def markdownify(url_list, **options): articles = [] images = [] paragraph_links = options['paragraph_links'] wrap_text = options['wrap_text'] preamble = options['preamble'] for url in url_list: req = urllib2.Request(url,None,{'Referer': url_list[0]}) html = urllib2.urlopen(req).read() document = Document(html, url=url) readable_title = document.short_title() summary = document.summary() summary_doc = build_doc(summary) images.extend([a.get('src') for a in summary_doc.findall('.//img')]) articles.append(document.summary()) markdown_articles = [] for (article, url) in zip(articles, url_list): h = html2text.HTML2Text(baseurl=url) h.inline_links = False h.links_each_paragraph = (paragraph_links and 1) or 0 h.body_width = (wrap_text and 78) or 0 markdown_articles.append(h.handle(article)) combined_article = u"\n\n----\n\n".join(markdown_articles) if preamble: combined_article = (u"Title: %s \nOriginal URL: %s\n\n" % (readable_title, url_list[0])) + combined_article return combined_article.encode("utf-8")
def __init__(self, raw, ident): self.raw = raw self.ident = ident self.featval = {} self.html = build_doc(raw) self.doc = Document(raw) self.content = content(self.doc.summary()) self.sents = sent_tokenize(self.content)
def __init__(self, html_string): # use readability `build_doc` func to avoid encoding error self.html, _ = htmls.build_doc(html_string) self.title = self.get_title(self.html) # Use self.html when self.html.body does not exist try: self.body = cleaner.clean_html(self.html.body) except Exception as e: self.body = cleaner.clean_html(self.html) logger.warn(repr(e)) self.prepend_newline()
def get_review_page(url, page): review_url = url + '/review_more?pageno=%s'%page html_source, _= curl(review_url) if html_source: html_source = re.sub('\<br\s*\>|\<br/\>', ' ', html_source) unicode_source = html_source.decode('utf-8', 'ignore') doc = build_doc(unicode_source) comment_list = doc.xpath("//div[@class='J_brief-cont']/text()") time_list = doc.xpath("//span[@class='time']/text()") rank_list = map(lambda x:rank_dict[x], doc.xpath("//div[@class='user-info']/span[1]/@title")) user_list = doc.xpath("//p[@class='name']/a[@class='J_card']/text()") return zip(user_list, rank_list, time_list, comment_list)
def get_reviews(xlsfile, url): start = datetime(2000, 7, 1) end = datetime(2013, 8, 16) review_url = url + '/review_more?pageno=1' html_source, _= curl(review_url) if html_source: unicode_source = html_source.decode('utf-8', 'ignore') doc = build_doc(unicode_source) name = doc.xpath("//div[@class='info-name']/h2/a/text()") account = doc.xpath("//span[@class='active']/em[1]/text()") account = re.search('\d+', str(account)) account = int(account.group(0)) if account else 0 pages = how_many_pages(account, 20) xlsfile.append(name) for page in range(1, pages+1): too_old = False lines = get_review_page(url, page) for line in lines: time = re.findall('\d+', line[2]) length = len(time) if length == 2 or length >3: month = int(time[0]) day = int(time[1]) year = 2013 elif length ==3: year = int('20'+time[0]) month = int(time[1]) day = int(time[2]) comment_date = datetime(year, month, day) if comment_date<end: if comment_date<start: too_old = True break line = list(line) line[2] = comment_date.date() line.insert(0, name[0]) print line print time print year, month, day xlsfile.append(line) sleep(2) if too_old: break xlsfile.append([' ']) xlsfile.append([' '])
def _parse(self, input): doc, self.encoding = build_doc(input) doc = html_cleaner.clean_html(doc) base_href = self.url if base_href: # trying to guard against bad links like <a href="http://[http://..."> try: # such support is added in lxml 3.3.0 doc.make_links_absolute(base_href, resolve_base_href=True, handle_failures='discard') except TypeError: #make_links_absolute() got an unexpected keyword argument 'handle_failures' # then we have lxml < 3.3.0 # please upgrade to lxml >= 3.3.0 if you're failing here! doc.make_links_absolute(base_href, resolve_base_href=True) else: doc.resolve_base_href() return doc
def split_page(domain, html_source, encode="utf-8"): doc = build_doc(html_source) if domain == 'mtime.com': sub_page_nodes = doc.xpath(".//div[@class='t_module']") elif domain == 'douban.com': sub_page_nodes = doc.xpath(".//div[@class='comment-item']") for sub_page_node in sub_page_nodes: html = u""" <html> <body> %s </body> </html> """ % tostring(sub_page_node).decode('utf-8') yield html.encode(encode)
def main(): with open("stat.log", 'w') as f: for line in sys.stdin: url = line.rstrip('\n') html_source, _ = curl(url) if not html_source: print "ERROR:", url continue unicode_source = html_source.decode('gbk') if u'你访问的贴子不存在' in unicode_source or u'你访问的贴子被隐藏' in unicode_source: print "ERROR:", url continue else: print url doc = build_doc(html_source) tie_ba = doc.xpath(XPATH) f.write(tie_ba[0] + '\n')
def predefined_site(url, html): print(url, "=====================predefined_site?") pds = predefine_sites_collection.find() for pd in pds: if re.compile(pd['url_pattern'], re.I).search(url) is not None: print(url, "=====================predefined_site") doc = htmls.build_doc(html) for tag in doc.iter(): allelem = doc.iter() for elem in allelem: s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) if re.compile(pd['content_css']).search(s) is not None: print(url, "=====================predefined_site", htmls.get_keywords(doc)+','+','.join(pd['tags'])) return Summary(get_clean_html(elem), '', short_title=htmls.shorten_title(doc), title=htmls.get_title(doc), description=htmls.get_description(doc), keywords=htmls.get_keywords(doc)+','+','.join(pd['tags'])) return None
def get_comments(url): xpaths = { '21cn.com': [("//td[@class='t_f']/text()",), ("//div[@class='pg']//a/@href", )], '55bbs.com': [("//div[@class='t_msgfont']/text()", ), ("//div[@class='pages']//a/@href", )], #'lirenn.55bbs.com': [("//dd[@class='clearfix']//p/text()", ), ], 'lady8844.com': [("//td[@class='t_msgfont']/text()",), ("//div[@class='pages']//a/@href", )], 'onlylady.com': [("//td[@class='t_f']/text()",), ("//div[@class='pg']//a/@href", )], 'pclady.com.cn': [("//div[@class='replyBody']/text()", ), ("//div[@class='pager']//a/@href", )], 'yoka.com': [("//td[@class='con_content']/text()", ), ("//dl[@class='bbs_Page']/dt//a/@href", )], # ajax } domain = get_domain(url) parse_result = urlparse.urlparse(url) page_urls = [] prefix = parse_result.scheme + '://' + parse_result.netloc if domain not in xpaths: return [], [] if (domain == '55bbs.com' and 'liren' in url) or url in crawed_urls: return [], page_urls xpath = xpaths[domain] html_source, _ = curl(url) if not html_source: return [], [] print domain, url crawed_urls.add(url) doc = build_doc(html_source) comments = doc.xpath(xpath[0][0]) pages = doc.xpath(xpath[1][0]) for page in pages: if page.startswith('/'): page = prefix + page if not page.startswith('http'): page = prefix + '/' + page if page not in crawed_urls: page_urls.append(page) return comments, page_urls
def predefined_site(url, html): print(url, "=====================predefined_site?") pds = predefine_sites_collection.find() for pd in pds: if re.compile(pd['url_pattern'], re.I).search(url) is not None: print(url, "=====================predefined_site") doc = htmls.build_doc(html) for tag in doc.iter(): allelem = doc.iter() for elem in allelem: s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) if re.compile(pd['content_css']).search(s) is not None: print( url, "=====================predefined_site", htmls.get_keywords(doc) + ',' + ','.join(pd['tags'])) return Summary(get_clean_html(elem), '', short_title=htmls.shorten_title(doc), title=htmls.get_title(doc), description=htmls.get_description(doc), keywords=htmls.get_keywords(doc) + ',' + ','.join(pd['tags'])) return None
def get_comments(source_url): queue = deque() comments = [] domain = get_domain(source_url) crawled_url = set() if domain == 'mtime.com': url = source_url + '/comment.html' elif domain == 'douban.com': url = source_url + "/reviews" queue.append(url) comment_urls = set() while len(queue): url = queue.popleft() if url in crawled_url: continue print url html_source, _ = curl(url) if not html_source: time.sleep(2*60) continue time.sleep(1.2) crawled_url.add(url) doc = build_doc(html_source) pages = doc.xpath(xpath[domain][0][1][0]) pages = build_url(url, pages) queue.extend(pages) comment_urls |= set(doc.xpath(xpath[domain][0][0][0])) for comment_url in comment_urls: print comment_url comment_source, _ = curl(comment_url) if not comment_source: time.sleep(2*60) continue time.sleep(1.2) doc = build_doc(comment_source) title = u' '.join(doc.xpath(xpath[domain][0][2][0])) author = u" ".join(doc.xpath(xpath[domain][0][2][1])) created_on = u' '.join(doc.xpath(xpath[domain][0][2][2])) content = u' '.join(doc.xpath(xpath[domain][0][2][3])) #comments.append((title, author, created_on, content)) yield (title, author, created_on, content) if domain == 'mtime.com': url = source_url + '/newshortcomment.html' elif domain == 'douban.com': url = source_url + "/comments" queue.append(url) while len(queue): url = queue.popleft() if url in crawled_url: continue html_source, _ = curl(url) if not html_source: time.sleep(2*60) queue.appendleft(url) continue print url time.sleep(1.2) crawled_url.add(url) doc = build_doc(html_source) pages = doc.xpath(xpath[domain][0][1][0]) pages = build_url(url, pages) queue.extend(pages) for page_source in split_page(domain, html_source): doc = build_doc(page_source) author = u' '.join(doc.xpath(xpath[domain][1][2][0])) created_on = u' '.join(doc.xpath(xpath[domain][1][2][1])) content = u' '.join(doc.xpath(xpath[domain][1][2][2])) #comments.append((u"", author, created_on, content)) yield (u"", author, created_on, content)
def main(): start=datetime(2013, 06, 16) end=datetime(2013, 06, 17) f = open('stat-0616.csv', 'w') # 新产生的文件名, 最终发给顾问得 f2 = codecs.open(u'中国最强音.csv', 'r', 'gbk') # 由query_from_pg.py生成的文件名 writer = csv.writer(f) parsed_set = {} for line in f2: row = line.split(',') print row[1], row[0] if 'tieba' not in row[0]: continue topic_url = get_url_with_qs(row[0]) if topic_url not in parsed_set: html_source, _= curl(topic_url) if html_source: unicode_source = html_source.decode('gbk', 'ignore') if u'你访问的贴子不存在' in unicode_source or u'你访问的贴子被隐藏' in unicode_source: print "ERROR:", row[0] continue doc = build_doc(unicode_source) title = doc.xpath("//h1//text()") if "#" in row[0]: title = u"[回复]" + title[0] if len(title)>0 else '' else: title = title[0] if len(title)>0 else '' comment_num = doc.xpath("//span[@id='comment_num']/text()|(//li[@class='l_reply_num'])[3]/span/text()") comment_count=comment_num[0] if len(comment_num)>0 else 0 at_xpath = "//div[contains(@class,'l_post')]//@data-field" regex = re.compile(u'(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2}).*?(?P<hour>\d{1,2}).*?(?P<minute>\d{1,2})') at = doc.xpath(at_xpath) matched = regex.search(str(at)) kwargs = {} if matched: for key, value in matched.groupdict().items(): if value: kwargs[key] = int(value) at_str = datetime(**kwargs).strftime("%Y-%m-%d %H:%M:%S") else: at_str='' line = [topic_url, title[4:], at_str, comment_count] line = [column.encode('gbk') if isinstance(column, unicode) else column for column in line] #line=map(lambda x: re.sub(u'\s+|\xa0', ' ', x.decode('utf-8')).encode('gbk', 'ignore') if isinstance(x, basestring) else x, line) parsed_set[topic_url] = {'title': title, 'comment_count': comment_count} at_time=datetime.strptime(at_str, "%Y-%m-%d %H:%M:%S") if start<= at_time < end: writer.writerow(line) time.sleep(3) if topic_url in parsed_set: title = parsed_set[topic_url]['title'] comment_count = parsed_set[topic_url]['comment_count'] line = [row[0], title, row[1], comment_count] line = [column.encode('utf-8') if isinstance(column, unicode) else column for column in line] at_time=datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S') if start<= at_time < end: writer.writerow(line) f.close() f2.close()
def content(html): doc = build_doc(html) text = [ clean_quotes(s.strip()) for s in doc.xpath('//text()') ] text = [ t.replace('\n',' ') for t in text if t ] text = ' '.join(text) return text