def crawl_blog_post(blog_id, log_no, tags, written_time=None, verbose=True): def get_title(root): result = '' try: result = root.xpath('//h3[@class="tit_h3"]/text()')[0].strip() except Exception: pass if result != '': return result try: result = root.xpath('//h3[@class="se_textarea"]/text()')[0].strip() except Exception: pass #return root.xpath('//h3[@class="tit_h3"]/text()')[0].strip() return result def get_page_html(url): try: page = requests.get(url, headers=headers) root = html.fromstring(page.content) elem = root.xpath('//div[@class="_postView"]')[0] html_ = etree.tostring(elem) return (BeautifulSoup(html_, 'lxml'), get_title(root)) except IOError: print '' return (None, None) #if blog_id.startswith('http'): # url = blog_id #else: url = mobileurl % (blog_id, log_no) (doc, title) = get_page_html(url) if doc: crawled_time = utils.get_today_str() crawler_version = utils.get_version() #url = posturl % (blog_id, log_no) post_tags = tags[(blog_id, log_no)] directory_seq = None # NOTE: No directory sequence given for query crawler post = btc.make_structure(blog_id, log_no, None, doc, crawled_time, crawler_version, title, written_time, url, post_tags, directory_seq) if not verbose: del post['directorySeq'] del post['sympathyCount'] return post else: print 'No doc in %s' % posturl return None
def make_structure(blog_id, log_no, raw, doc, crawled_time, crawler_version, title, written_time, url, tags, directory_seq, encoding='utf-8'): extract_crawlerTime = get_today_str() #extract_category = lambda doc: doc.find("a", {"class": "_categoryName"}).get_text().encode(encoding) def extract_category(doc): doc_node = None try: doc_node = doc.find("a", {"class": "_categoryName"}) except Exception: pass if doc_node == None: doc_node = doc.find("a", {"id": "_categoryName"}) if doc_node == None: return '' else: return doc_node.get_text().encode(encoding) extract_content_html = lambda doc: doc.find("div", {"id": "viewTypeSelector"}) def extract_sympathycount(doc): if doc.find("em", {"id": "sympathyCount"}) == None: return None else: return doc.find("em", {"id": "sympathyCount"}).get_text() def extract_images(htmls = extract_content_html(doc)): image_urls = [] images = htmls.find_all("span", {"class":"_img _inl fx"}) for i, image in enumerate(images): tmp = images[i]['thumburl'] + 'w2' image_urls.append(tmp) return image_urls return {u"blogId": blog_id, u"logNo": log_no, u"content": extract_content_html(doc).get_text().encode(encoding), u"contentHtml": str(extract_content_html(doc)), u"crawledTime": crawled_time, u"crawlerVersion": crawler_version, u"directorySeq": directory_seq, u"title": title, u"writtenTime": written_time, u"url": url, u"tags": tags, u"categoryName": extract_category(doc), u"sympathyCount": extract_sympathycount(doc), u"images": extract_images()}
def crawl_blog_post(blog_id, log_no, tags, written_time=None, verbose=True): def get_title(root): return root.xpath('//h3[@class="tit_h3"]/text()')[0].strip() def get_page_html(url): try: root = html.parse(url) elem = root.xpath('//div[@class="_postView"]')[0] html_ = etree.tostring(elem) return (BeautifulSoup(html_), get_title(root)) except IOError: print '' return (None, None) if blog_id.startswith('http'): url = blog_id else: url = mobileurl % (blog_id, log_no) (doc, title) = get_page_html(url) if doc: crawled_time = utils.get_today_str() crawler_version = utils.get_version() url = posturl % (blog_id, log_no) post_tags = tags[(blog_id, log_no)] directory_seq = None # NOTE: No directory sequence given for query crawler post = btc.make_structure(blog_id, log_no, None, doc, crawled_time, crawler_version, title, written_time, url, post_tags, directory_seq) if not verbose: del post['directorySeq'] del post['sympathyCount'] return post else: print 'No doc in %s' % posturl return None
def make_structure(blog_id, log_no, raw, doc, crawled_time, crawler_version, title, written_time, url, tags, directory_seq, encoding='utf-8'): extract_crawlerTime = get_today_str() #extract_category = lambda doc: doc.find("a", {"class": "_categoryName"}).get_text().encode(encoding) def extract_category(doc): doc_node = None try: doc_node = doc.find("a", {"class": "_categoryName"}) except Exception: pass if doc_node == None: doc_node = doc.find("a", {"id": "_categoryName"}) if doc_node == None: return '' else: return doc_node.get_text().encode(encoding) extract_content_html = lambda doc: doc.find("div", {"id": "viewTypeSelector"}) def extract_sympathycount(doc): if doc.find("em", {"id": "sympathyCount"}) == None: return None else: return doc.find("em", {"id": "sympathyCount"}).get_text() def extract_images(htmls=extract_content_html(doc)): image_urls = [] images = htmls.find_all("span", {"class": "_img _inl fx"}) for i, image in enumerate(images): tmp = images[i]['thumburl'] + 'w2' image_urls.append(tmp) return image_urls return { u"blogId": blog_id, u"logNo": log_no, u"content": extract_content_html(doc).get_text().encode(encoding), u"contentHtml": str(extract_content_html(doc)), u"crawledTime": crawled_time, u"crawlerVersion": crawler_version, u"directorySeq": directory_seq, u"title": title, u"writtenTime": written_time, u"url": url, u"tags": tags, u"categoryName": extract_category(doc), u"sympathyCount": extract_sympathycount(doc), u"images": extract_images() }