def fetchArticleContent(self, html): # 抓取文本内容 article_content_dict = {} try: article_content_elements = html.xpath( "//div[@class='article-body main-body']/p") if not article_content_elements: article_content_elements = html.xpath( "//div[@class='newsContent']/span[@id='newsCon']/p") if not article_content_elements: article_content_elements = html.xpath( "//div[@class='main-body']/p") if not article_content_elements: article_content_elements = html.xpath( "//div[@id='artibody']/p") article_contents = "" for article_content_element in article_content_elements: article_content = etree.tostring(article_content_element, encoding="UTF-8", pretty_print=False, method="html") article_contents = article_contents + article_content article_content_dict.setdefault("article_content", article_contents) article_content_dict.setdefault("article_content_size", len(article_contents)) except Exception as ex: logging.error(ex) traceback.print_exc() return article_content_dict
def fetchArticleHeader(self, html): """获取文章头信息""" article_header_dict = {} try: article_header_elements = html.xpath( "//div[@class='article-header clearfix']") if article_header_elements: article_header_element = article_header_elements[0] # 获取标题 article_title = getElement(article_header_element, "//h1/text()") article_header_dict.setdefault("article_title", article_title) # 新闻发布时间 source_time = getElement( article_header_element, "//p[@class='source-time']/span[1]/text()") article_header_dict.setdefault("source_time", source_time) # 新闻来源 art_source = getElement( article_header_element, "//p[@class='source-time']/span[2]/span[@id='art_source']/text()" ) article_header_dict.setdefault("art_source", art_source) # 评论数量 mcom_num = getElement( article_header_element, "//p[@class='source-time']/span[3]/a/b/@data-comment") article_header_dict.setdefault("mcom_num", mcom_num) else: article_title = html.xpath( "//div[@class='newsContent']/h2[@class='news_tit']/text()") if article_title: article_header_dict.setdefault("article_title", article_title[0]) source_time = html.xpath( "//div[@class='newsContent']/div[@class='artInfo']/text()") if source_time: article_header_dict.setdefault("source_time", source_time[0]) art_source = html.xpath( "//div[@class='newsContent']/div[@class='artInfo']/a/text()" ) if art_source: article_header_dict.setdefault("art_source", art_source[0]) mcom_num = html.xpath("//em[@id='plcount']/text()") if mcom_num: article_header_dict.setdefault("mcom_num", mcom_num[0]) except Exception as ex: logging.error(ex) traceback.print_exc() return article_header_dict
def crawler_video(url, outpath, thread_count=10, duplicateRemover="bloom"): """爬取多媒体视频 url 爬取的多媒体地址 outpath 输出文件目录 thread_count 爬虫线程数量 duplicateRemover 去除重复的方法,默认为bloom """ if not url or not outpath: logging.error("url[%s] or outpath[%s] is empty!" % (url, outpath)) return Crawler(url, VedioPageProcess()) \ .set_thread(thread_count) \ .set_storage(MediaStorage(outpath)) \ .set_duplicate_remover(__get_duplicate_remover__(duplicateRemover)) \ .run()
def download(url): """下载html 返回html内容""" try: content = str(urllib2.urlopen(url, timeout=5).read()) content_type_start_index = content.find("Content-Type") if content_type_start_index > -1: charset_start_index = content.find("charset=", content_type_start_index) content_type_end_index = content.find(">", content_type_start_index) encoding = content[charset_start_index + len("charset="):content_type_end_index - 3] if str(encoding).upper() == "GB2312": encoding = "gbk" content = unicode(content, encoding=encoding) return content except Exception as ex: logging.error(ex) return None
def get_url(self): retry_counter = 0 try: self.scheduler_lock.acquire() while True: if len(self.urls) == 0: if retry_counter < 3: time.sleep(1) retry_counter += 1 else: raise TypeError("页面无数据") else: page_url = self.urls.pop() logging.info(page_url) return page_url except Exception as ex: logging.error(ex) finally: self.scheduler_lock.release() return None
def crawler_images(url, outpath, thread_count=10, storage="json", duplicateRemover="bloom"): """爬取网站图片资源 url 爬取的图片地址 outpath 输出文件目录 thread_count 爬虫线程数量 duplicateRemover 去除重复的方法,默认为bloom """ if not url or not outpath: logging.error("url[%s] or outpath[%s] is empty!" % (url, outpath)) return Crawler(url, ImagePageProcess()) \ .set_thread(thread_count) \ .set_storage(MediaStorage(outpath)) \ .set_duplicate_remover(__get_duplicate_remover__(duplicateRemover)) \ .run()
def crawler_nsfocus_loophole(outpath, thread_count=10, storage="json", duplicateRemover="bloom"): """绿盟漏洞爬取 http://www.nsfocus.net/index.php?act=sec_bug outpath 输出文件目录 thread_count 爬虫线程数量 storage 存储方式 默认json duplicateRemover 去除重复的方法,默认为bloom """ if not outpath: logging.error("outpath[%s] is empty!" % outpath) return Crawler("http://www.nsfocus.net/index.php?act=sec_bug", NsfocusLoopholePageProcess(), filter_url=["http://www.nsfocus.net/index.php?act=sec_bug", "http://www.nsfocus.net/vulndb"]) \ .set_thread(thread_count) \ .set_storage(__get_storage__(storage, outpath)) \ .set_duplicate_remover(__get_duplicate_remover__(duplicateRemover)) \ .run()
def storage(self, json_data): if not json_data: return # 判断文件是否写满 放置到锁同步中 防止产生空文件 try: self.storage_lock.acquire() if self.current_counter >= self.file_counter: self.json_file.close() self.json_file = open(os.path.join(self.json_path, str(int(time.time() * 1000)) + ".json"), "w") self.current_counter = 0 article = json.dumps(json_data, ensure_ascii=False) logging.info(article) self.json_file.write(article + "\n") self.current_counter += 1 except Exception as ex: traceback.print_exc() logging.error(json_data) finally: self.storage_lock.release()
def fetchArticleMeta(self, html): """获取页面元数据""" article_meta_dict = {} try: # 消息类型 meta_og_type = getElement(html, "//meta[@property='og:type']/@content") article_meta_dict.setdefault("meta_og_type", meta_og_type) # meta设置标题 meta_og_title = getElement( html, "//meta[@property='og:title']/@content") article_meta_dict.setdefault("meta_og_title", meta_og_title) # meta设置内容 meta_og_description = getElement( html, "//meta[@property='og:description']/@content") article_meta_dict.setdefault("meta_og_description", meta_og_description) # meta设置url地址 meta_og_url = getElement(html, "//meta[@property='og:url']/@content") article_meta_dict.setdefault("meta_og_url", meta_og_url) # meta时间 meta_og_create = getElement( html, "//meta[@property='weibo: article:create_at']/@content") article_meta_dict.setdefault("meta_og_create", meta_og_create) # 文本描述 meta_keywords = getElement(html, "//meta[@name='Keywords']/@content") article_meta_dict.setdefault("meta_keywords", meta_keywords) # 文本描述 meta_description = getElement( html, "//meta[@name='Description']/@content") article_meta_dict.setdefault("meta_description", meta_description) # 文本描述 meta_tags = getElement(html, "//meta[@name='tags']/@content") article_meta_dict.setdefault("meta_tags", meta_tags) except Exception as ex: logging.error(ex) traceback.print_exc() return article_meta_dict
def storage(self, field_dict): if not field_dict: return for field_name in field_dict: field_values = field_dict.get(field_name) if not isinstance(field_values, list): field_values = [field_values] for field_value in field_values: # 获取url路径名称 try: logging.info(field_value) # 下载内容信息 content = urllib2.urlopen(field_value).read() # 将内容写入到文件中 with open( os.path.join(self.out_path, os.path.split(field_value)[1]), "wb") as media_file: media_file.write(content) except Exception as ex: logging.error(ex)
def crawler_news(url, outpath, thread_count=10, storage="json", duplicateRemover="bloom"): """爬取新浪湖北新闻数据 url 爬取的新闻网站地址,http://hb.sina.com.cn/news/ outpath 输出文件目录 thread_count 爬虫线程数量 storage 存储文件格式,默认为json duplicateRemover 去除重复的方法,默认为bloom """ if not url or not outpath: logging.error("url[%s] or outpath[%s] is empty!" % (url, outpath)) return Crawler(url, SinaNewsPageProcessor()) \ .set_thread(thread_count) \ .set_storage(__get_storage__(storage, outpath)) \ .set_duplicate_remover(__get_duplicate_remover__(duplicateRemover)) \ .run()