Пример #1
0
    def fetchArticleContent(self, html):
        # 抓取文本内容
        article_content_dict = {}
        try:
            article_content_elements = html.xpath(
                "//div[@class='article-body main-body']/p")
            if not article_content_elements:
                article_content_elements = html.xpath(
                    "//div[@class='newsContent']/span[@id='newsCon']/p")
            if not article_content_elements:
                article_content_elements = html.xpath(
                    "//div[@class='main-body']/p")
            if not article_content_elements:
                article_content_elements = html.xpath(
                    "//div[@id='artibody']/p")

            article_contents = ""
            for article_content_element in article_content_elements:
                article_content = etree.tostring(article_content_element,
                                                 encoding="UTF-8",
                                                 pretty_print=False,
                                                 method="html")
                article_contents = article_contents + article_content
            article_content_dict.setdefault("article_content",
                                            article_contents)
            article_content_dict.setdefault("article_content_size",
                                            len(article_contents))
        except Exception as ex:
            logging.error(ex)
            traceback.print_exc()
        return article_content_dict
Пример #2
0
 def fetchArticleHeader(self, html):
     """获取文章头信息"""
     article_header_dict = {}
     try:
         article_header_elements = html.xpath(
             "//div[@class='article-header clearfix']")
         if article_header_elements:
             article_header_element = article_header_elements[0]
             # 获取标题
             article_title = getElement(article_header_element,
                                        "//h1/text()")
             article_header_dict.setdefault("article_title", article_title)
             # 新闻发布时间
             source_time = getElement(
                 article_header_element,
                 "//p[@class='source-time']/span[1]/text()")
             article_header_dict.setdefault("source_time", source_time)
             # 新闻来源
             art_source = getElement(
                 article_header_element,
                 "//p[@class='source-time']/span[2]/span[@id='art_source']/text()"
             )
             article_header_dict.setdefault("art_source", art_source)
             # 评论数量
             mcom_num = getElement(
                 article_header_element,
                 "//p[@class='source-time']/span[3]/a/b/@data-comment")
             article_header_dict.setdefault("mcom_num", mcom_num)
         else:
             article_title = html.xpath(
                 "//div[@class='newsContent']/h2[@class='news_tit']/text()")
             if article_title:
                 article_header_dict.setdefault("article_title",
                                                article_title[0])
             source_time = html.xpath(
                 "//div[@class='newsContent']/div[@class='artInfo']/text()")
             if source_time:
                 article_header_dict.setdefault("source_time",
                                                source_time[0])
             art_source = html.xpath(
                 "//div[@class='newsContent']/div[@class='artInfo']/a/text()"
             )
             if art_source:
                 article_header_dict.setdefault("art_source", art_source[0])
             mcom_num = html.xpath("//em[@id='plcount']/text()")
             if mcom_num:
                 article_header_dict.setdefault("mcom_num", mcom_num[0])
     except Exception as ex:
         logging.error(ex)
         traceback.print_exc()
     return article_header_dict
Пример #3
0
def crawler_video(url, outpath, thread_count=10, duplicateRemover="bloom"):
    """爬取多媒体视频
        url
            爬取的多媒体地址
        outpath
            输出文件目录
        thread_count
            爬虫线程数量
        duplicateRemover
            去除重复的方法,默认为bloom
        """
    if not url or not outpath:
        logging.error("url[%s] or outpath[%s] is empty!" % (url, outpath))
        return
    Crawler(url, VedioPageProcess()) \
        .set_thread(thread_count) \
        .set_storage(MediaStorage(outpath)) \
        .set_duplicate_remover(__get_duplicate_remover__(duplicateRemover)) \
        .run()
Пример #4
0
def download(url):
    """下载html 返回html内容"""
    try:
        content = str(urllib2.urlopen(url, timeout=5).read())
        content_type_start_index = content.find("Content-Type")
        if content_type_start_index > -1:
            charset_start_index = content.find("charset=",
                                               content_type_start_index)
            content_type_end_index = content.find(">",
                                                  content_type_start_index)
            encoding = content[charset_start_index +
                               len("charset="):content_type_end_index - 3]
            if str(encoding).upper() == "GB2312":
                encoding = "gbk"
            content = unicode(content, encoding=encoding)
        return content
    except Exception as ex:
        logging.error(ex)
    return None
Пример #5
0
 def get_url(self):
     retry_counter = 0
     try:
         self.scheduler_lock.acquire()
         while True:
             if len(self.urls) == 0:
                 if retry_counter < 3:
                     time.sleep(1)
                     retry_counter += 1
                 else:
                     raise TypeError("页面无数据")
             else:
                 page_url = self.urls.pop()
                 logging.info(page_url)
                 return page_url
     except Exception as ex:
         logging.error(ex)
     finally:
         self.scheduler_lock.release()
     return None
Пример #6
0
def crawler_images(url, outpath, thread_count=10, storage="json", duplicateRemover="bloom"):
    """爬取网站图片资源
        url
            爬取的图片地址
        outpath
            输出文件目录
        thread_count
            爬虫线程数量
        duplicateRemover
            去除重复的方法,默认为bloom
        """
    if not url or not outpath:
        logging.error("url[%s] or outpath[%s] is empty!" % (url, outpath))
        return

    Crawler(url, ImagePageProcess()) \
        .set_thread(thread_count) \
        .set_storage(MediaStorage(outpath)) \
        .set_duplicate_remover(__get_duplicate_remover__(duplicateRemover)) \
        .run()
Пример #7
0
def crawler_nsfocus_loophole(outpath, thread_count=10, storage="json", duplicateRemover="bloom"):
    """绿盟漏洞爬取 http://www.nsfocus.net/index.php?act=sec_bug
        outpath
            输出文件目录
        thread_count
            爬虫线程数量
        storage
           存储方式 默认json
        duplicateRemover
            去除重复的方法,默认为bloom
        """
    if not outpath:
        logging.error("outpath[%s] is empty!" % outpath)
        return
    Crawler("http://www.nsfocus.net/index.php?act=sec_bug",
            NsfocusLoopholePageProcess(),
            filter_url=["http://www.nsfocus.net/index.php?act=sec_bug", "http://www.nsfocus.net/vulndb"]) \
        .set_thread(thread_count) \
        .set_storage(__get_storage__(storage, outpath)) \
        .set_duplicate_remover(__get_duplicate_remover__(duplicateRemover)) \
        .run()
Пример #8
0
    def storage(self, json_data):
        if not json_data:
            return

        # 判断文件是否写满 放置到锁同步中 防止产生空文件
        try:
            self.storage_lock.acquire()
            if self.current_counter >= self.file_counter:
                self.json_file.close()
                self.json_file = open(os.path.join(self.json_path, str(int(time.time() * 1000)) + ".json"), "w")
                self.current_counter = 0

            article = json.dumps(json_data, ensure_ascii=False)
            logging.info(article)
            self.json_file.write(article + "\n")
            self.current_counter += 1
        except Exception as ex:
            traceback.print_exc()
            logging.error(json_data)
        finally:
            self.storage_lock.release()
Пример #9
0
 def fetchArticleMeta(self, html):
     """获取页面元数据"""
     article_meta_dict = {}
     try:
         # 消息类型
         meta_og_type = getElement(html,
                                   "//meta[@property='og:type']/@content")
         article_meta_dict.setdefault("meta_og_type", meta_og_type)
         # meta设置标题
         meta_og_title = getElement(
             html, "//meta[@property='og:title']/@content")
         article_meta_dict.setdefault("meta_og_title", meta_og_title)
         # meta设置内容
         meta_og_description = getElement(
             html, "//meta[@property='og:description']/@content")
         article_meta_dict.setdefault("meta_og_description",
                                      meta_og_description)
         # meta设置url地址
         meta_og_url = getElement(html,
                                  "//meta[@property='og:url']/@content")
         article_meta_dict.setdefault("meta_og_url", meta_og_url)
         # meta时间
         meta_og_create = getElement(
             html, "//meta[@property='weibo: article:create_at']/@content")
         article_meta_dict.setdefault("meta_og_create", meta_og_create)
         # 文本描述
         meta_keywords = getElement(html,
                                    "//meta[@name='Keywords']/@content")
         article_meta_dict.setdefault("meta_keywords", meta_keywords)
         # 文本描述
         meta_description = getElement(
             html, "//meta[@name='Description']/@content")
         article_meta_dict.setdefault("meta_description", meta_description)
         # 文本描述
         meta_tags = getElement(html, "//meta[@name='tags']/@content")
         article_meta_dict.setdefault("meta_tags", meta_tags)
     except Exception as ex:
         logging.error(ex)
         traceback.print_exc()
     return article_meta_dict
Пример #10
0
    def storage(self, field_dict):
        if not field_dict:
            return

        for field_name in field_dict:
            field_values = field_dict.get(field_name)
            if not isinstance(field_values, list):
                field_values = [field_values]
            for field_value in field_values:
                # 获取url路径名称
                try:
                    logging.info(field_value)
                    # 下载内容信息
                    content = urllib2.urlopen(field_value).read()
                    # 将内容写入到文件中
                    with open(
                            os.path.join(self.out_path,
                                         os.path.split(field_value)[1]),
                            "wb") as media_file:
                        media_file.write(content)
                except Exception as ex:
                    logging.error(ex)
Пример #11
0
def crawler_news(url, outpath, thread_count=10, storage="json", duplicateRemover="bloom"):
    """爬取新浪湖北新闻数据

    url
        爬取的新闻网站地址,http://hb.sina.com.cn/news/
    outpath
        输出文件目录
    thread_count
        爬虫线程数量
    storage
       存储文件格式,默认为json
    duplicateRemover
        去除重复的方法,默认为bloom
    """
    if not url or not outpath:
        logging.error("url[%s] or outpath[%s] is empty!" % (url, outpath))
        return

    Crawler(url, SinaNewsPageProcessor()) \
        .set_thread(thread_count) \
        .set_storage(__get_storage__(storage, outpath)) \
        .set_duplicate_remover(__get_duplicate_remover__(duplicateRemover)) \
        .run()