def parse(self): """ Only change get_publish_date """ # logger.debug("custom parse") self.throw_if_not_downloaded_verbose() self.doc = self.config.get_parser().fromstring(self.html) self.clean_doc = copy.deepcopy(self.doc) if self.doc is None: # `parse` call failed, return nothing return # TODO: Fix this, sync in our fix_url() method parse_candidate = self.get_parse_candidate() self.link_hash = parse_candidate.link_hash # MD5 document_cleaner = DocumentCleaner(self.config) output_formatter = OutputFormatter(self.config) try: title, siml, h1 = self.extractor.get_title(self.clean_doc) self.set_title(title) self.weight = siml ltitle = len(title) if ltitle >= 28 or ltitle <= 6: self.weight += ltitle_weight * 0.05 elif ltitle <= 11 or ltitle >= 22: self.weight += ltitle_weight * 0.45 else: self.weight += ltitle_weight * 0.6 except ValueError: logger.error("title %s is_news %s h1 %s", title, siml, h1) if h1: self.h1 = h1[:self.config.MAX_TITLE] authors = self.extractor.get_authors(self.clean_doc) self.set_authors(authors) self.publish_date = self.extractor.get_publishing_date( self.url, self.html, self.clean_doc) if self.publish_date is None: self.weight += pubtime_weight * 0.01 elif self.publish_date.hour == 0 and self.publish_date.minute == 0 and self.publish_date.second == 0: self.weight += pubtime_weight * 0.35 else: self.weight += pubtime_weight * 0.5 # 只通过这些字段判断是不是新闻 太不负责任了 # 要不手动创建白名单和黑名单 # 简单质量高 # if self.is_news == False: # self.is_parsed = True # return meta_lang = self.extractor.get_meta_lang(self.clean_doc) self.set_meta_language(meta_lang) if self.config.use_meta_language: self.extractor.update_language(self.meta_lang) output_formatter.update_language(self.meta_lang) meta_favicon = self.extractor.get_favicon(self.clean_doc) self.set_meta_favicon(meta_favicon) meta_description = \ self.extractor.get_meta_description(self.clean_doc) self.set_meta_description(meta_description) canonical_link = self.extractor.get_canonical_link(self.url, self.clean_doc) self.set_canonical_link(canonical_link) tags = self.extractor.extract_tags(self.clean_doc) self.set_tags(tags) meta_keywords = self.extractor.get_meta_keywords(self.clean_doc) self.set_meta_keywords(meta_keywords) meta_data = self.extractor.get_meta_data(self.clean_doc) self.set_meta_data(meta_data) # Before any computations on the body, clean DOM object self.doc = document_cleaner.clean(self.doc) # dump(self.doc) self.top_node = self.extractor.calculate_best_node(self.doc) if self.top_node is not None: # 作者这里没有控制是否要提取video,我们这里有两种办法,一种是直接注释掉 # 一种是加上控制 if self.config.fetch_videos: video_extractor = VideoExtractor(self.config, self.top_node) self.set_movies(video_extractor.get_videos()) self.top_node = self.extractor.post_cleanup(self.top_node) self.clean_top_node = copy.deepcopy(self.top_node) text, article_html = output_formatter.get_formatted(self.top_node) self.set_article_html(article_html) self.set_text(text) ltext = len(self.text) if ltext == 0: self.weight += lcontent_weight * -10 elif ltext <= 20: self.weight += lcontent_weight * 0.2 elif ltext <= 50: self.weight += lcontent_weight * 0.3 elif ltext <= 200: self.weight += lcontent_weight * 0.4 elif ltext <= 800: self.weight += lcontent_weight * 0.5 elif ltext <= 1400: self.weight += lcontent_weight * 0.55 elif ltext <= 2000: self.weight += lcontent_weight * 0.6 else: self.weight += lcontent_weight * 0.3 logger.debug("url:{0}, weight:{1}".format(self.url, self.weight)) if self.weight <= 0.45: self.is_news = False else: self.is_news = True if self.config.fetch_images: self.fetch_images() self.is_parsed = True self.release_resources()
def parse(self, clean_doc=True): """ Extend the Original newspaper3k Article parser :param clean_doc: Controls wether to use original DocmeuntClenaer or modified Original cleaner: On some sources this prevents the text from being parsed (Special occasion, don't parse) However should almost always be used otherwsie bad elements might slip through :return: """ self.throw_if_not_downloaded_verbose() self.doc = self.config.get_parser().fromstring(self.html) self.clean_doc = copy.deepcopy(self.doc) if self.doc is None: # `parse` call failed, return nothing return # TODO: Fix this, sync in our fix_url() method parse_candidate = self.get_parse_candidate() self.link_hash = parse_candidate.link_hash # MD5 output_formatter = OutputFormatter(self.config) title = self.extractor.get_title(self.clean_doc) self.set_title(title) authors = self.extractor.get_authors(self.clean_doc) self.set_authors(authors) meta_lang = self.extractor.get_meta_lang(self.clean_doc) self.set_meta_language(meta_lang) if self.config.use_meta_language: self.extractor.update_language(self.meta_lang) output_formatter.update_language(self.meta_lang) meta_favicon = self.extractor.get_favicon(self.clean_doc) self.set_meta_favicon(meta_favicon) meta_description = \ self.extractor.get_meta_description(self.clean_doc) self.set_meta_description(meta_description) canonical_link = self.extractor.get_canonical_link( self.url, self.clean_doc) self.set_canonical_link(canonical_link) tags = self.extractor.extract_tags(self.clean_doc) self.set_tags(tags) meta_keywords = self.extractor.get_meta_keywords(self.clean_doc) self.set_meta_keywords(meta_keywords) meta_data = self.extractor.get_meta_data(self.clean_doc) self.set_meta_data(meta_data) self.publish_date = self.extractor.get_publishing_date( self.url, self.clean_doc) if clean_doc: document_cleaner = DocumentCleaner(self.config) # Before any computations on the body, clean DOM object self.doc = document_cleaner.clean(self.doc) else: # Use the extended cleaner that does not remove certain dom elements document_cleaner = Cleaner(self.config) # Before any computations on the body, clean DOM object self.doc = document_cleaner.clean(self.doc) self.top_node = self.extractor.calculate_best_node(self.doc) if self.top_node is not None: video_extractor = VideoExtractor(self.config, self.top_node) self.set_movies(video_extractor.get_videos()) self.top_node = self.extractor.post_cleanup(self.top_node) self.clean_top_node = copy.deepcopy(self.top_node) text, article_html = output_formatter.get_formatted(self.top_node) self.set_article_html(article_html) self.set_text(text) self.fetch_images() self.is_parsed = True self.release_resources()