def newspaper_fulltext2(parser, language, url): ''' This is a faster version of the function that uses some internal newspaper3k functions so that the lxml parse tree doesn't need to be recreated. Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71 but modified to use an already existing lxml parser ''' from newspaper.cleaners import DocumentCleaner from newspaper.configuration import Configuration from newspaper.extractors import ContentExtractor from newspaper.outputformatters import OutputFormatter config = Configuration() config.language = language config.keep_article_html = True extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = parser doc = document_cleaner.clean(doc) doc = extractor.calculate_best_node(doc) if doc is not None: doc = extractor.post_cleanup(doc) text, html = output_formatter.get_formatted(doc) else: text = '' html = '' return { 'value': { 'text': text, 'html': html, }, 'pattern': 'newspaper3k', }
def newspaper_fulltext(parser, language): ''' Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71 but modified to use an already existing lxml parser ''' from newspaper.cleaners import DocumentCleaner from newspaper.configuration import Configuration from newspaper.extractors import ContentExtractor from newspaper.outputformatters import OutputFormatter config = Configuration() config.language = language extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = parser doc = document_cleaner.clean(doc) top_node = extractor.calculate_best_node(doc) if top_node is not None: top_node = extractor.post_cleanup(top_node) text, html = output_formatter.get_formatted(top_node) else: text = None html = None return { 'value' : { 'text' : text, 'html' : html, }, 'pattern' : 'newspaper3k', }
def __init__(self): self.config = Configuration( ) # sets meta config for article and parser self.parser = self.config.get_parser() # parser self.extractor = ContentExtractor( self.config ) # extracts info (author, tags, text, etc.) from parsed article self.doc_cleaner = DocumentCleaner( self.config) # cleans unwanted tags and nodes from DOM self.formatter = OutputFormatter( self.config) # outputs formatted text from parsed xpath nodes
def fulltext(html, language='en'): """Takes article HTML string input and outputs the fulltext Input string is decoded via UnicodeDammit if needed """ config = Configuration() config.language = language extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = WithTagOutputFormatter(config) doc = config.get_parser().fromstring(html) doc = document_cleaner.clean(doc) top_node = extractor.calculate_best_node(doc) top_node = extractor.post_cleanup(top_node) text, article_html = output_formatter.get_formatted(top_node) return text, article_html
def get_data_from_html(html): result = {} parsed_html = Parser.fromstring(html) config = Configuration() extractor = ContentExtractor(config) formatter = OutputFormatter(config) cleaner = DocumentCleaner(config) result['title'] = extractor.get_title(parsed_html) publishing_date = extractor.get_publishing_date('', parsed_html) if publishing_date is None: publishing_date = datetime.datetime.now() result['published_at'] = publishing_date.isoformat() cleaned_html = cleaner.clean(parsed_html) top_node = extractor.calculate_best_node(cleaned_html) top_node = extractor.post_cleanup(top_node) result['content'], _ = formatter.get_formatted(top_node) return result
def modified_fulltext(parser, language, url): ''' Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71 but modified to use an already existing lxml parser ''' url_parsed = urlparse(url) from newspaper.cleaners import DocumentCleaner from newspaper.configuration import Configuration from newspaper.extractors import ContentExtractor from newspaper.outputformatters import OutputFormatter config = Configuration() config.language = language config.keep_article_html = True extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = parser doc = rm_ads(doc,url_parsed.hostname) doc = clean(document_cleaner,doc) #doc = document_cleaner.clean(doc) doc = calculate_best_node(extractor,doc) #doc = extractor.calculate_best_node(doc) if doc is not None: #doc = extractor.add_siblings(doc) doc = post_cleanup(doc) #doc = extractor.post_cleanup(doc) text, html = get_formatted(doc) #text, html = output_formatter.get_formatted(doc) else: text = '' html = '' return { 'value' : { 'text' : text, 'html' : html, }, 'pattern' : 'modified', }
class ArticleExtractionPipeline(object): def __init__(self): self.config = Configuration( ) # sets meta config for article and parser self.parser = self.config.get_parser() # parser self.extractor = ContentExtractor( self.config ) # extracts info (author, tags, text, etc.) from parsed article self.doc_cleaner = DocumentCleaner( self.config) # cleans unwanted tags and nodes from DOM self.formatter = OutputFormatter( self.config) # outputs formatted text from parsed xpath nodes # right now basically only works for RT # params: doc is parsed html from self.parser def find_date_from_html(self, doc): # https://github.com/Webhose/article-date-extractor/blob/master/articleDateExtractor/__init__.py candidates = self.parser.getElementsByTag(doc, tag="time") # add more times = [] for candidate in candidates: time_string = candidate.text for indicator in ["Edited", "Updated", "Published"]: if indicator in time_string: # indicator probably followed by "at" or ":", actual time is after that if "at" in time_string: time_string = time_string.split("at", 1)[1] elif ":" in time_string: time_string = time_string.split(":", 1)[1] break time = self.datetime_from_str(time_string) if time: times.append(time) if times: return min(times) else: return None def datetime_from_str(self, datetime_string): try: return date_parser.parse(datetime_string).replace( tzinfo=None ) # otherwise can't compare naive and (timezone) offset-aware times except (ValueError, OverflowError, AttributeError, TypeError): return None # params: doc is parsed html from self.parser # TODO: generalize def get_date(self, url, doc): raw_date = ( self.extractor.get_publishing_date(url, doc) or # telesur, africanews self.extractor.get_meta_content(doc, "meta[name='LastModifiedDate']") or # aljazeera, Sun, 07 January 2018 18:36:49 GMT self.extractor.get_meta_content(doc, "meta[name='Last-Modified']") or # times of india, Jan 9, 2018, 05:18 IST self.extractor.get_meta_content( doc, "meta[property='og:updated_time']") ) # diplomat, "2018-01-05 23:22:46" if raw_date: return self.datetime_from_str(raw_date) else: return self.find_date_from_html(doc) # params: date is datetime object def recent_article(self, date, max_days_elapsed=3): return datetime.datetime.now() - date < datetime.timedelta( days=max_days_elapsed) def process_item(self, item, spider): doc = self.parser.fromstring(item["content"]) item["title"] = self.extractor.get_title(doc) item["description"] = self.extractor.get_meta_description(doc) item["keywords"] = (self.extractor.get_meta_content( doc, "meta[name='news_keywords']") or self.extractor.get_meta_keywords(doc)) item["date"] = self.get_date(item["url"], doc) # drop item if no date if not item["date"] or not self.recent_article( item["date"], max_days_elapsed=7 ): # or not self.recent_article(item["date"]) raise DropItem("Missing or invalid date for: {}".format( item["title"])) # clean: clean_doc = self.doc_cleaner.clean(doc) top_node = self.extractor.post_cleanup( self.extractor.calculate_best_node(clean_doc)) item["content"] = self.formatter.get_formatted(top_node)[ 0] # [1] returns html of article # drop item if article too short if len(item["content"]) < 600: raise DropItem("Not enough text: {}".format(item["title"])) logging.info("ARTICLE TITLE: {}".format(item["title"])) logging.info("\t time: {}".format(item["date"])) return item
def parse(self): """ Only change get_publish_date """ # logger.debug("custom parse") self.throw_if_not_downloaded_verbose() self.doc = self.config.get_parser().fromstring(self.html) self.clean_doc = copy.deepcopy(self.doc) if self.doc is None: # `parse` call failed, return nothing return # TODO: Fix this, sync in our fix_url() method parse_candidate = self.get_parse_candidate() self.link_hash = parse_candidate.link_hash # MD5 document_cleaner = DocumentCleaner(self.config) output_formatter = OutputFormatter(self.config) try: title, siml, h1 = self.extractor.get_title(self.clean_doc) self.set_title(title) self.weight = siml ltitle = len(title) if ltitle >= 28 or ltitle <= 6: self.weight += ltitle_weight * 0.05 elif ltitle <= 11 or ltitle >= 22: self.weight += ltitle_weight * 0.45 else: self.weight += ltitle_weight * 0.6 except ValueError: logger.error("title %s is_news %s h1 %s", title, siml, h1) if h1: self.h1 = h1[:self.config.MAX_TITLE] authors = self.extractor.get_authors(self.clean_doc) self.set_authors(authors) self.publish_date = self.extractor.get_publishing_date( self.url, self.html, self.clean_doc) if self.publish_date is None: self.weight += pubtime_weight * 0.01 elif self.publish_date.hour == 0 and self.publish_date.minute == 0 and self.publish_date.second == 0: self.weight += pubtime_weight * 0.35 else: self.weight += pubtime_weight * 0.5 # 只通过这些字段判断是不是新闻 太不负责任了 # 要不手动创建白名单和黑名单 # 简单质量高 # if self.is_news == False: # self.is_parsed = True # return meta_lang = self.extractor.get_meta_lang(self.clean_doc) self.set_meta_language(meta_lang) if self.config.use_meta_language: self.extractor.update_language(self.meta_lang) output_formatter.update_language(self.meta_lang) meta_favicon = self.extractor.get_favicon(self.clean_doc) self.set_meta_favicon(meta_favicon) meta_description = \ self.extractor.get_meta_description(self.clean_doc) self.set_meta_description(meta_description) canonical_link = self.extractor.get_canonical_link(self.url, self.clean_doc) self.set_canonical_link(canonical_link) tags = self.extractor.extract_tags(self.clean_doc) self.set_tags(tags) meta_keywords = self.extractor.get_meta_keywords(self.clean_doc) self.set_meta_keywords(meta_keywords) meta_data = self.extractor.get_meta_data(self.clean_doc) self.set_meta_data(meta_data) # Before any computations on the body, clean DOM object self.doc = document_cleaner.clean(self.doc) # dump(self.doc) self.top_node = self.extractor.calculate_best_node(self.doc) if self.top_node is not None: # 作者这里没有控制是否要提取video,我们这里有两种办法,一种是直接注释掉 # 一种是加上控制 if self.config.fetch_videos: video_extractor = VideoExtractor(self.config, self.top_node) self.set_movies(video_extractor.get_videos()) self.top_node = self.extractor.post_cleanup(self.top_node) self.clean_top_node = copy.deepcopy(self.top_node) text, article_html = output_formatter.get_formatted(self.top_node) self.set_article_html(article_html) self.set_text(text) ltext = len(self.text) if ltext == 0: self.weight += lcontent_weight * -10 elif ltext <= 20: self.weight += lcontent_weight * 0.2 elif ltext <= 50: self.weight += lcontent_weight * 0.3 elif ltext <= 200: self.weight += lcontent_weight * 0.4 elif ltext <= 800: self.weight += lcontent_weight * 0.5 elif ltext <= 1400: self.weight += lcontent_weight * 0.55 elif ltext <= 2000: self.weight += lcontent_weight * 0.6 else: self.weight += lcontent_weight * 0.3 logger.debug("url:{0}, weight:{1}".format(self.url, self.weight)) if self.weight <= 0.45: self.is_news = False else: self.is_news = True if self.config.fetch_images: self.fetch_images() self.is_parsed = True self.release_resources()
def parse(self): if not self.is_downloaded: raise ArticleException(self) self.doc = self.config.get_parser().fromstring(self.html) self.clean_doc = copy.deepcopy(self.doc) if self.doc is None: # `parse` call failed, return nothing return # TODO(hieulq): Fix this, sync in our fix_url() method meta_lang = self.extractor.get_meta_lang(self.clean_doc) if CONF.news_detector.language not in meta_lang[0]: return if self.config.use_meta_language: self.extractor.update_language(meta_lang[0]) parse_candidate = self.get_parse_candidate() self.link_hash = parse_candidate.link_hash # MD5 document_cleaner = DocumentCleaner(self.config) title = self.extractor.get_title(self.clean_doc) self.set_title(title) authors = self.extractor.get_authors(self.clean_doc) self.set_authors(authors) meta_favicon = self.extractor.get_favicon(self.clean_doc) self.set_meta_favicon(meta_favicon) meta_description = \ self.extractor.get_meta_description(self.clean_doc) self.set_meta_description(meta_description) canonical_link = self.extractor.get_canonical_link(self.clean_doc) self.set_canonical_link(canonical_link) tags = self.extractor.extract_tags(self.clean_doc) self.set_tags(tags) meta_keywords = self.extractor.get_meta_keywords(self.clean_doc) self.set_meta_keywords(meta_keywords) meta_data = self.extractor.get_meta_data(self.clean_doc) self.set_meta_data(meta_data) self.publish_date = self.extractor.get_publishing_date( self.url, self.clean_doc) # Before any computations on the body, clean DOM object self.doc = document_cleaner.clean(self.doc) self.top_node = self.extractor.calculate_best_node(self.doc) if self.top_node: video_extractor = VideoExtractor(self.config, self.top_node) self.set_movies(video_extractor.get_videos()) self.set_text(self.top_node.xpath) self.is_parsed = True self.release_resources()
def modified_fulltext(parser, language): ''' Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71 but modified to use an already existing lxml parser ''' from newspaper.cleaners import DocumentCleaner from newspaper.configuration import Configuration from newspaper.extractors import ContentExtractor from newspaper.outputformatters import OutputFormatter def calculate_best_node(self, doc): top_node = None cxpath_body_nodes = lxml.etree.XPath('(//pre)|(//p)|(//td)') #nodes_to_check = self.nodes_to_check(doc) starting_boost = float(1.0) #cnt = 0 #i = 0 parent_nodes = [] nodes_with_text = [] #for node in nodes_to_check: for node in cxpath_body_nodes(doc): text_node = self.parser.getText(node) word_stats = self.stopwords_class(language=self.language). \ get_stopword_count(text_node) high_link_density = self.is_highlink_density(node) if word_stats.get_stopword_count() > 2 and not high_link_density: nodes_with_text.append(node) nodes_number = len(nodes_with_text) negative_scoring = 0 bottom_negativescore_nodes = float(nodes_number) * 0.25 #for node in nodes_with_text: for i,node in enumerate(nodes_with_text): boost_score = float(0) # boost if self.is_boostable(node): #if cnt >= 0: if i >= 0: boost_score = float((1.0 / starting_boost) * 50) starting_boost += 1 # nodes_number if nodes_number > 15: if (nodes_number - i) <= bottom_negativescore_nodes: booster = float( bottom_negativescore_nodes - (nodes_number - i)) boost_score = float(-pow(booster, float(2))) negscore = abs(boost_score) + negative_scoring if negscore > 40: boost_score = float(5) text_node = self.parser.getText(node) word_stats = self.stopwords_class(language=self.language). \ get_stopword_count(text_node) upscore = int(word_stats.get_stopword_count() + boost_score) parent_node = self.parser.getParent(node) self.update_score(parent_node, upscore) self.update_node_count(parent_node, 1) if parent_node not in parent_nodes: parent_nodes.append(parent_node) # Parent of parent node parent_parent_node = self.parser.getParent(parent_node) if parent_parent_node is not None: self.update_node_count(parent_parent_node, 1) self.update_score(parent_parent_node, upscore / 2) if parent_parent_node not in parent_nodes: parent_nodes.append(parent_parent_node) #cnt += 1 #i += 1 top_node_score = 0 for e in parent_nodes: score = self.get_score(e) if score > top_node_score: top_node = e top_node_score = score if top_node is None: top_node = e return top_node config = Configuration() config.language = language extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = parser #doc = document_cleaner.clean(doc) top_node = calculate_best_node(extractor,doc) if top_node is not None: top_node = extractor.post_cleanup(top_node) text, html = output_formatter.get_formatted(top_node) else: text = None html = None return { 'value' : { 'text' : text, 'html' : html, }, 'pattern' : 'modified', }
def parse(self, clean_doc=True): """ Extend the Original newspaper3k Article parser :param clean_doc: Controls wether to use original DocmeuntClenaer or modified Original cleaner: On some sources this prevents the text from being parsed (Special occasion, don't parse) However should almost always be used otherwsie bad elements might slip through :return: """ self.throw_if_not_downloaded_verbose() self.doc = self.config.get_parser().fromstring(self.html) self.clean_doc = copy.deepcopy(self.doc) if self.doc is None: # `parse` call failed, return nothing return # TODO: Fix this, sync in our fix_url() method parse_candidate = self.get_parse_candidate() self.link_hash = parse_candidate.link_hash # MD5 output_formatter = OutputFormatter(self.config) title = self.extractor.get_title(self.clean_doc) self.set_title(title) authors = self.extractor.get_authors(self.clean_doc) self.set_authors(authors) meta_lang = self.extractor.get_meta_lang(self.clean_doc) self.set_meta_language(meta_lang) if self.config.use_meta_language: self.extractor.update_language(self.meta_lang) output_formatter.update_language(self.meta_lang) meta_favicon = self.extractor.get_favicon(self.clean_doc) self.set_meta_favicon(meta_favicon) meta_description = \ self.extractor.get_meta_description(self.clean_doc) self.set_meta_description(meta_description) canonical_link = self.extractor.get_canonical_link( self.url, self.clean_doc) self.set_canonical_link(canonical_link) tags = self.extractor.extract_tags(self.clean_doc) self.set_tags(tags) meta_keywords = self.extractor.get_meta_keywords(self.clean_doc) self.set_meta_keywords(meta_keywords) meta_data = self.extractor.get_meta_data(self.clean_doc) self.set_meta_data(meta_data) self.publish_date = self.extractor.get_publishing_date( self.url, self.clean_doc) if clean_doc: document_cleaner = DocumentCleaner(self.config) # Before any computations on the body, clean DOM object self.doc = document_cleaner.clean(self.doc) else: # Use the extended cleaner that does not remove certain dom elements document_cleaner = Cleaner(self.config) # Before any computations on the body, clean DOM object self.doc = document_cleaner.clean(self.doc) self.top_node = self.extractor.calculate_best_node(self.doc) if self.top_node is not None: video_extractor = VideoExtractor(self.config, self.top_node) self.set_movies(video_extractor.get_videos()) self.top_node = self.extractor.post_cleanup(self.top_node) self.clean_top_node = copy.deepcopy(self.top_node) text, article_html = output_formatter.get_formatted(self.top_node) self.set_article_html(article_html) self.set_text(text) self.fetch_images() self.is_parsed = True self.release_resources()