def from_html(html, url=None, download_date=None): """ Extracts relevant information from an HTML page given as a string. This function does not invoke scrapy but only uses the article extractor. If you have the original URL make sure to provide it as this helps NewsPlease to extract the publishing date and title. :param html: :param url: :return: """ extractor = article_extractor.Extractor( ['newspaper_extractor', 'readability_extractor', 'lang_detect_extractor']) title_encoded = ''.encode() if not url: url = '' # if an url was given, we can use that as the filename filename = urllib.parse.quote_plus(url) + '.json' item = NewscrawlerItem() item['spider_response'] = DotMap() item['spider_response'].body = html item['url'] = url item['source_domain'] = urllib.parse.urlparse(url).hostname.encode() if url != '' else ''.encode() item['html_title'] = title_encoded item['rss_title'] = title_encoded item['local_path'] = None item['filename'] = filename item['download_date'] = download_date item['modified_date'] = None item = extractor.extract(item) tmp_article = ExtractedInformationStorage.extract_relevant_info(item) final_article = ExtractedInformationStorage.convert_to_class(tmp_article) return final_article
def from_html(html, url=None): """ Extracts relevant information from an HTML page given as a string. This function does not invoke scrapy but only uses the article extractor. :param html: :param url: :return: """ extractor = article_extractor.Extractor( ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor']) title_encoded = ''.encode() if not url: url = '' # if an url was given, we can use that as the filename filename = urllib.parse.quote_plus(url) + '.json' item = NewscrawlerItem() item['spider_response'] = DotMap() item['spider_response'].body = html item['url'] = url item['source_domain'] = urlparse(url).hostname.encode() if url != '' else ''.encode() item['html_title'] = title_encoded item['rss_title'] = title_encoded item['local_path'] = None item['filename'] = filename item['download_date'] = None item['modified_date'] = None item = extractor.extract(item) article = ExtractedInformationStorage.extract_relevant_info(item) return DotMap(article)
def from_html(html, url=None, download_date=None): """ Extracts relevant information from an HTML page given as a string. This function does not invoke scrapy but only uses the article extractor. If you have the original URL make sure to provide it as this helps NewsPlease to extract the publishing date and title. :param html: :param url: :return: """ extractor = article_extractor.Extractor( ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor']) title_encoded = ''.encode() if not url: url = '' # if an url was given, we can use that as the filename filename = urllib.parse.quote_plus(url) + '.json' item = NewscrawlerItem() item['spider_response'] = DotMap() item['spider_response'].body = html item['url'] = url item['source_domain'] = urllib.parse.urlparse(url).hostname.encode() if url != '' else ''.encode() item['html_title'] = title_encoded item['rss_title'] = title_encoded item['local_path'] = None item['filename'] = filename item['download_date'] = download_date item['modified_date'] = None item = extractor.extract(item) tmp_article = ExtractedInformationStorage.extract_relevant_info(item) final_article = ExtractedInformationStorage.convert_to_class(tmp_article) # final_article = DotMap(tmp_article) return final_article