def test_convert_to_dict_most_fields_works(): faker = Faker() source = Article(url=faker.url()) source.authors = [faker.name(), faker.name()] source.top_image = faker.image_url() source.article_html = faker.text() source.images = [faker.image_url(), faker.image_url()] source.meta_data = [faker.city(), faker.state(), faker.country()] result = extractor.to_dict(source, "article_html", "authors", "images", "keywords", "meta_data", "source_url", "summary", "top_image", "url", "tags", "meta_favicon") assert result assert len(result) == 7 assert "article_html" in result assert "authors" in result assert "images" in result assert "keywords" not in result assert "meta_data" in result assert "source_url" in result assert "summary" not in result assert "top_image" in result assert "url" in result assert "tags" not in result assert "meta_favicon" not in result
def build_news_article_from_url(source_url, sNLP): """build new article object from source url, if build fail would return None """ try: print('start to scrape from url: ', source_url) # pre-process news by NewsPaper3k and Boilerpipe library article = Article(source_url, keep_article_html=True) article.build() article.nlp() e = Extractor(extractor='DefaultExtractor', html=article.html) article.text = e.getText() article.article_html = e.getHTML() news_article = NewsArticle(article, sNLP) print('success to scrape from url: ', source_url) return news_article except Exception as e: print('fail to scrape from url: ', source_url) print('reason:', e) return None
#parse #title and try: article = Article(url, language='de', keep_article_html=True) article.download() article.parse() filename = feed[3] + ''.join( random.choices(string.ascii_uppercase + string.digits, k=8)) file = open("html/" + filename + ".html", "w") article.article_html = "<meta property='baseurl' content='" + feed[ 4] + "'>" + article.article_html article.article_html = "<script src='https://ajax.googleapis.com/ajax/libs/jquery/3.2.1/jquery.min.js'></script>" + article.article_html article.article_html = "<link href='https://fonts.googleapis.com/css?family=Slabo+27px' rel='stylesheet'>" + article.article_html article.article_html = "<img src='" + article.top_image + "' width='100%' >" + article.article_html article.article_html = "<h1 class='realTitle'>" + entrie.title + "</h1>" + article.article_html article.article_html = article.article_html + "<link type='text/css' rel='stylesheet' href='../css/renderStyles.css'/>" article.article_html = article.article_html + "<script src='../js/renderScript.js'></script>" article.article_html = "<div class='pr0news pr0-text text-orange " + feed[ 3] + "'>" + article.article_html + "</div>" file.write(article.article_html) file.close() if article.title != entrie.title: print( str(len(article.title)) + " || " +