Exemplo n.º 1
0
def get_data_news(exportCSV=True):
    config = get_config()
    for newspaper in config["newspaperlist"]:
        newspaper_dict = config["newspaperlist"][newspaper]
        details_dict = config["newspaperlist"][newspaper]["XPATH_news_details"]

        links = get_links(newspaper_dict)
        news_urls = create_urlnews(newspaper_dict, links)

        arraynews = []
        arraynews_stats = []

        for news_url in news_urls:
            news_details_dict = get_details_dict(details_dict, news_url)
            if news_details_dict:
                arraynews.append(news_details_dict)
                arraynews_stats.append(get_news_stats(news_details_dict))

        dfnews = pd.DataFrame(arraynews)
        dfnews.dropna(inplace=True)

        dfstats = pd.DataFrame(arraynews_stats)
        dfstats.dropna(inplace=True)

        if exportCSV:
            dfnews.to_csv(
                f"News {config['newspaperlist'][newspaper]['name']} at {datetime.now().strftime(' %Y, %m, %d %H-%M-%S')}.csv",
                encoding='utf-8-sig',
                index=False)
            dfstats.to_csv(
                f"Stats {config['newspaperlist'][newspaper]['name']} at {datetime.now().strftime(' %Y, %m, %d %H-%M-%S')}.csv",
                encoding='utf-8-sig',
                index=False)

    return arraynews, arraynews_stats
Exemplo n.º 2
0
def test_eOnline_regex():
    browser = make_browser()
    stories = scraper.get_links(browser, sites['eOnline']['url'],
                                sites['eOnline']['link_regex'])
    assert len(stories) > 0
Exemplo n.º 3
0
def test_nyTimes_regex():
    browser = make_browser()
    stories = scraper.get_links(browser, sites['nyTimes']['url'],
                                sites['nyTimes']['link_regex'])
    assert len(stories) > 0
Exemplo n.º 4
0
def test_guardian_regex():
    browser = make_browser()
    stories = scraper.get_links(browser, sites['guardian']['url'],
                                sites['guardian']['link_regex'])
    assert len(stories) > 0
Exemplo n.º 5
0
def test_nola_regex():
    stories = scraper.get_links(make_browser(), sites['nola']['url'],
                                sites['nola']['link_regex'])
    assert len(stories) > 0