Exemplo n.º 1
0
def scrape(site_name, *, fetch_rss=_fetch, fetch_html=_fetch):
    config = sites_config[site_name]
    rss_text = fetch_rss(config["rss"])

    # Patch RSS issue in walla. This might create duplicate `guid` field
    rss_text = rss_text.replace("link", "guid")

    rss_soup = BeautifulSoup(rss_text, features="lxml")
    rss_soup_items = rss_soup.find_all("item")

    assert rss_soup_items

    for item_rss_soup in rss_soup_items:
        link = item_rss_soup.guid.get_text()
        date = timezones.parse_creation_datetime(
            item_rss_soup.pubdate.get_text())

        html_text = fetch_html(link)
        item_html_soup = BeautifulSoup(html_text, "lxml")

        author, title, description = config["parser"](item_rss_soup,
                                                      item_html_soup)
        yield NewsFlash(
            link=link,
            date=date,
            source=site_name,
            author=author,
            title=title,
            description=description,
            accident=False,
        )
Exemplo n.º 2
0
def parse_tweet(tweet, screen_name):
    return NewsFlash(
        link="https://twitter.com/{}/status/{}".format(screen_name, tweet["id_str"]),
        date=timezones.parse_creation_datetime(tweet["created_at"]),
        source="twitter",
        author=to_hebrew[screen_name],
        title=tweet["full_text"],
        description=tweet["full_text"],
        tweet_id=int(tweet["id_str"]),
        accident=False,
    )
Exemplo n.º 3
0
def test_timeparse():
    twitter = timezones.parse_creation_datetime("Sun May 31 08:26:18 +0000 2020")
    ynet = timezones.parse_creation_datetime("Sun, 31 May 2020 11:26:18 +0300")
    walla = timezones.parse_creation_datetime("Sun, 31 May 2020 08:26:18 GMT")
    assert twitter == ynet == walla