def scrape(site_name, *, fetch_rss=_fetch, fetch_html=_fetch): config = sites_config[site_name] rss_text = fetch_rss(config["rss"]) # Patch RSS issue in walla. This might create duplicate `guid` field rss_text = rss_text.replace("link", "guid") rss_soup = BeautifulSoup(rss_text, features="lxml") rss_soup_items = rss_soup.find_all("item") assert rss_soup_items for item_rss_soup in rss_soup_items: link = item_rss_soup.guid.get_text() date = timezones.parse_creation_datetime( item_rss_soup.pubdate.get_text()) html_text = fetch_html(link) item_html_soup = BeautifulSoup(html_text, "lxml") author, title, description = config["parser"](item_rss_soup, item_html_soup) yield NewsFlash( link=link, date=date, source=site_name, author=author, title=title, description=description, accident=False, )
def parse_tweet(tweet, screen_name): return NewsFlash( link="https://twitter.com/{}/status/{}".format(screen_name, tweet["id_str"]), date=timezones.parse_creation_datetime(tweet["created_at"]), source="twitter", author=to_hebrew[screen_name], title=tweet["full_text"], description=tweet["full_text"], tweet_id=int(tweet["id_str"]), accident=False, )
def test_timeparse(): twitter = timezones.parse_creation_datetime("Sun May 31 08:26:18 +0000 2020") ynet = timezones.parse_creation_datetime("Sun, 31 May 2020 11:26:18 +0300") walla = timezones.parse_creation_datetime("Sun, 31 May 2020 08:26:18 GMT") assert twitter == ynet == walla