def parse_source(self, existing_ids=None): article_urls = [] feed_content = get_rss(self.VAL202_RSS_URL) for feed_entry in feed_content.entries: link = feed_entry["link"] guid = feed_entry["guid"] if existing_ids and get_sha_hash(guid) in existing_ids: logger.debug("Skipping %s", guid) continue published_date = time_to_datetime(feed_entry["published_parsed"]) try: text = feed_entry["content"][0]["value"] # Strip HTML soup = bs4.BeautifulSoup(text) text = soup.text except KeyError: return title = feed_entry["title"] author = feed_entry.get("author", None) article_urls.append((link, { "guid": guid, "published": published_date, "title": title, "text": text, "author": author })) return article_urls
def parse_source(self, existing_ids=None): news = [] feed_content = get_rss(self.FINANCE_RSS_URL) for feed_entry in feed_content.entries: link = feed_entry["link"] if existing_ids and get_sha_hash(link) in existing_ids: logger.debug("Skipping %s", link) continue published_date = time_to_datetime(feed_entry["published_parsed"]) news.append((link, {"published": published_date})) return news
def parse_source(self, existing_ids=None): feed_content = get_rss(self.DELO_RSS_URL) article_urls = [] for feed_entry in feed_content.entries: link = feed_entry["link"] if existing_ids and (get_hash(link) in existing_ids or get_sha_hash(link) in existing_ids): logger.debug("Skipping %s", link) continue published_date = time_to_datetime(feed_entry["published_parsed"]) article_urls.append((link, {"published": published_date})) return article_urls
def parse_source(self, existing_ids=None): news = [] for rss_feed in self.RTV_RSS_URLS: logger.debug("Parsing %s", rss_feed) feed_content = get_rss(rss_feed) for feed_entry in feed_content.entries: # Download article link = feed_entry["link"] if existing_ids and (get_hash(link) in existing_ids or get_sha_hash(link) in existing_ids): logger.debug("Skipping %s", link) continue published_date = time_to_datetime( feed_entry["published_parsed"]) news.append((link, {"published": published_date})) return news
def parse_source(self, existing_ids=None): article_urls = [] feed_content = get_rss(self.MONITOR_RSS_URL) for feed_entry in feed_content.entries: link = feed_entry["link"] guid = feed_entry["guid"] if existing_ids and get_sha_hash(guid) in existing_ids: logger.debug("Skipping %s", guid) return published_date = time_to_datetime(feed_entry["published_parsed"]) title = feed_entry["title"] article_urls.append((link, { "guid": guid, "title": title, "published": published_date })) return article_urls
def parse_source(self, existing_ids=None): news = [] feed_content = get_rss(self.DNEVNIK_RSS_URL) max_counter = 30 for feed_entry in feed_content.entries: link = feed_entry["link"] if existing_ids and (get_hash(link) in existing_ids or get_sha_hash(link) in existing_ids): logger.debug("Skipping %s", link) continue published_date = time_to_datetime(feed_entry["published_parsed"]) title = feed_entry["title"] news.append((link, {"published": published_date, "title": title})) max_counter -= 1 if max_counter <= 0: break return news