def detect_fetch_data(source): url = util.first_present([source.fetch_url_override, source.url]) twitter_data = twitter_source_fetch.twitter_fetch_data_from_url(url) if twitter_data: return twitter_data, None markup = util.url_fetch(url) if not markup: return None, None # is this an rss feed itself? feed = parse_as_feed(markup) if feed: return {"type": "rss", "url": url}, feed # try finding some linked rss: soup = bs4.BeautifulSoup(markup, 'lxml') feed_url = rss_tools.find_linked_rss(soup, url) if feed_url: return {"type": "rss", "url": feed_url}, None wp_rss_link = url + "/?feed=rss" feed = parse_as_feed(util.url_fetch(wp_rss_link)) if feed: return {"type": "rss", "url": wp_rss_link}, feed # is there a twitter account linked? twitter_data = twitter_source_fetch.linked_twitter_fetch_data(soup) if twitter_data: return twitter_data, None return None, None
def rss_fetch(data, feed_content): url = data['url'] if not feed_content: markup = util.url_fetch(url) if markup: feed_content = parse_as_feed(markup) if not feed_content: return None parsed = feed_content source_entry_processor = create_source_entry_processor(url) feed_title = parsed['feed']['title'] entries = [] latest_date = None for entry in parsed['entries']: if 'link' in entry and 'title' in entry: # print entry link_url = urljoin(url, entry['link'].strip()) title = entry['title'] pub_time = entry.get('published_parsed', entry.get('updated_parsed')) if pub_time: published = datetime.datetime.fromtimestamp(mktime(pub_time)) else: published = None result_entry = {"title": title, "url": link_url, "published": published} source_entry_processor(result_entry, entry) entries.append(result_entry) return FetchResult('rss', feed_title, entries)