def _source_fetch(source): fetch_type = None url = util.first_present([source.fetch_url_override, source.url]) markup = url_fetch(url) if markup: results = [] rpcs = [] def got_result(res): if res: results.append(res) def add_rpc(rpc): rpcs.append(rpc) for fn in fetch_functions_for_source(source): fn(source, markup, url, add_rpc, got_result) while len(rpcs): rpcs[0].wait() del rpcs[0] result = results[0] if len(results) else None if result: debug("SF: Fetched {0} as {1} source with {2} entries".format(url, result.method, len(result.entries))) else: warning("SF: Couldn't fetch {0} using any method".format(url)) if result: debug("SF: starting brand fetch") result.brand = extract_brand(markup, source.url) debug("SF: done with brand fetch") return result else: print "URL error fetching {0}".format(source.url) return None
def detect_fetch_data(source): url = util.first_present([source.fetch_url_override, source.url]) twitter_data = twitter_source_fetch.twitter_fetch_data_from_url(url) if twitter_data: return twitter_data, None markup = util.url_fetch(url) if not markup: return None, None # is this an rss feed itself? feed = parse_as_feed(markup) if feed: return {"type": "rss", "url": url}, feed # try finding some linked rss: soup = bs4.BeautifulSoup(markup, 'lxml') feed_url = rss_tools.find_linked_rss(soup, url) if feed_url: return {"type": "rss", "url": feed_url}, None wp_rss_link = url + "/?feed=rss" feed = parse_as_feed(util.url_fetch(wp_rss_link)) if feed: return {"type": "rss", "url": wp_rss_link}, feed # is there a twitter account linked? twitter_data = twitter_source_fetch.linked_twitter_fetch_data(soup) if twitter_data: return twitter_data, None return None, None
def fetch_normal(): response = url_fetch(url, return_response_obj=True) # print 'INFO', response.info() if response and response.info().getheader('content-type', 'text/html').lower().split(';')[0].strip() == 'text/html': markup = response.read() else: print 'BAD MIME TYPE' if response else 'NO SUCCESSFUL RESPONSE' markup = None if markup: # process markup: markup_soup = BeautifulSoup(markup, 'lxml') og_title = find_meta_value(markup_soup, 'og:title') og_image = find_meta_value(markup_soup, 'og:image') og_description = find_meta_value(markup_soup, 'og:description') title_field = find_title(markup_soup) article.site_name = find_meta_value(markup_soup, 'og:site_name') # find author: article.author = find_author(markup_soup) # parse and process article content: content.html = article_extractor.extract(markup, article.url) doc_soup = BeautifulSoup(content.html, 'lxml') article.title = first_present([og_title, title_field, article.title]) article.top_image = make_url_absolute(first_present([article.top_image, og_image])) populate_article_json(article, content) # compute description: description = None if og_description and len(og_description.strip()): description = truncate(og_description.strip(), words=40) elif content.text and len(content.text.strip()) > 0: description = truncate(content.text, words=40) article.description = re.sub(r"[\r\n\t ]+", " ", description).strip() if description else None return True else: return False