def source_fetch(source): debug("SF: Doing fetch for source: {0}".format(source.url)) result = _source_fetch(source) debug("SF: Done with source fetch for {0}; result type: {1}".format(source.url, (result.method if result else None))) added_any = False now = datetime.datetime.now() to_put = [] tasks_to_enqueue = [] if result: if result.feed_title: source.title = result.feed_title if result.brand: source.brand = result.brand titles = [entry['title'] for entry in result.entries if entry['title']] source.shared_title_suffix = shared_suffix(titles) entries = result.entries[:min(25, len(result.entries))] entry_ids = [Article.id_for_article(entry['url'], source.url) for entry in entries] print "ENTRY IDs:", entry_ids print "ENtry id lens: ", str(map(len, entry_ids)) article_futures = [Article.get_or_insert_async(id) for id in entry_ids] articles = [future.get_result() for future in article_futures] print "ARTICLE_OBJECTS:", articles for i, (entry, article) in enumerate(zip(entries, articles)): if not article.url: added_any = True article.added_date = now article.added_order = i article.source = source.key article.url = canonical_url(entry.get('url')) article.submission_url = canonical_url(entry.get('submission_url')) if entry['published']: article.published = entry['published'] else: article.published = datetime.datetime.now() if not article.title: article.title = entry['title'] to_put.append(article) delay = (i+1) * 4 # wait 5 seconds between each tasks_to_enqueue.append(article.create_fetch_task(delay=delay)) debug("SF: About to put {0} items".format(len(to_put))) if len(to_put): ndb.put_multi(to_put) debug("SF: About to enqueue") if len(tasks_to_enqueue): taskqueue.Queue('articles').add_async(tasks_to_enqueue) debug("SF: done enqueuing") if added_any: source.most_recent_article_added_date = now source_search.add_source_to_index(source) source.last_fetched = now source.put()
def article_title_processor(articles): # takes article JSON, removes redundant suffixes good_titles = [a.get('title') for a in articles if a.get('fetch_failed') == False] suffix_to_strip = shared_suffix(good_titles) if len(good_titles) >= 2 else None def process(article): title = article.get('title') or "" if suffix_to_strip and len(title) > len(suffix_to_strip) and title[-len(suffix_to_strip):] == suffix_to_strip: title = title[:-len(suffix_to_strip)] title = title.split(u" | ")[0] article['title'] = title.strip() return article return map(process, articles)