def test_rel_feed_anchor(self): """Check that we follow the rel=feed when it's in an <a> tag instead of <link> """ self.expect_requests_get( "http://author", """ <html> <head> <link rel="alternate" type="application/xml" href="not_this.html"> <link rel="alternate" type="application/xml" href="nor_this.html"> </head> <body> <a href="try_this.html" rel="feed">full unfiltered feed</a> </body> </html>""", ) self.expect_requests_get( "http://author/try_this.html", """ <html class="h-feed"> <body> <div class="h-entry">Hi</div> </body> </html>""", ) self.mox.ReplayAll() discover(self.source, self.activity)
def test_rel_feed_link(self): """Check that we follow the rel=feed link when looking for the author's full feed URL """ self.expect_requests_get( "http://author", """ <html> <head> <link rel="feed" type="text/html" href="try_this.html"> <link rel="alternate" type="application/xml" href="not_this.html"> <link rel="alternate" type="application/xml" href="nor_this.html"> </head> </html>""", ) self.expect_requests_get( "http://author/try_this.html", """ <html class="h-feed"> <body> <div class="h-entry">Hi</div> </body> </html>""", ) self.mox.ReplayAll() discover(self.source, self.activity)
def test_avoid_permalink_with_bad_content_type(self): """Confirm that we don't follow u-url's that lead to anything that isn't text/html (e.g., PDF) """ source = self.sources[0] source.domain_urls = ['http://author'] activity = self.activities[0] activity['object']['url'] = 'https://fa.ke/post/url' activity['object']['content'] = 'content without links' # head request to follow redirects on the post url self.expect_requests_head(activity['object']['url']) self.expect_requests_head('http://author') self.expect_requests_get('http://author', """ <html> <body> <div class="h-entry"> <a href="http://scholarly.com/paper.pdf">An interesting paper</a> </div> </body> </html> """) # and to check the content-type of the article self.expect_requests_head('http://scholarly.com/paper.pdf', response_headers={ 'content-type': 'application/pdf' }) # call to requests.get for permalink should be skipped self.mox.ReplayAll() original_post_discovery.discover(source, activity)
def get_webmention_targets(source, activity): """Returns a set of string target URLs to attempt to send webmentions to. Side effect: runs the original post discovery algorithm on the activity and adds the resulting URLs to the activity as tags, in place. Args: source: models.Source subclass activity: activity dict """ original_post_discovery.discover(source, activity) targets = set() obj = activity.get('object') or activity for tag in obj.get('tags', []): url = tag.get('url') if url and tag.get('objectType') == 'article': url, domain, send = util.get_webmention_target(url) tag['url'] = url if send: targets.add(url) for url in obj.get('upstreamDuplicates', []): url, domain, send = util.get_webmention_target(url) if send: targets.add(url) return targets
def _test_failed_post_permalink_fetch(self, raise_exception): """Make sure something reasonable happens when we're unable to fetch the permalink of an entry linked in the h-feed """ self.expect_requests_get( "http://author", """ <html class="h-feed"> <article class="h-entry"> <a class="u-url" href="nonexistent.html"></a> </article> </html> """, ) if raise_exception: self.expect_requests_get("http://author/nonexistent.html").AndRaise(HTTPError()) else: self.expect_requests_get("http://author/nonexistent.html", status_code=410) self.mox.ReplayAll() discover(self.source, self.activity) # we should have saved placeholders to prevent us from trying the # syndication url or permalink again self.assert_syndicated_posts(("http://author/nonexistent.html", None), (None, "https://fa.ke/post/url"))
def test_existing_syndicated_posts(self): """Confirm that no additional requests are made if we already have a SyndicatedPost in the DB. """ original_url = 'http://author/notes/2014/04/24/1' syndication_url = 'https://fa.ke/post/url' source = self.sources[0] source.domain_urls = ['http://author'] activity = self.activities[0] activity['object']['url'] = syndication_url activity['object']['content'] = 'content without links' # save the syndicated post ahead of time (as if it had been # discovered previously) SyndicatedPost(parent=source.key, original=original_url, syndication=syndication_url).put() self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', source, activity) original_post_discovery.discover(source, activity) # should append the author note url, with no addt'l requests self.assertEquals([original_url], activity['object']['upstreamDuplicates'])
def test_match_facebook_username_url(self): """Facebook URLs use username and user id interchangeably, and one does not redirect to the other. Make sure we can still find the relationship if author's publish syndication links using their username """ source = FacebookPage.new(self.handler, auth_entity=self.auth_entity) source.domain_urls = ['http://author'] activity = self.activities[0] # facebook activity comes to us with the numeric id activity['object']['url'] = 'http://facebook.com/212038/posts/314159' activity['object']['content'] = 'content without links' self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </div> </html>""") # user sensibly publishes syndication link using their name self.expect_requests_get('http://author/post/permalink', """ <html class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> <a class="u-syndication" href="http://facebook.com/snarfed.org/posts/314159"></a> </html>""") self.mox.ReplayAll() original_post_discovery.discover(source, activity) self.assertEquals(['http://author/post/permalink'], activity['object']['upstreamDuplicates'])
def _test_failed_rel_feed_link_fetch(self, raise_exception): """An author page with an invalid rel=feed link. We should recover and use any h-entries on the main url as a fallback. """ source = self.sources[0] source.domain_urls = ['http://author'] activity = self.activities[0] self.expect_requests_get('http://author', """ <html> <head> <link rel="feed" type="text/html" href="try_this.html"> <link rel="alternate" type="application/xml" href="not_this.html"> <link rel="alternate" type="application/xml" href="nor_this.html"> </head> <body> <div class="h-entry"> <a class="u-url" href="recover_and_fetch_this.html"></a> </div> </body> </html>""") # try to do this and fail if raise_exception: self.expect_requests_get('http://author/try_this.html').AndRaise(HTTPError()) else: self.expect_requests_get('http://author/try_this.html', status_code=404) # despite the error, should fallback on the main page's h-entries and # check the permalink self.expect_requests_get('http://author/recover_and_fetch_this.html') self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', source, activity) original_post_discovery.discover(source, activity)
def test_match_facebook_username_url(self): """Facebook URLs use username and user id interchangeably, and one does not redirect to the other. Make sure we can still find the relationship if author's publish syndication links using their username """ auth_entity = oauth_facebook.FacebookAuth( id='my_string_id', auth_code='my_code', access_token_str='my_token', user_json=json.dumps({'id': '212038', 'username': '******'})) auth_entity.put() source = FacebookPage.new(self.handler, auth_entity=auth_entity, domain_urls=['http://author']) # facebook activity comes to us with the numeric id self.activity['object']['url'] = 'http://facebook.com/212038/posts/314159' self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </div> </html>""") # user sensibly publishes syndication link using their username self.expect_requests_get('http://author/post/permalink', """ <html class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> <a class="u-syndication" href="http://facebook.com/snarfed.org/posts/314159"></a> </html>""") self.mox.ReplayAll() original_post_discovery.discover(source, self.activity) self.assertEquals(['http://author/post/permalink'], self.activity['object']['upstreamDuplicates'])
def test_avoid_permalink_with_bad_content_type(self): """Confirm that we don't follow u-url's that lead to anything that isn't text/html (e.g., PDF) """ # head request to follow redirects on the post url self.expect_requests_head(self.activity["object"]["url"]) self.expect_requests_head("http://author") self.expect_requests_get( "http://author", """ <html> <body> <div class="h-entry"> <a href="http://scholarly.com/paper.pdf">An interesting paper</a> </div> </body> </html> """, ) # and to check the content-type of the article self.expect_requests_head( "http://scholarly.com/paper.pdf", response_headers={"content-type": "application/pdf"} ) # call to requests.get for permalink should be skipped self.mox.ReplayAll() discover(self.source, self.activity)
def test_no_h_entries(self): """Make sure nothing bad happens when fetching a feed without h-entries """ activity = self.activities[0] activity['object']['content'] = 'post content without backlink' activity['object']['url'] = 'https://fa.ke/post/url' # silo domain is fa.ke source = self.sources[0] source.domain_urls = ['http://author'] self.expect_requests_get('http://author', """ <html class="h-feed"> <p>under construction</p> </html>""") self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', source, activity) original_post_discovery.discover(source, activity) self.assert_equals( [(None, 'https://fa.ke/post/url')], [(relationship.original, relationship.syndication) for relationship in SyndicatedPost.query(ancestor=source.key)])
def test_do_not_fetch_hfeed(self): """Confirms behavior of discover() when fetch_hfeed=False. Discovery should only check the database for previously discovered matches. It should not make any GET requests """ discover(self.source, self.activity, fetch_hfeed=False) self.assertFalse(SyndicatedPost.query(ancestor=self.source.key).get())
def test_rel_feed_link(self): """Check that we follow the rel=feed link when looking for the author's full feed URL """ source = self.sources[0] source.domain_urls = ['http://author'] activity = self.activities[0] self.expect_requests_get('http://author', """ <html> <head> <link rel="feed" type="text/html" href="try_this.html"> <link rel="alternate" type="application/xml" href="not_this.html"> <link rel="alternate" type="application/xml" href="nor_this.html"> </head> </html>""") self.expect_requests_get('http://author/try_this.html', """ <html class="h-feed"> <body> <div class="h-entry">Hi</div> </body> </html>""") self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', source, activity) original_post_discovery.discover(source, activity)
def test_refetch_multiple_responses_same_activity(self): """Ensure that refetching a post that has several replies does not generate duplicate original -> None blank entries in the database. See https://github.com/snarfed/bridgy/issues/259 for details """ for activity in self.activities: activity['object']['content'] = 'post content without backlinks' activity['object']['url'] = 'https://fa.ke/post/url' author_feed = """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </div> </html>""" author_entry = """ <html class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </html>""" # original self.expect_requests_get('http://author', author_feed) self.expect_requests_get('http://author/post/permalink', author_entry) # refetch self.expect_requests_get('http://author', author_feed) self.expect_requests_get('http://author/post/permalink', author_entry) self.mox.ReplayAll() for activity in self.activities: discover(self.source, activity) refetch(self.source) self.assert_syndicated_posts(('http://author/post/permalink', None), (None, 'https://fa.ke/post/url'))
def test_feed_head_request_failed(self): """Confirm that we don't follow rel=feeds explicitly marked as application/xml. """ self.expect_requests_get('http://author', """ <html> <head> <link rel="feed" href="/updates"> </head> <body> <article class="h-entry"> <a class="u-url" href="permalink"></a> </article> </body> </html> """) # head request to follow redirects on the post url self.expect_requests_head(self.activity['object']['url']) # and for the author url self.expect_requests_head('http://author') # try and fail to get the feed self.expect_requests_head('http://author/updates', status_code=400) self.expect_requests_get('http://author/updates', status_code=400) # fall back on the original page, and fetch the post permalink self.expect_requests_head('http://author/permalink') self.expect_requests_get('http://author/permalink', '<html></html>') self.mox.ReplayAll() discover(self.source, self.activity)
def test_rel_feed_link_error(self): """Author page has an h-feed link that raises an exception. We should recover and use the main page's h-entries as a fallback.""" self.expect_requests_get('http://author', """ <html> <head> <link rel="feed" type="text/html" href="try_this.html"> <link rel="alternate" type="application/xml" href="not_this.html"> <link rel="alternate" type="application/xml" href="nor_this.html"> </head> <body> <div class="h-entry"> <a class="u-url" href="recover_and_fetch_this.html"></a> </div> </body> </html>""") # try to do this and fail self.expect_requests_get('http://author/try_this.html', 'nope', status_code=404) # despite the error, should fallback on the main page's h-entries and # check the permalink self.expect_requests_get('http://author/recover_and_fetch_this.html', 'ok') self.mox.ReplayAll() discover(self.source, self.activity)
def test_rel_feed_anchor(self): """Check that we follow the rel=feed when it's in an <a> tag instead of <link> """ source = self.sources[0] source.domain_urls = ['http://author'] activity = self.activities[0] self.expect_requests_get('http://author', """ <html> <head> <link rel="alternate" type="application/xml" href="not_this.html"> <link rel="alternate" type="application/xml" href="nor_this.html"> </head> <body> <a href="try_this.html" rel="feed">full unfiltered feed</a> </body> </html>""") self.expect_requests_get('http://author/try_this.html', """ <html class="h-feed"> <body> <div class="h-entry">Hi</div> </body> </html>""") self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', source, activity) original_post_discovery.discover(source, activity)
def _test_failed_post_permalink_fetch(self, raise_exception): """Make sure something reasonable happens when we're unable to fetch the permalink of an entry linked in the h-feed """ source = self.sources[0] source.domain_urls = ['http://author'] activity = self.activities[0] activity['object']['url'] = 'https://fa.ke/post/url' activity['object']['content'] = 'content without links' self.expect_requests_get('http://author', """ <html class="h-feed"> <article class="h-entry"> <a class="u-url" href="nonexistent.html"></a> </article> </html> """) if raise_exception: self.expect_requests_get('http://author/nonexistent.html').AndRaise(HTTPError()) else: self.expect_requests_get('http://author/nonexistent.html', status_code=410) self.mox.ReplayAll() original_post_discovery.discover(source, activity) # we should have saved placeholders to prevent us from trying the # syndication url or permalink again self.assert_equals( set([('http://author/nonexistent.html', None), (None, 'https://fa.ke/post/url')]), set((relationship.original, relationship.syndication) for relationship in SyndicatedPost.query(ancestor=source.key)))
def test_syndication_url_in_hfeed(self): """Like test_single_post, but because the syndication URL is given in the h-feed we skip fetching the permalink. New behavior as of 2014-11-08 """ self.activity['object']['upstreamDuplicates'] = ['existing uD'] # silo domain is fa.ke self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> <a class="u-syndication" href="http://fa.ke/post/url"> </div> </html>""") self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', self.source, self.activity) original_post_discovery.discover(self.source, self.activity) # upstreamDuplicates = 1 original + 1 discovered self.assertEquals(['existing uD', 'http://author/post/permalink'], self.activity['object']['upstreamDuplicates']) origurls = [r.original for r in SyndicatedPost.query(ancestor=self.source.key)] self.assertEquals([u'http://author/post/permalink'], origurls) # for now only syndicated posts belonging to this source are stored syndurls = list(r.syndication for r in SyndicatedPost.query(ancestor=self.source.key)) self.assertEquals([u'https://fa.ke/post/url'], syndurls)
def test_no_author_url(self): """Make sure something reasonable happens when the author doesn't have a url at all. """ self.source.domain_urls = [] discover(self.source, self.activity) # nothing attempted, and no SyndicatedPost saved self.assertFalse(SyndicatedPost.query(ancestor=self.source.key).get())
def test_multiple_refetches(self): """Ensure that multiple refetches of the same post (with and without u-syndication) does not generate duplicate blank entries in the database. See https://github.com/snarfed/bridgy/issues/259 for details """ self.activities[0]['object'].update({ 'content': 'post content without backlinks', 'url': 'https://fa.ke/post/url', }) hfeed = """<html class="h-feed"> <a class="h-entry" href="/permalink"></a> </html>""" unsyndicated = """<html class="h-entry"> <a class="u-url" href="/permalink"></a> </html>""" syndicated = """<html class="h-entry"> <a class="u-url" href="/permalink"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""" # first attempt, no syndication url yet self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', unsyndicated) # refetch, still no syndication url self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', unsyndicated) # second refetch, has a syndication url this time self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', syndicated) self.mox.ReplayAll() original_post_discovery.discover(self.source, self.activities[0]) original_post_discovery.refetch(self.source) relations = list( SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink', ancestor=self.source.key).fetch()) self.assertEquals(1, len(relations)) self.assertEquals('http://author/permalink', relations[0].original) self.assertIsNone(relations[0].syndication) original_post_discovery.refetch(self.source) relations = list( SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink', ancestor=self.source.key).fetch()) self.assertEquals(1, len(relations)) self.assertEquals('http://author/permalink', relations[0].original) self.assertEquals('https://fa.ke/post/url', relations[0].syndication)
def test_merge_front_page_and_h_feed(self): """Make sure we are correctly merging the front page and rel-feed by checking that we visit h-entries that are only the front page or only the rel-feed page. """ activity = self.activities[0] activity['object'].update({ 'content': 'post content without backlink', 'url': 'https://fa.ke/post/url', 'upstreamDuplicates': ['existing uD'], }) # silo domain is fa.ke source = self.sources[0] source.domain_urls = ['http://author'] self.expect_requests_get('http://author', """ <link rel="feed" href="/feed"> <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/only-on-frontpage"></a> </div> <div class="h-entry"> <a class="u-url" href="http://author/on-both"></a> </div> </html>""") self.expect_requests_get('http://author/feed', """ <link rel="feed" href="/feed"> <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/on-both"></a> </div> <div class="h-entry"> <a class="u-url" href="http://author/only-on-feed"></a> </div> </html>""") for orig in ('/only-on-frontpage', '/on-both', '/only-on-feed'): self.expect_requests_get('http://author%s' % orig, """<div class="h-entry"> <a class="u-url" href="%s"></a> </div>""" % orig).InAnyOrder() self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', source, activity) original_post_discovery.discover(source, activity) # should be three blank SyndicatedPosts now for orig in ('http://author/only-on-frontpage', 'http://author/on-both', 'http://author/only-on-feed'): logging.debug('checking %s', orig) sp = SyndicatedPost.query( SyndicatedPost.original == orig, ancestor=source.key).get() self.assertTrue(sp) self.assertIsNone(sp.syndication)
def test_feed_type_unknown(self): """Confirm that we look for an h-feed with type=text/html even when the type is not given in <link>, and keep looking until we find one. """ self.expect_requests_get( "http://author", """ <html> <head> <link rel="feed" href="/updates.atom"> <link rel="feed" href="/updates.html"> <link rel="feed" href="/updates.rss"> </head> </html>""", ) # head request to follow redirects on the post url self.expect_requests_head(self.activity["object"]["url"]) # and for the author url self.expect_requests_head("http://author") # try to get the atom feed first self.expect_requests_head("http://author/updates.atom", content_type="application/xml") # keep looking for an html feed self.expect_requests_head("http://author/updates.html") # look at the rss feed last self.expect_requests_head("http://author/updates.rss", content_type="application/xml") # now fetch the html feed self.expect_requests_get( "http://author/updates.html", """ <html class="h-feed"> <article class="h-entry"> <a class="u-url" href="/permalink">should follow this</a> </article> </html>""", ) # should not try to get the rss feed at this point # but we will follow the post permalink # keep looking for an html feed self.expect_requests_head("http://author/permalink") self.expect_requests_get( "http://author/permalink", """ <html class="h-entry"> <p class="p-name">Title</p> </html>""", ) self.mox.ReplayAll() discover(self.source, self.activity)
def test_feed_type_unknown(self): """Confirm that we look for an h-feed with type=text/html even when the type is not given in <link>, and keep looking until we find one. """ source = self.sources[0] source.domain_urls = ['http://author'] activity = self.activities[0] activity['object']['url'] = 'http://fa.ke/post/url' activity['object']['content'] = 'content without links' self.mox.StubOutWithMock(requests, 'head', use_mock_anything=True) self.expect_requests_get( 'http://author', """ <html> <head> <link rel="feed" href="/updates.atom"> <link rel="feed" href="/updates.html"> <link rel="feed" href="/updates.rss"> </head> </html>""") # head request to follow redirects on the post url self.expect_requests_head(activity['object']['url']) # and for the author url self.expect_requests_head('http://author') # try to get the atom feed first self.expect_requests_head( 'http://author/updates.atom', content_type='application/xml') # keep looking for an html feed self.expect_requests_head('http://author/updates.html') # now fetch the html feed self.expect_requests_get( 'http://author/updates.html', """ <html class="h-feed"> <article class="h-entry"> <a class="u-url" href="/permalink">should follow this</a> </article> </html>""") # should not try to get the rss feed at this point # but we will follow the post permalink # keep looking for an html feed self.expect_requests_head('http://author/permalink') self.expect_requests_get( 'http://author/permalink', """ <html class="h-entry"> <p class="p-name">Title</p> </html>""") self.mox.ReplayAll() original_post_discovery.discover(source, activity)
def test_multiple_rel_feeds(self): """Make sure that we follow all rel=feed links, e.g. if notes and articles are in separate feeds.""" self.expect_requests_get('http://author', """ <html> <head> <link rel="feed" href="/articles" type="text/html"> <link rel="feed" href="/notes" type="text/html"> </head> </html>""") # fetches all feeds first self.expect_requests_get('http://author/articles', """ <html class="h-feed"> <article class="h-entry"> <a class="u-url" href="/article-permalink"></a> </article> </html>""").InAnyOrder('feed') self.expect_requests_get('http://author/notes', """ <html class="h-feed"> <article class="h-entry"> <a class="u-url" href="/note-permalink"></a> </article> </html>""").InAnyOrder('feed') # then the permalinks (in any order since they are hashed to # remove duplicates) self.expect_requests_get('http://author/article-permalink', """ <html class="h-entry"> <a class="u-url" href="/article-permalink"></a> <a class="u-syndication" href="https://fa.ke/article"></a> </html>""").InAnyOrder('permalink') self.expect_requests_get('http://author/note-permalink', """ <html class="h-entry"> <a class="u-url" href="/note-permalink"></a> <a class="u-syndication" href="https://fa.ke/note"></a> </html>""").InAnyOrder('permalink') self.mox.ReplayAll() original_post_discovery.discover(self.source, self.activity) note_rels = SyndicatedPost.query( SyndicatedPost.original == 'http://author/note-permalink', ancestor=self.source.key).fetch() self.assertEqual(1, len(note_rels)) self.assertEqual('https://fa.ke/note', note_rels[0].syndication) article_rels = SyndicatedPost.query( SyndicatedPost.original == 'http://author/article-permalink', ancestor=self.source.key).fetch() self.assertEqual(1, len(article_rels)) self.assertEqual('https://fa.ke/article', article_rels[0].syndication)
def test_avoid_author_page_with_bad_content_type(self): """Confirm that we check the author page's content type before fetching and parsing it """ # head request to follow redirects on the post url self.expect_requests_head(self.activity["object"]["url"]) self.expect_requests_head("http://author", response_headers={"content-type": "application/xml"}) # give up self.mox.ReplayAll() discover(self.source, self.activity)
def test_invalid_webmention_target(self): """Confirm that no additional requests are made if the author url is an invalid webmention target. Right now this pretty much just means they're on the blacklist. Eventually we want to filter out targets that don't have certain features, like a webmention endpoint or microformats. """ self.source.domain_urls = ['http://amazon.com'] discover(self.source, self.activity) # nothing attempted, but we should have saved a placeholder to prevent us # from trying again self.assert_syndicated_posts((None, 'https://fa.ke/post/url'))
def test_no_h_entries(self): """Make sure nothing bad happens when fetching a feed without h-entries. """ self.expect_requests_get('http://author', """ <html class="h-feed"> <p>under construction</p> </html>""") self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', self.source, self.activity) original_post_discovery.discover(self.source, self.activity) self.assert_syndicated_posts((None, 'https://fa.ke/post/url'))
def add_original_post_urls(self, post_id, obj, prop): """Extracts original post URLs and adds them to an object, in place. If the post object has upstreamDuplicates, *only* they are considered original post URLs and added as tags with objectType 'article', and the post's own links and 'article' tags are added with objectType 'mention'. Args: post_id: string post id obj: ActivityStreams post object prop: string property name in obj to add the original post URLs to """ post = None try: post = self.source.get_post(post_id) except: logging.warning('Error fetching source post %s', post_id, exc_info=True) return if not post: logging.warning('Source post %s not found', post_id) return original_post_discovery.discover(self.source, post, fetch_hfeed=False) tags = [tag for tag in post['object'].get('tags', []) if 'url' in tag and tag['objectType'] == 'article'] upstreams = post['object'].get('upstreamDuplicates', []) if not isinstance(obj.setdefault(prop, []), list): obj[prop] = [obj[prop]] if upstreams: obj[prop] += [{'url': url, 'objectType': 'article'} for url in upstreams] obj.setdefault('tags', []).extend( [{'url': tag.get('url'), 'objectType': 'mention'} for tag in tags]) else: obj[prop] += tags # check for redirects, and if there are any follow them and add final urls # in addition to the initial urls. seen = set() for url_list in obj[prop], obj.get('tags', []): for url_obj in url_list: url = util.clean_webmention_url(url_obj.get('url', '')) if not url or url in seen: continue seen.add(url) # when debugging locally, replace my (snarfed.org) URLs with localhost url_obj['url'] = url = util.replace_test_domains_with_localhost(url) resolved, _, send = util.get_webmention_target(url) if send and resolved != url and resolved not in seen: seen.add(resolved) url_list.append({'url': resolved, 'objectType': url_obj.get('objectType')}) logging.info('After original post discovery, urls are: %s', seen)
def test_rel_feed_adds_to_domains(self): """rel=feed discovery should update Source.domains.""" self.expect_requests_get('http://author', """ <html> <head> <link rel="feed" type="text/html" href="http://other/domain"> </head> </html>""") self.expect_requests_get('http://other/domain', 'foo') self.mox.ReplayAll() discover(self.source, self.activity) self.assertEquals(['author', 'other'], self.source.updates['domains'])
def retry(): entity = util.load_source() if not isinstance(entity, Webmentions): error(f'Unexpected key kind {entity.key.kind()}') source = entity.source.get() # run OPD to pick up any new SyndicatedPosts. note that we don't refetch # their h-feed, so if they've added a syndication URL since we last crawled, # retry won't make us pick it up. background in #524. if entity.key.kind() == 'Response': source = entity.source.get() for activity in [json_loads(a) for a in entity.activities_json]: originals, mentions = original_post_discovery.discover( source, activity, fetch_hfeed=False, include_redirect_sources=False) entity.unsent += original_post_discovery.targets_for_response( json_loads(entity.response_json), originals=originals, mentions=mentions) entity.restart() flash('Retrying. Refresh in a minute to see the results!') return redirect(request.values.get('redirect_to') or source.bridgy_url())
def post(self): entity = self.load_source(param='key') if not isinstance(entity, Webmentions): self.abort(400, 'Unexpected key kind %s', entity.key.kind()) # run OPD to pick up any new SyndicatedPosts. note that we don't refetch # their h-feed, so if they've added a syndication URL since we last crawled, # retry won't make us pick it up. background in #524. if entity.key.kind() == 'Response': source = entity.source.get() for activity in [json_loads(a) for a in entity.activities_json]: originals, mentions = original_post_discovery.discover( source, activity, fetch_hfeed=False, include_redirect_sources=False) entity.unsent += original_post_discovery.targets_for_response( json_loads(entity.response_json), originals=originals, mentions=mentions) entity.restart() self.messages.add('Retrying. Refresh in a minute to see the results!') self.redirect( self.request.get('redirect_to') or entity.source.get().bridgy_url(self))
def get_item(self, post_id, user_id, reaction_id): post = self.get_post(post_id) reaction = self.source.gr_source.get_reaction( self.source.key_id(), post_id, user_id, reaction_id, activity=post) if post: originals, mentions = original_post_discovery.discover( self.source, post, fetch_hfeed=False) self.merge_urls(reaction, 'object', originals) return reaction
def get_item(self, event_id, user_id): event = self.source.gr_source.get_event(event_id) rsvp = self.source.gr_source.get_rsvp( self.source.key_id(), event_id, user_id, event=event) if event: originals, mentions = original_post_discovery.discover( self.source, event, fetch_hfeed=False) self.merge_urls(rsvp, 'inReplyTo', originals) return rsvp
def get_item(self, post_id, user_id): post = self.get_post(post_id, fetch_likes=True) like = self.source.get_like(self.source.key_id(), post_id, user_id, activity=post) if post: originals, mentions = original_post_discovery.discover( self.source, post, fetch_hfeed=False) self.merge_urls(like, 'object', originals) return like
def get_item(self, post_id, id): post = self.get_post(post_id, fetch_replies=True) cmt = self.source.get_comment( id, activity_id=post_id, activity_author_id=self.source.key_id(), activity=post) if post: originals, mentions = original_post_discovery.discover( self.source, post, fetch_hfeed=False) self.merge_urls(cmt, 'inReplyTo', originals) self.merge_urls(cmt, 'tags', mentions, object_type='mention') return cmt
def get_item(self, id): posts = self.source.get_activities(activity_id=id, user_id=self.source.key_id()) if not posts: return None post = posts[0] originals, mentions = original_post_discovery.discover( self.source, post, fetch_hfeed=False) obj = post['object'] obj['upstreamDuplicates'] = list( set(util.get_list(obj, 'upstreamDuplicates')) | originals) self.merge_urls(obj, 'tags', mentions, object_type='mention') return obj
def get_item(self, post_id, share_id): post = self.get_post(post_id, fetch_shares=True) repost = self.source.gr_source.get_share( self.source.key_id(), post_id, share_id, activity=post) # webmention receivers don't want to see their own post in their # comments, so remove attachments before rendering. if repost and 'attachments' in repost: del repost['attachments'] if post: originals, mentions = original_post_discovery.discover( self.source, post, fetch_hfeed=False) self.merge_urls(repost, 'object', originals) return repost
def get_item(self, post_id, id): fetch_replies = not self.source.gr_source.OPTIMIZED_COMMENTS post = self.get_post(post_id, fetch_replies=fetch_replies) cmt = self.source.get_comment(id, activity_id=post_id, activity_author_id=self.source.key_id(), activity=post if fetch_replies else None) if post: originals, mentions = original_post_discovery.discover( self.source, post, fetch_hfeed=False) self.merge_urls(cmt, 'inReplyTo', originals) self.merge_urls(cmt, 'tags', mentions, object_type='mention') return cmt
def post(self): entity = ndb.Key(urlsafe=util.get_required_param(self, 'key')).get() if not entity: self.abort(400, 'key not found') # start all target URLs over if entity.status == 'complete': entity.status = 'new' targets = set(entity.unsent + entity.sent + entity.skipped + entity.error + entity.failed) entity.sent = entity.skipped = entity.error = entity.failed = [] # run OPD to pick up any new SyndicatedPosts. note that we don't refetch # their h-feed, so if they've added a syndication URL since we last crawled, # retry won't make us pick it up. background in #524. if entity.key.kind() == 'Response': source = entity.source.get() for activity in [json.loads(a) for a in entity.activities_json]: originals, mentions = original_post_discovery.discover( source, activity, fetch_hfeed=False, include_redirect_sources=False) targets |= original_post_discovery.targets_for_response( json.loads(entity.response_json), originals=originals, mentions=mentions) entity.unsent = targets entity.put() # clear any cached webmention endpoints memcache.delete_multi(util.webmention_endpoint_cache_key(url) for url in targets) if entity.key.kind() == 'Response': util.add_propagate_task(entity) elif entity.key.kind() == 'BlogPost': util.add_propagate_blogpost_task(entity) else: self.abort(400, 'Unexpected key kind %s', entity.key.kind()) self.messages.add('Retrying. Refresh in a minute to see the results!') self.redirect(self.request.get('redirect_to').encode('utf-8') or entity.source.get().bridgy_url(self))
def receive(self, email): addr = self.request.path.split('/')[-1] message_id = email.original.get('message-id').strip('<>') sender = getattr(email, 'sender', None) to = getattr(email, 'to', None) cc = getattr(email, 'cc', None) subject = getattr(email, 'subject', None) logging.info('Received %s from %s to %s (%s) cc %s: %s', message_id, sender, to, addr, cc, subject) addr = self.request.path.split('/')[-1] user = addr.split('@')[0] source = FacebookEmailAccount.query( FacebookEmailAccount.email_user == user).get() logging.info('Source for %s is %s', user, source) util.email_me(subject='New email from %s: %s' % (sender, subject), body='Source: %s' % (source.bridgy_url(self) if source else None)) htmls = list(body.decode() for _, body in email.bodies('text/html')) fbe = FacebookEmail.get_or_insert( message_id, source=source.key if source else None, htmls=htmls) logging.info('FacebookEmail created %s: %s', fbe.created, fbe.key.urlsafe()) if not source: self.response.status_code = 404 self.response.write( 'No Facebook email user found with address %s' % addr) return for html in htmls: obj = gr_facebook.Facebook.email_to_object(html) if obj: break else: self.response.status_code = 400 self.response.write('No HTML body could be parsed') return logging.info('Converted to AS1: %s', json.dumps(obj, indent=2)) base_obj = source.gr_source.base_object(obj) # note that this ignores the id query param (the post's user id) and uses # the source object's user id instead. base_obj['url'] = source.canonicalize_url(base_obj['url']) # also note that base_obj['id'] is not a tag URI, it's the raw Facebook post # id, eg '104790764108207'. we don't use it from activities_json much, # though, just in PropagateResponse.source_url(), which handles this fine. original_post_discovery.refetch(source) targets, mentions = original_post_discovery.discover(source, base_obj, fetch_hfeed=False) logging.info('Got targets %s mentions %s', targets, mentions) resp = Response(id=obj['id'], source=source.key, type=Response.get_type(obj), response_json=json.dumps(obj), activities_json=[json.dumps(base_obj)], unsent=targets) resp.get_or_save(source, restart=True) fbe.response = resp.key fbe.put()
def backfeed(self, source, responses=None, activities=None): """Processes responses and activities and generates propagate tasks. Stores property names and values to update in source.updates. Args: source: Source responses: dict mapping AS response id to AS object activities: dict mapping AS activity id to AS object """ if responses is None: responses = {} if activities is None: activities = {} # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls( [a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get( 'author', {}).get('id') != user_id and activity.get('verb') != 'share': for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get( 'id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update( u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get( 'author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json_dumps(resp, indent=2)) continue if source.is_blocked(resp): logging.info( 'Skipping response by blocked user: %s', json_dumps(resp.get('author') or resp.get('actor'), indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning( 'Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json_loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed( seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] source.blocked_ids = None for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) # new response to propagate! load block list if we haven't already if source.blocked_ids is None: source.load_blocklist() for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.info( 'Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response(id=id, source=source.key, activities_json=[ json_dumps( util.prune_activity(a, source)) for a in activities ], response_json=json_dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json_dumps(urls_to_activity) resp_entity.get_or_save(source, restart=self.RESTART_EXISTING_TASKS) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json_dumps( pruned_responses + unchanged_responses)
class Poll(webapp2.RequestHandler): """Task handler that fetches and processes new responses from a single source. Request parameters: source_key: string key of source entity last_polled: timestamp, YYYY-MM-DD-HH-MM-SS Inserts a propagate task for each response that hasn't been seen before. """ def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime( util.POLL_TASK_DATETIME_FORMAT): logging.warning( 'duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s/log?start_time=%s&key=%s', self.request.host_url, calendar.timegm(source.last_poll_attempt.utctimetuple()), source.key.urlsafe()) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except models.DisableSource: # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. source.updates['status'] = 'disabled' logging.warning('Disabling source!') except: source.updates['poll_status'] = 'error' raise finally: source = models.Source.put_updates(source) # add new poll task. randomize task ETA to within +/- 20% to try to spread # out tasks and prevent thundering herds. task_countdown = source.poll_period().total_seconds() * random.uniform( .8, 1.2) util.add_poll_task(source, countdown=task_countdown) # feeble attempt to avoid hitting the instance memory limit source = None gc.collect() def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) source.updates['poll_status'] = 'ok' raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning( 'Rate limited. Marking as error and finishing. %s', e) source.updates.update({ 'poll_status': 'error', 'rate_limited': True }) return elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error( 'API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json.dumps({ k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids }) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls( [a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get('author', {}).get('id') != user_id: for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get( 'id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update( u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get( 'author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json.dumps(resp, indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning( 'Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed( seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning( 'Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response(id=id, source=source.key, activities_json=[ json.dumps( util.prune_activity(a, source)) for a in activities ], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json.dumps( pruned_responses + unchanged_responses) source.updates.update({ 'last_polled': source.last_poll_attempt, 'poll_status': 'ok' }) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info( 'refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if (isinstance(e, (datastore_errors.BadRequestError, datastore_errors.Timeout)) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', exc_info=True) else: raise