def _posse_post_discovery(source, activity, author_url, syndication_url, fetch_hfeed): """Performs the actual meat of the posse-post-discover. It was split out from discover() so that it can be done inside of a transaction. Args: source: models.Source subclass activity: activity dict author_url: author's url configured in their silo profile syndication_url: url of the syndicated copy for which we are trying to find an original fetch_hfeed: boolean, whether or not to fetch and parse the author's feed if we don't have a previously stored relationship. Return: the activity, updated with original post urls if any are found """ logging.info( 'starting posse post discovery with author %s and syndicated %s', author_url, syndication_url) relationship = SyndicatedPost.query_by_syndication(source, syndication_url) if not relationship and fetch_hfeed: # a syndicated post we haven't seen before! fetch the author's # h-feed to see if we can find it. results = _process_author(source, author_url) relationship = results.get(syndication_url, None) if not relationship: # No relationship was found. Remember that we've seen this # syndicated post to avoid reprocessing it every time logging.debug('posse post discovery found no relationship for %s', syndication_url) SyndicatedPost.get_or_insert_by_syndication_url( source, syndication_url, None) return activity logging.debug('posse post discovery found relationship %s -> %s', syndication_url, relationship.original) if relationship.original: obj = activity.get('object') or activity obj.setdefault('upstreamDuplicates', []).append(relationship.original) return activity
def test_get_or_insert_by_syndication_replace(self): """Make sure we replace original=None with original=something when it is discovered""" r = SyndicatedPost.get_or_insert_by_syndication_url( self.source, 'http://silo/no-original', 'http://original/newly-discovered') self.assertIsNotNone(r) self.assertEquals('http://original/newly-discovered', r.original) # make sure it's in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key ).fetch() self.assertEquals(1, len(rs)) self.assertEquals('http://original/newly-discovered', rs[0].original) self.assertEquals('http://silo/no-original', rs[0].syndication)
def test_get_or_insert_by_syndication_do_not_replace(self): """Make sure we don't replace original=something with original=something else (in practice, that would mean another task is running discovery concurrently and found a different url) """ r = SyndicatedPost.get_or_insert_by_syndication_url( self.source, 'http://silo/post/url', 'http://original/different/url') self.assertIsNotNone(r) self.assertEquals('http://original/post/url', r.original) # make sure it's unchanged in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', ancestor=self.source.key ).fetch() self.assertEquals(1, len(rs)) self.assertEquals('http://original/post/url', rs[0].original) self.assertEquals('http://silo/post/url', rs[0].syndication)
def _process_entry(source, permalink, refetch_blanks, preexisting): """Fetch and process an h-entry, saving a new SyndicatedPost to the DB if successful. Args: permalink: url of the unprocessed post syndication_url: url of the syndicated content refetch_blanks: boolean whether we should ignore blank preexisting SyndicatedPosts preexisting: dict of original url to SyndicatedPost Return: a dict from syndicated url to new models.SyndicatedPosts """ results = {} preexisting_relationship = preexisting.get(permalink) # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting_relationship: # if we're refetching blanks and this one is blank, do not return if refetch_blanks and not preexisting_relationship.syndication: logging.debug('ignoring blank relationship for original %s', permalink) else: return results syndication_urls = set() parsed = None try: logging.debug('fetching post permalink %s', permalink) permalink, _, type_ok = util.get_webmention_target(permalink) if type_ok: resp = requests.get(permalink, timeout=HTTP_TIMEOUT) resp.raise_for_status() parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict() except BaseException: # TODO limit the number of allowed failures logging.warning('Could not fetch permalink %s', permalink, exc_info=True) if parsed: relsynd = parsed.get('rels').get('syndication', []) logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(relsynd) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) logging.debug('u-syndication links: %s', usynd) syndication_urls.update(usynd) # save the results (or lack thereof) to the db, and put them in a # map for immediate use for syndication_url in syndication_urls: # follow redirects to give us the canonical syndication url -- # gives the best chance of finding a match. syndication_url = util.follow_redirects(syndication_url).url # source-specific logic to standardize the URL. (e.g., replace facebook # username with numeric id) syndication_url = source.canonicalize_syndication_url(syndication_url) # check that the syndicated url belongs to this source TODO save future # lookups by saving results for other sources too (note: query the # appropriate source subclass by author.domains, rather than # author.domain_urls) parsed = urlparse.urlparse(syndication_url) if util.domain_from_link(parsed.netloc) == source.AS_CLASS.DOMAIN: logging.debug('saving discovered relationship %s -> %s', syndication_url, permalink) relationship = SyndicatedPost.get_or_insert_by_syndication_url( source, syndication=syndication_url, original=permalink) results[syndication_url] = relationship if not results: logging.debug('no syndication links from %s to current source %s. ' 'saving empty relationship so that it will not be ' 'searched again', permalink, source.label()) # remember that this post doesn't have syndication links for this # particular source SyndicatedPost(parent=source.key, original=permalink, syndication=None).put() logging.debug('discovered relationships %s', results) return results