def _posse_post_discovery(source, activity, author_url, syndication_url,
                          fetch_hfeed):
  """Performs the actual meat of the posse-post-discover. It was split
  out from discover() so that it can be done inside of a transaction.

  Args:
    source: models.Source subclass
    activity: activity dict
    author_url: author's url configured in their silo profile
    syndication_url: url of the syndicated copy for which we are
                     trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
                 author's feed if we don't have a previously stored
                 relationship.

  Return:
    the activity, updated with original post urls if any are found
  """
  logging.info(
      'starting posse post discovery with author %s and syndicated %s',
      author_url, syndication_url)

  relationship = SyndicatedPost.query_by_syndication(source, syndication_url)
  if not relationship and fetch_hfeed:
    # a syndicated post we haven't seen before! fetch the author's
    # h-feed to see if we can find it.
    results = _process_author(source, author_url)
    relationship = results.get(syndication_url, None)

  if not relationship:
    # No relationship was found. Remember that we've seen this
    # syndicated post to avoid reprocessing it every time
    logging.debug('posse post discovery found no relationship for %s',
                  syndication_url)
    SyndicatedPost.get_or_insert_by_syndication_url(
        source, syndication_url, None)
    return activity

  logging.debug('posse post discovery found relationship %s -> %s',
                syndication_url, relationship.original)

  if relationship.original:
    obj = activity.get('object') or activity
    obj.setdefault('upstreamDuplicates', []).append(relationship.original)

  return activity
Exemplo n.º 2
0
  def test_get_or_insert_by_syndication_replace(self):
    """Make sure we replace original=None with original=something
    when it is discovered"""
    r = SyndicatedPost.get_or_insert_by_syndication_url(
        self.source, 'http://silo/no-original',
        'http://original/newly-discovered')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/newly-discovered', r.original)

    # make sure it's in NDB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        ancestor=self.source.key
    ).fetch()
    self.assertEquals(1, len(rs))
    self.assertEquals('http://original/newly-discovered', rs[0].original)
    self.assertEquals('http://silo/no-original', rs[0].syndication)
Exemplo n.º 3
0
  def test_get_or_insert_by_syndication_do_not_replace(self):
    """Make sure we don't replace original=something with
    original=something else (in practice, that would mean another task
    is running discovery concurrently and found a different url)
    """
    r = SyndicatedPost.get_or_insert_by_syndication_url(
        self.source, 'http://silo/post/url',
        'http://original/different/url')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/post/url', r.original)

    # make sure it's unchanged in NDB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/post/url',
        ancestor=self.source.key
    ).fetch()

    self.assertEquals(1, len(rs))
    self.assertEquals('http://original/post/url', rs[0].original)
    self.assertEquals('http://silo/post/url', rs[0].syndication)
def _process_entry(source, permalink, refetch_blanks, preexisting):
  """Fetch and process an h-entry, saving a new SyndicatedPost to the
  DB if successful.

  Args:
    permalink: url of the unprocessed post
    syndication_url: url of the syndicated content
    refetch_blanks: boolean whether we should ignore blank preexisting
      SyndicatedPosts
    preexisting: dict of original url to SyndicatedPost

  Return:
    a dict from syndicated url to new models.SyndicatedPosts
  """
  results = {}
  preexisting_relationship = preexisting.get(permalink)

  # if the post has already been processed, do not add to the results
  # since this method only returns *newly* discovered relationships.
  if preexisting_relationship:
    # if we're refetching blanks and this one is blank, do not return
    if refetch_blanks and not preexisting_relationship.syndication:
      logging.debug('ignoring blank relationship for original %s', permalink)
    else:
      return results

  syndication_urls = set()
  parsed = None
  try:
    logging.debug('fetching post permalink %s', permalink)
    permalink, _, type_ok = util.get_webmention_target(permalink)
    if type_ok:
      resp = requests.get(permalink, timeout=HTTP_TIMEOUT)
      resp.raise_for_status()
      parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict()
  except BaseException:
    # TODO limit the number of allowed failures
    logging.warning('Could not fetch permalink %s', permalink, exc_info=True)

  if parsed:
    relsynd = parsed.get('rels').get('syndication', [])
    logging.debug('rel-syndication links: %s', relsynd)
    syndication_urls.update(relsynd)

    # there should only be one h-entry on a permalink page, but
    # we'll check all of them just in case.
    for hentry in (item for item in parsed['items']
                   if 'h-entry' in item['type']):
      usynd = hentry.get('properties', {}).get('syndication', [])
      logging.debug('u-syndication links: %s', usynd)
      syndication_urls.update(usynd)

  # save the results (or lack thereof) to the db, and put them in a
  # map for immediate use
  for syndication_url in syndication_urls:
    # follow redirects to give us the canonical syndication url --
    # gives the best chance of finding a match.
    syndication_url = util.follow_redirects(syndication_url).url
    # source-specific logic to standardize the URL. (e.g., replace facebook
    # username with numeric id)
    syndication_url = source.canonicalize_syndication_url(syndication_url)
    # check that the syndicated url belongs to this source TODO save future
    # lookups by saving results for other sources too (note: query the
    # appropriate source subclass by author.domains, rather than
    # author.domain_urls)
    parsed = urlparse.urlparse(syndication_url)
    if util.domain_from_link(parsed.netloc) == source.AS_CLASS.DOMAIN:
      logging.debug('saving discovered relationship %s -> %s',
                    syndication_url, permalink)
      relationship = SyndicatedPost.get_or_insert_by_syndication_url(
          source, syndication=syndication_url, original=permalink)
      results[syndication_url] = relationship

  if not results:
    logging.debug('no syndication links from %s to current source %s. '
                  'saving empty relationship so that it will not be '
                  'searched again', permalink, source.label())
    # remember that this post doesn't have syndication links for this
    # particular source
    SyndicatedPost(parent=source.key, original=permalink,
                   syndication=None).put()

  logging.debug('discovered relationships %s', results)

  return results