Python dedupe_urls примеры, util.dedupe_urls Python примеры использования

Пример #1

0

Показать файл

Файл: models.py Проект: uniteddiversity/bridgy

  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: oauth_dropins.models.BaseAuth
      user_url: string, optional URL passed in when authorizing

    Returns: ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    urls = []
    for url in util.trim_nulls(util.uniquify(
        [user_url] + [actor.get('url')] +
        [u.get('value') for u in actor.get('urls', [])])):
      domain = util.domain_from_link(url)
      if domain and not util.in_webmention_blacklist(domain.lower()):
        urls.append(url)

    urls = util.dedupe_urls(urls)
    domains = [util.domain_from_link(url).lower() for url in urls]
    return urls, domains

Пример #2

0

Показать файл

Файл: wordpress_rest.py Проект: LennonFlores/bridgy

    def new(handler, auth_entity=None, **kwargs):
        """Creates and returns a WordPress for the logged in user.

    Args:
      handler: the current RequestHandler
      auth_entity: oauth_dropins.wordpress.WordPressAuth
    """
        auth_domain = auth_entity.key.id()
        site_info = WordPress.get_site_info(handler, auth_entity)
        if site_info is None:
            return

        urls = util.dedupe_urls(
            util.trim_nulls([site_info.get('URL'), auth_entity.blog_url]))
        domains = [util.domain_from_link(u) for u in urls]

        avatar = (json.loads(auth_entity.user_json).get('avatar_URL')
                  if auth_entity.user_json else None)
        return WordPress(id=domains[0],
                         auth_entity=auth_entity.key,
                         name=auth_entity.user_display_name(),
                         picture=avatar,
                         superfeedr_secret=util.generate_secret(),
                         url=urls[0],
                         domain_urls=urls,
                         domains=domains,
                         site_info=site_info,
                         **kwargs)

Пример #3

0

Показать файл

    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
        logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.warning(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            url, domain, send = util.get_webmention_target(
                url, resolve=i < MAX_AUTHOR_URLS)
            if send:
                urls.append(url)

        urls = util.dedupe_urls(urls)  # normalizes domains to lower case
        domains = [util.domain_from_link(url) for url in urls]
        return urls, domains

Пример #4

0

Показать файл

Файл: tasks.py Проект: dev511/bridgy

def get_webmention_targets(source, activity):
  """Returns a set of string target URLs to attempt to send webmentions to.

  Side effect: runs the original post discovery algorithm on the activity and
  adds the resulting URLs to the activity as tags, in place.

  Args:
   source: models.Source subclass
   activity: activity dict
  """
  original_post_discovery.discover(source, activity)

  obj = activity.get('object') or activity
  urls = []

  for tag in obj.get('tags', []):
    url = tag.get('url')
    if url and tag.get('objectType') == 'article':
      url, domain, send = util.get_webmention_target(url)
      tag['url'] = url
      if send:
        urls.append(url)

  for url in obj.get('upstreamDuplicates', []):
    url, domain, send = util.get_webmention_target(url)
    if send:
      urls.append(url)

  return util.dedupe_urls(urls)

Пример #5

0

Показать файл

Файл: models.py Проект: chrisaldrich/bridgy

  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: oauth_dropins.models.BaseAuth
      user_url: string, optional URL passed in when authorizing

    Returns: ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logging.warning('Too many profile links! Only resolving the first %s: %s',
                      MAX_AUTHOR_URLS, candidates)

    urls = []
    for i, url in enumerate(candidates):
      url, domain, send = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS)
      if send:
        urls.append(url)

    urls = util.dedupe_urls(urls)  # normalizes domains to lower case
    domains = [util.domain_from_link(url) for url in urls]
    return urls, domains

Пример #6

0

Показать файл

Файл: wordpress_rest.py Проект: dev511/bridgy

  def new(handler, auth_entity=None, **kwargs):
    """Creates and returns a WordPress for the logged in user.

    Args:
      handler: the current RequestHandler
      auth_entity: oauth_dropins.wordpress.WordPressAuth
    """
    auth_domain = auth_entity.key.id()
    site_info = WordPress.get_site_info(handler, auth_entity)
    if site_info is None:
      return

    urls = util.dedupe_urls(util.trim_nulls(
      [site_info.get('URL'), auth_entity.blog_url]))
    domains = [util.domain_from_link(u) for u in urls]

    avatar = (json.loads(auth_entity.user_json).get('avatar_URL')
              if auth_entity.user_json else None)
    return WordPress(id=domains[0],
                     auth_entity=auth_entity.key,
                     name=auth_entity.user_display_name(),
                     picture=avatar,
                     superfeedr_secret=util.generate_secret(),
                     url=urls[0],
                     domain_urls=urls,
                     domains=domains,
                     site_info=site_info,
                     **kwargs)

Пример #7

0

Показать файл

Файл: superfeedr.py Проект: snarfed/bridgy

def handle_feed(feed, source):
  """Handles a Superfeedr JSON feed.

  Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks
  for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: unicode string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
  logging.info('Source: %s %s', source.label(), source.key.string_id())
  logging.info('Raw feed: %s', feed)

  if source.status != 'enabled':
    logging.info('Dropping because source is %s', source.status)
    return
  elif 'webmention' not in source.features:
    logging.info("Dropping because source doesn't have webmention feature")
    return

  for item in json.loads(feed).get('items', []):
    url = item.get('permalinkUrl') or item.get('id')
    if not url:
      logging.error('Dropping feed item without permalinkUrl or id!')
      continue

    # extract links from content, discarding self links.
    #
    # i don't use get_webmention_target[s]() here because they follows redirects
    # and fetch link contents, and this handler should be small and fast and try
    # to return a response to superfeedr successfully.
    #
    # TODO: extract_links currently has a bug that makes it drop trailing
    # slashes. ugh. fix that.
    content = item.get('content') or item.get('summary', '')
    links = [util.clean_url(util.unwrap_t_umblr_com(l))
             for l in util.extract_links(content)
             if util.domain_from_link(l) not in source.domains]

    unique = []
    for link in util.dedupe_urls(links):
      if len(link) <= _MAX_STRING_LENGTH:
        unique.append(link)
      else:
        logging.info('Giving up on link over %s chars! %s', _MAX_STRING_LENGTH, link)

    logging.info('Found links: %s', unique)
    if len(url) > _MAX_KEYPART_BYTES:
      logging.warning('Blog post URL is too long (over 500 chars)! Giving up.')
      bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key,
                           feed_item=item, failed=unique)
    else:
      bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=unique)

    bp.get_or_save()

Пример #8

0

Показать файл

Файл: superfeedr.py Проект: siakaramalegos/bridgy

def handle_feed(feed, source):
  """Handles a Superfeedr JSON feed.

  Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks
  for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: unicode string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
  logging.info('Source: %s %s', source.label(), source.key.string_id())
  logging.info('Raw feed: %s', feed)

  if source.status != 'enabled':
    logging.info('Dropping because source is %s', source.status)
    return
  elif 'webmention' not in source.features:
    logging.info("Dropping because source doesn't have webmention feature")
    return

  for item in json_loads(feed).get('items', []):
    url = item.get('permalinkUrl') or item.get('id')
    if not url:
      logging.error('Dropping feed item without permalinkUrl or id!')
      continue

    # extract links from content, discarding self links.
    #
    # i don't use get_webmention_target[s]() here because they follows redirects
    # and fetch link contents, and this handler should be small and fast and try
    # to return a response to superfeedr successfully.
    #
    # TODO: extract_links currently has a bug that makes it drop trailing
    # slashes. ugh. fix that.
    content = item.get('content') or item.get('summary', '')
    links = [util.clean_url(util.unwrap_t_umblr_com(l))
             for l in util.extract_links(content)
             if util.domain_from_link(l) not in source.domains]

    unique = []
    for link in util.dedupe_urls(links):
      if len(link) <= _MAX_STRING_LENGTH:
        unique.append(link)
      else:
        logging.info('Giving up on link over %s chars! %s', _MAX_STRING_LENGTH, link)

    logging.info('Found links: %s', unique)
    if len(url) > _MAX_KEYPART_BYTES:
      logging.warning('Blog post URL is too long (over 500 chars)! Giving up.')
      bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key,
                           feed_item=item, failed=unique)
    else:
      bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=unique)

    bp.get_or_save()

Пример #9

0

Показать файл

Файл: test_util.py Проект: snarfed/webutil

 def test_dedupe_urls(self):
   self.assertEquals([], util.dedupe_urls([]))
   self.assertEquals(['http://foo/'], util.dedupe_urls(['http://foo']))
   self.assertEquals(['http://foo/'], util.dedupe_urls(['http://foo', 'http://foo']))
   self.assertEquals(['http://foo/'], util.dedupe_urls(['http://foo', 'http://foo/']))
   self.assertEquals(['https://foo/'], util.dedupe_urls([
     'https://foo', 'http://foo', 'https://foo/', 'http://foo/']))
   self.assertEquals(['https://foo/'],
                     util.dedupe_urls(['http://foo', 'https://foo/']))
   self.assertEquals(['http://foo/bar', 'http://foo/bar/'],
                     util.dedupe_urls(['http://foo/bar', 'http://foo/bar/']))
   self.assertEquals(['http://foo/'],
                     util.dedupe_urls(['http://foo', 'http://FOO/', 'http://FoO/']))

Пример #10

0

Показать файл

Файл: models.py Проект: mblaney/bridgy

    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
        logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.info(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            final, domain, ok = util.get_webmention_target(
                url, resolve=i < MAX_AUTHOR_URLS)
            if ok:
                final = final.lower()
                if util.schemeless(final).startswith(
                        util.schemeless(url.lower())):
                    # redirected to a deeper path. use the original higher level URL. #652
                    final = url
                # If final has a path segment check if root has a matching rel=me.
                match = re.match(r'^(https?://[^/]+)/.+', final)
                if match and i < MAX_AUTHOR_URLS:
                    root = match.group(1)
                    resp = util.requests_get(root)
                    resp.raise_for_status()
                    data = util.mf2py_parse(resp.text, root)
                    me_urls = data.get('rels', {}).get('me', [])
                    if final in me_urls:
                        final = root
                urls.append(final)

        urls = util.dedupe_urls(urls)  # normalizes domains to lower case
        domains = [util.domain_from_link(url) for url in urls]
        return urls, domains

Пример #11

0

Показать файл

  def urls_and_domains(self, auth_entity, user_url, actor=None,
                       resolve_source_domain=True):
    """Returns this user's valid (not webmention-blocklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing
      actor: dict, optional AS actor for the user. If provided, overrides
        auth_entity
      resolve_source_domain: boolean, whether to follow redirects on URLs on
        this source's domain

    Returns:
      ([string url, ...], [string domain, ...])
    """
    if not actor:
      actor = self.gr_source.user_to_actor(json_loads(auth_entity.user_json))
    logger.debug(f'Extracting URLs and domains from actor: {json_dumps(actor, indent=2)}')

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logger.info(f'Too many profile links! Only resolving the first {MAX_AUTHOR_URLS}: {candidates}')

    urls = []
    for i, url in enumerate(candidates):
      on_source_domain = util.domain_from_link(url) == self.gr_source.DOMAIN
      resolve = ((resolve_source_domain or not on_source_domain) and
                 i < MAX_AUTHOR_URLS)
      resolved = self.resolve_profile_url(url, resolve=resolve)
      if resolved:
        urls.append(resolved)

    final_urls = []
    domains = []
    for url in util.dedupe_urls(urls):  # normalizes domains to lower case
      # skip links on this source's domain itself. only currently needed for
      # Mastodon; the other silo domains are in the webmention blocklist.
      domain = util.domain_from_link(url)
      if domain != self.gr_source.DOMAIN:
        final_urls.append(url)
        domains.append(domain)

    return final_urls, domains

Пример #12

0

Показать файл

Файл: models.py Проект: mblaney/bridgy

  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logging.info('Too many profile links! Only resolving the first %s: %s',
                   MAX_AUTHOR_URLS, candidates)

    urls = []
    for i, url in enumerate(candidates):
      final, domain, ok = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS)
      if ok:
        final = final.lower()
        if util.schemeless(final).startswith(util.schemeless(url.lower())):
          # redirected to a deeper path. use the original higher level URL. #652
          final = url
        # If final has a path segment check if root has a matching rel=me.
        match = re.match(r'^(https?://[^/]+)/.+', final)
        if match and i < MAX_AUTHOR_URLS:
          root = match.group(1)
          resp = util.requests_get(root)
          resp.raise_for_status()
          data = util.mf2py_parse(resp.text, root)
          me_urls = data.get('rels', {}).get('me', [])
          if final in me_urls:
            final = root
        urls.append(final)

    urls = util.dedupe_urls(urls)  # normalizes domains to lower case
    domains = [util.domain_from_link(url) for url in urls]
    return urls, domains

Пример #13

0

Показать файл

Файл: models.py Проект: mblaney/bridgy

    def restart(self):
        """Moves status and targets to 'new' and adds a propagate task."""
        self.status = 'new'
        self.unsent = util.dedupe_urls(self.unsent + self.sent + self.error +
                                       self.failed + self.skipped)
        self.sent = self.error = self.failed = self.skipped = []

        # clear any cached webmention endpoints
        memcache.delete_multi(
            util.webmention_endpoint_cache_key(url) for url in self.unsent)

        @ndb.transactional
        def finish():
            self.put()
            self.add_task(transactional=True)

        finish()

Пример #14

0

Показать файл

Файл: models.py Проект: mblaney/bridgy

  def restart(self):
    """Moves status and targets to 'new' and adds a propagate task."""
    self.status = 'new'
    self.unsent = util.dedupe_urls(self.unsent + self.sent + self.error +
                                   self.failed + self.skipped)
    self.sent = self.error = self.failed = self.skipped = []

    # clear any cached webmention endpoints
    memcache.delete_multi(util.webmention_endpoint_cache_key(url)
                          for url in self.unsent)

    @ndb.transactional
    def finish():
      self.put()
      self.add_task(transactional=True)

    finish()

Пример #15

0

Показать файл

    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        user = json_loads(auth_entity.user_json)
        actor = (
            user.get('actor')  # for Instagram; its user_json is IndieAuth
            or self.gr_source.user_to_actor(user))
        logging.debug('Extracting URLs and domains from actor: %s',
                      json_dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.info(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            resolved = self.resolve_profile_url(url,
                                                resolve=i < MAX_AUTHOR_URLS)
            if resolved:
                urls.append(resolved)

        final_urls = []
        domains = []
        for url in util.dedupe_urls(urls):  # normalizes domains to lower case
            # skip links on this source's domain itself. only currently needed for
            # Mastodon; the other silo domains are in the webmention blacklist.
            domain = util.domain_from_link(url)
            if domain != self.gr_source.DOMAIN:
                final_urls.append(url)
                domains.append(domain)

        return final_urls, domains

Пример #16

0

Показать файл

Файл: models.py Проект: stedn/bridgy

  def restart(self):
    """Moves status and targets to 'new' and adds a propagate task."""
    self.status = 'new'
    self.unsent = util.dedupe_urls(self.unsent + self.sent + self.error +
                                   self.failed + self.skipped)
    self.sent = self.error = self.failed = self.skipped = []

    # clear any cached webmention endpoints
    with util.webmention_endpoint_cache_lock:
      for url in self.unsent:
        util.webmention_endpoint_cache.pop(util.webmention_endpoint_cache_key(url), None)

    # this datastore put and task add should be transactional, but Cloud Tasks
    # doesn't support that :(
    # https://cloud.google.com/appengine/docs/standard/python/taskqueue/push/migrating-push-queues#features-not-available
    self.put()
    self.add_task()

Пример #17

0

Показать файл

  def restart(self):
    """Moves status and targets to 'new' and adds a propagate task."""
    self.status = 'new'
    self.unsent = util.dedupe_urls(self.unsent + self.sent + self.error +
                                   self.failed + self.skipped)
    self.sent = self.error = self.failed = self.skipped = []

    # clear any cached webmention endpoints
    with util.webmention_endpoint_cache_lock:
      for url in self.unsent:
        util.webmention_endpoint_cache.pop(util.webmention_endpoint_cache_key(url), None)

    # this datastore put and task add should be transactional, but Cloud Tasks
    # doesn't support that :(
    # https://cloud.google.com/appengine/docs/standard/python/taskqueue/push/migrating-push-queues#features-not-available
    # https://github.com/googleapis/python-tasks/issues/26
    #
    # The new "bundled services" bridge for the old App Engine APIs still
    # supports them, but only because that's literally on the old backends,
    # which seems like a dead end.
    # https://groups.google.com/g/google-appengine/c/22BKInlWty0/m/05ObNEdsAgAJ
    self.put()
    self.add_task()

Пример #18

0

Показать файл

Файл: original_post_discovery.py Проект: qls0ulp/bridgy

def discover(source,
             activity,
             fetch_hfeed=True,
             include_redirect_sources=True,
             already_fetched_hfeeds=None):
    """Augments the standard original_post_discovery algorithm with a
  reverse lookup that supports posts without a backlink or citation.

  If fetch_hfeed is False, then we will check the db for previously found
  :class:`models.SyndicatedPost`\ s but will not do posse-post-discovery to find new
  ones.

  Args:
    source: :class:`models.Source` subclass. Changes to property values (e.g.
      domains, domain_urls, last_syndication_url) are stored in source.updates;
      they should be updated transactionally later.
    activity: activity dict
    fetch_hfeed: boolean
    include_redirect_sources: boolean, whether to include URLs that redirect as
      well as their final destination URLs
    already_fetched_hfeeds: set, URLs that we have already fetched and run
      posse-post-discovery on, so we can avoid running it multiple times

  Returns:
    (set(string original post URLs), set(string mention URLs)) tuple
  """
    logging.debug('discovering original posts for: %s',
                  activity.get('url') or activity.get('id'))

    if not source.updates:
        source.updates = {}

    if already_fetched_hfeeds is None:
        already_fetched_hfeeds = set()

    originals, mentions = gr_source.Source.original_post_discovery(
        activity,
        domains=source.domains,
        cache=memcache,
        include_redirect_sources=include_redirect_sources,
        headers=util.request_headers(source=source))

    obj = activity.get('object', {})
    author_id = obj.get('author', {}).get('id') or activity.get('author',
                                                                {}).get('id')
    if author_id and author_id != source.user_tag_id():
        logging.info(
            "Demoting original post links because user %s doesn't match author %s",
            source.user_tag_id(), author_id)
        # this is someone else's post, so all links must be mentions
        mentions.update(originals)
        originals = set()

    # look for original URL of attachments (e.g. quote tweets)
    for att in obj.get('attachments', []):
        if (att.get('objectType') in ('note', 'article')
                and att.get('author', {}).get('id') == source.user_tag_id()):
            logging.debug('running original post discovery on attachment: %s',
                          att.get('id'))
            att_origs, _ = discover(
                source, att, include_redirect_sources=include_redirect_sources)
            logging.debug(
                'original post discovery found originals for attachment, %s',
                att_origs)
            mentions.update(att_origs)

    def resolve(urls):
        resolved = set()
        for url in urls:
            final, _, send = util.get_webmention_target(url)
            if send:
                resolved.add(final)
                if include_redirect_sources:
                    resolved.add(url)
        return resolved

    originals = resolve(originals)
    mentions = resolve(mentions)

    if not source.get_author_urls():
        logging.debug('no author url(s), cannot find h-feed')
        return ((originals, mentions)
                if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else
                (set(), set()))

    # TODO possible optimization: if we've discovered a backlink to a post on the
    # author's domain (i.e., it included a link or citation), then skip the rest
    # of this.
    syndicated = []
    syndication_url = obj.get('url') or activity.get('url')
    if syndication_url:
        # use the canonical syndication url on both sides, so that we have
        # the best chance of finding a match. Some silos allow several
        # different permalink formats to point to the same place (e.g.,
        # facebook user id instead of user name)
        syndication_url = source.canonicalize_url(syndication_url)
        if syndication_url:
            syndicated = _posse_post_discovery(source, activity,
                                               syndication_url, fetch_hfeed,
                                               already_fetched_hfeeds)
            originals.update(syndicated)
        originals = set(util.dedupe_urls(originals))

    if not syndication_url:
        logging.debug('no %s syndication url, cannot process h-entries',
                      source.SHORT_NAME)

    return ((originals,
             mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else
            (set(syndicated), set()))

Пример #19

0

Показать файл

Файл: blog_webmention.py Проект: sheyril/bridgy

    def post(self, source_short_name):
        logging.info('Params: %s', list(self.request.params.items()))
        # strip fragments from source and target url
        self.source_url = urllib.parse.urldefrag(
            util.get_required_param(self, 'source'))[0]
        self.target_url = urllib.parse.urldefrag(
            util.get_required_param(self, 'target'))[0]

        # follow target url through any redirects, strip utm_* query params
        resp = util.follow_redirects(self.target_url)
        redirected_target_urls = [r.url for r in resp.history]
        self.target_url = util.clean_url(resp.url)

        # parse and validate target URL
        domain = util.domain_from_link(self.target_url)
        if not domain:
            return self.error('Could not parse target URL %s' %
                              self.target_url)

        # look up source by domain
        source_cls = models.sources[source_short_name]
        domain = domain.lower()
        self.source = (source_cls.query().filter(
            source_cls.domains == domain).filter(
                source_cls.features == 'webmention').filter(
                    source_cls.status == 'enabled').get())
        if not self.source:
            # check for a rel-canonical link. Blogger uses these when it serves a post
            # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs
            # epeus.blogspot.com.
            # https://github.com/snarfed/bridgy/issues/805
            mf2 = self.fetch_mf2(self.target_url, require_mf2=False)
            if not mf2:
                # fetch_mf2() already wrote the error response
                return
            domains = util.dedupe_urls(
                util.domain_from_link(url)
                for url in mf2[1]['rels'].get('canonical', []))
            if domains:
                self.source = (source_cls.query().filter(
                    source_cls.domains.IN(domains)).filter(
                        source_cls.features == 'webmention').filter(
                            source_cls.status == 'enabled').get())

        if not self.source:
            return self.error(
                'Could not find %s account for %s. Is it registered with Bridgy?'
                % (source_cls.GR_CLASS.NAME, domain))

        # check that the target URL path is supported
        target_path = urllib.parse.urlparse(self.target_url).path
        if target_path in ('', '/'):
            return self.error(
                'Home page webmentions are not currently supported.',
                status=202)
        for pattern in self.source.PATH_BLOCKLIST:
            if pattern.match(target_path):
                return self.error(
                    '%s webmentions are not supported for URL path: %s' %
                    (self.source.GR_CLASS.NAME, target_path),
                    status=202)

        # create BlogWebmention entity
        id = '%s %s' % (self.source_url, self.target_url)
        self.entity = BlogWebmention.get_or_insert(
            id,
            source=self.source.key,
            redirected_target_urls=redirected_target_urls)
        if self.entity.status == 'complete':
            # TODO: response message saying update isn't supported
            self.response.write(self.entity.published)
            return
        logging.debug("BlogWebmention entity: '%s'",
                      self.entity.key.urlsafe().decode())

        # fetch source page
        fetched = self.fetch_mf2(self.source_url)
        if not fetched:
            return
        resp, mf2 = fetched

        item = self.find_mention_item(mf2.get('items', []))
        if not item:
            return self.error(
                'Could not find target URL %s in source page %s' %
                (self.target_url, resp.url),
                data=mf2,
                log_exception=False)

        # default author to target domain
        author_name = domain
        author_url = 'http://%s/' % domain

        # extract author name and URL from h-card, if any
        props = item['properties']
        author = first_value(props, 'author')
        if author:
            if isinstance(author, str):
                author_name = author
            else:
                author_props = author.get('properties', {})
                author_name = first_value(author_props, 'name')
                author_url = first_value(author_props, 'url')

        # if present, u-url overrides source url
        u_url = first_value(props, 'url')
        if u_url:
            self.entity.u_url = u_url

        # generate content
        content = props['content'][
            0]  # find_mention_item() guaranteed this is here
        text = (content.get('html') or content.get('value')).strip()
        source_url = self.entity.source_url()
        text += ' <br /> <a href="%s">via %s</a>' % (
            source_url, util.domain_from_link(source_url))

        # write comment
        try:
            self.entity.published = self.source.create_comment(
                self.target_url, author_name, author_url, text)
        except Exception as e:
            code, body = util.interpret_http_exception(e)
            msg = 'Error: %s %s; %s' % (code, e, body)
            if code == '401':
                logging.warning('Disabling source due to: %s' % e,
                                stack_info=True)
                self.source.status = 'disabled'
                self.source.put()
                return self.error(msg,
                                  status=code,
                                  report=self.source.is_beta_user())
            elif code == '404':
                # post is gone
                return self.error(msg, status=code, report=False)
            elif util.is_connection_failure(e) or (code
                                                   and int(code) // 100 == 5):
                return self.error(msg,
                                  status=util.ERROR_HTTP_RETURN_CODE,
                                  report=False)
            elif code or body:
                return self.error(msg, status=code, report=True)
            else:
                raise

        # write results to datastore
        self.entity.status = 'complete'
        self.entity.put()
        self.response.write(json_dumps(self.entity.published))

Пример #20

0

Показать файл

def discover(source,
             activity,
             fetch_hfeed=True,
             include_redirect_sources=True,
             already_fetched_hfeeds=None):
    """Augments the standard original_post_discovery algorithm with a
  reverse lookup that supports posts without a backlink or citation.

  If fetch_hfeed is False, then we will check the db for previously found
  :class:`models.SyndicatedPost`\ s but will not do posse-post-discovery to find
  new ones.

  Args:
    source: :class:`models.Source` subclass. Changes to property values (e.g.
      domains, domain_urls, last_syndication_url) are stored in source.updates;
      they should be updated transactionally later.
    activity: activity dict
    fetch_hfeed: boolean
    include_redirect_sources: boolean, whether to include URLs that redirect as
      well as their final destination URLs
    already_fetched_hfeeds: set, URLs that we have already fetched and run
      posse-post-discovery on, so we can avoid running it multiple times

  Returns:
    (set(string original post URLs), set(string mention URLs)) tuple
  """
    label = activity.get('url') or activity.get('id')
    logger.debug(f'discovering original posts for: {label}')

    if not source.updates:
        source.updates = {}

    if already_fetched_hfeeds is None:
        already_fetched_hfeeds = set()

    originals, mentions = gr_source.Source.original_post_discovery(
        activity,
        domains=source.domains,
        include_redirect_sources=include_redirect_sources,
        include_reserved_hosts=DEBUG,
        max_redirect_fetches=MAX_ORIGINAL_CANDIDATES,
        headers=util.request_headers(source=source))

    # only include mentions of the author themselves.
    # (mostly just for Mastodon; other silos' domains are all in the blocklist, so
    # their mention URLs get dropped later anyway.)
    # (these are originally added in Source._inject_user_urls() and in poll step 2.)
    obj = activity.get('object', {})
    other_user_mentions = set(
        t.get('url') for t in obj.get('tags', [])
        if t.get('objectType') == 'person'
        and t.get('url') not in source.domain_urls)
    originals -= other_user_mentions
    mentions -= other_user_mentions

    # original posts are only from the author themselves
    obj_author = obj.get('author', {})
    activity_author = activity.get('actor', {})
    author_id = obj_author.get('id') or activity_author.get('id')
    author_username = obj_author.get('username') or activity_author.get(
        'username')
    if (author_id and author_id != source.user_tag_id()
            and author_username != source.key.id()):
        logger.info(
            f"Demoting original post links because user {source.user_tag_id()} doesn't match author id {author_id} username {author_username}"
        )
        # this is someone else's post, so all links must be mentions
        mentions.update(originals)
        originals = set()

    # look for original URL of attachments (e.g. quote tweets)
    for att in obj.get('attachments', []):
        if (att.get('objectType') in ('note', 'article')
                and att.get('author', {}).get('id') == source.user_tag_id()):
            logger.debug(
                f"running original post discovery on attachment: {att.get('id')}"
            )
            att_origs, _ = discover(
                source, att, include_redirect_sources=include_redirect_sources)
            logger.debug(
                f'original post discovery found originals for attachment, {att_origs}'
            )
            mentions.update(att_origs)

    if len(originals) > MAX_ORIGINAL_CANDIDATES:
        logging.info(
            f'{len(originals)} originals, pruning down to {MAX_ORIGINAL_CANDIDATES}'
        )
        originals = sorted(originals)[:MAX_ORIGINAL_CANDIDATES]
    if len(mentions) > MAX_MENTION_CANDIDATES:
        logging.info(
            f'{len(mentions)} mentions, pruning down to {MAX_MENTION_CANDIDATES}'
        )
        mentions = sorted(mentions)[:MAX_MENTION_CANDIDATES]

    def resolve(urls):
        resolved = set()
        for url in urls:
            final, domain, send = util.get_webmention_target(url)
            if send and domain != source.gr_source.DOMAIN:
                resolved.add(final)
                if include_redirect_sources:
                    resolved.add(url)
        return resolved

    originals = resolve(originals)
    mentions = resolve(mentions)

    if not source.get_author_urls():
        logger.debug('no author url(s), cannot find h-feed')
        return ((originals, mentions)
                if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else
                (set(), set()))

    # TODO possible optimization: if we've discovered a backlink to a post on the
    # author's domain (i.e., it included a link or citation), then skip the rest
    # of this.
    syndicated = []
    syndication_url = obj.get('url') or activity.get('url')
    if syndication_url:
        # use the canonical syndication url on both sides, so that we have
        # the best chance of finding a match. Some silos allow several
        # different permalink formats to point to the same place.
        syndication_url = source.canonicalize_url(syndication_url)
        if syndication_url:
            syndicated = _posse_post_discovery(source, activity,
                                               syndication_url, fetch_hfeed,
                                               already_fetched_hfeeds)
            originals.update(syndicated)
        originals = set(util.dedupe_urls(originals))

    if not syndication_url:
        logger.debug(
            f'no {source.SHORT_NAME} syndication url, cannot process h-entries'
        )

    return ((originals,
             mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else
            (set(syndicated), set()))

Пример #21

0

Показать файл

Файл: blog_webmention.py Проект: snarfed/bridgy

  def post(self, source_short_name):
    logging.info('Params: %self', self.request.params.items())
    # strip fragments from source and target url
    self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0]
    self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0]

    # follow target url through any redirects, strip utm_* query params
    resp = util.follow_redirects(self.target_url)
    redirected_target_urls = [r.url for r in resp.history]
    self.target_url = util.clean_url(resp.url)

    # parse and validate target URL
    domain = util.domain_from_link(self.target_url)
    if not domain:
      return self.error('Could not parse target URL %s' % self.target_url)

    # look up source by domain
    source_cls = models.sources[source_short_name]
    domain = domain.lower()
    self.source = (source_cls.query()
                   .filter(source_cls.domains == domain)
                   .filter(source_cls.features == 'webmention')
                   .filter(source_cls.status == 'enabled')
                   .get())
    if not self.source:
      # check for a rel-canonical link. Blogger uses these when it serves a post
      # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs
      # epeus.blogspot.com.
      # https://github.com/snarfed/bridgy/issues/805
      mf2 = self.fetch_mf2(self.target_url, require_mf2=False)
      if not mf2:
        # fetch_mf2() already wrote the error response
        return
      domains = util.dedupe_urls(
        util.domain_from_link(url)
        for url in mf2[1].get('rels', {}).get('canonical', []))
      if domains:
        self.source = (source_cls.query()
                       .filter(source_cls.domains.IN(domains))
                       .filter(source_cls.features == 'webmention')
                       .filter(source_cls.status == 'enabled')
                       .get())

    if not self.source:
      return self.error(
        'Could not find %s account for %s. Is it registered with Bridgy?' %
        (source_cls.GR_CLASS.NAME, domain))

    # check that the target URL path is supported
    target_path = urlparse.urlparse(self.target_url).path
    if target_path in ('', '/'):
      return self.error('Home page webmentions are not currently supported.',
                        status=202)
    for pattern in self.source.PATH_BLACKLIST:
      if pattern.match(target_path):
        return self.error('%s webmentions are not supported for URL path: %s' %
                          (self.source.GR_CLASS.NAME, target_path), status=202)

    # create BlogWebmention entity
    id = '%s %s' % (self.source_url, self.target_url)
    self.entity = BlogWebmention.get_or_insert(
      id, source=self.source.key, redirected_target_urls=redirected_target_urls)
    if self.entity.status == 'complete':
      # TODO: response message saying update isn't supported
      self.response.write(self.entity.published)
      return
    logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe())

    # fetch source page
    resp = self.fetch_mf2(self.source_url)
    if not resp:
      return
    self.fetched, data = resp

    item = self.find_mention_item(data.get('items', []))
    if not item:
      return self.error('Could not find target URL %s in source page %s' %
                        (self.target_url, self.fetched.url),
                        data=data, log_exception=False)

    # default author to target domain
    author_name = domain
    author_url = 'http://%s/' % domain

    # extract author name and URL from h-card, if any
    props = item['properties']
    author = first_value(props, 'author')
    if author:
      if isinstance(author, basestring):
        author_name = author
      else:
        author_props = author.get('properties', {})
        author_name = first_value(author_props, 'name')
        author_url = first_value(author_props, 'url')

    # if present, u-url overrides source url
    u_url = first_value(props, 'url')
    if u_url:
      self.entity.u_url = u_url

    # generate content
    content = props['content'][0]  # find_mention_item() guaranteed this is here
    text = (content.get('html') or content.get('value')).strip()
    source_url = self.entity.source_url()
    text += ' <br /> <a href="%s">via %s</a>' % (
      source_url, util.domain_from_link(source_url))

    # write comment
    try:
      self.entity.published = self.source.create_comment(
        self.target_url, author_name, author_url, text)
    except Exception as e:
      code, body = util.interpret_http_exception(e)
      msg = 'Error: %s %s; %s' % (code, e, body)
      if code == '401':
        logging.warning('Disabling source due to: %s' % e, exc_info=True)
        self.source.status = 'disabled'
        self.source.put()
        return self.error(msg, status=code, mail=self.source.is_beta_user())
      elif code == '404':
        # post is gone
        return self.error(msg, status=code, mail=False)
      elif util.is_connection_failure(e) or (code and int(code) // 100 == 5):
        return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, mail=False)
      elif code or body:
        return self.error(msg, status=code, mail=True)
      else:
        raise

    # write results to datastore
    self.entity.status = 'complete'
    self.entity.put()
    self.response.write(json.dumps(self.entity.published))

Пример #22

0

Показать файл

Файл: original_post_discovery.py Проект: kylewm/bridgy

def discover(source, activity, fetch_hfeed=True, include_redirect_sources=True):
  """Augments the standard original_post_discovery algorithm with a
  reverse lookup that supports posts without a backlink or citation.

  If fetch_hfeed is False, then we will check the db for previously
  found SyndicatedPosts but will not do posse-post-discovery to find
  new ones.

  Args:
    source: models.Source subclass. Changes to property values (e.g. domains,
      domain_urls, last_syndication_url) are stored in source.updates; they
      should be updated transactionally later.
    activity: activity dict
    fetch_hfeed: boolean
    include_redirect_sources: boolean, whether to include URLs that redirect as
      well as their final destination URLs

  Returns: (set(string original post URLs), set(string mention URLs)) tuple

  """
  if not source.updates:
    source.updates = {}

  originals, mentions = gr_source.Source.original_post_discovery(
    activity, domains=source.domains, cache=memcache,
    include_redirect_sources=include_redirect_sources,
    headers=util.USER_AGENT_HEADER)

  obj = activity.get('object', {})
  author_id = obj.get('author', {}).get('id') or activity.get('author', {}).get('id')
  if author_id and author_id != source.user_tag_id():
    logging.info(
      "Demoting original post links because user %s doesn't match author %s",
      source.user_tag_id(), author_id)
    # this is someone else's post, so all links must be mentions
    mentions.update(originals)
    originals = set()

  # look for original URL of attachments (e.g. quote tweets)
  for att in obj.get('attachments', []):
    if (att.get('objectType') in ('note', 'article')
        and att.get('author', {}).get('id') == source.user_tag_id()):
      logging.debug('running original post discovery on attachment: %s',
                    att.get('id'))
      att_origs, _ = discover(
        source, att, include_redirect_sources=include_redirect_sources)
      logging.debug('original post discovery found originals for attachment, %s',
                    att_origs)
      mentions.update(att_origs)

  def resolve(urls):
    resolved = set()
    for url in urls:
      final, _, send = util.get_webmention_target(url)
      if send:
        resolved.add(final)
        if include_redirect_sources:
          resolved.add(url)
    return resolved

  originals = resolve(originals)
  mentions = resolve(mentions)

  if not source.get_author_urls():
    logging.debug('no author url(s), cannot find h-feed')
    return originals, mentions

  # TODO possible optimization: if we've discovered a backlink to a post on the
  # author's domain (i.e., it included a link or citation), then skip the rest
  # of this.
  syndication_url = obj.get('url') or activity.get('url')
  if syndication_url:
    # use the canonical syndication url on both sides, so that we have
    # the best chance of finding a match. Some silos allow several
    # different permalink formats to point to the same place (e.g.,
    # facebook user id instead of user name)
    syndication_url = source.canonicalize_syndication_url(
      util.follow_redirects(syndication_url).url)
    originals.update(_posse_post_discovery(
      source, activity, syndication_url, fetch_hfeed))
    originals = set(util.dedupe_urls(originals))
  else:
    logging.debug('no syndication url, cannot process h-entries')

  return originals, mentions

Пример #23

0

Показать файл

Файл: blog_webmention.py Проект: snarfed/bridgy

  def dispatch_request(self, site):
    logger.info(f'Params: {list(request.values.items())}')
    # strip fragments from source and target url
    self.source_url = urllib.parse.urldefrag(request.form['source'])[0]
    self.target_url = urllib.parse.urldefrag(request.form['target'])[0]

    # follow target url through any redirects, strip utm_* query params
    resp = util.follow_redirects(self.target_url)
    redirected_target_urls = [r.url for r in resp.history]
    self.target_url = util.clean_url(resp.url)

    # parse and validate target URL
    domain = util.domain_from_link(self.target_url)
    if not domain:
      self.error(f'Could not parse target URL {self.target_url}')

    # look up source by domain
    source_cls = models.sources[site]
    domain = domain.lower()
    self.source = (source_cls.query()
                   .filter(source_cls.domains == domain)
                   .filter(source_cls.features == 'webmention')
                   .filter(source_cls.status == 'enabled')
                   .get())
    if not self.source:
      # check for a rel-canonical link. Blogger uses these when it serves a post
      # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs
      # epeus.blogspot.com.
      # https://github.com/snarfed/bridgy/issues/805
      mf2 = self.fetch_mf2(self.target_url, require_mf2=False)
      if not mf2:
        # fetch_mf2() already wrote the error response
        return
      domains = util.dedupe_urls(
        util.domain_from_link(url)
        for url in mf2[1]['rels'].get('canonical', []))
      if domains:
        self.source = (source_cls.query()
                       .filter(source_cls.domains.IN(domains))
                       .filter(source_cls.features == 'webmention')
                       .filter(source_cls.status == 'enabled')
                       .get())

    if not self.source:
      self.error(
        f'Could not find {source_cls.GR_CLASS.NAME} account for {domain}. Is it registered with Bridgy?')

    # check that the target URL path is supported
    target_path = urllib.parse.urlparse(self.target_url).path
    if target_path in ('', '/'):
      msg = 'Home page webmentions are not currently supported.'
      logger.info(msg)
      return {'error': msg}, 202
    for pattern in self.source.PATH_BLOCKLIST:
      if pattern.match(target_path):
        msg = f'{self.source.GR_CLASS.NAME} webmentions are not supported for URL path: {target_path}'
        logger.info(msg)
        return {'error': msg}, 202

    # create BlogWebmention entity
    id = f'{self.source_url} {self.target_url}'
    self.entity = BlogWebmention.get_or_insert(
      id, source=self.source.key, redirected_target_urls=redirected_target_urls)
    if self.entity.status == 'complete':
      # TODO: response message saying update isn't supported
      return self.entity.published
    logger.debug(f'BlogWebmention entity: {self.entity.key.urlsafe().decode()}')

    # fetch source page
    fetched = self.fetch_mf2(self.source_url)
    if not fetched:
      return
    resp, mf2 = fetched

    item = self.find_mention_item(mf2.get('items', []))
    if not item:
      self.error(f'Could not find target URL {self.target_url} in source page {resp.url}', data=mf2, log_exception=False)

    # default author to target domain
    author_name = domain
    author_url = f'http://{domain}/'

    # extract author name and URL from h-card, if any
    props = item['properties']
    author = get_first(props, 'author')
    if author:
      if isinstance(author, str):
        author_name = author
      else:
        author_props = author.get('properties', {})
        author_name = get_first(author_props, 'name')
        author_url = get_first(author_props, 'url')

    # if present, u-url overrides source url
    u_url = get_first(props, 'url')
    if u_url:
      self.entity.u_url = u_url

    # generate content
    content = props['content'][0]  # find_mention_item() guaranteed this is here
    text = (content.get('html') or content.get('value')).strip()
    source_url = self.entity.source_url()
    text += f' <br /> <a href="{source_url}">via {util.domain_from_link(source_url)}</a>'

    # write comment
    try:
      self.entity.published = self.source.create_comment(
        self.target_url, author_name, author_url, text)
    except Exception as e:
      code, body = util.interpret_http_exception(e)
      msg = f'Error: {code}: {e}; {body}'
      if code == '401':
        logger.warning(f'Disabling source due to: {e}', exc_info=True)
        self.source.status = 'disabled'
        self.source.put()
        self.error(msg, status=code, report=self.source.is_beta_user())
      elif code == '404':
        # post is gone
        self.error(msg, status=code, report=False)
      elif util.is_connection_failure(e) or (code and int(code) // 100 == 5):
        self.error(msg, status=502, report=False)
      elif code or body:
        self.error(msg, status=code, report=True)
      else:
        raise

    # write results to datastore
    self.entity.status = 'complete'
    self.entity.put()

    return self.entity.published

Python dedupe_urls примеры использования