Exemplo n.º 1
0
    def search_for_links(self):
        """Searches for activities with links to any of this source's web sites.

    Only searches for root domain web site URLs! Skips URLs with paths; they
    tend to generate false positive results in G+'s search. Not sure why yet.

    G+ search supports OR:
    https://developers.google.com/+/api/latest/activities/search

    Returns:
      sequence of ActivityStreams activity dicts
    """
        urls = [
            '"%s"' % util.fragmentless(url) for url in self.domain_urls
            if not util.in_webmention_blacklist(util.domain_from_link(url))
            and urlparse.urlparse(url).path in ('', '/')
        ][:models.MAX_AUTHOR_URLS]

        if urls:
            return self.get_activities(search_query=' OR '.join(urls),
                                       group_id=gr_source.SEARCH,
                                       etag=self.last_activities_etag,
                                       fetch_replies=False,
                                       fetch_likes=False,
                                       fetch_shares=False,
                                       count=50)

        return []
Exemplo n.º 2
0
  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: oauth_dropins.models.BaseAuth
      user_url: string, optional URL passed in when authorizing

    Returns: ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    urls = []
    for url in util.trim_nulls(util.uniquify(
        [user_url] + [actor.get('url')] +
        [u.get('value') for u in actor.get('urls', [])])):
      domain = util.domain_from_link(url)
      if domain and not util.in_webmention_blacklist(domain.lower()):
        urls.append(url)

    urls = util.dedupe_urls(urls)
    domains = [util.domain_from_link(url).lower() for url in urls]
    return urls, domains
Exemplo n.º 3
0
  def search_for_links(self):
    """Searches for activities with links to any of this source's web sites.

    G+ search supports OR:
    https://developers.google.com/+/api/latest/activities/search

    Returns: sequence of ActivityStreams activity dicts
    """
    query = ' OR '.join(
      '"%s"' % util.fragmentless(url) for url in self.domain_urls
      if not util.in_webmention_blacklist(util.domain_from_link(url)))
    return self.get_activities(
      search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag,
      fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50)
Exemplo n.º 4
0
    def verify(self, force=False):
        """Checks that this source is ready to be used.

    For blog and listen sources, this fetches their front page HTML and
    discovers their webmention endpoint. For publish sources, this checks that
    they have a domain.

    May be overridden by subclasses, e.g. :class:`tumblr.Tumblr`.

    Args:
      force: if True, fully verifies (e.g. re-fetches the blog's HTML and
        performs webmention discovery) even we already think this source is
        verified.
    """
        author_urls = [
            u for u, d in zip(self.get_author_urls(), self.domains)
            if not util.in_webmention_blacklist(d)
        ]
        if ((self.verified() and not force) or self.status == 'disabled'
                or not self.features or not author_urls):
            return

        author_url = author_urls[0]
        logging.info('Attempting to discover webmention endpoint on %s',
                     author_url)
        mention = send.WebmentionSend('https://brid.gy/', author_url)
        mention.requests_kwargs = {
            'timeout': HTTP_TIMEOUT,
            'headers': util.REQUEST_HEADERS
        }
        try:
            mention._discoverEndpoint()
        except BaseException:
            logging.info('Error discovering webmention endpoint',
                         exc_info=True)
            mention.error = {'code': 'EXCEPTION'}

        self._fetched_html = getattr(mention, 'html', None)
        error = getattr(mention, 'error', None)
        endpoint = getattr(mention, 'receiver_endpoint', None)
        if error or not endpoint:
            logging.info("No webmention endpoint found: %s %r", error,
                         endpoint)
            self.webmention_endpoint = None
        else:
            logging.info("Discovered webmention endpoint %s", endpoint)
            self.webmention_endpoint = endpoint

        self.put()
Exemplo n.º 5
0
    def search_for_links(self):
        """Searches for activities with links to any of this source's web sites.

    Twitter search supports OR:
    https://dev.twitter.com/rest/public/search

    ...but it only returns complete(ish) results if we strip scheme from URLs,
    ie search for example.com instead of http://example.com/, and that also
    returns false positivies, so we check that the returned tweets actually have
    matching links. https://github.com/snarfed/bridgy/issues/565

    Returns:
      sequence of ActivityStreams activity dicts
    """
        urls = set(
            util.fragmentless(url) for url in self.domain_urls
            if not util.in_webmention_blacklist(util.domain_from_link(url)))
        if not urls:
            return []

        query = ' OR '.join('"%s"' % util.schemeless(url, slashes=False)
                            for url in urls)
        candidates = self.get_activities(search_query=query,
                                         group_id=gr_source.SEARCH,
                                         etag=self.last_activities_etag,
                                         fetch_replies=False,
                                         fetch_likes=False,
                                         fetch_shares=False,
                                         count=50)

        # filter out retweets and search false positives that don't actually link to us
        results = []
        for candidate in candidates:
            if candidate.get('verb') == 'share':
                continue
            obj = candidate['object']
            tags = obj.get('tags', [])
            atts = obj.get('attachments', [])
            for url in urls:
                if (url in obj.get('content', '') or any(
                        t.get('url', '').startswith(url)
                        for t in tags + atts)):
                    id = candidate['id']
                    results.append(candidate)
                    break

        return results
Exemplo n.º 6
0
  def verify(self, force=False):
    """Checks that this source is ready to be used.

    For blog and listen sources, this fetches their front page HTML and
    discovers their webmention endpoint. For publish sources, this checks that
    they have a domain.

    May be overridden by subclasses, e.g. :class:`tumblr.Tumblr`.

    Args:
      force: if True, fully verifies (e.g. re-fetches the blog's HTML and
        performs webmention discovery) even we already think this source is
        verified.
    """
    author_urls = [u for u, d in zip(self.get_author_urls(), self.domains)
                   if not util.in_webmention_blacklist(d)]
    if ((self.verified() and not force) or self.status == 'disabled' or
        not self.features or not author_urls):
      return

    author_url = author_urls[0]
    logging.info('Attempting to discover webmention endpoint on %s', author_url)
    mention = send.WebmentionSend('https://brid.gy/', author_url)
    mention.requests_kwargs = {'timeout': HTTP_TIMEOUT,
                               'headers': util.REQUEST_HEADERS}
    try:
      mention._discoverEndpoint()
    except BaseException:
      logging.info('Error discovering webmention endpoint', exc_info=True)
      mention.error = {'code': 'EXCEPTION'}

    self._fetched_html = getattr(mention, 'html', None)
    error = getattr(mention, 'error', None)
    endpoint = getattr(mention, 'receiver_endpoint', None)
    if error or not endpoint:
      logging.info("No webmention endpoint found: %s %r", error, endpoint)
      self.webmention_endpoint = None
    else:
      logging.info("Discovered webmention endpoint %s", endpoint)
      self.webmention_endpoint = endpoint

    self.put()
Exemplo n.º 7
0
  def search_for_links(self):
    """Searches for activities with links to any of this source's web sites.

    Twitter search supports OR:
    https://dev.twitter.com/rest/public/search

    ...but it only returns complete(ish) results if we strip scheme from URLs,
    ie search for example.com instead of http://example.com/, and that also
    returns false positivies, so we check that the returned tweets actually have
    matching links. https://github.com/snarfed/bridgy/issues/565

    Returns:
      sequence of ActivityStreams activity dicts
    """
    urls = set(util.fragmentless(url) for url in self.domain_urls
               if not util.in_webmention_blacklist(util.domain_from_link(url)))
    if not urls:
      return []

    query = ' OR '.join('"%s"' % util.schemeless(url, slashes=False) for url in urls)
    candidates = self.get_activities(
      search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag,
      fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50)

    # filter out retweets and search false positives that don't actually link to us
    results = []
    for candidate in candidates:
      if candidate.get('verb') == 'share':
        continue
      obj = candidate['object']
      tags = obj.get('tags', [])
      atts = obj.get('attachments', [])
      for url in urls:
        if (url in obj.get('content', '') or
            any(t.get('url', '').startswith(url) for t in tags + atts)):
          id = candidate['id']
          results.append(candidate)
          break

    return results
Exemplo n.º 8
0
  def search_for_links(self):
    """Searches for activities with links to any of this source's web sites.

    Only searches for root domain web site URLs! Skips URLs with paths; they
    tend to generate false positive results in G+'s search. Not sure why yet.

    G+ search supports OR:
    https://developers.google.com/+/api/latest/activities/search

    Returns: sequence of ActivityStreams activity dicts
    """
    urls = ['"%s"' % util.fragmentless(url) for url in self.domain_urls
            if not util.in_webmention_blacklist(util.domain_from_link(url))
            and urlparse.urlparse(url).path in ('', '/')
           ][:models.MAX_AUTHOR_URLS]

    if urls:
      return self.get_activities(
        search_query=' OR '.join(urls), group_id=gr_source.SEARCH,
        etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False,
        fetch_shares=False, count=50)

    return []
Exemplo n.º 9
0
    def test_in_webmention_blacklist(self):
        for bad in 't.co', 'x.t.co', 'x.y.t.co', 'abc.onion':
            self.assertTrue(util.in_webmention_blacklist(bad), bad)

        for good in 'snarfed.org', 'www.snarfed.org', 't.co.com':
            self.assertFalse(util.in_webmention_blacklist(good), good)
Exemplo n.º 10
0
  def test_in_webmention_blacklist(self):
    for bad in 't.co', 'x.t.co', 'x.y.t.co', 'abc.onion':
      self.assertTrue(util.in_webmention_blacklist(bad), bad)

    for good in 'snarfed.org', 'www.snarfed.org', 't.co.com':
      self.assertFalse(util.in_webmention_blacklist(good), good)
Exemplo n.º 11
0
    def test_in_webmention_blacklist(self):
        for bad in "t.co", "x.t.co", "x.y.t.co", "abc.onion":
            self.assertTrue(util.in_webmention_blacklist(bad), bad)

        for good in "snarfed.org", "www.snarfed.org", "t.co.com":
            self.assertFalse(util.in_webmention_blacklist(good), good)