示例#1
0
def redirect_unwrap(val):
    """Removes our redirect wrapping from a URL, if it's there.

    url may be a string, dict, or list. dicts and lists are unwrapped
    recursively.

    Strings that aren't wrapped URLs are left unchanged.

    Args:
      url: string

    Returns: string, unwrapped url
    """
    if isinstance(val, dict):
        return {k: redirect_unwrap(v) for k, v in val.items()}

    elif isinstance(val, list):
        return [redirect_unwrap(v) for v in val]

    elif isinstance(val, str):
        prefix = urllib.parse.urljoin(request.host_url, '/r/')
        if val.startswith(prefix):
            return util.follow_redirects(val[len(prefix):]).url
        elif val.startswith(request.host_url):
            domain = util.domain_from_link(
                urllib.parse.urlparse(val).path.strip('/'))
            return util.follow_redirects(domain).url

    return val
示例#2
0
文件: util.py 项目: snarfed/bridgy
def follow_redirects(url, cache=True):
  """Wraps :func:`oauth_dropins.webutil.util.follow_redirects` with our settings.

  ...specifically memcache and REQUEST_HEADERS.
  """
  return util.follow_redirects(url, cache=memcache if cache else None,
                               headers=request_headers(url=url))
示例#3
0
def follow_redirects(url, cache=True):
  """Wraps :func:`oauth_dropins.webutil.util.follow_redirects` with our settings.

  ...specifically memcache and REQUEST_HEADERS.
  """
  return util.follow_redirects(url, cache=memcache if cache else None,
                               headers=request_headers(url=url))
示例#4
0
def follow_redirects(url, cache=True):
  """Wraps granary.source.follow_redirects and injects our settings.

  ...specifically memcache and USER_AGENT_HEADER.
  """
  return util.follow_redirects(url, cache=memcache if cache else None,
                               headers=USER_AGENT_HEADER)
示例#5
0
def follow_redirects(url, cache=True):
    """Wraps granary.source.follow_redirects and injects our settings.

  ...specifically memcache and USER_AGENT_HEADER.
  """
    return util.follow_redirects(url,
                                 cache=memcache if cache else None,
                                 headers=USER_AGENT_HEADER)
示例#6
0
def redirect_unwrap(val):
    """Removes our redirect wrapping from a URL, if it's there.

    url may be a string, dict, or list. dicts and lists are unwrapped
    recursively.

    Strings that aren't wrapped URLs are left unchanged.
    """
    if isinstance(val, dict):
        return {k: redirect_unwrap(v) for k, v in val.items()}

    elif isinstance(val, list):
        return [redirect_unwrap(v) for v in val]

    elif isinstance(val, basestring):
        if val.startswith(REDIRECT_PREFIX):
            return val[len(REDIRECT_PREFIX):]
        elif val.startswith(appengine_config.HOST_URL):
            return util.follow_redirects(
                util.domain_from_link(urlparse.urlparse(val).path.strip('/')),
                cache=memcache).url

    return val
示例#7
0
文件: util.py 项目: snarfed/bridgy
def follow_redirects(url):
    """Wraps :func:`oauth_dropins.webutil.util.follow_redirects` with our headers."""
    return util.follow_redirects(url, headers=request_headers(url=url))
示例#8
0
    def original_post_discovery(activity,
                                domains=None,
                                cache=None,
                                include_redirect_sources=True,
                                **kwargs):
        """Discovers original post links.

    This is a variation on http://indiewebcamp.com/original-post-discovery . It
    differs in that it finds multiple candidate links instead of one, and it
    doesn't bother looking for MF2 (etc) markup because the silos don't let you
    input it. More background:
    https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857

    Original post candidates come from the upstreamDuplicates, attachments, and
    tags fields, as well as links and permashortlinks/permashortcitations in the
    text content.

    Args:
      activity: activity dict
      domains: optional sequence of domains. If provided, only links to these
        domains will be considered original and stored in upstreamDuplicates.
        (Permashortcitations are exempt.)
      cache: optional, a cache object for storing resolved URL redirects. Passed
        to follow_redirects().
      include_redirect_sources: boolean, whether to include URLs that redirect
        as well as their final destination URLs
      kwargs: passed to requests.head() when following redirects

    Returns:
      ([string original post URLs], [string mention URLs]) tuple
    """
        obj = activity.get('object') or activity
        content = obj.get('content', '').strip()

        # find all candidate URLs
        tags = [
            t.get('url')
            for t in obj.get('attachments', []) + obj.get('tags', [])
            if t.get('objectType') in ('article', 'mention', None)
        ]
        candidates = tags + util.extract_links(content) + obj.get(
            'upstreamDuplicates', [])

        # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short
        # references to canonical copies of a given (usually syndicated) post, of
        # the form (DOMAIN PATH). We consider them an explicit original post link.
        candidates += [
            match.expand(r'http://\1/\2')
            for match in Source._PERMASHORTCITATION_RE.finditer(content)
        ]

        candidates = set(
            filter(
                None,
                (
                    util.clean_url(url) for url in candidates
                    # heuristic: ellipsized URLs are probably incomplete, so omit them.
                    if url and not url.endswith('...')
                    and not url.endswith(u'…'))))

        # check for redirect and add their final urls
        redirects = {}  # maps final URL to original URL for redirects
        for url in list(candidates):
            resolved = util.follow_redirects(url, cache=cache, **kwargs)
            if (resolved.url != url and resolved.headers.get(
                    'content-type', '').startswith('text/html')):
                redirects[resolved.url] = url
                candidates.add(resolved.url)

        # use domains to determine which URLs are original post links vs mentions
        originals = set()
        mentions = set()
        for url in util.dedupe_urls(candidates):
            if url in redirects.values():
                # this is a redirected original URL. postpone and handle it when we hit
                # its final URL so that we know the final domain.
                continue
            domain = util.domain_from_link(url)
            which = (originals if not domains or util.domain_or_parent_in(
                domain, domains) else mentions)
            which.add(url)
            redirected_from = redirects.get(url)
            if redirected_from and include_redirect_sources:
                which.add(redirected_from)

        logging.info(
            'Original post discovery found original posts %s, mentions %s',
            originals, mentions)
        return originals, mentions
示例#9
0
文件: source.py 项目: snarfed/granary
  def original_post_discovery(activity, domains=None, cache=None,
                              include_redirect_sources=True, **kwargs):
    """Discovers original post links.

    This is a variation on http://indiewebcamp.com/original-post-discovery . It
    differs in that it finds multiple candidate links instead of one, and it
    doesn't bother looking for MF2 (etc) markup because the silos don't let you
    input it. More background:
    https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857

    Original post candidates come from the upstreamDuplicates, attachments, and
    tags fields, as well as links and permashortlinks/permashortcitations in the
    text content.

    Args:
      activity: activity dict
      domains: optional sequence of domains. If provided, only links to these
        domains will be considered original and stored in upstreamDuplicates.
        (Permashortcitations are exempt.)
      cache: optional, a cache object for storing resolved URL redirects. Passed
        to follow_redirects().
      include_redirect_sources: boolean, whether to include URLs that redirect
        as well as their final destination URLs
      kwargs: passed to requests.head() when following redirects

    Returns:
      ([string original post URLs], [string mention URLs]) tuple
    """
    obj = activity.get('object') or activity
    content = obj.get('content', '').strip()

    # find all candidate URLs
    tags = [t.get('url') for t in obj.get('attachments', []) + obj.get('tags', [])
            if t.get('objectType') in ('article', 'mention', None)]
    candidates = tags + util.extract_links(content) + obj.get('upstreamDuplicates', [])

    # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short
    # references to canonical copies of a given (usually syndicated) post, of
    # the form (DOMAIN PATH). We consider them an explicit original post link.
    candidates += [match.expand(r'http://\1/\2') for match in
                   Source._PERMASHORTCITATION_RE.finditer(content)]

    candidates = set(util.dedupe_urls(
      util.clean_url(url) for url in candidates
      # heuristic: ellipsized URLs are probably incomplete, so omit them.
      if url and not url.endswith('...') and not url.endswith('…')))

    # check for redirect and add their final urls
    redirects = {}  # maps final URL to original URL for redirects
    for url in candidates:
      resolved = util.follow_redirects(url, cache=cache, **kwargs)
      if (resolved.url != url and
          resolved.headers.get('content-type', '').startswith('text/html')):
        redirects[resolved.url] = url

    candidates.update(redirects.keys())

    # use domains to determine which URLs are original post links vs mentions
    originals = set()
    mentions = set()
    for url in util.dedupe_urls(candidates):
      if url in redirects.values():
        # this is a redirected original URL. postpone and handle it when we hit
        # its final URL so that we know the final domain.
        continue
      domain = util.domain_from_link(url)
      which = (originals if not domains or util.domain_or_parent_in(domain, domains)
               else mentions)
      which.add(url)
      redirected_from = redirects.get(url)
      if redirected_from and include_redirect_sources:
        which.add(redirected_from)

    logging.info('Original post discovery found original posts %s, mentions %s',
                 originals, mentions)
    return originals, mentions