Пример #1
0
def get_webmention_target(url, resolve=True, replace_test_domains=True):
  """Resolves a URL and decides whether we should try to send it a webmention.

  Note that this ignores failed HTTP requests, ie the boolean in the returned
  tuple will be true! TODO: check callers and reconsider this.

  Args:
    url: string
    resolve: whether to follow redirects
    replace_test_domains: whether to replace test user domains with localhost

  Returns: (string url, string pretty domain, boolean) tuple. The boolean is
    True if we should send a webmention, False otherwise, e.g. if it's a bad
    URL, not text/html, or in the blacklist.
  """
  url = util.clean_url(url)
  try:
    domain = domain_from_link(url).lower()
  except BaseException:
    logging.info('Dropping bad URL %s.', url)
    return url, None, False

  send = True
  if resolve:
    # this follows *all* redirects, until the end
    resolved = follow_redirects(url, cache=memcache)
    send = resolved.headers.get('content-type', '').startswith('text/html')
    url, domain, _ = get_webmention_target(
      resolved.url, resolve=False, replace_test_domains=replace_test_domains)

  send = send and domain and not in_webmention_blacklist(domain)
  if replace_test_domains:
    url = replace_test_domains_with_localhost(url)
  return url, domain, send
Пример #2
0
def get_webmention_target(url, resolve=True, replace_test_domains=True):
  """Resolves a URL and decides whether we should try to send it a webmention.

  Note that this ignores failed HTTP requests, ie the boolean in the returned
  tuple will be true! TODO: check callers and reconsider this.

  Args:
    url: string
    resolve: whether to follow redirects
    replace_test_domains: whether to replace test user domains with localhost

  Returns:
    (string url, string pretty domain, boolean) tuple. The boolean is
    True if we should send a webmention, False otherwise, e.g. if it's a bad
    URL, not text/html, or in the blacklist.
  """
  url = util.clean_url(url)
  try:
    domain = domain_from_link(url).lower()
  except BaseException:
    logging.info('Dropping bad URL %s.', url)
    return url, None, False

  send = True
  if resolve:
    # this follows *all* redirects, until the end
    resolved = follow_redirects(url, cache=memcache)
    send = resolved.headers.get('content-type', '').startswith('text/html')
    url, domain, _ = get_webmention_target(
      resolved.url, resolve=False, replace_test_domains=replace_test_domains)

  send = send and domain and not in_webmention_blacklist(domain)
  if replace_test_domains:
    url = replace_test_domains_with_localhost(url)
  return url, domain, send
Пример #3
0
def get_webmention_target(url, resolve=True, replace_test_domains=True):
    """Resolves a URL and decides whether we should try to send it a webmention.

  Note that this ignores failed HTTP requests, ie the boolean in the returned
  tuple will be True! TODO: check callers and reconsider this.

  Args:
    url: string
    resolve: whether to follow redirects
    replace_test_domains: whether to replace test user domains with localhost

  Returns:
    (string url, string pretty domain, boolean) tuple. The boolean is
    True if we should send a webmention, False otherwise, e.g. if it's a bad
    URL, not text/html, or in the blocklist.
  """
    url = util.clean_url(url)
    try:
        domain = domain_from_link(url).lower()
    except BaseException:
        logging.info('Dropping bad URL %s.', url)
        return url, None, False

    if domain in ('puzzleadventura.com', 'sweetgamesbox.com'):
        return url, domain, False

    send = True
    if resolve:
        # this follows *all* redirects, until the end
        resolved = follow_redirects(url)
        html = resolved.headers.get('content-type', '').startswith('text/html')
        send = html and resolved.status_code != util.HTTP_RESPONSE_TOO_BIG_STATUS_CODE
        url, domain, _ = get_webmention_target(
            resolved.url,
            resolve=False,
            replace_test_domains=replace_test_domains)

    scheme = urllib.parse.urlparse(url).scheme  # require http or https
    send = (send and domain and scheme in ('http', 'https')
            and not in_webmention_blocklist(domain))

    if replace_test_domains:
        url = replace_test_domains_with_localhost(url)

    return url, domain, send
Пример #4
0
    def original_post_discovery(activity,
                                domains=None,
                                cache=None,
                                include_redirect_sources=True,
                                **kwargs):
        """Discovers original post links.

    This is a variation on http://indiewebcamp.com/original-post-discovery . It
    differs in that it finds multiple candidate links instead of one, and it
    doesn't bother looking for MF2 (etc) markup because the silos don't let you
    input it. More background:
    https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857

    Original post candidates come from the upstreamDuplicates, attachments, and
    tags fields, as well as links and permashortlinks/permashortcitations in the
    text content.

    Args:
      activity: activity dict
      domains: optional sequence of domains. If provided, only links to these
        domains will be considered original and stored in upstreamDuplicates.
        (Permashortcitations are exempt.)
      cache: optional, a cache object for storing resolved URL redirects. Passed
        to follow_redirects().
      include_redirect_sources: boolean, whether to include URLs that redirect
        as well as their final destination URLs
      kwargs: passed to requests.head() when following redirects

    Returns:
      ([string original post URLs], [string mention URLs]) tuple
    """
        obj = activity.get('object') or activity
        content = obj.get('content', '').strip()

        # find all candidate URLs
        tags = [
            t.get('url')
            for t in obj.get('attachments', []) + obj.get('tags', [])
            if t.get('objectType') in ('article', 'mention', None)
        ]
        candidates = tags + util.extract_links(content) + obj.get(
            'upstreamDuplicates', [])

        # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short
        # references to canonical copies of a given (usually syndicated) post, of
        # the form (DOMAIN PATH). We consider them an explicit original post link.
        candidates += [
            match.expand(r'http://\1/\2')
            for match in Source._PERMASHORTCITATION_RE.finditer(content)
        ]

        candidates = set(
            filter(
                None,
                (
                    util.clean_url(url) for url in candidates
                    # heuristic: ellipsized URLs are probably incomplete, so omit them.
                    if url and not url.endswith('...')
                    and not url.endswith(u'…'))))

        # check for redirect and add their final urls
        redirects = {}  # maps final URL to original URL for redirects
        for url in list(candidates):
            resolved = util.follow_redirects(url, cache=cache, **kwargs)
            if (resolved.url != url and resolved.headers.get(
                    'content-type', '').startswith('text/html')):
                redirects[resolved.url] = url
                candidates.add(resolved.url)

        # use domains to determine which URLs are original post links vs mentions
        originals = set()
        mentions = set()
        for url in util.dedupe_urls(candidates):
            if url in redirects.values():
                # this is a redirected original URL. postpone and handle it when we hit
                # its final URL so that we know the final domain.
                continue
            domain = util.domain_from_link(url)
            which = (originals if not domains or util.domain_or_parent_in(
                domain, domains) else mentions)
            which.add(url)
            redirected_from = redirects.get(url)
            if redirected_from and include_redirect_sources:
                which.add(redirected_from)

        logging.info(
            'Original post discovery found original posts %s, mentions %s',
            originals, mentions)
        return originals, mentions
Пример #5
0
    def original_post_discovery(activity, domains=None, cache=None, include_redirect_sources=True, **kwargs):
        """Discovers original post links.

    This is a variation on http://indiewebcamp.com/original-post-discovery . It
    differs in that it finds multiple candidate links instead of one, and it
    doesn't bother looking for MF2 (etc) markup because the silos don't let you
    input it. More background:
    https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857

    Original post candidates come from the upstreamDuplicates, attachments, and
    tags fields, as well as links and permashortlinks/permashortcitations in the
    text content.

    Args:
      activity: activity dict
      domains: optional sequence of domains. If provided, only links to these
        domains will be considered original and stored in upstreamDuplicates.
        (Permashortcitations are exempt.)
      cache: optional, a cache object for storing resolved URL redirects. Passed
        to follow_redirects().
      include_redirect_sources: boolean, whether to include URLs that redirect
        as well as their final destination URLs
      kwargs: passed to requests.head() when following redirects

    Returns: ([string original post URLs], [string mention URLs]) tuple
    """
        obj = activity.get("object") or activity
        content = obj.get("content", "").strip()

        # find all candidate URLs
        tags = [
            t.get("url")
            for t in obj.get("attachments", []) + obj.get("tags", [])
            if t.get("objectType") in ("article", "mention", None)
        ]
        candidates = tags + util.extract_links(content) + obj.get("upstreamDuplicates", [])

        # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short
        # references to canonical copies of a given (usually syndicated) post, of
        # the form (DOMAIN PATH). We consider them an explicit original post link.
        candidates += [match.expand(r"http://\1/\2") for match in Source._PERMASHORTCITATION_RE.finditer(content)]

        candidates = set(
            filter(
                None,
                (
                    util.clean_url(url)
                    for url in candidates
                    # heuristic: ellipsized URLs are probably incomplete, so omit them.
                    if url and not url.endswith("...") and not url.endswith(u"…")
                ),
            )
        )

        # check for redirect and add their final urls
        redirects = {}  # maps final URL to original URL for redirects
        for url in list(candidates):
            resolved = follow_redirects(url, cache=cache, **kwargs)
            if resolved.url != url and resolved.headers.get("content-type", "").startswith("text/html"):
                redirects[resolved.url] = url
                candidates.add(resolved.url)

        # use domains to determine which URLs are original post links vs mentions
        originals = set()
        mentions = set()
        for url in util.dedupe_urls(candidates):
            if url in redirects.values():
                # this is a redirected original URL. postpone and handle it when we hit
                # its final URL so that we know the final domain.
                continue
            which = originals if not domains or util.domain_from_link(url) in domains else mentions
            which.add(url)
            redirected_from = redirects.get(url)
            if redirected_from and include_redirect_sources:
                which.add(redirected_from)

        logging.info("Original post discovery found original posts %s, mentions %s", originals, mentions)
        return originals, mentions
Пример #6
0
        kwargs.setdefault("timeout", appengine_config.HTTP_TIMEOUT)
        resolved = requests.head(url, allow_redirects=True, **kwargs)
        resolved.raise_for_status()
        if resolved.url != url:
            logging.debug("Resolved %s to %s", url, resolved.url)
        cache_time = 0  # forever
    except AssertionError:
        raise
    except BaseException, e:
        logging.warning("Couldn't resolve URL %s : %s", url, e)
        resolved = requests.Response()
        resolved.url = url
        resolved.status_code = 499  # not standard. i made this up.
        cache_time = FAILED_RESOLVE_URL_CACHE_TIME

    content_type = resolved.headers.get("content-type")
    if not content_type:
        type, _ = mimetypes.guess_type(resolved.url)
        resolved.headers["content-type"] = type or "text/html"

    refresh = resolved.headers.get("refresh")
    if refresh:
        for part in refresh.split(";"):
            if part.strip().startswith("url="):
                return follow_redirects(part.strip()[4:], cache=cache, **kwargs)

    resolved.url = util.clean_url(resolved.url)
    if cache is not None:
        cache.set_multi({cache_key: resolved, "R " + resolved.url: resolved}, time=cache_time)
    return resolved
Пример #7
0
    except AssertionError:
        raise
    except BaseException, e:
        logging.warning("Couldn't resolve URL %s : %s", url, e)
        resolved = requests.Response()
        resolved.url = url
        resolved.status_code = 499  # not standard. i made this up.
        cache_time = FAILED_RESOLVE_URL_CACHE_TIME

    content_type = resolved.headers.get('content-type')
    if not content_type:
        type, _ = mimetypes.guess_type(resolved.url)
        resolved.headers['content-type'] = type or 'text/html'

    refresh = resolved.headers.get('refresh')
    if refresh:
        for part in refresh.split(';'):
            if part.strip().startswith('url='):
                return follow_redirects(part.strip()[4:],
                                        cache=cache,
                                        **kwargs)

    resolved.url = util.clean_url(resolved.url)
    if cache is not None:
        cache.set_multi({
            cache_key: resolved,
            'R ' + resolved.url: resolved
        },
                        time=cache_time)
    return resolved