def fetch_mf2_func(url): if util.domain_or_parent_in( urllib.parse.urlparse(url).netloc, SILO_DOMAINS): return { 'items': [{ 'type': ['h-card'], 'properties': { 'url': [url] } }] } return util.fetch_mf2(url, gateway=True)
def fetch_mf2_func(url): if util.domain_or_parent_in( urlparse.urlparse(url).netloc, SILO_DOMAINS): return { 'items': [{ 'type': ['h-card'], 'properties': { 'url': [url] } }] } _, doc = self._fetch(url) return mf2py.parse(doc=doc, url=url)
def host_url(path_query=None): domain = util.domain_from_link(request.host_url) base = (HOST_URL if util.domain_or_parent_in(domain, OTHER_DOMAINS) else request.host_url) return urllib.parse.urljoin(base, path_query)
def in_webmention_blocklist(domain): """Returns True if the domain or its root domain is in BLOCKLIST.""" domain = domain.lower() return (util.domain_or_parent_in(domain, BLOCKLIST) or (not LOCAL and domain in LOCAL_HOSTS))
def in_webmention_blacklist(domain): """Returns True if the domain or its root domain is in BLACKLIST.""" return util.domain_or_parent_in(domain.lower(), BLACKLIST)
def original_post_discovery(activity, domains=None, cache=None, include_redirect_sources=True, **kwargs): """Discovers original post links. This is a variation on http://indiewebcamp.com/original-post-discovery . It differs in that it finds multiple candidate links instead of one, and it doesn't bother looking for MF2 (etc) markup because the silos don't let you input it. More background: https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857 Original post candidates come from the upstreamDuplicates, attachments, and tags fields, as well as links and permashortlinks/permashortcitations in the text content. Args: activity: activity dict domains: optional sequence of domains. If provided, only links to these domains will be considered original and stored in upstreamDuplicates. (Permashortcitations are exempt.) cache: optional, a cache object for storing resolved URL redirects. Passed to follow_redirects(). include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs kwargs: passed to requests.head() when following redirects Returns: ([string original post URLs], [string mention URLs]) tuple """ obj = activity.get('object') or activity content = obj.get('content', '').strip() # find all candidate URLs tags = [ t.get('url') for t in obj.get('attachments', []) + obj.get('tags', []) if t.get('objectType') in ('article', 'mention', None) ] candidates = tags + util.extract_links(content) + obj.get( 'upstreamDuplicates', []) # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short # references to canonical copies of a given (usually syndicated) post, of # the form (DOMAIN PATH). We consider them an explicit original post link. candidates += [ match.expand(r'http://\1/\2') for match in Source._PERMASHORTCITATION_RE.finditer(content) ] candidates = set( filter( None, ( util.clean_url(url) for url in candidates # heuristic: ellipsized URLs are probably incomplete, so omit them. if url and not url.endswith('...') and not url.endswith(u'…')))) # check for redirect and add their final urls redirects = {} # maps final URL to original URL for redirects for url in list(candidates): resolved = util.follow_redirects(url, cache=cache, **kwargs) if (resolved.url != url and resolved.headers.get( 'content-type', '').startswith('text/html')): redirects[resolved.url] = url candidates.add(resolved.url) # use domains to determine which URLs are original post links vs mentions originals = set() mentions = set() for url in util.dedupe_urls(candidates): if url in redirects.values(): # this is a redirected original URL. postpone and handle it when we hit # its final URL so that we know the final domain. continue domain = util.domain_from_link(url) which = (originals if not domains or util.domain_or_parent_in( domain, domains) else mentions) which.add(url) redirected_from = redirects.get(url) if redirected_from and include_redirect_sources: which.add(redirected_from) logging.info( 'Original post discovery found original posts %s, mentions %s', originals, mentions) return originals, mentions
def original_post_discovery(activity, domains=None, cache=None, include_redirect_sources=True, **kwargs): """Discovers original post links. This is a variation on http://indiewebcamp.com/original-post-discovery . It differs in that it finds multiple candidate links instead of one, and it doesn't bother looking for MF2 (etc) markup because the silos don't let you input it. More background: https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857 Original post candidates come from the upstreamDuplicates, attachments, and tags fields, as well as links and permashortlinks/permashortcitations in the text content. Args: activity: activity dict domains: optional sequence of domains. If provided, only links to these domains will be considered original and stored in upstreamDuplicates. (Permashortcitations are exempt.) cache: optional, a cache object for storing resolved URL redirects. Passed to follow_redirects(). include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs kwargs: passed to requests.head() when following redirects Returns: ([string original post URLs], [string mention URLs]) tuple """ obj = activity.get('object') or activity content = obj.get('content', '').strip() # find all candidate URLs tags = [t.get('url') for t in obj.get('attachments', []) + obj.get('tags', []) if t.get('objectType') in ('article', 'mention', None)] candidates = tags + util.extract_links(content) + obj.get('upstreamDuplicates', []) # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short # references to canonical copies of a given (usually syndicated) post, of # the form (DOMAIN PATH). We consider them an explicit original post link. candidates += [match.expand(r'http://\1/\2') for match in Source._PERMASHORTCITATION_RE.finditer(content)] candidates = set(util.dedupe_urls( util.clean_url(url) for url in candidates # heuristic: ellipsized URLs are probably incomplete, so omit them. if url and not url.endswith('...') and not url.endswith('…'))) # check for redirect and add their final urls redirects = {} # maps final URL to original URL for redirects for url in candidates: resolved = util.follow_redirects(url, cache=cache, **kwargs) if (resolved.url != url and resolved.headers.get('content-type', '').startswith('text/html')): redirects[resolved.url] = url candidates.update(redirects.keys()) # use domains to determine which URLs are original post links vs mentions originals = set() mentions = set() for url in util.dedupe_urls(candidates): if url in redirects.values(): # this is a redirected original URL. postpone and handle it when we hit # its final URL so that we know the final domain. continue domain = util.domain_from_link(url) which = (originals if not domains or util.domain_or_parent_in(domain, domains) else mentions) which.add(url) redirected_from = redirects.get(url) if redirected_from and include_redirect_sources: which.add(redirected_from) logging.info('Original post discovery found original posts %s, mentions %s', originals, mentions) return originals, mentions
def host_url(handler): domain = util.domain_from_link(handler.request.host_url) return (HOST_URL if util.domain_or_parent_in(domain, OTHER_DOMAINS) else handler.request.host_url)
def fetch_mf2_func(url): if util.domain_or_parent_in(urlparse.urlparse(url).netloc, SILO_DOMAINS): return {'items': [{'type': ['h-card'], 'properties': {'url': [url]}}]} _, doc = self._fetch(url) return mf2py.parse(doc=doc, url=url, img_with_alt=True)