def resolve_profile_url(url, resolve=True): """Resolves a profile URL to be added to a source. Args: url: string resolve: boolean, whether to make HTTP requests to follow redirects, etc. Returns: string, resolved URL, or None """ final, _, ok = util.get_webmention_target(url, resolve=resolve) if not ok: return None final = final.lower() if util.schemeless(final).startswith(util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and resolve: root = match.group(1) try: resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root except requests.RequestException: logging.warning("Couldn't fetch %s, preserving path in %s", root, final, exc_info=True) return final
def _find_source(self, source_cls, url, domain): """Returns the source that should publish a post URL, or None if not found. Args: source_cls: :class:`models.Source` subclass for this silo url: string domain: string, url's domain Returns: :class:`models.Source` """ domain = domain.lower() sources = source_cls.query().filter( source_cls.domains == domain).fetch(100) if not sources: self.error( "Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." % { 'type': source_cls.GR_CLASS.NAME, 'domain': domain }) return current_url = '' sources_ready = [] best_match = None for source in sources: logging.info('Source: %s , features %s, status %s, poll status %s', source.bridgy_url(self), source.features, source.status, source.poll_status) if source.status != 'disabled' and 'publish' in source.features: # use a source that has a domain_url matching the url provided, # including path. find the source with the closest match. sources_ready.append(source) schemeless_url = util.schemeless(url.lower()).strip('/') for domain_url in source.domain_urls: schemeless_domain_url = util.schemeless( domain_url.lower()).strip('/') if (schemeless_url.startswith(schemeless_domain_url) and len(domain_url) > len(current_url)): current_url = domain_url best_match = source if best_match: return best_match elif sources_ready: self.error( 'No account found that matches %s. Check that <a href="%s/about#profile-link">the web site URL is in your silo profile</a>, then <a href="%s/">sign up again</a>.' % (self.request.host_url, util.pretty_link(url), self.request.host_url)) else: self.error( 'Publish is not enabled for your account. <a href="%s/">Try signing up!</a>' % self.request.host_url)
def test_schemeless(self): for expected, url in ( ('', ''), ('/path', '/path'), ('//foo', '//foo'), ('//foo', 'http://foo'), ('//foo.bar/baz', 'http://foo.bar/baz'), ('//foo.bar/baz', 'https://foo.bar/baz'), ): self.assertEqual(expected, util.schemeless(url)) self.assertEqual('foo', util.schemeless('http://foo/', slashes=False)) self.assertEqual('foo/bar', util.schemeless('http://foo/bar/', slashes=False))
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls( util.uniquify([user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info( 'Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): final, domain, ok = util.get_webmention_target( url, resolve=i < MAX_AUTHOR_URLS) if ok: final = final.lower() if util.schemeless(final).startswith( util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and i < MAX_AUTHOR_URLS: root = match.group(1) resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root urls.append(final) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def _find_source(self, source_cls, url, domain): """Returns the source that should publish a post URL, or None if not found. Args: source_cls: :class:`models.Source` subclass for this silo url: string domain: string, url's domain Returns: :class:`models.Source` """ domain = domain.lower() if util.domain_or_parent_in(domain, util.DOMAINS): return self.error( f'Source URL should be on your own site, not {domain}') sources = source_cls.query().filter( source_cls.domains == domain).fetch(100) if not sources: msg = f'Could not find <b>{source_cls.GR_CLASS.NAME}</b> account for <b>{domain}</b>. Check that your {source_cls.GR_CLASS.NAME} profile has {domain} in its <em>web site</em> or <em>link</em> field, then try signing up again.' return self.error(msg, html=msg) current_url = '' sources_ready = [] best_match = None for source in sources: logging.info( f'Source: {source.bridgy_url()} , features {source.features}, status {source.status}, poll status {source.poll_status}' ) if source.status != 'disabled' and 'publish' in source.features: # use a source that has a domain_url matching the url provided, # including path. find the source with the closest match. sources_ready.append(source) schemeless_url = util.schemeless(url.lower()).strip('/') for domain_url in source.domain_urls: schemeless_domain_url = util.schemeless( domain_url.lower()).strip('/') if (schemeless_url.startswith(schemeless_domain_url) and len(domain_url) > len(current_url)): current_url = domain_url best_match = source if best_match: return best_match if sources_ready: msg = f'No account found that matches {util.pretty_link(url)}. Check that <a href="{util.host_url("/about#profile-link")}">the web site URL is in your silo profile</a>, then <a href="{request.host_url}">sign up again</a>.' else: msg = f'Publish is not enabled for your account. <a href="{request.host_url}">Try signing up!</a>' self.error(msg, html=msg)
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Twitter search supports OR: https://dev.twitter.com/rest/public/search ...but it only returns complete(ish) results if we strip scheme from URLs, ie search for example.com instead of http://example.com/, and that also returns false positivies, so we check that the returned tweets actually have matching links. https://github.com/snarfed/bridgy/issues/565 Returns: sequence of ActivityStreams activity dicts """ urls = set( util.schemeless(util.fragmentless(url), slashes=False) for url in self.domain_urls if not util.in_webmention_blacklist(util.domain_from_link(url))) if not urls: return [] query = ' OR '.join(urls) candidates = self.get_activities(search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50) # filter out retweets and search false positives that don't actually link to us results = [] for candidate in candidates: if candidate.get('verb') == 'share': continue obj = candidate['object'] tags = obj.get('tags', []) atts = obj.get('attachments', []) for url in urls: if (any( util.schemeless(t.get('url', ''), slashes=False).startswith(url) for t in tags + atts)): results.append(candidate) break return results
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls(util.uniquify( [user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info('Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): final, domain, ok = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS) if ok: final = final.lower() if util.schemeless(final).startswith(util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and i < MAX_AUTHOR_URLS: root = match.group(1) resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root urls.append(final) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def _find_source(self, source_cls, url, domain): """Returns the source that should publish a post URL, or None if not found. Args: source_cls: :class:`models.Source` subclass for this silo url: string domain: string, url's domain Returns: :class:`models.Source` """ domain = domain.lower() sources = source_cls.query().filter(source_cls.domains == domain).fetch(100) if not sources: self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." % {'type': source_cls.GR_CLASS.NAME, 'domain': domain}) return current_url = '' sources_ready = [] best_match = None for source in sources: logging.info('Source: %s , features %s, status %s, poll status %s', source.bridgy_url(self), source.features, source.status, source.poll_status) if source.status != 'disabled' and 'publish' in source.features: # use a source that has a domain_url matching the url provided, # including path. find the source with the closest match. sources_ready.append(source) schemeless_url = util.schemeless(url.lower()).strip('/') for domain_url in source.domain_urls: schemeless_domain_url = util.schemeless(domain_url.lower()).strip('/') if (schemeless_url.startswith(schemeless_domain_url) and len(domain_url) > len(current_url)): current_url = domain_url best_match = source if best_match: return best_match elif sources_ready: self.error( 'No account found that matches %s. Check that <a href="/about#profile-link">the web site URL is in your silo profile</a>, then <a href="/">sign up again</a>.' % util.pretty_link(url)) else: self.error('Publish is not enabled for your account. <a href="/">Try signing up!</a>')
def test_schemeless(self): for expected, url in ( ('', util.schemeless('')), ('/path', util.schemeless('/path')), ('//foo', util.schemeless('//foo')), ('//foo', util.schemeless('http://foo')), ('//foo.bar/baz', util.schemeless('http://foo.bar/baz')), ('//foo.bar/baz', util.schemeless('https://foo.bar/baz'))): self.assertEqual(expected, util.schemeless(url))
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Returns: sequence of ActivityStreams activity dicts """ urls = {util.schemeless(util.fragmentless(url), slashes=False) for url in self.domain_urls if not util.in_webmention_blocklist(util.domain_from_link(url))} if not urls: return [] # Search syntax: https://www.reddit.com/wiki/search url_query = ' OR '.join(f'site:"{u}" OR selftext:"{u}"' for u in urls) return self.get_activities( search_query=url_query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50)
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Returns: sequence of ActivityStreams activity dicts """ urls = set( util.schemeless(util.fragmentless(url), slashes=False) for url in self.domain_urls if not util.in_webmention_blocklist(util.domain_from_link(url))) if not urls: return [] url_query = ' OR '.join([f'"{u}"' for u in urls]) return self.get_activities(search_query=url_query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=True, fetch_likes=False, fetch_shares=False, count=50)
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Twitter search supports OR: https://dev.twitter.com/rest/public/search ...but it only returns complete(ish) results if we strip scheme from URLs, ie search for example.com instead of http://example.com/, and that also returns false positivies, so we check that the returned tweets actually have matching links. https://github.com/snarfed/bridgy/issues/565 Returns: sequence of ActivityStreams activity dicts """ urls = set(util.fragmentless(url) for url in self.domain_urls if not util.in_webmention_blacklist(util.domain_from_link(url))) if not urls: return [] query = ' OR '.join('"%s"' % util.schemeless(url, slashes=False) for url in urls) candidates = self.get_activities( search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50) # filter out retweets and search false positives that don't actually link to us results = [] for candidate in candidates: if candidate.get('verb') == 'share': continue obj = candidate['object'] tags = obj.get('tags', []) atts = obj.get('attachments', []) for url in urls: if (url in obj.get('content', '') or any(t.get('url', '').startswith(url) for t in tags + atts)): id = candidate['id'] results.append(candidate) break return results
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}') elif source_cls == GooglePlusPage: return self.error('Sorry, %s is not yet supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain domain = domain.lower() sources = source_cls.query().filter(source_cls.domains == domain).fetch(100) if not sources: return self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." % {'type': source_cls.GR_CLASS.NAME, 'domain': domain}) current_url = '' for source in sources: logging.info('Source: %s , features %s, status %s, poll status %s', source.bridgy_url(self), source.features, source.status, source.poll_status) if source.status != 'disabled' and 'publish' in source.features: # use a source that has a domain_url matching the url provided. # look through each source to find the one with the closest match. schemeless_url = util.schemeless(url.lower()).strip('/') for domain_url in source.domain_urls: schemeless_domain_url = util.schemeless(domain_url.lower()).strip('/') if (schemeless_url.startswith(schemeless_domain_url) and len(domain_url) > len(current_url)): self.source = source current_url = domain_url if not self.source: return self.error( 'Publish is not enabled for your account. Please visit https://brid.gy and sign up!') content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) for source_url in url, self.source_url(): parts = urlparse.urlparse(source_url) if (parts.netloc == domain_url_parts.netloc and parts.path.strip('/') == domain_url_parts.path.strip('/') and not parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!") # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. entity = self.get_or_add_publish_entity(url) if (entity.status == 'complete' and entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Details: https://github.com/snarfed/bridgy/issues/84") self.entity = entity # fetch source page resp = self.fetch_mf2(url) if not resp: return self.fetched, data = resp # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = util.beautifulsoup_parse(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = shortlinks[0]['href'] # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. result = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: types = types.union(item_types) continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) if not code: raise msg = 'Error from %s API or your site: %s %s' % ( self.source.GR_CLASS.NAME, body or '', e) return self.error(msg, status=code, mail=code not in ('502', '503', '504'))
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}') elif source_cls == GooglePlusPage: return self.error('Sorry, %s is not yet supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain domain = domain.lower() sources = source_cls.query().filter(source_cls.domains == domain).fetch(100) if not sources: return self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." % {'type': source_cls.GR_CLASS.NAME, 'domain': domain}) current_url = '' for source in sources: logging.info('Source: %s , features %s, status %s, poll status %s', source.bridgy_url(self), source.features, source.status, source.poll_status) if source.status != 'disabled' and 'publish' in source.features: # use a source that has a domain_url matching the url provided. # look through each source to find the one with the closest match. schemeless_url = util.schemeless(url.lower()).strip('/') for domain_url in source.domain_urls: schemeless_domain_url = util.schemeless(domain_url.lower()).strip('/') if (schemeless_url.startswith(schemeless_domain_url) and len(domain_url) > len(current_url)): self.source = source current_url = domain_url if not self.source: return self.error( 'Publish is not enabled for your account. Please visit https://brid.gy and sign up!') content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) source_url_parts = urlparse.urlparse(self.source_url()) if (source_url_parts.netloc == domain_url_parts.netloc and source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and not source_url_parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!") # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. entity = self.get_or_add_publish_entity(url) if (entity.status == 'complete' and entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!") self.entity = entity # fetch source page resp = self.fetch_mf2(url) if not resp: return self.fetched, data = resp # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = util.beautifulsoup_parse(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = shortlinks[0]['href'] # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. result = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) mail = True if (not code or code == 500) and util.is_connection_failure(e): code = 502 mail=False msg = '%s API error: %s %s' % (self.source.GR_CLASS.NAME, body or '', e) return self.error(msg, status=code or 500, mail=mail)
def add_original_post_urls(self, post, obj, prop): """Extracts original post URLs and adds them to an object, in place. If the post object has upstreamDuplicates, *only* they are considered original post URLs and added as tags with objectType 'article', and the post's own links and 'article' tags are added with objectType 'mention'. Args: post: ActivityStreams post object to get original post URLs from obj: ActivityStreams post object to add original post URLs to prop: string property name in obj to add the original post URLs to """ original_post_discovery.discover(self.source, post, fetch_hfeed=False) tags = [tag for tag in post['object'].get('tags', []) if 'url' in tag and tag['objectType'] == 'article'] upstreams = post['object'].get('upstreamDuplicates', []) if not isinstance(obj.setdefault(prop, []), list): obj[prop] = [obj[prop]] if upstreams: obj[prop] += [{'url': url, 'objectType': 'article'} for url in upstreams] obj.setdefault('tags', []).extend( [{'url': tag.get('url'), 'objectType': 'mention'} for tag in tags]) else: obj[prop] += tags # check for redirects, and if there are any follow them and add final urls # in addition to the initial urls. seen = set() tags = obj.get('tags', []) for url_list in obj[prop], tags: for url_obj in url_list: url = util.clean_webmention_url(url_obj.get('url', '')) if not url or url in seen: continue seen.add(url) # when debugging locally, replace my (snarfed.org) URLs with localhost url_obj['url'] = url = util.replace_test_domains_with_localhost(url) resolved, _, send = util.get_webmention_target(url) if send and resolved != url and resolved not in seen: seen.add(resolved) url_list.append({'url': resolved, 'objectType': url_obj.get('objectType')}) # if the http version of a link is in upstreams but the https one is just a # mention, or vice versa, promote them both to upstream. # https://github.com/snarfed/bridgy/issues/290 # # TODO: for links that came from resolving redirects above, this doesn't # also catch the initial pre-redirect link. ah well. prop_schemeful = set(tag['url'] for tag in obj[prop] if tag.get('url')) prop_schemeless = set(util.schemeless(url) for url in prop_schemeful) for url_obj in copy.copy(tags): url = url_obj.get('url', '') schemeless = util.schemeless(url) if schemeless in prop_schemeless and url not in prop_schemeful: obj[prop].append(url_obj) tags.remove(url_obj) prop_schemeful.add(url) logging.info('After original post discovery, urls are: %s', seen)