def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: oauth_dropins.models.BaseAuth user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) urls = [] for url in util.trim_nulls(util.uniquify( [user_url] + [actor.get('url')] + [u.get('value') for u in actor.get('urls', [])])): domain = util.domain_from_link(url) if domain and not util.in_webmention_blacklist(domain.lower()): urls.append(url) urls = util.dedupe_urls(urls) domains = [util.domain_from_link(url).lower() for url in urls] return urls, domains
def new(handler, auth_entity=None, **kwargs): """Creates and returns a WordPress for the logged in user. Args: handler: the current RequestHandler auth_entity: oauth_dropins.wordpress.WordPressAuth """ auth_domain = auth_entity.key.id() site_info = WordPress.get_site_info(handler, auth_entity) if site_info is None: return urls = util.dedupe_urls( util.trim_nulls([site_info.get('URL'), auth_entity.blog_url])) domains = [util.domain_from_link(u) for u in urls] avatar = (json.loads(auth_entity.user_json).get('avatar_URL') if auth_entity.user_json else None) return WordPress(id=domains[0], auth_entity=auth_entity.key, name=auth_entity.user_display_name(), picture=avatar, superfeedr_secret=util.generate_secret(), url=urls[0], domain_urls=urls, domains=domains, site_info=site_info, **kwargs)
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls( util.uniquify([user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.warning( 'Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): url, domain, send = util.get_webmention_target( url, resolve=i < MAX_AUTHOR_URLS) if send: urls.append(url) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def get_webmention_targets(source, activity): """Returns a set of string target URLs to attempt to send webmentions to. Side effect: runs the original post discovery algorithm on the activity and adds the resulting URLs to the activity as tags, in place. Args: source: models.Source subclass activity: activity dict """ original_post_discovery.discover(source, activity) obj = activity.get('object') or activity urls = [] for tag in obj.get('tags', []): url = tag.get('url') if url and tag.get('objectType') == 'article': url, domain, send = util.get_webmention_target(url) tag['url'] = url if send: urls.append(url) for url in obj.get('upstreamDuplicates', []): url, domain, send = util.get_webmention_target(url) if send: urls.append(url) return util.dedupe_urls(urls)
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: oauth_dropins.models.BaseAuth user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls(util.uniquify( [user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.warning('Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): url, domain, send = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS) if send: urls.append(url) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def new(handler, auth_entity=None, **kwargs): """Creates and returns a WordPress for the logged in user. Args: handler: the current RequestHandler auth_entity: oauth_dropins.wordpress.WordPressAuth """ auth_domain = auth_entity.key.id() site_info = WordPress.get_site_info(handler, auth_entity) if site_info is None: return urls = util.dedupe_urls(util.trim_nulls( [site_info.get('URL'), auth_entity.blog_url])) domains = [util.domain_from_link(u) for u in urls] avatar = (json.loads(auth_entity.user_json).get('avatar_URL') if auth_entity.user_json else None) return WordPress(id=domains[0], auth_entity=auth_entity.key, name=auth_entity.user_display_name(), picture=avatar, superfeedr_secret=util.generate_secret(), url=urls[0], domain_urls=urls, domains=domains, site_info=site_info, **kwargs)
def handle_feed(feed, source): """Handles a Superfeedr JSON feed. Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks for new items. http://documentation.superfeedr.com/schema.html#json http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications Args: feed: unicode string, Superfeedr JSON feed source: Blogger, Tumblr, or WordPress """ logging.info('Source: %s %s', source.label(), source.key.string_id()) logging.info('Raw feed: %s', feed) if source.status != 'enabled': logging.info('Dropping because source is %s', source.status) return elif 'webmention' not in source.features: logging.info("Dropping because source doesn't have webmention feature") return for item in json.loads(feed).get('items', []): url = item.get('permalinkUrl') or item.get('id') if not url: logging.error('Dropping feed item without permalinkUrl or id!') continue # extract links from content, discarding self links. # # i don't use get_webmention_target[s]() here because they follows redirects # and fetch link contents, and this handler should be small and fast and try # to return a response to superfeedr successfully. # # TODO: extract_links currently has a bug that makes it drop trailing # slashes. ugh. fix that. content = item.get('content') or item.get('summary', '') links = [util.clean_url(util.unwrap_t_umblr_com(l)) for l in util.extract_links(content) if util.domain_from_link(l) not in source.domains] unique = [] for link in util.dedupe_urls(links): if len(link) <= _MAX_STRING_LENGTH: unique.append(link) else: logging.info('Giving up on link over %s chars! %s', _MAX_STRING_LENGTH, link) logging.info('Found links: %s', unique) if len(url) > _MAX_KEYPART_BYTES: logging.warning('Blog post URL is too long (over 500 chars)! Giving up.') bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key, feed_item=item, failed=unique) else: bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=unique) bp.get_or_save()
def handle_feed(feed, source): """Handles a Superfeedr JSON feed. Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks for new items. http://documentation.superfeedr.com/schema.html#json http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications Args: feed: unicode string, Superfeedr JSON feed source: Blogger, Tumblr, or WordPress """ logging.info('Source: %s %s', source.label(), source.key.string_id()) logging.info('Raw feed: %s', feed) if source.status != 'enabled': logging.info('Dropping because source is %s', source.status) return elif 'webmention' not in source.features: logging.info("Dropping because source doesn't have webmention feature") return for item in json_loads(feed).get('items', []): url = item.get('permalinkUrl') or item.get('id') if not url: logging.error('Dropping feed item without permalinkUrl or id!') continue # extract links from content, discarding self links. # # i don't use get_webmention_target[s]() here because they follows redirects # and fetch link contents, and this handler should be small and fast and try # to return a response to superfeedr successfully. # # TODO: extract_links currently has a bug that makes it drop trailing # slashes. ugh. fix that. content = item.get('content') or item.get('summary', '') links = [util.clean_url(util.unwrap_t_umblr_com(l)) for l in util.extract_links(content) if util.domain_from_link(l) not in source.domains] unique = [] for link in util.dedupe_urls(links): if len(link) <= _MAX_STRING_LENGTH: unique.append(link) else: logging.info('Giving up on link over %s chars! %s', _MAX_STRING_LENGTH, link) logging.info('Found links: %s', unique) if len(url) > _MAX_KEYPART_BYTES: logging.warning('Blog post URL is too long (over 500 chars)! Giving up.') bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key, feed_item=item, failed=unique) else: bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=unique) bp.get_or_save()
def test_dedupe_urls(self): self.assertEquals([], util.dedupe_urls([])) self.assertEquals(['http://foo/'], util.dedupe_urls(['http://foo'])) self.assertEquals(['http://foo/'], util.dedupe_urls(['http://foo', 'http://foo'])) self.assertEquals(['http://foo/'], util.dedupe_urls(['http://foo', 'http://foo/'])) self.assertEquals(['https://foo/'], util.dedupe_urls([ 'https://foo', 'http://foo', 'https://foo/', 'http://foo/'])) self.assertEquals(['https://foo/'], util.dedupe_urls(['http://foo', 'https://foo/'])) self.assertEquals(['http://foo/bar', 'http://foo/bar/'], util.dedupe_urls(['http://foo/bar', 'http://foo/bar/'])) self.assertEquals(['http://foo/'], util.dedupe_urls(['http://foo', 'http://FOO/', 'http://FoO/']))
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls( util.uniquify([user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info( 'Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): final, domain, ok = util.get_webmention_target( url, resolve=i < MAX_AUTHOR_URLS) if ok: final = final.lower() if util.schemeless(final).startswith( util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and i < MAX_AUTHOR_URLS: root = match.group(1) resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root urls.append(final) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def urls_and_domains(self, auth_entity, user_url, actor=None, resolve_source_domain=True): """Returns this user's valid (not webmention-blocklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing actor: dict, optional AS actor for the user. If provided, overrides auth_entity resolve_source_domain: boolean, whether to follow redirects on URLs on this source's domain Returns: ([string url, ...], [string domain, ...]) """ if not actor: actor = self.gr_source.user_to_actor(json_loads(auth_entity.user_json)) logger.debug(f'Extracting URLs and domains from actor: {json_dumps(actor, indent=2)}') candidates = util.trim_nulls(util.uniquify( [user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logger.info(f'Too many profile links! Only resolving the first {MAX_AUTHOR_URLS}: {candidates}') urls = [] for i, url in enumerate(candidates): on_source_domain = util.domain_from_link(url) == self.gr_source.DOMAIN resolve = ((resolve_source_domain or not on_source_domain) and i < MAX_AUTHOR_URLS) resolved = self.resolve_profile_url(url, resolve=resolve) if resolved: urls.append(resolved) final_urls = [] domains = [] for url in util.dedupe_urls(urls): # normalizes domains to lower case # skip links on this source's domain itself. only currently needed for # Mastodon; the other silo domains are in the webmention blocklist. domain = util.domain_from_link(url) if domain != self.gr_source.DOMAIN: final_urls.append(url) domains.append(domain) return final_urls, domains
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls(util.uniquify( [user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info('Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): final, domain, ok = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS) if ok: final = final.lower() if util.schemeless(final).startswith(util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and i < MAX_AUTHOR_URLS: root = match.group(1) resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root urls.append(final) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def restart(self): """Moves status and targets to 'new' and adds a propagate task.""" self.status = 'new' self.unsent = util.dedupe_urls(self.unsent + self.sent + self.error + self.failed + self.skipped) self.sent = self.error = self.failed = self.skipped = [] # clear any cached webmention endpoints memcache.delete_multi( util.webmention_endpoint_cache_key(url) for url in self.unsent) @ndb.transactional def finish(): self.put() self.add_task(transactional=True) finish()
def restart(self): """Moves status and targets to 'new' and adds a propagate task.""" self.status = 'new' self.unsent = util.dedupe_urls(self.unsent + self.sent + self.error + self.failed + self.skipped) self.sent = self.error = self.failed = self.skipped = [] # clear any cached webmention endpoints memcache.delete_multi(util.webmention_endpoint_cache_key(url) for url in self.unsent) @ndb.transactional def finish(): self.put() self.add_task(transactional=True) finish()
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ user = json_loads(auth_entity.user_json) actor = ( user.get('actor') # for Instagram; its user_json is IndieAuth or self.gr_source.user_to_actor(user)) logging.debug('Extracting URLs and domains from actor: %s', json_dumps(actor, indent=2)) candidates = util.trim_nulls( util.uniquify([user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info( 'Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): resolved = self.resolve_profile_url(url, resolve=i < MAX_AUTHOR_URLS) if resolved: urls.append(resolved) final_urls = [] domains = [] for url in util.dedupe_urls(urls): # normalizes domains to lower case # skip links on this source's domain itself. only currently needed for # Mastodon; the other silo domains are in the webmention blacklist. domain = util.domain_from_link(url) if domain != self.gr_source.DOMAIN: final_urls.append(url) domains.append(domain) return final_urls, domains
def restart(self): """Moves status and targets to 'new' and adds a propagate task.""" self.status = 'new' self.unsent = util.dedupe_urls(self.unsent + self.sent + self.error + self.failed + self.skipped) self.sent = self.error = self.failed = self.skipped = [] # clear any cached webmention endpoints with util.webmention_endpoint_cache_lock: for url in self.unsent: util.webmention_endpoint_cache.pop(util.webmention_endpoint_cache_key(url), None) # this datastore put and task add should be transactional, but Cloud Tasks # doesn't support that :( # https://cloud.google.com/appengine/docs/standard/python/taskqueue/push/migrating-push-queues#features-not-available self.put() self.add_task()
def restart(self): """Moves status and targets to 'new' and adds a propagate task.""" self.status = 'new' self.unsent = util.dedupe_urls(self.unsent + self.sent + self.error + self.failed + self.skipped) self.sent = self.error = self.failed = self.skipped = [] # clear any cached webmention endpoints with util.webmention_endpoint_cache_lock: for url in self.unsent: util.webmention_endpoint_cache.pop(util.webmention_endpoint_cache_key(url), None) # this datastore put and task add should be transactional, but Cloud Tasks # doesn't support that :( # https://cloud.google.com/appengine/docs/standard/python/taskqueue/push/migrating-push-queues#features-not-available # https://github.com/googleapis/python-tasks/issues/26 # # The new "bundled services" bridge for the old App Engine APIs still # supports them, but only because that's literally on the old backends, # which seems like a dead end. # https://groups.google.com/g/google-appengine/c/22BKInlWty0/m/05ObNEdsAgAJ self.put() self.add_task()
def discover(source, activity, fetch_hfeed=True, include_redirect_sources=True, already_fetched_hfeeds=None): """Augments the standard original_post_discovery algorithm with a reverse lookup that supports posts without a backlink or citation. If fetch_hfeed is False, then we will check the db for previously found :class:`models.SyndicatedPost`\ s but will not do posse-post-discovery to find new ones. Args: source: :class:`models.Source` subclass. Changes to property values (e.g. domains, domain_urls, last_syndication_url) are stored in source.updates; they should be updated transactionally later. activity: activity dict fetch_hfeed: boolean include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs already_fetched_hfeeds: set, URLs that we have already fetched and run posse-post-discovery on, so we can avoid running it multiple times Returns: (set(string original post URLs), set(string mention URLs)) tuple """ logging.debug('discovering original posts for: %s', activity.get('url') or activity.get('id')) if not source.updates: source.updates = {} if already_fetched_hfeeds is None: already_fetched_hfeeds = set() originals, mentions = gr_source.Source.original_post_discovery( activity, domains=source.domains, cache=memcache, include_redirect_sources=include_redirect_sources, headers=util.request_headers(source=source)) obj = activity.get('object', {}) author_id = obj.get('author', {}).get('id') or activity.get('author', {}).get('id') if author_id and author_id != source.user_tag_id(): logging.info( "Demoting original post links because user %s doesn't match author %s", source.user_tag_id(), author_id) # this is someone else's post, so all links must be mentions mentions.update(originals) originals = set() # look for original URL of attachments (e.g. quote tweets) for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get('author', {}).get('id') == source.user_tag_id()): logging.debug('running original post discovery on attachment: %s', att.get('id')) att_origs, _ = discover( source, att, include_redirect_sources=include_redirect_sources) logging.debug( 'original post discovery found originals for attachment, %s', att_origs) mentions.update(att_origs) def resolve(urls): resolved = set() for url in urls: final, _, send = util.get_webmention_target(url) if send: resolved.add(final) if include_redirect_sources: resolved.add(url) return resolved originals = resolve(originals) mentions = resolve(mentions) if not source.get_author_urls(): logging.debug('no author url(s), cannot find h-feed') return ((originals, mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else (set(), set())) # TODO possible optimization: if we've discovered a backlink to a post on the # author's domain (i.e., it included a link or citation), then skip the rest # of this. syndicated = [] syndication_url = obj.get('url') or activity.get('url') if syndication_url: # use the canonical syndication url on both sides, so that we have # the best chance of finding a match. Some silos allow several # different permalink formats to point to the same place (e.g., # facebook user id instead of user name) syndication_url = source.canonicalize_url(syndication_url) if syndication_url: syndicated = _posse_post_discovery(source, activity, syndication_url, fetch_hfeed, already_fetched_hfeeds) originals.update(syndicated) originals = set(util.dedupe_urls(originals)) if not syndication_url: logging.debug('no %s syndication url, cannot process h-entries', source.SHORT_NAME) return ((originals, mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else (set(syndicated), set()))
def post(self, source_short_name): logging.info('Params: %s', list(self.request.params.items())) # strip fragments from source and target url self.source_url = urllib.parse.urldefrag( util.get_required_param(self, 'source'))[0] self.target_url = urllib.parse.urldefrag( util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query().filter( source_cls.domains == domain).filter( source_cls.features == 'webmention').filter( source_cls.status == 'enabled').get()) if not self.source: # check for a rel-canonical link. Blogger uses these when it serves a post # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs # epeus.blogspot.com. # https://github.com/snarfed/bridgy/issues/805 mf2 = self.fetch_mf2(self.target_url, require_mf2=False) if not mf2: # fetch_mf2() already wrote the error response return domains = util.dedupe_urls( util.domain_from_link(url) for url in mf2[1]['rels'].get('canonical', [])) if domains: self.source = (source_cls.query().filter( source_cls.domains.IN(domains)).filter( source_cls.features == 'webmention').filter( source_cls.status == 'enabled').get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) # check that the target URL path is supported target_path = urllib.parse.urlparse(self.target_url).path if target_path in ('', '/'): return self.error( 'Home page webmentions are not currently supported.', status=202) for pattern in self.source.PATH_BLOCKLIST: if pattern.match(target_path): return self.error( '%s webmentions are not supported for URL path: %s' % (self.source.GR_CLASS.NAME, target_path), status=202) # create BlogWebmention entity id = '%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe().decode()) # fetch source page fetched = self.fetch_mf2(self.source_url) if not fetched: return resp, mf2 = fetched item = self.find_mention_item(mf2.get('items', [])) if not item: return self.error( 'Could not find target URL %s in source page %s' % (self.target_url, resp.url), data=mf2, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, str): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][ 0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception as e: code, body = util.interpret_http_exception(e) msg = 'Error: %s %s; %s' % (code, e, body) if code == '401': logging.warning('Disabling source due to: %s' % e, stack_info=True) self.source.status = 'disabled' self.source.put() return self.error(msg, status=code, report=self.source.is_beta_user()) elif code == '404': # post is gone return self.error(msg, status=code, report=False) elif util.is_connection_failure(e) or (code and int(code) // 100 == 5): return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, report=False) elif code or body: return self.error(msg, status=code, report=True) else: raise # write results to datastore self.entity.status = 'complete' self.entity.put() self.response.write(json_dumps(self.entity.published))
def discover(source, activity, fetch_hfeed=True, include_redirect_sources=True, already_fetched_hfeeds=None): """Augments the standard original_post_discovery algorithm with a reverse lookup that supports posts without a backlink or citation. If fetch_hfeed is False, then we will check the db for previously found :class:`models.SyndicatedPost`\ s but will not do posse-post-discovery to find new ones. Args: source: :class:`models.Source` subclass. Changes to property values (e.g. domains, domain_urls, last_syndication_url) are stored in source.updates; they should be updated transactionally later. activity: activity dict fetch_hfeed: boolean include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs already_fetched_hfeeds: set, URLs that we have already fetched and run posse-post-discovery on, so we can avoid running it multiple times Returns: (set(string original post URLs), set(string mention URLs)) tuple """ label = activity.get('url') or activity.get('id') logger.debug(f'discovering original posts for: {label}') if not source.updates: source.updates = {} if already_fetched_hfeeds is None: already_fetched_hfeeds = set() originals, mentions = gr_source.Source.original_post_discovery( activity, domains=source.domains, include_redirect_sources=include_redirect_sources, include_reserved_hosts=DEBUG, max_redirect_fetches=MAX_ORIGINAL_CANDIDATES, headers=util.request_headers(source=source)) # only include mentions of the author themselves. # (mostly just for Mastodon; other silos' domains are all in the blocklist, so # their mention URLs get dropped later anyway.) # (these are originally added in Source._inject_user_urls() and in poll step 2.) obj = activity.get('object', {}) other_user_mentions = set( t.get('url') for t in obj.get('tags', []) if t.get('objectType') == 'person' and t.get('url') not in source.domain_urls) originals -= other_user_mentions mentions -= other_user_mentions # original posts are only from the author themselves obj_author = obj.get('author', {}) activity_author = activity.get('actor', {}) author_id = obj_author.get('id') or activity_author.get('id') author_username = obj_author.get('username') or activity_author.get( 'username') if (author_id and author_id != source.user_tag_id() and author_username != source.key.id()): logger.info( f"Demoting original post links because user {source.user_tag_id()} doesn't match author id {author_id} username {author_username}" ) # this is someone else's post, so all links must be mentions mentions.update(originals) originals = set() # look for original URL of attachments (e.g. quote tweets) for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get('author', {}).get('id') == source.user_tag_id()): logger.debug( f"running original post discovery on attachment: {att.get('id')}" ) att_origs, _ = discover( source, att, include_redirect_sources=include_redirect_sources) logger.debug( f'original post discovery found originals for attachment, {att_origs}' ) mentions.update(att_origs) if len(originals) > MAX_ORIGINAL_CANDIDATES: logging.info( f'{len(originals)} originals, pruning down to {MAX_ORIGINAL_CANDIDATES}' ) originals = sorted(originals)[:MAX_ORIGINAL_CANDIDATES] if len(mentions) > MAX_MENTION_CANDIDATES: logging.info( f'{len(mentions)} mentions, pruning down to {MAX_MENTION_CANDIDATES}' ) mentions = sorted(mentions)[:MAX_MENTION_CANDIDATES] def resolve(urls): resolved = set() for url in urls: final, domain, send = util.get_webmention_target(url) if send and domain != source.gr_source.DOMAIN: resolved.add(final) if include_redirect_sources: resolved.add(url) return resolved originals = resolve(originals) mentions = resolve(mentions) if not source.get_author_urls(): logger.debug('no author url(s), cannot find h-feed') return ((originals, mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else (set(), set())) # TODO possible optimization: if we've discovered a backlink to a post on the # author's domain (i.e., it included a link or citation), then skip the rest # of this. syndicated = [] syndication_url = obj.get('url') or activity.get('url') if syndication_url: # use the canonical syndication url on both sides, so that we have # the best chance of finding a match. Some silos allow several # different permalink formats to point to the same place. syndication_url = source.canonicalize_url(syndication_url) if syndication_url: syndicated = _posse_post_discovery(source, activity, syndication_url, fetch_hfeed, already_fetched_hfeeds) originals.update(syndicated) originals = set(util.dedupe_urls(originals)) if not syndication_url: logger.debug( f'no {source.SHORT_NAME} syndication url, cannot process h-entries' ) return ((originals, mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else (set(syndicated), set()))
def post(self, source_short_name): logging.info('Params: %self', self.request.params.items()) # strip fragments from source and target url self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0] self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query() .filter(source_cls.domains == domain) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: # check for a rel-canonical link. Blogger uses these when it serves a post # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs # epeus.blogspot.com. # https://github.com/snarfed/bridgy/issues/805 mf2 = self.fetch_mf2(self.target_url, require_mf2=False) if not mf2: # fetch_mf2() already wrote the error response return domains = util.dedupe_urls( util.domain_from_link(url) for url in mf2[1].get('rels', {}).get('canonical', [])) if domains: self.source = (source_cls.query() .filter(source_cls.domains.IN(domains)) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) # check that the target URL path is supported target_path = urlparse.urlparse(self.target_url).path if target_path in ('', '/'): return self.error('Home page webmentions are not currently supported.', status=202) for pattern in self.source.PATH_BLACKLIST: if pattern.match(target_path): return self.error('%s webmentions are not supported for URL path: %s' % (self.source.GR_CLASS.NAME, target_path), status=202) # create BlogWebmention entity id = '%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe()) # fetch source page resp = self.fetch_mf2(self.source_url) if not resp: return self.fetched, data = resp item = self.find_mention_item(data.get('items', [])) if not item: return self.error('Could not find target URL %s in source page %s' % (self.target_url, self.fetched.url), data=data, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, basestring): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception as e: code, body = util.interpret_http_exception(e) msg = 'Error: %s %s; %s' % (code, e, body) if code == '401': logging.warning('Disabling source due to: %s' % e, exc_info=True) self.source.status = 'disabled' self.source.put() return self.error(msg, status=code, mail=self.source.is_beta_user()) elif code == '404': # post is gone return self.error(msg, status=code, mail=False) elif util.is_connection_failure(e) or (code and int(code) // 100 == 5): return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, mail=False) elif code or body: return self.error(msg, status=code, mail=True) else: raise # write results to datastore self.entity.status = 'complete' self.entity.put() self.response.write(json.dumps(self.entity.published))
def discover(source, activity, fetch_hfeed=True, include_redirect_sources=True): """Augments the standard original_post_discovery algorithm with a reverse lookup that supports posts without a backlink or citation. If fetch_hfeed is False, then we will check the db for previously found SyndicatedPosts but will not do posse-post-discovery to find new ones. Args: source: models.Source subclass. Changes to property values (e.g. domains, domain_urls, last_syndication_url) are stored in source.updates; they should be updated transactionally later. activity: activity dict fetch_hfeed: boolean include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs Returns: (set(string original post URLs), set(string mention URLs)) tuple """ if not source.updates: source.updates = {} originals, mentions = gr_source.Source.original_post_discovery( activity, domains=source.domains, cache=memcache, include_redirect_sources=include_redirect_sources, headers=util.USER_AGENT_HEADER) obj = activity.get('object', {}) author_id = obj.get('author', {}).get('id') or activity.get('author', {}).get('id') if author_id and author_id != source.user_tag_id(): logging.info( "Demoting original post links because user %s doesn't match author %s", source.user_tag_id(), author_id) # this is someone else's post, so all links must be mentions mentions.update(originals) originals = set() # look for original URL of attachments (e.g. quote tweets) for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get('author', {}).get('id') == source.user_tag_id()): logging.debug('running original post discovery on attachment: %s', att.get('id')) att_origs, _ = discover( source, att, include_redirect_sources=include_redirect_sources) logging.debug('original post discovery found originals for attachment, %s', att_origs) mentions.update(att_origs) def resolve(urls): resolved = set() for url in urls: final, _, send = util.get_webmention_target(url) if send: resolved.add(final) if include_redirect_sources: resolved.add(url) return resolved originals = resolve(originals) mentions = resolve(mentions) if not source.get_author_urls(): logging.debug('no author url(s), cannot find h-feed') return originals, mentions # TODO possible optimization: if we've discovered a backlink to a post on the # author's domain (i.e., it included a link or citation), then skip the rest # of this. syndication_url = obj.get('url') or activity.get('url') if syndication_url: # use the canonical syndication url on both sides, so that we have # the best chance of finding a match. Some silos allow several # different permalink formats to point to the same place (e.g., # facebook user id instead of user name) syndication_url = source.canonicalize_syndication_url( util.follow_redirects(syndication_url).url) originals.update(_posse_post_discovery( source, activity, syndication_url, fetch_hfeed)) originals = set(util.dedupe_urls(originals)) else: logging.debug('no syndication url, cannot process h-entries') return originals, mentions
def dispatch_request(self, site): logger.info(f'Params: {list(request.values.items())}') # strip fragments from source and target url self.source_url = urllib.parse.urldefrag(request.form['source'])[0] self.target_url = urllib.parse.urldefrag(request.form['target'])[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: self.error(f'Could not parse target URL {self.target_url}') # look up source by domain source_cls = models.sources[site] domain = domain.lower() self.source = (source_cls.query() .filter(source_cls.domains == domain) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: # check for a rel-canonical link. Blogger uses these when it serves a post # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs # epeus.blogspot.com. # https://github.com/snarfed/bridgy/issues/805 mf2 = self.fetch_mf2(self.target_url, require_mf2=False) if not mf2: # fetch_mf2() already wrote the error response return domains = util.dedupe_urls( util.domain_from_link(url) for url in mf2[1]['rels'].get('canonical', [])) if domains: self.source = (source_cls.query() .filter(source_cls.domains.IN(domains)) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: self.error( f'Could not find {source_cls.GR_CLASS.NAME} account for {domain}. Is it registered with Bridgy?') # check that the target URL path is supported target_path = urllib.parse.urlparse(self.target_url).path if target_path in ('', '/'): msg = 'Home page webmentions are not currently supported.' logger.info(msg) return {'error': msg}, 202 for pattern in self.source.PATH_BLOCKLIST: if pattern.match(target_path): msg = f'{self.source.GR_CLASS.NAME} webmentions are not supported for URL path: {target_path}' logger.info(msg) return {'error': msg}, 202 # create BlogWebmention entity id = f'{self.source_url} {self.target_url}' self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported return self.entity.published logger.debug(f'BlogWebmention entity: {self.entity.key.urlsafe().decode()}') # fetch source page fetched = self.fetch_mf2(self.source_url) if not fetched: return resp, mf2 = fetched item = self.find_mention_item(mf2.get('items', [])) if not item: self.error(f'Could not find target URL {self.target_url} in source page {resp.url}', data=mf2, log_exception=False) # default author to target domain author_name = domain author_url = f'http://{domain}/' # extract author name and URL from h-card, if any props = item['properties'] author = get_first(props, 'author') if author: if isinstance(author, str): author_name = author else: author_props = author.get('properties', {}) author_name = get_first(author_props, 'name') author_url = get_first(author_props, 'url') # if present, u-url overrides source url u_url = get_first(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += f' <br /> <a href="{source_url}">via {util.domain_from_link(source_url)}</a>' # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception as e: code, body = util.interpret_http_exception(e) msg = f'Error: {code}: {e}; {body}' if code == '401': logger.warning(f'Disabling source due to: {e}', exc_info=True) self.source.status = 'disabled' self.source.put() self.error(msg, status=code, report=self.source.is_beta_user()) elif code == '404': # post is gone self.error(msg, status=code, report=False) elif util.is_connection_failure(e) or (code and int(code) // 100 == 5): self.error(msg, status=502, report=False) elif code or body: self.error(msg, status=code, report=True) else: raise # write results to datastore self.entity.status = 'complete' self.entity.put() return self.entity.published