def user_to_actor(self, user): """Converts a user or page to an actor. Args: user: dict, a decoded JSON Facebook user or page Returns: an ActivityStreams actor dict, ready to be JSON-encoded """ if not user: return {} id = user.get('id') username = user.get('username') handle = username or id if not handle: return {} # facebook implements this as a 302 redirect actor = { # FB only returns the type field if you fetch the object with ?metadata=1 # https://developers.facebook.com/docs/graph-api/using-graph-api/v2.2#introspection 'objectType': 'page' if user.get('type') == 'page' else 'person', 'displayName': user.get('name') or username, 'id': self.tag_uri(handle), 'updated': util.maybe_iso8601_to_rfc3339(user.get('updated_time')), 'username': username, 'description': user.get('bio') or user.get('description'), 'summary': user.get('about'), } # numeric_id is our own custom field that always has the source's numeric # user id, if available. if util.is_int(id): actor.update({ 'numeric_id': id, 'image': { 'url': 'https://graph.facebook.com/v2.2/%s/picture?type=large' % id, }, }) # extract web site links. extract_links uniquifies and preserves order urls = util.extract_links(user.get('website')) if not urls: urls = util.extract_links(user.get('link')) or [self.user_url(handle)] actor['url'] = urls[0] if len(urls) > 1: actor['urls'] = [{'value': u} for u in urls] location = user.get('location') if location: actor['location'] = {'id': location.get('id'), 'displayName': location.get('name')} return util.trim_nulls(actor)
def original_post_discovery(activity): """Discovers original post links and stores them as tags, in place. This is a variation on http://indiewebcamp.com/original-post-discovery . It differs in that it finds multiple candidate links instead of one, and it doesn't bother looking for MF2 (etc) markup because the silos don't let you input it. Args: activity: activity dict """ obj = activity.get('object') or activity content = obj.get('content', '').strip() # Permashortcitations are short references to canonical copies of a given # (usually syndicated) post, of the form (DOMAIN PATH). Details: # http://indiewebcamp.com/permashortcitation pscs = set(match.expand(r'http://\1/\2') for match in Source._PERMASHORTCITATION_RE.finditer(content)) attachments = set(a.get('url') for a in obj.get('attachments', []) if a['objectType'] == 'article') urls = util.trim_nulls(util.extract_links(content) | attachments | pscs) obj.setdefault('tags', []).extend({'objectType': 'article', 'url': u} for u in urls) return activity
def user_to_actor(self, user): """Converts a GitHub user to an ActivityStreams actor. Handles both v4 GraphQL and v3 REST API user objects. https://developer.github.com/v4/object/user/ https://developer.github.com/v3/users/ Args: user: dict, decoded JSON GitHub user Returns: an ActivityStreams actor dict, ready to be JSON-encoded """ actor = self._to_object(user) if not actor: return actor username = user.get('login') desc = user.get('bio') or user.get('description') actor.update({ # TODO: orgs, bots 'objectType': 'person', 'displayName': user.get('name') or username, 'username': username, 'email': user.get('email'), 'description': desc, 'summary': desc, 'image': { 'url': user.get('avatarUrl') or user.get('avatar_url') or user.get('url') }, 'location': { 'displayName': user.get('location') }, }) # extract web site links. extract_links uniquifies and preserves order urls = sum( ( util.extract_links(user.get(field)) for field in ( 'html_url', # REST 'url', # both 'websiteUrl', # GraphQL 'blog', # REST 'bio', # both )), []) urls = [ u for u in urls if util.domain_from_link(u) != 'api.github.com' ] if urls: actor['url'] = urls[0] if len(urls) > 1: actor['urls'] = [{'value': u} for u in urls] return self.postprocess_object(actor)
def praw_to_object(self, thing, type): """ Converts a praw object to an object. currently only returns public content Note that this will make external API calls to lazily load some attrs Args: thing: a praw object, Submission or Comment type: string to denote whether to get submission or comment content Returns: an ActivityStreams object dict, ready to be JSON-encoded """ obj = {} id = getattr(thing, 'id', None) if not id: return {} published = util.maybe_timestamp_to_iso8601(getattr(thing, 'created_utc', None)) obj = { 'id': self.tag_uri(id), 'published': published, 'to': [{ 'objectType': 'group', 'alias': '@public', }], } user = getattr(thing, 'author', None) if user: obj['author'] = self.praw_to_actor(user) username = obj['author'].get('username') obj['url'] = self.BASE_URL + thing.permalink if type == 'submission': obj['content'] = getattr(thing, 'title', None) obj['objectType'] = 'note' obj['tags'] = [ {'objectType': 'article', 'url': t, 'displayName': t, } for t in util.extract_links(getattr(thing, 'selftext', None)) ] elif type == 'comment': obj['content'] = getattr(thing, 'body_html', None) obj['objectType'] = 'comment' reply_to = thing.parent() if reply_to: obj['inReplyTo'] = [{ 'id': self.tag_uri(getattr(reply_to, 'id', None)), 'url': self.BASE_URL + getattr(reply_to, 'permalink', None), }] return self.postprocess_object(obj)
def praw_to_object(self, thing, type): """ Converts a praw object to an object. currently only returns public content Args: thing: a praw object, Submission or Comment type: string to denote whether to get submission or comment content Returns: an ActivityStreams object dict, ready to be JSON-encoded """ obj = {} id = thing.id if not id: return {} published = util.maybe_timestamp_to_iso8601(thing.created_utc) obj = { 'id': self.tag_uri(id), 'published': published, 'to': [{ 'objectType': 'group', 'alias': '@public', }], } user = thing.author if user: obj['author'] = self.praw_to_actor(user) username = obj['author'].get('username') obj['url'] = self.BASE_URL + thing.permalink if type == 'submission': obj['content'] = thing.title obj['objectType'] = 'note' obj['tags'] = [{ 'objectType': 'article', 'url': t, 'displayName': t, } for t in util.extract_links(thing.selftext)] elif type == 'comment': obj['content'] = thing.body obj['objectType'] = 'comment' reply_to = thing.parent() if reply_to: obj['inReplyTo'] = [{ 'id': self.tag_uri(reply_to.id), 'url': self.BASE_URL + reply_to.permalink, }] return self.postprocess_object(obj)
def user_to_actor(self, user): """Converts a dict user to an actor. Args: user: json user Returns: an ActivityStreams actor dict, ready to be JSON-encoded """ username = user.get('name') if not username: return {} # trying my best to grab all the urls from the profile description description = '' subreddit = user.get('subreddit') if subreddit: user_url = self.BASE_URL + subreddit.get('url') urls = [user_url] description = subreddit.get('public_description') profile_urls = util.extract_links(description) urls += util.trim_nulls(profile_urls) else: urls = [self.BASE_URL + '/user/' + username] image = user.get('icon_img') return util.trim_nulls({ 'objectType': 'person', 'displayName': username, 'image': { 'url': image }, 'id': self.tag_uri(username), # numeric_id is our own custom field that always has the source's numeric # user id, if available. 'numeric_id': user.get('id'), 'published': util.maybe_timestamp_to_iso8601(user.get('created_utc')), 'url': urls[0], 'urls': [{ 'value': u } for u in urls] if len(urls) > 1 else None, 'username': username, 'description': description, })
def user_to_actor(self, user): """Converts a user to an actor. Args: user: JSON object from the Instagram API Returns: an ActivityStreams actor dict, ready to be JSON-encoded """ if not user: return {} id = user.get('id') username = user.get('username') actor = { 'id': self.tag_uri(id or username), 'username': username, 'objectType': 'person', } if not id or not username: return actor urls = [self.user_url(username)] + sum( (util.extract_links(user.get(field)) for field in ('website', 'bio')), []) actor.update({ 'url': urls[0], 'urls': [{ 'value': u } for u in urls] if len(urls) > 1 else None }) private = user.get('is_private') if private is not None: actor['to'] = [{ 'objectType': 'group', 'alias': '@private' if private else '@public', }] pic_url = user.get('profile_picture') or user.get( 'profile_pic_url') or '' actor.update({ 'displayName': user.get('full_name') or username, 'image': { 'url': pic_url.replace('\/', '/') }, 'description': user.get('bio') }) return util.trim_nulls(actor)
def user_to_actor(self, account): """Converts a Mastodon account to an AS1 actor. Args: account: dict, Mastodon account Returns: dict, AS1 actor """ domain = self.DOMAIN username = account.get('username') # parse acct. it's just username for local accounts but fully qualified # address for remote accounts, eg [email protected]. acct = account.get('acct') or '' split = acct.split('@') if len(split) in (2, 3): acct_username, acct_domain = split[-2:] if acct_domain: domain = acct_domain if not username: username = acct[-2] elif acct_username and username != acct_username: raise ValueError('username %s and acct %s conflict!' % (username, acct)) if not username: return {} url = account.get('url') # mastodon's 'Web site' fields are HTML links, so extract their URLs web_sites = sum((util.extract_links(f.get('value')) for f in (account.get('fields') or [])), []) # account.created_at is string ISO8601 in Mastodon, int timestamp in Pixelfed published = account.get('created_at') if util.is_int(published) or util.is_float(published): published = util.maybe_timestamp_to_iso8601(published) return util.trim_nulls({ 'objectType': 'person', 'id': util.tag_uri(domain, username), 'numeric_id': account.get('id'), 'username': username, 'displayName': account.get('display_name') or acct or username, 'url': url, 'urls': [{'value': u} for u in [url] + web_sites], 'image': {'url': account.get('avatar')}, 'published': published, 'description': account.get('note'), })
def user_to_actor(cls, user): """Converts a GitHub user to an ActivityStreams actor. Handles both v4 GraphQL and v3 REST API user objects. https://developer.github.com/v4/object/user/ https://developer.github.com/v3/users/ Args: user: dict, decoded JSON GitHub user Returns: an ActivityStreams actor dict, ready to be JSON-encoded """ actor = cls._to_object(user) if not actor: return actor username = user.get('login') desc = user.get('bio') or user.get('description') actor.update({ # TODO: orgs, bots 'objectType': 'person', 'displayName': user.get('name') or username, 'username': username, 'email': user.get('email'), 'description': desc, 'summary': desc, 'image': {'url': user.get('avatarUrl') or user.get('avatar_url') or user.get('url')}, 'location': {'displayName': user.get('location')}, }) # extract web site links. extract_links uniquifies and preserves order urls = sum((util.extract_links(user.get(field)) for field in ( 'html_url', # REST 'url', # both 'websiteUrl', # GraphQL 'blog', # REST 'bio', # both )), []) urls = [u for u in urls if util.domain_from_link(u) != 'api.github.com'] if urls: actor['url'] = urls[0] if len(urls) > 1: actor['urls'] = [{'value': u} for u in urls] return cls.postprocess_object(actor)
def original_post_discovery(activity): """Discovers original post links and stores them as tags, in place. This is a variation on http://indiewebcamp.com/original-post-discovery . It differs in that it finds multiple candidate links instead of one, and it doesn't bother looking for MF2 (etc) markup because the silos don't let you input it. Args: activity: activity dict """ obj = activity.get('object') or activity content = obj.get('content', '').strip() def article_urls(field): return set(util.trim_nulls(a.get('url') for a in obj.get(field, []) if a.get('objectType') == 'article')) attachments = article_urls('attachments') tags = article_urls('tags') urls = attachments | set(util.extract_links(content)) # Permashortcitations are short references to canonical copies of a given # (usually syndicated) post, of the form (DOMAIN PATH). Details: # http://indiewebcamp.com/permashortcitation # # We consider them an explicit original post link, so we store them in # upstreamDuplicates to signal that. # http://activitystrea.ms/specs/json/1.0/#id-comparison for match in Source._PERMASHORTCITATION_RE.finditer(content): http = match.expand(r'http://\1/\2') https = match.expand(r'https://\1/\2') uds = obj.setdefault('upstreamDuplicates', []) if (http not in uds and https not in uds # heuristic: ellipsized URLs are probably incomplete, so omit them. and not http.endswith('...') and not http.endswith(u'…')): uds.append(http) obj.setdefault('tags', []).extend( {'objectType': 'article', 'url': u} for u in urls # same heuristic from above if not u.endswith('...') and not u.endswith(u'…')) return activity
def original_post_discovery(activity): """Discovers original post links and stores them as tags, in place. This is a variation on http://indiewebcamp.com/original-post-discovery . It differs in that it finds multiple candidate links instead of one, and it doesn't bother looking for MF2 (etc) markup because the silos don't let you input it. Args: activity: activity dict """ obj = activity.get('object') or activity content = obj.get('content', '').strip() def article_urls(field): return set(util.trim_nulls(a.get('url') for a in obj.get(field, []) if a.get('objectType') == 'article')) attachments = article_urls('attachments') tags = article_urls('tags') urls = attachments | set(util.extract_links(content)) # Permashortcitations are short references to canonical copies of a given # (usually syndicated) post, of the form (DOMAIN PATH). Details: # http://indiewebcamp.com/permashortcitation # # We consider them an explicit original post link, so we store them in # upstreamDuplicates to signal that. # http://activitystrea.ms/specs/json/1.0/#id-comparison for match in Source._PERMASHORTCITATION_RE.finditer(content): http = match.expand(r'http://\1/\2') https = match.expand(r'https://\1/\2') uds = obj.setdefault('upstreamDuplicates', []) if http not in uds and https not in uds: uds.append(http) obj.setdefault('tags', []).extend( {'objectType': 'article', 'url': u} for u in urls # heuristic: ellipsized URLs are probably incomplete, so omit them. if not u.endswith('...') and not u.endswith(u'…')) return activity
def user_to_actor(self, user): """Converts a user to an actor. Args: user: JSON object from the Instagram API Returns: an ActivityStreams actor dict, ready to be JSON-encoded """ if not user: return {} id = user.get('id') username = user.get('username') actor = { 'id': self.tag_uri(id or username), 'username': username, 'objectType': 'person', } if not id or not username: return actor urls = sum((util.extract_links(user.get(field)) for field in ('website', 'bio')), []) if urls: actor['url'] = urls[0] if len(urls) > 1: actor['urls'] = [{'value': u} for u in urls] else: actor['url'] = self.user_url(username) actor.update({ 'displayName': user.get('full_name') or username, 'image': { 'url': user.get('profile_picture') }, 'description': user.get('bio') }) return util.trim_nulls(actor)
def user_to_actor(self, user): """Converts a user to an actor. Args: user: JSON object from the Instagram API Returns: an ActivityStreams actor dict, ready to be JSON-encoded """ if not user: return {} id = user.get('id') username = user.get('username') actor = { 'id': self.tag_uri(id or username), 'username': username, } if not id or not username: return actor urls = sum((util.extract_links(user.get(field)) for field in ('website', 'bio')), []) if urls: actor['url'] = urls[0] if len(urls) > 1: actor['urls'] = [{'value': u} for u in urls] else: actor['url'] = self.user_url(username) actor.update({ 'objectType': 'person', 'displayName': user.get('full_name') or username, 'image': {'url': user.get('profile_picture')}, 'description': user.get('bio') }) return util.trim_nulls(actor)
def original_post_discovery(activity, domains=None, cache=None, include_redirect_sources=True, **kwargs): """Discovers original post links. This is a variation on http://indiewebcamp.com/original-post-discovery . It differs in that it finds multiple candidate links instead of one, and it doesn't bother looking for MF2 (etc) markup because the silos don't let you input it. More background: https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857 Original post candidates come from the upstreamDuplicates, attachments, and tags fields, as well as links and permashortlinks/permashortcitations in the text content. Args: activity: activity dict domains: optional sequence of domains. If provided, only links to these domains will be considered original and stored in upstreamDuplicates. (Permashortcitations are exempt.) cache: optional, a cache object for storing resolved URL redirects. Passed to follow_redirects(). include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs kwargs: passed to requests.head() when following redirects Returns: ([string original post URLs], [string mention URLs]) tuple """ obj = activity.get('object') or activity content = obj.get('content', '').strip() # find all candidate URLs tags = [ t.get('url') for t in obj.get('attachments', []) + obj.get('tags', []) if t.get('objectType') in ('article', 'mention', None) ] candidates = tags + util.extract_links(content) + obj.get( 'upstreamDuplicates', []) # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short # references to canonical copies of a given (usually syndicated) post, of # the form (DOMAIN PATH). We consider them an explicit original post link. candidates += [ match.expand(r'http://\1/\2') for match in Source._PERMASHORTCITATION_RE.finditer(content) ] candidates = set( filter( None, ( util.clean_url(url) for url in candidates # heuristic: ellipsized URLs are probably incomplete, so omit them. if url and not url.endswith('...') and not url.endswith(u'…')))) # check for redirect and add their final urls redirects = {} # maps final URL to original URL for redirects for url in list(candidates): resolved = util.follow_redirects(url, cache=cache, **kwargs) if (resolved.url != url and resolved.headers.get( 'content-type', '').startswith('text/html')): redirects[resolved.url] = url candidates.add(resolved.url) # use domains to determine which URLs are original post links vs mentions originals = set() mentions = set() for url in util.dedupe_urls(candidates): if url in redirects.values(): # this is a redirected original URL. postpone and handle it when we hit # its final URL so that we know the final domain. continue domain = util.domain_from_link(url) which = (originals if not domains or util.domain_or_parent_in( domain, domains) else mentions) which.add(url) redirected_from = redirects.get(url) if redirected_from and include_redirect_sources: which.add(redirected_from) logging.info( 'Original post discovery found original posts %s, mentions %s', originals, mentions) return originals, mentions
def _create(self, obj, preview=None, include_link=False): """Creates or previews creating a tweet, reply tweet, retweet, or favorite. https://dev.twitter.com/docs/api/1.1/post/statuses/update https://dev.twitter.com/docs/api/1.1/post/statuses/retweet/:id https://dev.twitter.com/docs/api/1.1/post/favorites/create Args: obj: ActivityStreams object preview: boolean include_link: boolean Returns: a CreationResult If preview is True, the content will be a unicode string HTML snippet. If False, it will be a dict with 'id' and 'url' keys for the newly created Twitter object. """ # TODO: validation, error handling assert preview in (False, True) type = obj.get('objectType') verb = obj.get('verb') base_id, base_url = self.base_object(obj) content = self._content_for_create(obj) if not content: if type == 'activity': content = verb else: return source.creation_result( abort=False, # keep looking for things to publish, error_plain='No content text found.', error_html='No content text found.') is_reply = type == 'comment' or 'inReplyTo' in obj if is_reply and base_url: # extract username from in-reply-to URL so we can @-mention it, if it's # not already @-mentioned, since Twitter requires that to make our new # tweet a reply. # https://dev.twitter.com/docs/api/1.1/post/statuses/update#api-param-in_reply_to_status_id # TODO: this doesn't handle an in-reply-to username that's a prefix of # another username already mentioned, e.g. in reply to @foo when content # includes @foobar. parsed = urlparse.urlparse(base_url) parts = parsed.path.split('/') if len(parts) < 2 or not parts[1]: raise ValueError('Could not determine author of in-reply-to URL %s' % base_url) mention = '@' + parts[1] if mention not in content: content = mention + ' ' + content # the embed URL in the preview can't start with mobile. or www., so just # hard-code it to twitter.com. index #1 is netloc. parsed = list(parsed) parsed[1] = self.DOMAIN base_url = urlparse.urlunparse(parsed) # need a base_url with the tweet id for the embed HTML below. do this # *after* checking the real base_url for in-reply-to author username. if base_id and not base_url: base_url = 'https://twitter.com/-/statuses/' + base_id # truncate and ellipsize content if it's over the character count. URLs will # be t.co-wrapped, so include that when counting. links = set(util.extract_links(content)) max = MAX_TWEET_LENGTH include_url = obj.get('url') if include_link else None if include_url: max -= TCO_LENGTH + 3 length = 0 tokens = content.split() for i, token in enumerate(tokens): # extract_links() strips trailing slashes from URLs, so do the same here # so we can compare. as_url = token[:-1] if token.endswith('/') else token length += (TCO_LENGTH if as_url in links else len(token)) if i > 0: length += 1 # space between tokens if length > max: break else: i = len(tokens) # normalize whitespace # TODO: user opt in to preserve original whitespace (newlines, etc) content = ' '.join(tokens[:i]) if i < len(tokens): content += u'…' if include_url: content += ' (%s)' % include_url # linkify defaults to Twitter's link shortening behavior preview_content = util.linkify(content, pretty=True) if is_reply: if not base_url: return source.creation_result( abort=True, error_plain='Could not find a tweet to reply to.', error_html='Could not find a tweet to <a href="http://indiewebcamp.com/reply">reply to</a>. ' 'Check that your post has an <a href="http://indiewebcamp.com/comment">in-reply-to</a> ' 'link a Twitter URL or to an original post that publishes a ' '<a href="http://indiewebcamp.com/rel-syndication">rel-syndication</a> link to Twitter.') if preview: return source.creation_result( 'will <span class="verb">@-reply</span>:<br /><br />\n<em>%s</em>\n' '<br /><br />...to <a href="%s">this tweet</a>:\n%s' % (preview_content, base_url, EMBED_TWEET % base_url)) else: content = unicode(content).encode('utf-8') data = urllib.urlencode({'status': content, 'in_reply_to_status_id': base_id}) resp = json.loads(self.urlopen(API_POST_TWEET_URL, data=data).read()) resp['type'] = 'comment' elif type == 'activity' and verb == 'like': if not base_url: return source.creation_result( abort=True, error_plain='Could not find a tweet to like.', error_html='Could not find a tweet to <a href="http://indiewebcamp.com/favorite">favorite</a>. ' 'Check that your post has a like-of link to a Twitter URL or to an original post that publishes a ' '<a href="http://indiewebcamp.com/rel-syndication">rel-syndication</a> link to Twitter.') if preview: return source.creation_result( 'will <span class="verb">favorite</span> <a href="%s">this tweet</a>:\n%s' % (base_url, EMBED_TWEET % base_url)) else: data = urllib.urlencode({'id': base_id}) self.urlopen(API_POST_FAVORITE_URL, data=data).read() resp = {'type': 'like'} elif type == 'activity' and verb == 'share': if not base_url: return source.creation_result( abort=True, error_plain='Could not find a tweet to retweet.', error_html='Could not find a tweet to <a href="http://indiewebcamp.com/repost">retweet</a>. ' 'Check that your post has a repost-of link to a Twitter URL or to an original post that publishes a ' '<a href="http://indiewebcamp.com/rel-syndication">rel-syndication</a> link to Twitter.') if preview: return source.creation_result( 'will <span class="verb">retweet</span> <a href="%s">this tweet</a>:\n%s' % (base_url, EMBED_TWEET % base_url)) else: data = urllib.urlencode({'id': base_id}) resp = json.loads(self.urlopen(API_POST_RETWEET_URL % base_id, data=data).read()) resp['type'] = 'repost' elif type in ('note', 'article') and obj.get('image'): image_url = obj.get('image').get('url') if preview: return source.creation_result( 'will <span class="verb">tweet</span> with photo:<br /><br />' '<em>%s</em><br /><img src="%s"/><br />' % (preview_content, image_url)) else: content = unicode(content).encode('utf-8') data = {'status': content} files = {'media[]': urllib2.urlopen(image_url)} headers = twitter_auth.auth_header(API_POST_MEDIA_URL, self.access_token_key, self.access_token_secret, 'POST') resp = json.loads(requests.post(API_POST_MEDIA_URL, data=data, files=files, headers=headers, timeout=HTTP_TIMEOUT).text) resp['type'] = 'post' elif type in ('note', 'article'): if preview: return source.creation_result( 'will <span class="verb">tweet</span>:<br /><br />' '<em>%s</em><br />' % preview_content) else: content = unicode(content).encode('utf-8') data = urllib.urlencode({'status': content}) resp = json.loads(self.urlopen(API_POST_TWEET_URL, data=data).read()) resp['type'] = 'post' elif (verb and verb.startswith('rsvp-')) or verb == 'invite': return source.creation_result( abort=True, error_plain='Cannot publish RSVPs to Twitter.', error_html='This looks like an <a href="http://indiewebcamp.com/rsvp">RSVP</a>. ' 'Publishing events or RSVPs to Twitter is not supported.') else: return source.creation_result( abort=False, error_plain='Cannot publish type=%s, verb=%s to Twitter' % (type, verb), error_html='Cannot publish type=%s, verb=%s to Twitter' % (type, verb)) id_str = resp.get('id_str') if id_str: resp.update({'id': id_str, 'url': self.tweet_url(resp)}) elif 'url' not in resp: resp['url'] = base_url return source.creation_result(resp)
def original_post_discovery(activity, domains=None, cache=None, include_redirect_sources=True, **kwargs): """Discovers original post links. This is a variation on http://indiewebcamp.com/original-post-discovery . It differs in that it finds multiple candidate links instead of one, and it doesn't bother looking for MF2 (etc) markup because the silos don't let you input it. More background: https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857 Original post candidates come from the upstreamDuplicates, attachments, and tags fields, as well as links and permashortlinks/permashortcitations in the text content. Args: activity: activity dict domains: optional sequence of domains. If provided, only links to these domains will be considered original and stored in upstreamDuplicates. (Permashortcitations are exempt.) cache: optional, a cache object for storing resolved URL redirects. Passed to follow_redirects(). include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs kwargs: passed to requests.head() when following redirects Returns: ([string original post URLs], [string mention URLs]) tuple """ obj = activity.get("object") or activity content = obj.get("content", "").strip() # find all candidate URLs tags = [ t.get("url") for t in obj.get("attachments", []) + obj.get("tags", []) if t.get("objectType") in ("article", "mention", None) ] candidates = tags + util.extract_links(content) + obj.get("upstreamDuplicates", []) # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short # references to canonical copies of a given (usually syndicated) post, of # the form (DOMAIN PATH). We consider them an explicit original post link. candidates += [match.expand(r"http://\1/\2") for match in Source._PERMASHORTCITATION_RE.finditer(content)] candidates = set( filter( None, ( util.clean_url(url) for url in candidates # heuristic: ellipsized URLs are probably incomplete, so omit them. if url and not url.endswith("...") and not url.endswith(u"…") ), ) ) # check for redirect and add their final urls redirects = {} # maps final URL to original URL for redirects for url in list(candidates): resolved = follow_redirects(url, cache=cache, **kwargs) if resolved.url != url and resolved.headers.get("content-type", "").startswith("text/html"): redirects[resolved.url] = url candidates.add(resolved.url) # use domains to determine which URLs are original post links vs mentions originals = set() mentions = set() for url in util.dedupe_urls(candidates): if url in redirects.values(): # this is a redirected original URL. postpone and handle it when we hit # its final URL so that we know the final domain. continue which = originals if not domains or util.domain_from_link(url) in domains else mentions which.add(url) redirected_from = redirects.get(url) if redirected_from and include_redirect_sources: which.add(redirected_from) logging.info("Original post discovery found original posts %s, mentions %s", originals, mentions) return originals, mentions
def _create(self, obj, preview=None, include_link=False): """Creates or previews creating a tweet, reply tweet, retweet, or favorite. https://dev.twitter.com/docs/api/1.1/post/statuses/update https://dev.twitter.com/docs/api/1.1/post/statuses/retweet/:id https://dev.twitter.com/docs/api/1.1/post/favorites/create Args: obj: ActivityStreams object preview: boolean include_link: boolean Returns: If preview is True, a string HTML snippet. If False, a dict with 'id' and 'url' keys for the newly created Twitter object. """ # TODO: validation, error handling assert preview in (False, True) type = obj.get('objectType') verb = obj.get('verb') base_id, base_url = self.base_object(obj) content = obj.get('content', '').strip() is_reply = (type == 'comment' or 'inReplyTo' in obj) and base_url if is_reply: # extract username from in-reply-to URL so we can @-mention it, if it's # not already @-mentioned, since Twitter requires that to make our new # tweet a reply. # https://dev.twitter.com/docs/api/1.1/post/statuses/update#api-param-in_reply_to_status_id # TODO: this doesn't handle an in-reply-to username that's a prefix of # another username already mentioned, e.g. in reply to @foo when content # includes @foobar. parsed = urlparse.urlparse(base_url) parts = parsed.path.split('/') if len(parts) < 2 or not parts[1]: raise ValueError('Could not determine author of in-reply-to URL %s' % base_url) mention = '@' + parts[1] if mention not in content: content = mention + ' ' + content # the embed URL in the preview can't start with mobile. or www., so just # hard-code it to twitter.com. index #1 is netloc. parsed = list(parsed) parsed[1] = self.DOMAIN base_url = urlparse.urlunparse(parsed) # need a base_url with the tweet id for the embed HTML below. do this # *after* checking the real base_url for in-reply-to author username. if base_id and not base_url: base_url = 'https://twitter.com/-/statuses/' + base_id # truncate and ellipsize content if it's over the character count. URLs will # be t.co-wrapped, so include that when counting. links = set(util.extract_links(content)) max = MAX_TWEET_LENGTH include_url = obj.get('url') if include_link else None if include_url: max -= TCO_LENGTH + 3 length = 0 tokens = content.split() for i, token in enumerate(tokens): # extract_links() strips trailing slashes from URLs, so do the same here # so we can compare. as_url = token[:-1] if token.endswith('/') else token length += (TCO_LENGTH if as_url in links else len(token)) if i > 0: length += 1 # space between tokens if length > max: break else: i = len(tokens) # normalize whitespace # TODO: user opt in to preserve original whitespace (newlines, etc) content = ' '.join(tokens[:i]) if i < len(tokens): content += u'…' if include_url: content += ' (%s)' % include_url content = unicode(content).encode('utf-8') # linkify defaults to Twitter's link shortening behavior preview_content = util.linkify(content, pretty=True) if is_reply: if preview: return ('will <span class="verb">@-reply</span>:<br /><br />\n<em>%s</em>\n' '<br /><br />...to <a href="%s">this tweet</a>:\n%s' % (preview_content, base_url, EMBED_TWEET % base_url)) else: data = urllib.urlencode({'status': content, 'in_reply_to_status_id': base_id}) resp = json.loads(self.urlopen(API_POST_TWEET_URL, data=data).read()) resp['type'] = 'comment' elif type == 'activity' and verb == 'like': if preview: return ('will <span class="verb">favorite</span> <a href="%s">this tweet</a>:\n%s' % (base_url, EMBED_TWEET % base_url)) else: data = urllib.urlencode({'id': base_id}) self.urlopen(API_POST_FAVORITE_URL, data=data).read() resp = {'type': 'like'} elif type == 'activity' and verb == 'share': if preview: return ('will <span class="verb">retweet</span> <a href="%s">this tweet</a>:\n%s' % (base_url, EMBED_TWEET % base_url)) else: data = urllib.urlencode({'id': base_id}) resp = json.loads(self.urlopen(API_POST_RETWEET_URL % base_id, data=data).read()) resp['type'] = 'repost' elif type in ('note', 'article', 'comment'): if preview: return ('will <span class="verb">tweet</span>:<br /><br />' '<em>%s</em><br />' % preview_content) else: data = urllib.urlencode({'status': content}) resp = json.loads(self.urlopen(API_POST_TWEET_URL, data=data).read()) resp['type'] = 'post' else: raise NotImplementedError() id_str = resp.get('id_str') if id_str: resp.update({'id': id_str, 'url': self.tweet_url(resp)}) elif 'url' not in resp: resp['url'] = base_url return resp