def get_rsvps_from_event(event): """Returns RSVP objects for an event's *attending fields. Args: event: ActivityStreams event object Returns: sequence of ActivityStreams RSVP activity objects """ id = event.get("id") if not id: return [] parsed = util.parse_tag_uri(id) if not parsed: return [] domain, event_id = parsed url = event.get("url") rsvps = [] for verb, field in RSVP_TO_EVENT.items(): for actor in event.get(field, []): rsvp = {"objectType": "activity", "verb": verb, "actor": actor, "url": url} if event_id and "id" in actor: _, actor_id = util.parse_tag_uri(actor["id"]) rsvp["id"] = util.tag_uri(domain, "%s_rsvp_%s" % (event_id, actor_id)) if url: rsvp["url"] = "#".join((url, actor_id)) rsvps.append(rsvp) return rsvps
def get_rsvps_from_event(event): """Returns RSVP objects for an event's *attending fields. Args: event: ActivityStreams event object Returns: sequence of ActivityStreams RSVP activity objects """ id = event.get('id') if not id: return [] parsed = util.parse_tag_uri(id) if not parsed: return [] domain, event_id = parsed rsvps = [] for verb, field in RSVP_TO_EVENT.items(): for actor in event.get(field, []): rsvp = {'objectType': 'activity', 'verb': verb, 'actor': actor, } if event_id and 'id' in actor: _, actor_id = util.parse_tag_uri(actor['id']) rsvp['id'] = util.tag_uri(domain, '%s_rsvp_%s' % (event_id, actor_id)) rsvps.append(rsvp) return rsvps
def get_rsvps_from_event(event): """Returns RSVP objects for an event's *attending fields. Args: event: ActivityStreams event object Returns: sequence of ActivityStreams RSVP activity objects """ id = event.get('id') if not id: return [] parsed = util.parse_tag_uri(id) if not parsed: return [] domain, event_id = parsed url = event.get('url') rsvps = [] for verb, field in RSVP_TO_EVENT.items(): for actor in event.get(field, []): rsvp = {'objectType': 'activity', 'verb': verb, 'actor': actor, 'url': url, } if event_id and 'id' in actor: _, actor_id = util.parse_tag_uri(actor['id']) rsvp['id'] = util.tag_uri(domain, '%s_rsvp_%s' % (event_id, actor_id)) if url: rsvp['url'] = '#'.join((url, actor_id)) rsvps.append(rsvp) return rsvps
def base_object(self, obj): """Returns id and URL of the 'base' silo object that an object operates on. For example, if the object is a comment, this returns the post that it's a comment on. If it's an RSVP, this returns the event. The id in the returned tuple is silo-specific, ie not a tag URI. Subclasses may override this. Args: obj: ActivityStreams object Returns: (string id, string URL) tuple. Both may be None. """ # look at in-reply-tos first, then objects (for likes and reposts). # technically, the ActivityStreams 'object' field is always supposed to be # singular, but microformats2.json_to_object() sometimes returns activities # that have a list value, e.g. likes or reposts of multiple objects. candidates = [] for field in ('inReplyTo', 'object'): objs = obj.get(field, []) if isinstance(objs, dict): candidates.append(objs) else: candidates += objs for base_obj in candidates: parsed_id = util.parse_tag_uri(base_obj.get('id', '')) if parsed_id: domain = parsed_id[0] else: domain = urlparse.urlparse(base_obj.get('url', '')).netloc for subdomain in 'www.', 'mobile.': if domain.startswith(subdomain): domain = domain[len(subdomain):] if domain == self.DOMAIN: break else: return (None, None) id = base_obj.get('id') url = base_obj.get('url') if id: id = util.parse_tag_uri(id)[1] elif url: path = urlparse.urlparse(url).path if path.endswith('/'): path = path[:-1] id = path.rsplit('/', 1)[-1] return (id, url)
def base_object(self, obj): """Returns the 'base' silo object that an object operates on. For example, if the object is a comment, this returns the post that it's a comment on. If it's an RSVP, this returns the event. The id in the returned object is silo-specific, ie not a tag URI. Subclasses may override this. Args: obj: ActivityStreams object Returns: dict, minimal ActivityStreams object. Usually has at least id; may also have url, author, etc. """ # look at in-reply-tos first, then objects (for likes and reposts). # technically, the ActivityStreams 'object' field is always supposed to be # singular, but microformats2.json_to_object() sometimes returns activities # that have a list value, e.g. likes or reposts of multiple objects. candidates = [] for field in ('inReplyTo', 'object'): objs = obj.get(field, []) if isinstance(objs, dict): candidates.append(objs) else: candidates += objs for base_obj in candidates: parsed_id = util.parse_tag_uri(base_obj.get('id', '')) if parsed_id: domain = parsed_id[0] else: domain = util.domain_from_link(base_obj.get('url', '')) if domain == self.DOMAIN: break else: return {} base_obj = copy.deepcopy(base_obj) id = base_obj.get('id') url = base_obj.get('url') if id: parsed = util.parse_tag_uri(id) if parsed: base_obj['id'] = parsed[1] elif url: base_obj['id'] = self.base_id(url) return base_obj
def base_object(self, obj): """Returns the 'base' silo object that an object operates on. For example, if the object is a comment, this returns the post that it's a comment on. If it's an RSVP, this returns the event. The id in the returned object is silo-specific, ie not a tag URI. Subclasses may override this. Args: obj: ActivityStreams object Returns: dict, minimal ActivityStreams object. Usually has at least id; may also have url, author, etc. """ # look at in-reply-tos first, then objects (for likes and reposts). # technically, the ActivityStreams 'object' field is always supposed to be # singular, but microformats2.json_to_object() sometimes returns activities # that have a list value, e.g. likes or reposts of multiple objects. candidates = [] for field in ("inReplyTo", "object"): objs = obj.get(field, []) if isinstance(objs, dict): candidates.append(objs) else: candidates += objs for base_obj in candidates: parsed_id = util.parse_tag_uri(base_obj.get("id", "")) if parsed_id: domain = parsed_id[0] else: domain = util.domain_from_link(base_obj.get("url", "")) if domain == self.DOMAIN: break else: return {} base_obj = copy.deepcopy(base_obj) id = base_obj.get("id") url = base_obj.get("url") if id: parsed = util.parse_tag_uri(id) if parsed: base_obj["id"] = parsed[1] elif url: path = urlparse.urlparse(url).path base_obj["id"] = path.rstrip("/").rsplit("/", 1)[-1] return base_obj
def base_object(self, obj): """Returns id and URL of the 'base' silo object that an object operates on. For example, if the object is a comment, this returns the post that it's a comment on. If it's an RSVP, this returns the event. The id in the returned tuple is silo-specific, ie not a tag URI. Args: obj: ActivityStreams object Returns: (string id, string URL) tuple. Both may be None. """ reply_to = obj.get('inReplyTo') base_obj = reply_to[0] if reply_to else obj.get('object') if not base_obj: return (None, None) id = base_obj.get('id') url = base_obj.get('url') if id: id = util.parse_tag_uri(id)[1] elif url: path = urlparse.urlparse(url).path if path.endswith('/'): path = path[:-1] id = path.rsplit('/', 1)[-1] return (id, url)
def merge_scraped_reactions(self, scraped, activity): """Converts and merges scraped likes and reactions into an activity. New likes and emoji reactions are added to the activity in 'tags'. Existing likes and emoji reactions in 'tags' are ignored. Args: scraped: str or dict, scraped JSON likes activity: dict, AS activity to merge these reactions into Returns: list of dict AS like tag objects converted from scraped Raises: ValueError: if scraped is not valid JSON """ if isinstance(scraped, str): scraped = json_loads(scraped) media = scraped.get('data', {}).get('shortcode_media', {}) if media: id = util.parse_tag_uri(activity['id'])[1] media_url = self.media_url(media['shortcode']) likes = [ self.like_to_object(like.get('node', {}), id, media_url) for like in media.get('edge_liked_by', {}).get('edges', []) ] source.merge_by_id(activity['object'], 'tags', likes) return likes return []
def fetch_replies(self, activities, min_id=None): """Fetches and injects Twitter replies into a list of activities, in place. Includes indirect replies ie reply chains, not just direct replies. Searches for @-mentions, matches them to the original tweets with in_reply_to_status_id_str, and recurses until it's walked the entire tree. Args: activities: list of activity dicts Returns: same activities list """ # cache searches for @-mentions for individual users. maps username to dict # mapping tweet id to ActivityStreams reply object dict. mentions = {} # find replies for activity in activities: # list of ActivityStreams reply object dict and set of seen activity ids # (tag URIs). seed with the original tweet; we'll filter it out later. replies = [activity] _, id = util.parse_tag_uri(activity['id']) seen_ids = set([id]) for reply in replies: # get mentions of this tweet's author so we can search them for replies to # this tweet. can't use statuses/mentions_timeline because i'd need to # auth as the user being mentioned. # https://dev.twitter.com/docs/api/1.1/get/statuses/mentions_timeline # # note that these HTTP requests are synchronous. you can make async # requests by using urlfetch.fetch() directly, but not with urllib2. # https://developers.google.com/appengine/docs/python/urlfetch/asynchronousrequests author = reply['actor']['username'] if author not in mentions: url = API_SEARCH % { 'q': urllib.quote_plus('@' + author.encode('utf-8')), 'count': 100, } if min_id is not None: url = util.add_query_params(url, {'since_id': min_id}) mentions[author] = self.urlopen(url)['statuses'] # look for replies. add any we find to the end of replies. this makes us # recursively follow reply chains to their end. (python supports # appending to a sequence while you're iterating over it.) for mention in mentions[author]: id = mention['id_str'] if (mention.get('in_reply_to_status_id_str') in seen_ids and id not in seen_ids): replies.append(self.tweet_to_activity(mention)) seen_ids.add(id) items = [r['object'] for r in replies[1:]] # filter out seed activity activity['object']['replies'] = { 'items': items, 'totalItems': len(items), }
def fetch_replies(self, activities, min_id=None): """Fetches and injects Twitter replies into a list of activities, in place. Includes indirect replies ie reply chains, not just direct replies. Searches for @-mentions, matches them to the original tweets with in_reply_to_status_id_str, and recurses until it's walked the entire tree. Args: activities: list of activity dicts Returns: same activities list """ # cache searches for @-mentions for individual users. maps username to dict # mapping tweet id to ActivityStreams reply object dict. mentions = {} # find replies for activity in activities: # list of ActivityStreams reply object dict and set of seen activity ids # (tag URIs). seed with the original tweet; we'll filter it out later. replies = [activity] _, id = util.parse_tag_uri(activity['id']) seen_ids = set([id]) for reply in replies: # get mentions of this tweet's author so we can search them for replies to # this tweet. can't use statuses/mentions_timeline because i'd need to # auth as the user being mentioned. # https://dev.twitter.com/docs/api/1.1/get/statuses/mentions_timeline # # note that these HTTP requests are synchronous. you can make async # requests by using urlfetch.fetch() directly, but not with urllib2. # https://developers.google.com/appengine/docs/python/urlfetch/asynchronousrequests author = reply['actor']['username'] if author not in mentions: url = API_SEARCH_URL % { 'q': urllib.quote_plus('@' + author), 'count': 100, } if min_id is not None: url = util.add_query_params(url, {'since_id': min_id}) mentions[author] = self.urlopen(url)['statuses'] # look for replies. add any we find to the end of replies. this makes us # recursively follow reply chains to their end. (python supports # appending to a sequence while you're iterating over it.) for mention in mentions[author]: id = mention['id_str'] if (mention.get('in_reply_to_status_id_str') in seen_ids and id not in seen_ids): replies.append(self.tweet_to_activity(mention)) seen_ids.add(id) items = [r['object'] for r in replies[1:]] # filter out seed activity activity['object']['replies'] = { 'items': items, 'totalItems': len(items), }
def base_object(self, obj): """Returns the 'base' silo object that an object operates on. For example, if the object is a comment, this returns the post that it's a comment on. If it's an RSVP, this returns the event. The id in the returned object is silo-specific, ie not a tag URI. Subclasses may override this. Args: obj: ActivityStreams object Returns: dict, minimal ActivityStreams object. Usually has at least id; may also have url, author, etc. """ # look at in-reply-tos first, then objects (for likes and reposts). # technically, the ActivityStreams 'object' field is always supposed to be # singular, but microformats2.json_to_object() sometimes returns activities # that have a list value, e.g. likes or reposts of multiple objects. candidates = [] for field in ('inReplyTo', 'object', 'target'): candidates += util.get_list(obj, field) for base_obj in candidates: parsed_id = util.parse_tag_uri(base_obj.get('id', '')) if parsed_id: domain = parsed_id[0] else: domain = util.domain_from_link(base_obj.get('url', '')) if domain == self.DOMAIN: break else: return {} base_obj = copy.deepcopy(base_obj) id = base_obj.get('id') url = base_obj.get('url') if id: parsed = util.parse_tag_uri(id) if parsed: base_obj['id'] = parsed[1] elif url: base_obj['id'] = self.base_id(url) return base_obj
def get_rsvps_from_event(event): """Returns RSVP objects for an event's attending fields. Args: event: ActivityStreams event object Returns: sequence of ActivityStreams RSVP activity objects """ id = event.get('id') if not id: return [] parsed = util.parse_tag_uri(id) if not parsed: return [] domain, event_id = parsed url = event.get('url') author = event.get('author') rsvps = [] for verb, field in RSVP_VERB_TO_COLLECTION.items(): for actor in event.get(field, []): rsvp = {'objectType': 'activity', 'verb': verb, 'object' if verb == 'invite' else 'actor': actor, 'url': url, } if event_id and 'id' in actor: _, actor_id = util.parse_tag_uri(actor['id']) rsvp['id'] = util.tag_uri(domain, '%s_rsvp_%s' % (event_id, actor_id)) if url: rsvp['url'] = '#'.join((url, actor_id)) if verb == 'invite' and author: rsvp['actor'] = author rsvps.append(rsvp) return rsvps
def _postprocess_base_object(cls, obj): obj = copy.deepcopy(obj) id = obj.get('id') url = obj.get('url') if id: parsed = util.parse_tag_uri(id) if parsed: obj['id'] = parsed[1] elif url: obj['id'] = cls.base_id(url) return obj
def base_object(self, obj): """Extends the default base_object() to avoid using shortcodes as object ids. """ base_obj = super(Instagram, self).base_object(obj) base_id = base_obj.get('id') if base_id and not base_id.replace('_', '').isdigit(): # this isn't id. it's probably a shortcode. del base_obj['id'] id = obj.get('id') if id: parsed = util.parse_tag_uri(id) if parsed and '_' in parsed[1]: base_obj['id'] = parsed[1].split('_')[0] return base_obj
def get_rsvp(self, activity_user_id, event_id, user_id, event=None): """Returns an ActivityStreams RSVP activity object. Args: activity_user_id: string id of the user who posted the event. unused. event_id: string event id user_id: string user id event: AS event activity (optional) """ user_tag_id = self.tag_uri(user_id) if not event: event = self.get_event(event_id) if not event: return None for rsvp in self.get_rsvps_from_event(event['object']): for field in 'actor', 'object': id = rsvp.get(field, {}).get('id') if id and user_id == util.parse_tag_uri(id)[1]: return rsvp
def _fetch_replies(self, r, activities): """Fetches and injects comments into a list of activities, in place. limitations: Only includes top level comments Args: r: PRAW API object for querying submissions in activities activities: list of activity dicts """ for activity in activities: subm = r.submission(id=util.parse_tag_uri(activity.get('id'))[1]) # for v0 we will use just the top level comments because threading is hard. # feature request: https://github.com/snarfed/bridgy/issues/1014 subm.comments.replace_more() replies = [] for top_level_comment in subm.comments: replies.append( self.praw_to_activity(top_level_comment, 'comment')) items = [r.get('object') for r in replies] activity['object']['replies'] = { 'items': items, 'totalItems': len(items), }
def get(self): """Handles an API GET. Request path is of the form /site/user_id/group_id/app_id/activity_id , where each element except site is an optional string object id. """ # parse path args = urllib.parse.unquote(self.request.path).strip('/').split('/') if not args or len(args) > MAX_PATH_LEN: raise exc.HTTPNotFound('Expected 1-%d path elements; found %d' % (MAX_PATH_LEN, len(args))) if len(args) > 1 and args[1] == 'nederland20': return self.abort( 401, 'To protect our users from spam and other malicious activity, this account is temporarily locked. Please log in to https://twitter.com to unlock your account.' ) # make source instance site = args.pop(0) if site == 'twitter': src = twitter.Twitter(access_token_key=util.get_required_param( self, 'access_token_key'), access_token_secret=util.get_required_param( self, 'access_token_secret')) elif site == 'facebook': self.abort( 400, 'Sorry, Facebook is no longer available in the REST API. Try the library instead!' ) elif site == 'flickr': src = flickr.Flickr(access_token_key=util.get_required_param( self, 'access_token_key'), access_token_secret=util.get_required_param( self, 'access_token_secret')) elif site == 'github': src = github.GitHub( access_token=util.get_required_param(self, 'access_token')) elif site == 'instagram': if self.request.get('interactive').lower() == 'true': src = instagram.Instagram(scrape=True) else: self.abort( 400, 'Sorry, Instagram is not currently available in the REST API. Try https://instagram-atom.appspot.com/ instead!' ) elif site == 'mastodon': src = mastodon.Mastodon( instance=util.get_required_param(self, 'instance'), access_token=util.get_required_param(self, 'access_token'), user_id=util.get_required_param(self, 'user_id')) elif site == 'meetup': src = meetup.Meetup(access_token_key=util.get_required_param( self, 'access_token_key'), access_token_secret=util.get_required_param( self, 'access_token_secret')) elif site == 'pixelfed': src = pixelfed.Pixelfed( instance=util.get_required_param(self, 'instance'), access_token=util.get_required_param(self, 'access_token'), user_id=util.get_required_param(self, 'user_id')) elif site == 'reddit': src = reddit.Reddit(refresh_token=util.get_required_param( self, 'refresh_token' )) # the refresh_roken should be returned but is not appearing else: src_cls = source.sources.get(site) if not src_cls: raise exc.HTTPNotFound('Unknown site %r' % site) src = src_cls(**self.request.params) # decode tag URI ids for i, arg in enumerate(args): parsed = util.parse_tag_uri(arg) if parsed: domain, id = parsed if domain != src.DOMAIN: raise exc.HTTPBadRequest( 'Expected domain %s in tag URI %s, found %s' % (src.DOMAIN, arg, domain)) args[i] = id # handle default path elements args = [ None if a in defaults else a for a, defaults in zip(args, PATH_DEFAULTS) ] user_id = args[0] if args else None # get activities (etc) try: if len(args) >= 2 and args[1] == '@blocks': try: response = {'items': src.get_blocklist()} except source.RateLimited as e: if not e.partial: self.abort(429, str(e)) response = {'items': e.partial} else: response = src.get_activities_response(*args, **self.get_kwargs()) except (NotImplementedError, ValueError) as e: self.abort(400, str(e)) # other exceptions are handled by webutil.handlers.handle_exception(), # which uses interpret_http_exception(), etc. # fetch actor if necessary actor = response.get('actor') if not actor and self.request.get('format') == 'atom': # atom needs actor actor = src.get_actor(user_id) if src else {} self.write_response(response, actor=actor, url=src.BASE_URL)
def _scrape(self, user_id=None, group_id=None, activity_id=None, cookie=None, count=None, fetch_extras=False, cache=None, shortcode=None): """Scrapes a user's profile or feed and converts the media to activities. Args: user_id: string activity_id: string, e.g. '1020355224898358984_654594' count: integer, number of activities to fetch and return, None for all fetch_extras: boolean cookie: string shortcode: string, e.g. '4pB6vEx87I' Returns: dict activities API response """ assert user_id or activity_id or shortcode or cookie assert not (activity_id and shortcode) if not shortcode: shortcode = self.id_to_shortcode(activity_id) url = ( HTML_MEDIA % shortcode if shortcode else HTML_PROFILE % user_id if user_id and group_id == source.SELF else HTML_BASE_URL) kwargs = {} if cookie: if not cookie.startswith('sessionid='): cookie = 'sessionid=' + cookie kwargs = {'headers': {'Cookie': cookie}} resp = util.requests_get(url, allow_redirects=False, **kwargs) if ((cookie and 'not-logged-in' in resp.text) or (resp.status_code in (301, 302) and '/accounts/login' in resp.headers.get('Location', ''))): resp.status_code = 401 raise requests.HTTPError('401 Unauthorized', response=resp) elif resp.status_code == 404: if activity_id: return self._scrape(shortcode=activity_id, cookie=cookie, count=count) # otherwise not found, fall through and return empty response else: resp.raise_for_status() activities, actor = self.html_to_activities(resp.text, cookie=cookie, count=count) if fetch_extras: # batch get cached counts of comments and likes for all activities cached = {} # don't update the cache until the end, in case we hit an error before cache_updates = {} if cache is not None: keys = [] for activity in activities: _, id = util.parse_tag_uri(activity['id']) keys.extend(['AIL ' + id, 'AIC ' + id]) cached = cache.get_multi(keys) for i, activity in enumerate(activities): obj = activity['object'] _, id = util.parse_tag_uri(activity['id']) likes = obj.get('ig_like_count') or 0 comments = obj.get('replies', {}).get('totalItems') or 0 likes_key = 'AIL %s' % id comments_key = 'AIC %s' % id if (likes and likes != cached.get(likes_key) or comments and comments != cached.get(comments_key)): if not activity_id and not shortcode: url = activity['url'].replace(self.BASE_URL, HTML_BASE_URL) resp = util.requests_get(url) resp.raise_for_status() # otherwise resp is a fetch of just this activity; reuse it full_activity, _ = self.html_to_activities( resp.text, cookie=cookie, count=count, fetch_extras=fetch_extras) if full_activity: activities[i] = full_activity[0] cache_updates.update({ likes_key: likes, comments_key: comments }) if cache_updates and cache is not None: cache.set_multi(cache_updates) resp = self.make_activities_base_response(activities) resp['actor'] = actor return resp
def _scrape(self, user_id=None, activity_id=None, cookie=None, fetch_extras=False, cache=None): """Scrapes a user's profile or feed and converts the media to activities. Args: user_id: string activity_id: string fetch_extras: boolean cookie: string Returns: list of activities """ assert user_id or activity_id or cookie url = (HTML_MEDIA % self.id_to_shortcode(activity_id) if activity_id else self.user_url(user_id) if user_id else self.BASE_URL) kwargs = {} if cookie: kwargs = {'headers': {'Cookie': cookie}} resp = util.requests_get(url, allow_redirects=False, **kwargs) if ((cookie and 'not-logged-in' in resp.text) or (resp.status_code in ('301', '302') and '/accounts/login' in resp.headers.get('Location', ''))): resp.status_code = '401' raise requests.HTTPError('401 Unauthorized', response=resp) activities, actor = self.html_to_activities(resp.text) if fetch_extras and not activity_id: # batch get cached counts of comments and likes for all activities cached = {} # don't update the cache until the end, in case we hit an error before cache_updates = {} if cache is not None: keys = [] for activity in activities: _, id = util.parse_tag_uri(activity['id']) keys.extend(['AIL ' + id, 'AIC ' + id]) cached = cache.get_multi(keys) for i, activity in enumerate(activities): obj = activity['object'] _, id = util.parse_tag_uri(activity['id']) likes = obj.get('ig_like_count') or 0 comments = obj.get('replies', {}).get('totalItems') or 0 likes_key = 'AIL %s' % id comments_key = 'AIC %s' % id if (likes and likes != cached.get(likes_key) or comments and comments != cached.get(comments_key)): full_activity, _ = self.html_to_activities( util.requests_get(activity['url']).text) if full_activity: activities[i] = full_activity[0] cache_updates.update({ likes_key: likes, comments_key: comments }) if cache_updates and cache is not None: cache.set_multi(cache_updates) resp = self.make_activities_base_response(activities) resp['actor'] = actor return resp
def get(self): """Handles an API GET. Request path is of the form /site/user_id/group_id/app_id/activity_id , where each element except site is an optional string object id. """ # parse path args = urllib.unquote(self.request.path).strip('/').split('/') if not args or len(args) > MAX_PATH_LEN: raise exc.HTTPNotFound('Expected 1-%d path elements; found %d' % (MAX_PATH_LEN, len(args))) # make source instance site = args.pop(0) if site == 'twitter': src = twitter.Twitter(access_token_key=util.get_required_param( self, 'access_token_key'), access_token_secret=util.get_required_param( self, 'access_token_secret')) elif site == 'facebook': src = facebook.Facebook( access_token=util.get_required_param(self, 'access_token')) elif site == 'flickr': src = flickr.Flickr(access_token_key=util.get_required_param( self, 'access_token_key'), access_token_secret=util.get_required_param( self, 'access_token_secret')) elif site == 'instagram': src = instagram.Instagram(scrape=True) elif site == 'google+': auth_entity = util.get_required_param(self, 'auth_entity') src = googleplus.GooglePlus(auth_entity=ndb.Key( urlsafe=auth_entity).get()) else: src_cls = source.sources.get(site) if not src_cls: raise exc.HTTPNotFound('Unknown site %r' % site) src = src_cls(**self.request.params) # decode tag URI ids for i, arg in enumerate(args): parsed = util.parse_tag_uri(arg) if parsed: domain, id = parsed if domain != src.DOMAIN: raise exc.HTTPBadRequest( 'Expected domain %s in tag URI %s, found %s' % (src.DOMAIN, arg, domain)) args[i] = id # check if request is cached cache = self.request.get('cache', '').lower() != 'false' if cache: cache_key = 'R %s' % self.request.path cached = memcache.get(cache_key) if cached: logging.info('Serving cached response %r', cache_key) self.write_response(cached['response'], actor=cached['actor'], url=src.BASE_URL) return # handle default path elements args = [ None if a in defaults else a for a, defaults in zip(args, PATH_DEFAULTS) ] user_id = args[0] if args else None # get activities (etc) try: if len(args) >= 2 and args[1] == '@blocks': response = {'items': src.get_blocklist()} else: response = src.get_activities_response(*args, **self.get_kwargs(src)) except (NotImplementedError, ValueError) as e: self.abort(400, str(e)) # other exceptions are handled by webutil.handlers.handle_exception(), # which uses interpret_http_exception(), etc. # fetch actor if necessary actor = response.get('actor') if not actor and self.request.get('format') == 'atom': # atom needs actor args = [ None if a in defaults else a # handle default path elements for a, defaults in zip(args, PATH_DEFAULTS) ] user_id = args[0] if args else None actor = src.get_actor(user_id) if src else {} self.write_response(response, actor=actor, url=src.BASE_URL) # cache response if cache: logging.info('Caching response in %r', cache_key) memcache.set(cache_key, { 'response': response, 'actor': actor }, src.RESPONSE_CACHE_TIME)
def get(self): """Handles an API GET. Request path is of the form /site/user_id/group_id/app_id/activity_id , where each element except site is an optional string object id. """ # parse path args = urllib.unquote(self.request.path).strip('/').split('/') if not args or len(args) > MAX_PATH_LEN: raise exc.HTTPNotFound('Expected 1-%d path elements; found %d' % (MAX_PATH_LEN, len(args))) # make source instance site = args.pop(0) if site == 'twitter': src = twitter.Twitter( access_token_key=util.get_required_param(self, 'access_token_key'), access_token_secret=util.get_required_param(self, 'access_token_secret')) elif site == 'facebook': src = facebook.Facebook( access_token=util.get_required_param(self, 'access_token')) elif site == 'flickr': src = flickr.Flickr( access_token_key=util.get_required_param(self, 'access_token_key'), access_token_secret=util.get_required_param(self, 'access_token_secret')) elif site == 'github': src = github.GitHub( access_token=util.get_required_param(self, 'access_token')) elif site == 'instagram': src = instagram.Instagram(scrape=True) else: src_cls = source.sources.get(site) if not src_cls: raise exc.HTTPNotFound('Unknown site %r' % site) src = src_cls(**self.request.params) # decode tag URI ids for i, arg in enumerate(args): parsed = util.parse_tag_uri(arg) if parsed: domain, id = parsed if domain != src.DOMAIN: raise exc.HTTPBadRequest('Expected domain %s in tag URI %s, found %s' % (src.DOMAIN, arg, domain)) args[i] = id # handle default path elements args = [None if a in defaults else a for a, defaults in zip(args, PATH_DEFAULTS)] user_id = args[0] if args else None # get activities (etc) try: if len(args) >= 2 and args[1] == '@blocks': try: response = {'items': src.get_blocklist()} except source.RateLimited as e: if not e.partial: self.abort(429, str(e)) response = {'items': e.partial} else: response = src.get_activities_response(*args, **self.get_kwargs()) except (NotImplementedError, ValueError) as e: self.abort(400, str(e)) # other exceptions are handled by webutil.handlers.handle_exception(), # which uses interpret_http_exception(), etc. # fetch actor if necessary actor = response.get('actor') if not actor and self.request.get('format') == 'atom': # atom needs actor args = [None if a in defaults else a # handle default path elements for a, defaults in zip(args, PATH_DEFAULTS)] actor = src.get_actor(user_id) if src else {} self.write_response(response, actor=actor, url=src.BASE_URL)
def _scrape(self, user_id=None, activity_id=None, cookie=None, fetch_extras=False, cache=None, shortcode=None): """Scrapes a user's profile or feed and converts the media to activities. Args: user_id: string activity_id: string, e.g. '1020355224898358984_654594' fetch_extras: boolean cookie: string shortcode: string, e.g. '4pB6vEx87I' Returns: dict activities API response """ assert user_id or activity_id or shortcode or cookie assert not (activity_id and shortcode) if not shortcode: shortcode = self.id_to_shortcode(activity_id) url = (HTML_MEDIA % shortcode if shortcode else HTML_PROFILE % user_id if user_id else HTML_BASE_URL) kwargs = {} if cookie: kwargs = {'headers': {'Cookie': cookie}} resp = util.requests_get(url, allow_redirects=False, **kwargs) if ((cookie and 'not-logged-in' in resp.text) or (resp.status_code in (301, 302) and '/accounts/login' in resp.headers.get('Location', ''))): resp.status_code = 401 raise requests.HTTPError('401 Unauthorized', response=resp) elif resp.status_code == 404: if activity_id: return self._scrape(shortcode=activity_id, cookie=cookie) # otherwise not found, fall through and return empty response else: resp.raise_for_status() activities, actor = self.html_to_activities(resp.text) if fetch_extras and not activity_id: # batch get cached counts of comments and likes for all activities cached = {} # don't update the cache until the end, in case we hit an error before cache_updates = {} if cache is not None: keys = [] for activity in activities: _, id = util.parse_tag_uri(activity['id']) keys.extend(['AIL ' + id, 'AIC ' + id]) cached = cache.get_multi(keys) for i, activity in enumerate(activities): obj = activity['object'] _, id = util.parse_tag_uri(activity['id']) likes = obj.get('ig_like_count') or 0 comments = obj.get('replies', {}).get('totalItems') or 0 likes_key = 'AIL %s' % id comments_key = 'AIC %s' % id if (likes and likes != cached.get(likes_key) or comments and comments != cached.get(comments_key)): url = activity['url'].replace(self.BASE_URL, HTML_BASE_URL) resp = util.requests_get(url) resp.raise_for_status() full_activity, _ = self.html_to_activities(resp.text) if full_activity: activities[i] = full_activity[0] cache_updates.update({likes_key: likes, comments_key: comments}) if cache_updates and cache is not None: cache.set_multi(cache_updates) resp = self.make_activities_base_response(activities) resp['actor'] = actor return resp
def get(self): """Handles an API GET. Request path is of the form /site/user_id/group_id/app_id/activity_id , where each element except site is an optional string object id. """ # parse path args = urllib.unquote(self.request.path).strip('/').split('/') if not args or len(args) > MAX_PATH_LEN: raise exc.HTTPNotFound('Expected 1-%d path elements; found %d' % (MAX_PATH_LEN, len(args))) # make source instance site = args.pop(0) if site == 'twitter': src = twitter.Twitter( access_token_key=util.get_required_param(self, 'access_token_key'), access_token_secret=util.get_required_param(self, 'access_token_secret')) elif site == 'facebook': src = facebook.Facebook( access_token=util.get_required_param(self, 'access_token')) elif site == 'flickr': src = flickr.Flickr( access_token_key=util.get_required_param(self, 'access_token_key'), access_token_secret=util.get_required_param(self, 'access_token_secret')) elif site == 'instagram': src = instagram.Instagram(scrape=True) elif site == 'google+': auth_entity = util.get_required_param(self, 'auth_entity') src = googleplus.GooglePlus(auth_entity=ndb.Key(urlsafe=auth_entity).get()) else: src_cls = source.sources.get(site) if not src_cls: raise exc.HTTPNotFound('Unknown site %r' % site) src = src_cls(**self.request.params) # decode tag URI ids for i, arg in enumerate(args): parsed = util.parse_tag_uri(arg) if parsed: domain, id = parsed if domain != src.DOMAIN: raise exc.HTTPBadRequest('Expected domain %s in tag URI %s, found %s' % (src.DOMAIN, arg, domain)) args[i] = id # check if request is cached cache = self.request.get('cache', '').lower() != 'false' if cache: cache_key = 'R %s' % self.request.path cached = memcache.get(cache_key) if cached: logging.info('Serving cached response %r', cache_key) self.write_response(cached['response'], actor=cached['actor'], url=src.BASE_URL) return # handle default path elements args = [None if a in defaults else a for a, defaults in zip(args, PATH_DEFAULTS)] user_id = args[0] if args else None # get activities try: response = src.get_activities_response(*args, **self.get_kwargs(src)) except NotImplementedError as e: self.abort(400, str(e)) except Exception as e: if util.is_connection_failure(e): # HTTP 504 Gateway Timeout self.abort(504, str(e)) raise # fetch actor if necessary actor = response.get('actor') if not actor and self.request.get('format') == 'atom': # atom needs actor args = [None if a in defaults else a # handle default path elements for a, defaults in zip(args, PATH_DEFAULTS)] user_id = args[0] if args else None actor = src.get_actor(user_id) if src else {} self.write_response(response, actor=actor, url=src.BASE_URL) # cache response if cache: logging.info('Caching response in %r', cache_key) memcache.set(cache_key, {'response': response, 'actor': actor}, src.RESPONSE_CACHE_TIME)