Пример #1
0
    def get_rsvps_from_event(event):
        """Returns RSVP objects for an event's *attending fields.

    Args:
      event: ActivityStreams event object

    Returns: sequence of ActivityStreams RSVP activity objects
    """
        id = event.get("id")
        if not id:
            return []
        parsed = util.parse_tag_uri(id)
        if not parsed:
            return []
        domain, event_id = parsed
        url = event.get("url")

        rsvps = []
        for verb, field in RSVP_TO_EVENT.items():
            for actor in event.get(field, []):
                rsvp = {"objectType": "activity", "verb": verb, "actor": actor, "url": url}
                if event_id and "id" in actor:
                    _, actor_id = util.parse_tag_uri(actor["id"])
                    rsvp["id"] = util.tag_uri(domain, "%s_rsvp_%s" % (event_id, actor_id))
                    if url:
                        rsvp["url"] = "#".join((url, actor_id))
                rsvps.append(rsvp)

        return rsvps
Пример #2
0
  def get_rsvps_from_event(event):
    """Returns RSVP objects for an event's *attending fields.

    Args:
      event: ActivityStreams event object

    Returns: sequence of ActivityStreams RSVP activity objects
    """
    id = event.get('id')
    if not id:
      return []
    parsed = util.parse_tag_uri(id)
    if not parsed:
      return []
    domain, event_id = parsed

    rsvps = []
    for verb, field in RSVP_TO_EVENT.items():
      for actor in event.get(field, []):
        rsvp = {'objectType': 'activity',
                'verb': verb,
                'actor': actor,
                }
        if event_id and 'id' in actor:
          _, actor_id = util.parse_tag_uri(actor['id'])
          rsvp['id'] = util.tag_uri(domain, '%s_rsvp_%s' % (event_id, actor_id))
        rsvps.append(rsvp)

    return rsvps
Пример #3
0
  def get_rsvps_from_event(event):
    """Returns RSVP objects for an event's *attending fields.

    Args:
      event: ActivityStreams event object

    Returns: sequence of ActivityStreams RSVP activity objects
    """
    id = event.get('id')
    if not id:
      return []
    parsed = util.parse_tag_uri(id)
    if not parsed:
      return []
    domain, event_id = parsed
    url = event.get('url')

    rsvps = []
    for verb, field in RSVP_TO_EVENT.items():
      for actor in event.get(field, []):
        rsvp = {'objectType': 'activity',
                'verb': verb,
                'actor': actor,
                'url': url,
                }
        if event_id and 'id' in actor:
          _, actor_id = util.parse_tag_uri(actor['id'])
          rsvp['id'] = util.tag_uri(domain, '%s_rsvp_%s' % (event_id, actor_id))
          if url:
            rsvp['url'] = '#'.join((url, actor_id))
        rsvps.append(rsvp)

    return rsvps
Пример #4
0
  def base_object(self, obj):
    """Returns id and URL of the 'base' silo object that an object operates on.

    For example, if the object is a comment, this returns the post that it's a
    comment on. If it's an RSVP, this returns the event. The id in the returned
    tuple is silo-specific, ie not a tag URI.

    Subclasses may override this.

    Args:
      obj: ActivityStreams object

    Returns: (string id, string URL) tuple. Both may be None.
    """
    # look at in-reply-tos first, then objects (for likes and reposts).
    # technically, the ActivityStreams 'object' field is always supposed to be
    # singular, but microformats2.json_to_object() sometimes returns activities
    # that have a list value, e.g. likes or reposts of multiple objects.
    candidates = []
    for field in ('inReplyTo', 'object'):
      objs = obj.get(field, [])
      if isinstance(objs, dict):
        candidates.append(objs)
      else:
        candidates += objs

    for base_obj in candidates:
      parsed_id = util.parse_tag_uri(base_obj.get('id', ''))
      if parsed_id:
        domain = parsed_id[0]
      else:
        domain = urlparse.urlparse(base_obj.get('url', '')).netloc
      for subdomain in 'www.', 'mobile.':
        if domain.startswith(subdomain):
          domain = domain[len(subdomain):]
      if domain == self.DOMAIN:
        break
    else:
      return (None, None)

    id = base_obj.get('id')
    url = base_obj.get('url')

    if id:
      id = util.parse_tag_uri(id)[1]
    elif url:
      path = urlparse.urlparse(url).path
      if path.endswith('/'):
        path = path[:-1]
      id = path.rsplit('/', 1)[-1]

    return (id, url)
Пример #5
0
    def base_object(self, obj):
        """Returns the 'base' silo object that an object operates on.

    For example, if the object is a comment, this returns the post that it's a
    comment on. If it's an RSVP, this returns the event. The id in the returned
    object is silo-specific, ie not a tag URI.

    Subclasses may override this.

    Args:
      obj: ActivityStreams object

    Returns:
      dict, minimal ActivityStreams object. Usually has at least id; may
      also have url, author, etc.
    """
        # look at in-reply-tos first, then objects (for likes and reposts).
        # technically, the ActivityStreams 'object' field is always supposed to be
        # singular, but microformats2.json_to_object() sometimes returns activities
        # that have a list value, e.g. likes or reposts of multiple objects.
        candidates = []
        for field in ('inReplyTo', 'object'):
            objs = obj.get(field, [])
            if isinstance(objs, dict):
                candidates.append(objs)
            else:
                candidates += objs

        for base_obj in candidates:
            parsed_id = util.parse_tag_uri(base_obj.get('id', ''))
            if parsed_id:
                domain = parsed_id[0]
            else:
                domain = util.domain_from_link(base_obj.get('url', ''))
            if domain == self.DOMAIN:
                break
        else:
            return {}

        base_obj = copy.deepcopy(base_obj)
        id = base_obj.get('id')
        url = base_obj.get('url')

        if id:
            parsed = util.parse_tag_uri(id)
            if parsed:
                base_obj['id'] = parsed[1]
        elif url:
            base_obj['id'] = self.base_id(url)

        return base_obj
Пример #6
0
    def base_object(self, obj):
        """Returns the 'base' silo object that an object operates on.

    For example, if the object is a comment, this returns the post that it's a
    comment on. If it's an RSVP, this returns the event. The id in the returned
    object is silo-specific, ie not a tag URI.

    Subclasses may override this.

    Args:
      obj: ActivityStreams object

    Returns: dict, minimal ActivityStreams object. Usually has at least id; may
      also have url, author, etc.
    """
        # look at in-reply-tos first, then objects (for likes and reposts).
        # technically, the ActivityStreams 'object' field is always supposed to be
        # singular, but microformats2.json_to_object() sometimes returns activities
        # that have a list value, e.g. likes or reposts of multiple objects.
        candidates = []
        for field in ("inReplyTo", "object"):
            objs = obj.get(field, [])
            if isinstance(objs, dict):
                candidates.append(objs)
            else:
                candidates += objs

        for base_obj in candidates:
            parsed_id = util.parse_tag_uri(base_obj.get("id", ""))
            if parsed_id:
                domain = parsed_id[0]
            else:
                domain = util.domain_from_link(base_obj.get("url", ""))
            if domain == self.DOMAIN:
                break
        else:
            return {}

        base_obj = copy.deepcopy(base_obj)
        id = base_obj.get("id")
        url = base_obj.get("url")

        if id:
            parsed = util.parse_tag_uri(id)
            if parsed:
                base_obj["id"] = parsed[1]
        elif url:
            path = urlparse.urlparse(url).path
            base_obj["id"] = path.rstrip("/").rsplit("/", 1)[-1]

        return base_obj
  def base_object(self, obj):
    """Returns id and URL of the 'base' silo object that an object operates on.

    For example, if the object is a comment, this returns the post that it's a
    comment on. If it's an RSVP, this returns the event. The id in the returned
    tuple is silo-specific, ie not a tag URI.

    Args:
      obj: ActivityStreams object

    Returns: (string id, string URL) tuple. Both may be None.
    """
    reply_to = obj.get('inReplyTo')
    base_obj = reply_to[0] if reply_to else obj.get('object')
    if not base_obj:
      return (None, None)

    id = base_obj.get('id')
    url = base_obj.get('url')

    if id:
      id = util.parse_tag_uri(id)[1]
    elif url:
      path = urlparse.urlparse(url).path
      if path.endswith('/'):
        path = path[:-1]
      id = path.rsplit('/', 1)[-1]

    return (id, url)
Пример #8
0
    def merge_scraped_reactions(self, scraped, activity):
        """Converts and merges scraped likes and reactions into an activity.

    New likes and emoji reactions are added to the activity in 'tags'.
    Existing likes and emoji reactions in 'tags' are ignored.

    Args:
      scraped: str or dict, scraped JSON likes
      activity: dict, AS activity to merge these reactions into

    Returns:
      list of dict AS like tag objects converted from scraped

    Raises:
      ValueError: if scraped is not valid JSON
    """
        if isinstance(scraped, str):
            scraped = json_loads(scraped)

        media = scraped.get('data', {}).get('shortcode_media', {})
        if media:
            id = util.parse_tag_uri(activity['id'])[1]
            media_url = self.media_url(media['shortcode'])
            likes = [
                self.like_to_object(like.get('node', {}), id, media_url)
                for like in media.get('edge_liked_by', {}).get('edges', [])
            ]
            source.merge_by_id(activity['object'], 'tags', likes)
            return likes

        return []
Пример #9
0
  def fetch_replies(self, activities, min_id=None):
    """Fetches and injects Twitter replies into a list of activities, in place.

    Includes indirect replies ie reply chains, not just direct replies. Searches
    for @-mentions, matches them to the original tweets with
    in_reply_to_status_id_str, and recurses until it's walked the entire tree.

    Args:
      activities: list of activity dicts

    Returns:
      same activities list
    """

    # cache searches for @-mentions for individual users. maps username to dict
    # mapping tweet id to ActivityStreams reply object dict.
    mentions = {}

    # find replies
    for activity in activities:
      # list of ActivityStreams reply object dict and set of seen activity ids
      # (tag URIs). seed with the original tweet; we'll filter it out later.
      replies = [activity]
      _, id = util.parse_tag_uri(activity['id'])
      seen_ids = set([id])

      for reply in replies:
        # get mentions of this tweet's author so we can search them for replies to
        # this tweet. can't use statuses/mentions_timeline because i'd need to
        # auth as the user being mentioned.
        # https://dev.twitter.com/docs/api/1.1/get/statuses/mentions_timeline
        #
        # note that these HTTP requests are synchronous. you can make async
        # requests by using urlfetch.fetch() directly, but not with urllib2.
        # https://developers.google.com/appengine/docs/python/urlfetch/asynchronousrequests
        author = reply['actor']['username']
        if author not in mentions:
          url = API_SEARCH % {
            'q': urllib.quote_plus('@' + author.encode('utf-8')),
            'count': 100,
          }
          if min_id is not None:
            url = util.add_query_params(url, {'since_id': min_id})
          mentions[author] = self.urlopen(url)['statuses']

        # look for replies. add any we find to the end of replies. this makes us
        # recursively follow reply chains to their end. (python supports
        # appending to a sequence while you're iterating over it.)
        for mention in mentions[author]:
          id = mention['id_str']
          if (mention.get('in_reply_to_status_id_str') in seen_ids and
              id not in seen_ids):
            replies.append(self.tweet_to_activity(mention))
            seen_ids.add(id)

      items = [r['object'] for r in replies[1:]]  # filter out seed activity
      activity['object']['replies'] = {
        'items': items,
        'totalItems': len(items),
        }
Пример #10
0
  def fetch_replies(self, activities, min_id=None):
    """Fetches and injects Twitter replies into a list of activities, in place.

    Includes indirect replies ie reply chains, not just direct replies. Searches
    for @-mentions, matches them to the original tweets with
    in_reply_to_status_id_str, and recurses until it's walked the entire tree.

    Args:
      activities: list of activity dicts

    Returns:
      same activities list
    """

    # cache searches for @-mentions for individual users. maps username to dict
    # mapping tweet id to ActivityStreams reply object dict.
    mentions = {}

    # find replies
    for activity in activities:
      # list of ActivityStreams reply object dict and set of seen activity ids
      # (tag URIs). seed with the original tweet; we'll filter it out later.
      replies = [activity]
      _, id = util.parse_tag_uri(activity['id'])
      seen_ids = set([id])

      for reply in replies:
        # get mentions of this tweet's author so we can search them for replies to
        # this tweet. can't use statuses/mentions_timeline because i'd need to
        # auth as the user being mentioned.
        # https://dev.twitter.com/docs/api/1.1/get/statuses/mentions_timeline
        #
        # note that these HTTP requests are synchronous. you can make async
        # requests by using urlfetch.fetch() directly, but not with urllib2.
        # https://developers.google.com/appengine/docs/python/urlfetch/asynchronousrequests
        author = reply['actor']['username']
        if author not in mentions:
          url = API_SEARCH_URL % {
            'q': urllib.quote_plus('@' + author),
            'count': 100,
          }
          if min_id is not None:
            url = util.add_query_params(url, {'since_id': min_id})
          mentions[author] = self.urlopen(url)['statuses']

        # look for replies. add any we find to the end of replies. this makes us
        # recursively follow reply chains to their end. (python supports
        # appending to a sequence while you're iterating over it.)
        for mention in mentions[author]:
          id = mention['id_str']
          if (mention.get('in_reply_to_status_id_str') in seen_ids and
              id not in seen_ids):
            replies.append(self.tweet_to_activity(mention))
            seen_ids.add(id)

      items = [r['object'] for r in replies[1:]]  # filter out seed activity
      activity['object']['replies'] = {
        'items': items,
        'totalItems': len(items),
        }
Пример #11
0
  def base_object(self, obj):
    """Returns the 'base' silo object that an object operates on.

    For example, if the object is a comment, this returns the post that it's a
    comment on. If it's an RSVP, this returns the event. The id in the returned
    object is silo-specific, ie not a tag URI.

    Subclasses may override this.

    Args:
      obj: ActivityStreams object

    Returns:
      dict, minimal ActivityStreams object. Usually has at least id; may
      also have url, author, etc.
    """
    # look at in-reply-tos first, then objects (for likes and reposts).
    # technically, the ActivityStreams 'object' field is always supposed to be
    # singular, but microformats2.json_to_object() sometimes returns activities
    # that have a list value, e.g. likes or reposts of multiple objects.
    candidates = []
    for field in ('inReplyTo', 'object', 'target'):
      candidates += util.get_list(obj, field)

    for base_obj in candidates:
      parsed_id = util.parse_tag_uri(base_obj.get('id', ''))
      if parsed_id:
        domain = parsed_id[0]
      else:
        domain = util.domain_from_link(base_obj.get('url', ''))
      if domain == self.DOMAIN:
        break
    else:
      return {}

    base_obj = copy.deepcopy(base_obj)
    id = base_obj.get('id')
    url = base_obj.get('url')

    if id:
      parsed = util.parse_tag_uri(id)
      if parsed:
        base_obj['id'] = parsed[1]
    elif url:
      base_obj['id'] = self.base_id(url)

    return base_obj
Пример #12
0
  def get_rsvps_from_event(event):
    """Returns RSVP objects for an event's attending fields.

    Args:
      event: ActivityStreams event object

    Returns:
      sequence of ActivityStreams RSVP activity objects
    """
    id = event.get('id')
    if not id:
      return []
    parsed = util.parse_tag_uri(id)
    if not parsed:
      return []
    domain, event_id = parsed
    url = event.get('url')
    author = event.get('author')

    rsvps = []
    for verb, field in RSVP_VERB_TO_COLLECTION.items():
      for actor in event.get(field, []):
        rsvp = {'objectType': 'activity',
                'verb': verb,
                'object' if verb == 'invite' else 'actor': actor,
                'url': url,
                }

        if event_id and 'id' in actor:
          _, actor_id = util.parse_tag_uri(actor['id'])
          rsvp['id'] = util.tag_uri(domain, '%s_rsvp_%s' % (event_id, actor_id))
          if url:
            rsvp['url'] = '#'.join((url, actor_id))

        if verb == 'invite' and author:
          rsvp['actor'] = author

        rsvps.append(rsvp)

    return rsvps
Пример #13
0
    def _postprocess_base_object(cls, obj):
        obj = copy.deepcopy(obj)
        id = obj.get('id')
        url = obj.get('url')

        if id:
            parsed = util.parse_tag_uri(id)
            if parsed:
                obj['id'] = parsed[1]
        elif url:
            obj['id'] = cls.base_id(url)

        return obj
Пример #14
0
  def base_object(self, obj):
    """Extends the default base_object() to avoid using shortcodes as object ids.
    """
    base_obj = super(Instagram, self).base_object(obj)

    base_id = base_obj.get('id')
    if base_id and not base_id.replace('_', '').isdigit():
      # this isn't id. it's probably a shortcode.
      del base_obj['id']
      id = obj.get('id')
      if id:
        parsed = util.parse_tag_uri(id)
        if parsed and '_' in parsed[1]:
          base_obj['id'] = parsed[1].split('_')[0]

    return base_obj
Пример #15
0
    def base_object(self, obj):
        """Extends the default base_object() to avoid using shortcodes as object ids.
    """
        base_obj = super(Instagram, self).base_object(obj)

        base_id = base_obj.get('id')
        if base_id and not base_id.replace('_', '').isdigit():
            # this isn't id. it's probably a shortcode.
            del base_obj['id']
            id = obj.get('id')
            if id:
                parsed = util.parse_tag_uri(id)
                if parsed and '_' in parsed[1]:
                    base_obj['id'] = parsed[1].split('_')[0]

        return base_obj
Пример #16
0
    def get_rsvp(self, activity_user_id, event_id, user_id, event=None):
        """Returns an ActivityStreams RSVP activity object.

    Args:
      activity_user_id: string id of the user who posted the event. unused.
      event_id: string event id
      user_id: string user id
      event: AS event activity (optional)
    """
        user_tag_id = self.tag_uri(user_id)
        if not event:
            event = self.get_event(event_id)
            if not event:
                return None

        for rsvp in self.get_rsvps_from_event(event['object']):
            for field in 'actor', 'object':
                id = rsvp.get(field, {}).get('id')
                if id and user_id == util.parse_tag_uri(id)[1]:
                    return rsvp
Пример #17
0
  def get_rsvp(self, activity_user_id, event_id, user_id, event=None):
    """Returns an ActivityStreams RSVP activity object.

    Args:
      activity_user_id: string id of the user who posted the event. unused.
      event_id: string event id
      user_id: string user id
      event: AS event activity (optional)
    """
    user_tag_id = self.tag_uri(user_id)
    if not event:
      event = self.get_event(event_id)
      if not event:
        return None

    for rsvp in self.get_rsvps_from_event(event['object']):
      for field in 'actor', 'object':
        id = rsvp.get(field, {}).get('id')
        if id and user_id == util.parse_tag_uri(id)[1]:
          return rsvp
Пример #18
0
    def _fetch_replies(self, r, activities):
        """Fetches and injects comments into a list of activities, in place.

    limitations: Only includes top level comments
    Args:
      r: PRAW API object for querying submissions in activities
      activities: list of activity dicts
    """
        for activity in activities:
            subm = r.submission(id=util.parse_tag_uri(activity.get('id'))[1])

            # for v0 we will use just the top level comments because threading is hard.
            # feature request: https://github.com/snarfed/bridgy/issues/1014
            subm.comments.replace_more()
            replies = []
            for top_level_comment in subm.comments:
                replies.append(
                    self.praw_to_activity(top_level_comment, 'comment'))

            items = [r.get('object') for r in replies]
            activity['object']['replies'] = {
                'items': items,
                'totalItems': len(items),
            }
Пример #19
0
    def get(self):
        """Handles an API GET.

    Request path is of the form /site/user_id/group_id/app_id/activity_id ,
    where each element except site is an optional string object id.
    """
        # parse path
        args = urllib.parse.unquote(self.request.path).strip('/').split('/')
        if not args or len(args) > MAX_PATH_LEN:
            raise exc.HTTPNotFound('Expected 1-%d path elements; found %d' %
                                   (MAX_PATH_LEN, len(args)))

        if len(args) > 1 and args[1] == 'nederland20':
            return self.abort(
                401,
                'To protect our users from spam and other malicious activity, this account is temporarily locked. Please log in to https://twitter.com to unlock your account.'
            )

        # make source instance
        site = args.pop(0)
        if site == 'twitter':
            src = twitter.Twitter(access_token_key=util.get_required_param(
                self, 'access_token_key'),
                                  access_token_secret=util.get_required_param(
                                      self, 'access_token_secret'))
        elif site == 'facebook':
            self.abort(
                400,
                'Sorry, Facebook is no longer available in the REST API. Try the library instead!'
            )
        elif site == 'flickr':
            src = flickr.Flickr(access_token_key=util.get_required_param(
                self, 'access_token_key'),
                                access_token_secret=util.get_required_param(
                                    self, 'access_token_secret'))
        elif site == 'github':
            src = github.GitHub(
                access_token=util.get_required_param(self, 'access_token'))
        elif site == 'instagram':
            if self.request.get('interactive').lower() == 'true':
                src = instagram.Instagram(scrape=True)
            else:
                self.abort(
                    400,
                    'Sorry, Instagram is not currently available in the REST API. Try https://instagram-atom.appspot.com/ instead!'
                )
        elif site == 'mastodon':
            src = mastodon.Mastodon(
                instance=util.get_required_param(self, 'instance'),
                access_token=util.get_required_param(self, 'access_token'),
                user_id=util.get_required_param(self, 'user_id'))
        elif site == 'meetup':
            src = meetup.Meetup(access_token_key=util.get_required_param(
                self, 'access_token_key'),
                                access_token_secret=util.get_required_param(
                                    self, 'access_token_secret'))
        elif site == 'pixelfed':
            src = pixelfed.Pixelfed(
                instance=util.get_required_param(self, 'instance'),
                access_token=util.get_required_param(self, 'access_token'),
                user_id=util.get_required_param(self, 'user_id'))
        elif site == 'reddit':
            src = reddit.Reddit(refresh_token=util.get_required_param(
                self, 'refresh_token'
            ))  # the refresh_roken should be returned but is not appearing
        else:
            src_cls = source.sources.get(site)
            if not src_cls:
                raise exc.HTTPNotFound('Unknown site %r' % site)
            src = src_cls(**self.request.params)

        # decode tag URI ids
        for i, arg in enumerate(args):
            parsed = util.parse_tag_uri(arg)
            if parsed:
                domain, id = parsed
                if domain != src.DOMAIN:
                    raise exc.HTTPBadRequest(
                        'Expected domain %s in tag URI %s, found %s' %
                        (src.DOMAIN, arg, domain))
                args[i] = id

        # handle default path elements
        args = [
            None if a in defaults else a
            for a, defaults in zip(args, PATH_DEFAULTS)
        ]
        user_id = args[0] if args else None

        # get activities (etc)
        try:
            if len(args) >= 2 and args[1] == '@blocks':
                try:
                    response = {'items': src.get_blocklist()}
                except source.RateLimited as e:
                    if not e.partial:
                        self.abort(429, str(e))
                    response = {'items': e.partial}
            else:
                response = src.get_activities_response(*args,
                                                       **self.get_kwargs())
        except (NotImplementedError, ValueError) as e:
            self.abort(400, str(e))
            # other exceptions are handled by webutil.handlers.handle_exception(),
            # which uses interpret_http_exception(), etc.

        # fetch actor if necessary
        actor = response.get('actor')
        if not actor and self.request.get('format') == 'atom':
            # atom needs actor
            actor = src.get_actor(user_id) if src else {}

        self.write_response(response, actor=actor, url=src.BASE_URL)
Пример #20
0
    def _scrape(self,
                user_id=None,
                group_id=None,
                activity_id=None,
                cookie=None,
                count=None,
                fetch_extras=False,
                cache=None,
                shortcode=None):
        """Scrapes a user's profile or feed and converts the media to activities.

    Args:
      user_id: string
      activity_id: string, e.g. '1020355224898358984_654594'
      count: integer, number of activities to fetch and return, None for all
      fetch_extras: boolean
      cookie: string
      shortcode: string, e.g. '4pB6vEx87I'

    Returns:
      dict activities API response
    """
        assert user_id or activity_id or shortcode or cookie
        assert not (activity_id and shortcode)

        if not shortcode:
            shortcode = self.id_to_shortcode(activity_id)

        url = (
            HTML_MEDIA % shortcode if shortcode else HTML_PROFILE %
            user_id if user_id and group_id == source.SELF else HTML_BASE_URL)
        kwargs = {}
        if cookie:
            if not cookie.startswith('sessionid='):
                cookie = 'sessionid=' + cookie
            kwargs = {'headers': {'Cookie': cookie}}
        resp = util.requests_get(url, allow_redirects=False, **kwargs)
        if ((cookie and 'not-logged-in' in resp.text) or
            (resp.status_code in (301, 302)
             and '/accounts/login' in resp.headers.get('Location', ''))):
            resp.status_code = 401
            raise requests.HTTPError('401 Unauthorized', response=resp)
        elif resp.status_code == 404:
            if activity_id:
                return self._scrape(shortcode=activity_id,
                                    cookie=cookie,
                                    count=count)
            # otherwise not found, fall through and return empty response
        else:
            resp.raise_for_status()

        activities, actor = self.html_to_activities(resp.text,
                                                    cookie=cookie,
                                                    count=count)

        if fetch_extras:
            # batch get cached counts of comments and likes for all activities
            cached = {}
            # don't update the cache until the end, in case we hit an error before
            cache_updates = {}
            if cache is not None:
                keys = []
                for activity in activities:
                    _, id = util.parse_tag_uri(activity['id'])
                    keys.extend(['AIL ' + id, 'AIC ' + id])
                cached = cache.get_multi(keys)

            for i, activity in enumerate(activities):
                obj = activity['object']
                _, id = util.parse_tag_uri(activity['id'])
                likes = obj.get('ig_like_count') or 0
                comments = obj.get('replies', {}).get('totalItems') or 0
                likes_key = 'AIL %s' % id
                comments_key = 'AIC %s' % id

                if (likes and likes != cached.get(likes_key)
                        or comments and comments != cached.get(comments_key)):
                    if not activity_id and not shortcode:
                        url = activity['url'].replace(self.BASE_URL,
                                                      HTML_BASE_URL)
                        resp = util.requests_get(url)
                        resp.raise_for_status()
                    # otherwise resp is a fetch of just this activity; reuse it

                    full_activity, _ = self.html_to_activities(
                        resp.text,
                        cookie=cookie,
                        count=count,
                        fetch_extras=fetch_extras)
                    if full_activity:
                        activities[i] = full_activity[0]
                        cache_updates.update({
                            likes_key: likes,
                            comments_key: comments
                        })

            if cache_updates and cache is not None:
                cache.set_multi(cache_updates)

        resp = self.make_activities_base_response(activities)
        resp['actor'] = actor
        return resp
Пример #21
0
    def _scrape(self,
                user_id=None,
                activity_id=None,
                cookie=None,
                fetch_extras=False,
                cache=None):
        """Scrapes a user's profile or feed and converts the media to activities.

    Args:
      user_id: string
      activity_id: string
      fetch_extras: boolean
      cookie: string

    Returns: list of activities
    """
        assert user_id or activity_id or cookie

        url = (HTML_MEDIA % self.id_to_shortcode(activity_id) if activity_id
               else self.user_url(user_id) if user_id else self.BASE_URL)
        kwargs = {}
        if cookie:
            kwargs = {'headers': {'Cookie': cookie}}
        resp = util.requests_get(url, allow_redirects=False, **kwargs)
        if ((cookie and 'not-logged-in' in resp.text) or
            (resp.status_code in ('301', '302')
             and '/accounts/login' in resp.headers.get('Location', ''))):
            resp.status_code = '401'
            raise requests.HTTPError('401 Unauthorized', response=resp)

        activities, actor = self.html_to_activities(resp.text)

        if fetch_extras and not activity_id:
            # batch get cached counts of comments and likes for all activities
            cached = {}
            # don't update the cache until the end, in case we hit an error before
            cache_updates = {}
            if cache is not None:
                keys = []
                for activity in activities:
                    _, id = util.parse_tag_uri(activity['id'])
                    keys.extend(['AIL ' + id, 'AIC ' + id])
                cached = cache.get_multi(keys)

            for i, activity in enumerate(activities):
                obj = activity['object']
                _, id = util.parse_tag_uri(activity['id'])
                likes = obj.get('ig_like_count') or 0
                comments = obj.get('replies', {}).get('totalItems') or 0
                likes_key = 'AIL %s' % id
                comments_key = 'AIC %s' % id

                if (likes and likes != cached.get(likes_key)
                        or comments and comments != cached.get(comments_key)):
                    full_activity, _ = self.html_to_activities(
                        util.requests_get(activity['url']).text)
                    if full_activity:
                        activities[i] = full_activity[0]
                        cache_updates.update({
                            likes_key: likes,
                            comments_key: comments
                        })

            if cache_updates and cache is not None:
                cache.set_multi(cache_updates)

        resp = self.make_activities_base_response(activities)
        resp['actor'] = actor
        return resp
Пример #22
0
    def get(self):
        """Handles an API GET.

    Request path is of the form /site/user_id/group_id/app_id/activity_id ,
    where each element except site is an optional string object id.
    """
        # parse path
        args = urllib.unquote(self.request.path).strip('/').split('/')
        if not args or len(args) > MAX_PATH_LEN:
            raise exc.HTTPNotFound('Expected 1-%d path elements; found %d' %
                                   (MAX_PATH_LEN, len(args)))

        # make source instance
        site = args.pop(0)
        if site == 'twitter':
            src = twitter.Twitter(access_token_key=util.get_required_param(
                self, 'access_token_key'),
                                  access_token_secret=util.get_required_param(
                                      self, 'access_token_secret'))
        elif site == 'facebook':
            src = facebook.Facebook(
                access_token=util.get_required_param(self, 'access_token'))
        elif site == 'flickr':
            src = flickr.Flickr(access_token_key=util.get_required_param(
                self, 'access_token_key'),
                                access_token_secret=util.get_required_param(
                                    self, 'access_token_secret'))
        elif site == 'instagram':
            src = instagram.Instagram(scrape=True)
        elif site == 'google+':
            auth_entity = util.get_required_param(self, 'auth_entity')
            src = googleplus.GooglePlus(auth_entity=ndb.Key(
                urlsafe=auth_entity).get())
        else:
            src_cls = source.sources.get(site)
            if not src_cls:
                raise exc.HTTPNotFound('Unknown site %r' % site)
            src = src_cls(**self.request.params)

        # decode tag URI ids
        for i, arg in enumerate(args):
            parsed = util.parse_tag_uri(arg)
            if parsed:
                domain, id = parsed
                if domain != src.DOMAIN:
                    raise exc.HTTPBadRequest(
                        'Expected domain %s in tag URI %s, found %s' %
                        (src.DOMAIN, arg, domain))
                args[i] = id

        # check if request is cached
        cache = self.request.get('cache', '').lower() != 'false'
        if cache:
            cache_key = 'R %s' % self.request.path
            cached = memcache.get(cache_key)
            if cached:
                logging.info('Serving cached response %r', cache_key)
                self.write_response(cached['response'],
                                    actor=cached['actor'],
                                    url=src.BASE_URL)
                return

        # handle default path elements
        args = [
            None if a in defaults else a
            for a, defaults in zip(args, PATH_DEFAULTS)
        ]
        user_id = args[0] if args else None

        # get activities (etc)
        try:
            if len(args) >= 2 and args[1] == '@blocks':
                response = {'items': src.get_blocklist()}
            else:
                response = src.get_activities_response(*args,
                                                       **self.get_kwargs(src))
        except (NotImplementedError, ValueError) as e:
            self.abort(400, str(e))
            # other exceptions are handled by webutil.handlers.handle_exception(),
            # which uses interpret_http_exception(), etc.

        # fetch actor if necessary
        actor = response.get('actor')
        if not actor and self.request.get('format') == 'atom':
            # atom needs actor
            args = [
                None if a in defaults else a  # handle default path elements
                for a, defaults in zip(args, PATH_DEFAULTS)
            ]
            user_id = args[0] if args else None
            actor = src.get_actor(user_id) if src else {}

        self.write_response(response, actor=actor, url=src.BASE_URL)

        # cache response
        if cache:
            logging.info('Caching response in %r', cache_key)
            memcache.set(cache_key, {
                'response': response,
                'actor': actor
            }, src.RESPONSE_CACHE_TIME)
Пример #23
0
  def get(self):
    """Handles an API GET.

    Request path is of the form /site/user_id/group_id/app_id/activity_id ,
    where each element except site is an optional string object id.
    """
    # parse path
    args = urllib.unquote(self.request.path).strip('/').split('/')
    if not args or len(args) > MAX_PATH_LEN:
      raise exc.HTTPNotFound('Expected 1-%d path elements; found %d' %
                             (MAX_PATH_LEN, len(args)))

    # make source instance
    site = args.pop(0)
    if site == 'twitter':
      src = twitter.Twitter(
        access_token_key=util.get_required_param(self, 'access_token_key'),
        access_token_secret=util.get_required_param(self, 'access_token_secret'))
    elif site == 'facebook':
      src = facebook.Facebook(
        access_token=util.get_required_param(self, 'access_token'))
    elif site == 'flickr':
      src = flickr.Flickr(
        access_token_key=util.get_required_param(self, 'access_token_key'),
        access_token_secret=util.get_required_param(self, 'access_token_secret'))
    elif site == 'github':
      src = github.GitHub(
        access_token=util.get_required_param(self, 'access_token'))
    elif site == 'instagram':
      src = instagram.Instagram(scrape=True)
    else:
      src_cls = source.sources.get(site)
      if not src_cls:
        raise exc.HTTPNotFound('Unknown site %r' % site)
      src = src_cls(**self.request.params)

    # decode tag URI ids
    for i, arg in enumerate(args):
      parsed = util.parse_tag_uri(arg)
      if parsed:
        domain, id = parsed
        if domain != src.DOMAIN:
          raise exc.HTTPBadRequest('Expected domain %s in tag URI %s, found %s' %
                                   (src.DOMAIN, arg, domain))
        args[i] = id

    # handle default path elements
    args = [None if a in defaults else a
            for a, defaults in zip(args, PATH_DEFAULTS)]
    user_id = args[0] if args else None

    # get activities (etc)
    try:
      if len(args) >= 2 and args[1] == '@blocks':
        try:
          response = {'items': src.get_blocklist()}
        except source.RateLimited as e:
          if not e.partial:
            self.abort(429, str(e))
          response = {'items': e.partial}
      else:
        response = src.get_activities_response(*args, **self.get_kwargs())
    except (NotImplementedError, ValueError) as e:
      self.abort(400, str(e))
      # other exceptions are handled by webutil.handlers.handle_exception(),
      # which uses interpret_http_exception(), etc.

    # fetch actor if necessary
    actor = response.get('actor')
    if not actor and self.request.get('format') == 'atom':
      # atom needs actor
      args = [None if a in defaults else a  # handle default path elements
              for a, defaults in zip(args, PATH_DEFAULTS)]
      actor = src.get_actor(user_id) if src else {}

    self.write_response(response, actor=actor, url=src.BASE_URL)
Пример #24
0
  def get(self):
    """Handles an API GET.

    Request path is of the form /site/user_id/group_id/app_id/activity_id ,
    where each element except site is an optional string object id.
    """
    # parse path
    args = urllib.unquote(self.request.path).strip('/').split('/')
    if not args or len(args) > MAX_PATH_LEN:
      raise exc.HTTPNotFound('Expected 1-%d path elements; found %d' %
                             (MAX_PATH_LEN, len(args)))

    # make source instance
    site = args.pop(0)
    if site == 'twitter':
      src = twitter.Twitter(
        access_token_key=util.get_required_param(self, 'access_token_key'),
        access_token_secret=util.get_required_param(self, 'access_token_secret'))
    elif site == 'facebook':
      src = facebook.Facebook(
        access_token=util.get_required_param(self, 'access_token'))
    elif site == 'flickr':
      src = flickr.Flickr(
        access_token_key=util.get_required_param(self, 'access_token_key'),
        access_token_secret=util.get_required_param(self, 'access_token_secret'))
    elif site == 'github':
      src = github.GitHub(
        access_token=util.get_required_param(self, 'access_token'))
    elif site == 'instagram':
      src = instagram.Instagram(scrape=True)
    else:
      src_cls = source.sources.get(site)
      if not src_cls:
        raise exc.HTTPNotFound('Unknown site %r' % site)
      src = src_cls(**self.request.params)

    # decode tag URI ids
    for i, arg in enumerate(args):
      parsed = util.parse_tag_uri(arg)
      if parsed:
        domain, id = parsed
        if domain != src.DOMAIN:
          raise exc.HTTPBadRequest('Expected domain %s in tag URI %s, found %s' %
                                   (src.DOMAIN, arg, domain))
        args[i] = id

    # handle default path elements
    args = [None if a in defaults else a
            for a, defaults in zip(args, PATH_DEFAULTS)]
    user_id = args[0] if args else None

    # get activities (etc)
    try:
      if len(args) >= 2 and args[1] == '@blocks':
        try:
          response = {'items': src.get_blocklist()}
        except source.RateLimited as e:
          if not e.partial:
            self.abort(429, str(e))
          response = {'items': e.partial}
      else:
        response = src.get_activities_response(*args, **self.get_kwargs())
    except (NotImplementedError, ValueError) as e:
      self.abort(400, str(e))
      # other exceptions are handled by webutil.handlers.handle_exception(),
      # which uses interpret_http_exception(), etc.

    # fetch actor if necessary
    actor = response.get('actor')
    if not actor and self.request.get('format') == 'atom':
      # atom needs actor
      args = [None if a in defaults else a  # handle default path elements
              for a, defaults in zip(args, PATH_DEFAULTS)]
      actor = src.get_actor(user_id) if src else {}

    self.write_response(response, actor=actor, url=src.BASE_URL)
Пример #25
0
  def _scrape(self, user_id=None, activity_id=None, cookie=None,
              fetch_extras=False, cache=None, shortcode=None):
    """Scrapes a user's profile or feed and converts the media to activities.

    Args:
      user_id: string
      activity_id: string, e.g. '1020355224898358984_654594'
      fetch_extras: boolean
      cookie: string
      shortcode: string, e.g. '4pB6vEx87I'

    Returns:
      dict activities API response
    """
    assert user_id or activity_id or shortcode or cookie
    assert not (activity_id and shortcode)

    if not shortcode:
      shortcode = self.id_to_shortcode(activity_id)

    url = (HTML_MEDIA % shortcode if shortcode
           else HTML_PROFILE % user_id if user_id
           else HTML_BASE_URL)
    kwargs = {}
    if cookie:
      kwargs = {'headers': {'Cookie': cookie}}
    resp = util.requests_get(url, allow_redirects=False, **kwargs)
    if ((cookie and 'not-logged-in' in resp.text) or
        (resp.status_code in (301, 302) and
         '/accounts/login' in resp.headers.get('Location', ''))):
      resp.status_code = 401
      raise requests.HTTPError('401 Unauthorized', response=resp)
    elif resp.status_code == 404:
      if activity_id:
        return self._scrape(shortcode=activity_id, cookie=cookie)
      # otherwise not found, fall through and return empty response
    else:
      resp.raise_for_status()

    activities, actor = self.html_to_activities(resp.text)

    if fetch_extras and not activity_id:
      # batch get cached counts of comments and likes for all activities
      cached = {}
      # don't update the cache until the end, in case we hit an error before
      cache_updates = {}
      if cache is not None:
        keys = []
        for activity in activities:
          _, id = util.parse_tag_uri(activity['id'])
          keys.extend(['AIL ' + id, 'AIC ' + id])
        cached = cache.get_multi(keys)

      for i, activity in enumerate(activities):
        obj = activity['object']
        _, id = util.parse_tag_uri(activity['id'])
        likes = obj.get('ig_like_count') or 0
        comments = obj.get('replies', {}).get('totalItems') or 0
        likes_key = 'AIL %s' % id
        comments_key = 'AIC %s' % id

        if (likes and likes != cached.get(likes_key) or
            comments and comments != cached.get(comments_key)):
          url = activity['url'].replace(self.BASE_URL, HTML_BASE_URL)
          resp = util.requests_get(url)
          resp.raise_for_status()
          full_activity, _ = self.html_to_activities(resp.text)
          if full_activity:
            activities[i] = full_activity[0]
            cache_updates.update({likes_key: likes, comments_key: comments})

      if cache_updates and cache is not None:
        cache.set_multi(cache_updates)

    resp = self.make_activities_base_response(activities)
    resp['actor'] = actor
    return resp
Пример #26
0
  def get(self):
    """Handles an API GET.

    Request path is of the form /site/user_id/group_id/app_id/activity_id ,
    where each element except site is an optional string object id.
    """
    # parse path
    args = urllib.unquote(self.request.path).strip('/').split('/')
    if not args or len(args) > MAX_PATH_LEN:
      raise exc.HTTPNotFound('Expected 1-%d path elements; found %d' %
                             (MAX_PATH_LEN, len(args)))

    # make source instance
    site = args.pop(0)
    if site == 'twitter':
      src = twitter.Twitter(
        access_token_key=util.get_required_param(self, 'access_token_key'),
        access_token_secret=util.get_required_param(self, 'access_token_secret'))
    elif site == 'facebook':
      src = facebook.Facebook(
        access_token=util.get_required_param(self, 'access_token'))
    elif site == 'flickr':
      src = flickr.Flickr(
        access_token_key=util.get_required_param(self, 'access_token_key'),
        access_token_secret=util.get_required_param(self, 'access_token_secret'))
    elif site == 'instagram':
      src = instagram.Instagram(scrape=True)
    elif site == 'google+':
      auth_entity = util.get_required_param(self, 'auth_entity')
      src = googleplus.GooglePlus(auth_entity=ndb.Key(urlsafe=auth_entity).get())
    else:
      src_cls = source.sources.get(site)
      if not src_cls:
        raise exc.HTTPNotFound('Unknown site %r' % site)
      src = src_cls(**self.request.params)

    # decode tag URI ids
    for i, arg in enumerate(args):
      parsed = util.parse_tag_uri(arg)
      if parsed:
        domain, id = parsed
        if domain != src.DOMAIN:
          raise exc.HTTPBadRequest('Expected domain %s in tag URI %s, found %s' %
                                   (src.DOMAIN, arg, domain))
        args[i] = id

    # check if request is cached
    cache = self.request.get('cache', '').lower() != 'false'
    if cache:
      cache_key = 'R %s' % self.request.path
      cached = memcache.get(cache_key)
      if cached:
        logging.info('Serving cached response %r', cache_key)
        self.write_response(cached['response'], actor=cached['actor'],
                            url=src.BASE_URL)
        return

    # handle default path elements
    args = [None if a in defaults else a
            for a, defaults in zip(args, PATH_DEFAULTS)]
    user_id = args[0] if args else None

    # get activities
    try:
      response = src.get_activities_response(*args, **self.get_kwargs(src))
    except NotImplementedError as e:
      self.abort(400, str(e))
    except Exception as e:
      if util.is_connection_failure(e):
        # HTTP 504 Gateway Timeout
        self.abort(504, str(e))
      raise

    # fetch actor if necessary
    actor = response.get('actor')
    if not actor and self.request.get('format') == 'atom':
      # atom needs actor
      args = [None if a in defaults else a  # handle default path elements
              for a, defaults in zip(args, PATH_DEFAULTS)]
      user_id = args[0] if args else None
      actor = src.get_actor(user_id) if src else {}

    self.write_response(response, actor=actor, url=src.BASE_URL)

    # cache response
    if cache:
      logging.info('Caching response in %r', cache_key)
      memcache.set(cache_key, {'response': response, 'actor': actor},
                   src.RESPONSE_CACHE_TIME)