Exemplo n.º 1
0
  def source_url(self, target_url):
    # parse the response id. (we know Response key ids are always tag URIs)
    _, response_id = util.parse_tag_uri(self.entity.key.string_id())
    if self.entity.type in ('like', 'repost', 'rsvp'):
      response_id = response_id.split('_')[-1]

    # determine which activity to use
    activity = self.activities[0]
    if self.entity.urls_to_activity:
      urls_to_activity = json.loads(self.entity.urls_to_activity)
      if urls_to_activity:
        activity = self.activities[urls_to_activity[target_url]]

    # generate source URL
    id = activity['id']
    parsed = util.parse_tag_uri(id)
    post_id = parsed[1] if parsed else id
    # prefer brid-gy.appspot.com to brid.gy because non-browsers (ie OpenSSL)
    # currently have problems with brid.gy's SSL cert. details:
    # https://github.com/snarfed/bridgy/issues/20
    if (self.request.host_url.endswith('brid.gy') or
        self.request.host_url.endswith('brid-gy.appspot.com')):
      host_url = 'https://brid-gy.appspot.com'
    else:
      host_url = self.request.host_url

    return '%s/%s/%s/%s/%s/%s' % (
      host_url, self.entity.type, self.entity.source.get().SHORT_NAME,
      self.entity.source.string_id(), post_id, response_id)
Exemplo n.º 2
0
    def source_url(self, target_url):
        # determine which activity to use
        try:
            activity = self.activities[0]
            if self.entity.urls_to_activity:
                urls_to_activity = json.loads(self.entity.urls_to_activity)
                if urls_to_activity:
                    activity = self.activities[urls_to_activity[target_url]]
        except (KeyError, IndexError):
            logging.warning(
                """\
Hit https://github.com/snarfed/bridgy/issues/237 KeyError!
target url %s not in urls_to_activity: %s
activities: %s""",
                target_url,
                self.entity.urls_to_activity,
                self.activities,
            )
            self.abort(ERROR_HTTP_RETURN_CODE)

        # generate source URL
        id = activity["id"]
        parsed = util.parse_tag_uri(id)
        post_id = parsed[1] if parsed else id
        # prefer brid-gy.appspot.com to brid.gy because non-browsers (ie OpenSSL)
        # currently have problems with brid.gy's SSL cert. details:
        # https://github.com/snarfed/bridgy/issues/20
        if self.request.host_url.endswith("brid.gy") or self.request.host_url.endswith("brid-gy.appspot.com"):
            host_url = "https://brid-gy.appspot.com"
        else:
            host_url = self.request.host_url

        path = [
            host_url,
            self.entity.type,
            self.entity.source.get().SHORT_NAME,
            self.entity.source.string_id(),
            post_id,
        ]

        if self.entity.type != "post":
            # parse and add response id. (we know Response key ids are always tag URIs)
            _, response_id = util.parse_tag_uri(self.entity.key.string_id())
            reaction_id = response_id
            if self.entity.type in ("like", "react", "repost", "rsvp"):
                response_id = response_id.split("_")[-1]  # extract responder user id
            path.append(response_id)
            if self.entity.type == "react":
                path.append(reaction_id)

        return "/".join(path)
Exemplo n.º 3
0
    def dispatch_request(self, *args):
        source = self.auth()

        gr_src = self.gr_source()
        id = request.values['id']

        # validate request
        parsed_id = util.parse_tag_uri(id)
        if not parsed_id:
            self.error(f'Scrape error: expected id to be tag URI; got {id}')

        activity = Activity.get_by_id(id)
        if not activity:
            self.error(f'No {gr_src.NAME} post found for id {id}', 404)
        elif activity.source != source.key:
            self.error(
                f'Activity {id} is owned by {activity.source}, not {source.key}',
                403)

        activity_data = json_loads(activity.activity_json)

        # convert new extras to AS, merge into existing activity
        try:
            new_extras = getattr(gr_src, self.MERGE_METHOD)(
                request.get_data(as_text=True), activity_data)
        except ValueError as e:
            self.error(f"Scrape error: couldn't parse extras: {e}")

        activity.activity_json = json_dumps(activity_data)
        activity.put()

        extra_ids = ' '.join(c['id'] for c in new_extras)
        logger.info(f"Stored extras for activity {id}: {extra_ids}")
        return jsonify(new_extras)
Exemplo n.º 4
0
    def source_url(self, target_url):
        # determine which activity to use
        activity = self.activities[0]
        if self.entity.urls_to_activity:
            urls_to_activity = json.loads(self.entity.urls_to_activity)
            if urls_to_activity:
                try:
                    activity = self.activities[urls_to_activity[target_url]]
                except KeyError:
                    logging.warning(
                        """\
Hit https://github.com/snarfed/bridgy/issues/237 KeyError!
target url %s not in urls_to_activity: %s
activities: %s""", target_url, urls_to_activity, self.activities)
                    self.abort(ERROR_HTTP_RETURN_CODE)

        # generate source URL
        id = activity['id']
        parsed = util.parse_tag_uri(id)
        post_id = parsed[1] if parsed else id
        # prefer brid-gy.appspot.com to brid.gy because non-browsers (ie OpenSSL)
        # currently have problems with brid.gy's SSL cert. details:
        # https://github.com/snarfed/bridgy/issues/20
        if (self.request.host_url.endswith('brid.gy')
                or self.request.host_url.endswith('brid-gy.appspot.com')):
            host_url = 'https://brid-gy.appspot.com'
        else:
            host_url = self.request.host_url

        path = [
            host_url, self.entity.type,
            self.entity.source.get().SHORT_NAME,
            self.entity.source.string_id(), post_id
        ]

        if self.entity.type != 'post':
            # parse and add response id. (we know Response key ids are always tag URIs)
            _, response_id = util.parse_tag_uri(self.entity.key.string_id())
            reaction_id = response_id
            if self.entity.type in ('like', 'react', 'repost', 'rsvp'):
                response_id = response_id.split('_')[
                    -1]  # extract responder user id
            path.append(response_id)
            if self.entity.type == 'react':
                path.append(reaction_id)

        return '/'.join(path)
Exemplo n.º 5
0
 def get_comment(self, comment_id, activity=None, **kwargs):
     """Uses the activity passed in the activity kwarg."""
     if activity:
         for reply in activity.get('object', {}).get('replies',
                                                     {}).get('items', []):
             parsed = util.parse_tag_uri(reply.get('id', ''))
             if parsed and parsed[1] == comment_id:
                 return reply
Exemplo n.º 6
0
 def remove_bad_ids(objs, label):
   ret = []
   for o in objs:
     id = util.parse_tag_uri(o.get('id') or o.get('object', {}).get('id') or '')
     if id and ':' in id[1]:
       logging.warning('Cowardly ignoring %s with bad id: %s', label,  id[1])
     else:
       ret.append(o)
   return ret
Exemplo n.º 7
0
    def post(self):
        logging.debug('Params: %s', self.request.params)

        type = self.request.get('type')
        if type:
            assert type in ('event', )

        source = util.load_source(self)
        if not source or source.status == 'disabled' or 'listen' not in source.features:
            logging.error('Source not found or disabled. Dropping task.')
            return
        logging.info('Source: %s %s, %s', source.label(),
                     source.key.string_id(), source.bridgy_url(self))

        post_id = util.get_required_param(self, 'post_id')
        source.updates = {}

        try:
            if type == 'event':
                activities = [source.gr_source.get_event(post_id)]
            else:
                activities = source.get_activities(fetch_replies=True,
                                                   fetch_likes=True,
                                                   fetch_shares=True,
                                                   activity_id=post_id,
                                                   user_id=source.key.id())

            if not activities or not activities[0]:
                logging.info('Post %s not found.', post_id)
                return
            assert len(activities) == 1, activities
            self.backfeed(source,
                          activities={activities[0]['id']: activities[0]})

            obj = activities[0].get('object') or activities[0]
            in_reply_to = util.get_first(obj, 'inReplyTo')
            if in_reply_to:
                parsed = util.parse_tag_uri(in_reply_to.get(
                    'id', ''))  # TODO: fall back to url
                if parsed:
                    util.add_discover_task(source, parsed[1])

        except Exception, e:
            code, body = util.interpret_http_exception(e)
            if (code and (code in source.RATE_LIMIT_HTTP_CODES
                          or code in ('400', '404') or int(code) / 100 == 5)
                    or util.is_connection_failure(e)):
                logging.error('API call failed; giving up. %s: %s\n%s', code,
                              body, e)
                self.abort(util.ERROR_HTTP_RETURN_CODE)
            else:
                raise
Exemplo n.º 8
0
    def dispatch_request(self):
        logger.debug(f'Params: {list(request.values.items())}')
        g.TRANSIENT_ERROR_HTTP_CODES = ('400', '404')

        type = request.values.get('type')
        if type:
            assert type in ('event', )

        source = g.source = util.load_source()
        if not source or source.status == 'disabled' or 'listen' not in source.features:
            logger.error('Source not found or disabled. Dropping task.')
            return ''
        logger.info(
            f'Source: {source.label()} {source.key_id()}, {source.bridgy_url()}'
        )

        post_id = request.values['post_id']
        source.updates = {}

        if type == 'event':
            activities = [source.gr_source.get_event(post_id)]
        else:
            activities = source.get_activities(fetch_replies=True,
                                               fetch_likes=True,
                                               fetch_shares=True,
                                               activity_id=post_id,
                                               user_id=source.key_id())

        if not activities or not activities[0]:
            logger.info(f'Post {post_id} not found.')
            return ''
        assert len(activities) == 1, activities
        activity = activities[0]
        activities = {activity['id']: activity}

        # STATE: propagate tasks created by backfeed() here get started before their Response entities get created/updated, so they fail with https://github.com/snarfed/bridgy/issues/237 , but that's a red herring, it's really that activities_json and urls_to_activity are empty
        # is poll transactional somehow, and this isn't?
        # no more transactional tasks. https://github.com/googleapis/python-tasks/issues/26
        # they're still supported in the new "bundled services" thing, but that seems like a dead end.
        # https://groups.google.com/g/google-appengine/c/22BKInlWty0/m/05ObNEdsAgAJ
        self.backfeed(source, responses=activities, activities=activities)

        obj = activity.get('object') or activity
        in_reply_to = util.get_first(obj, 'inReplyTo')
        if in_reply_to:
            parsed = util.parse_tag_uri(in_reply_to.get(
                'id', ''))  # TODO: fall back to url
            if parsed:
                util.add_discover_task(source, parsed[1])

        return 'OK'
Exemplo n.º 9
0
  def source_url(self, target_url):
    # parse the response id. (we know Response key ids are always tag URIs)
    _, response_id = util.parse_tag_uri(self.entity.key.string_id())
    if self.entity.type in ('like', 'repost', 'rsvp'):
      response_id = response_id.split('_')[-1]

    # determine which activity to use
    activity = self.activities[0]
    if self.entity.urls_to_activity:
      urls_to_activity = json.loads(self.entity.urls_to_activity)
      if urls_to_activity:
        try:
          activity = self.activities[urls_to_activity[target_url]]
        except KeyError:
          logging.warning("""\
Hit https://github.com/snarfed/bridgy/issues/237 KeyError!
target url %s not in urls_to_activity: %s
activities: %s""", target_url, urls_to_activity, self.activities)
          self.abort(ERROR_HTTP_RETURN_CODE)

    # generate source URL
    id = activity['id']
    parsed = util.parse_tag_uri(id)
    post_id = parsed[1] if parsed else id
    # prefer brid-gy.appspot.com to brid.gy because non-browsers (ie OpenSSL)
    # currently have problems with brid.gy's SSL cert. details:
    # https://github.com/snarfed/bridgy/issues/20
    if (self.request.host_url.endswith('brid.gy') or
        self.request.host_url.endswith('brid-gy.appspot.com')):
      host_url = 'https://brid-gy.appspot.com'
    else:
      host_url = self.request.host_url

    path = [host_url, self.entity.type, self.entity.source.get().SHORT_NAME,
            self.entity.source.string_id(), post_id]
    if self.entity.type != 'post':
      path.append(response_id)
    return '/'.join(path)
Exemplo n.º 10
0
 def get_like(self,
              activity_user_id,
              activity_id,
              like_user_id,
              activity=None,
              **kwargs):
     """Uses the activity passed in the activity kwarg."""
     if activity:
         for tag in activity.get('object', {}).get('tags', []):
             if tag.get('verb') == 'like':
                 parsed = util.parse_tag_uri(
                     tag.get('author', {}).get('id', ''))
                 if parsed and parsed[1] == like_user_id:
                     return tag
Exemplo n.º 11
0
    def source_url(self, target_url):
        # determine which activity to use
        try:
            activity = self.activities[0]
            if self.entity.urls_to_activity:
                urls_to_activity = json_loads(self.entity.urls_to_activity)
                if urls_to_activity:
                    activity = self.activities[urls_to_activity[target_url]]
        except (KeyError, IndexError):
            logging.warning(
                """\
Hit https://github.com/snarfed/bridgy/issues/237 KeyError!
target url %s not in urls_to_activity: %s
activities: %s""", target_url, self.entity.urls_to_activity, self.activities)
            self.abort(util.ERROR_HTTP_RETURN_CODE)

        # generate source URL
        id = activity['id']
        parsed = util.parse_tag_uri(id)
        post_id = parsed[1] if parsed else id
        parts = [
            util.host_url(self), self.entity.type, self.source.SHORT_NAME,
            self.source.key.string_id(), post_id
        ]

        if self.entity.type != 'post':
            # parse and add response id. (we know Response key ids are always tag URIs)
            _, response_id = util.parse_tag_uri(self.entity.key.string_id())
            reaction_id = response_id
            if self.entity.type in ('like', 'react', 'repost', 'rsvp'):
                response_id = response_id.split('_')[
                    -1]  # extract responder user id
            parts.append(response_id)
            if self.entity.type == 'react':
                parts.append(reaction_id)

        return '/'.join(parts)
Exemplo n.º 12
0
  def post(self):
    logging.debug('Params: %s', self.request.params)

    type = self.request.get('type')
    if type:
      assert type in ('event',)

    key = util.get_required_param(self, 'source_key')
    source = ndb.Key(urlsafe=key).get()
    if not source or source.status == 'disabled' or 'listen' not in source.features:
      logging.error('Source not found or disabled. Dropping task.')
      return
    logging.info('Source: %s %s, %s', source.label(), source.key.string_id(),
                 source.bridgy_url(self))

    post_id = util.get_required_param(self, 'post_id')
    source.updates = {}

    try:
      if type == 'event':
        activities = [source.gr_source.get_event(post_id)]
      else:
        activities = source.get_activities(
          fetch_replies=True, fetch_likes=True, fetch_shares=True,
          activity_id=post_id, user_id=source.key.id())

      if not activities or not activities[0]:
        logging.info('Post %s not found.', post_id)
        return
      assert len(activities) == 1, activities
      self.backfeed(source, activities={activities[0]['id']: activities[0]})

      in_reply_to = util.get_first(activities[0]['object'], 'inReplyTo')
      if in_reply_to:
        parsed = util.parse_tag_uri(in_reply_to.get('id', ''))  # TODO: fall back to url
        if parsed:
          util.add_discover_task(source, parsed[1])

    except Exception, e:
      code, body = util.interpret_http_exception(e)
      if (code and (code in util.HTTP_RATE_LIMIT_CODES or
                    code in ('400', '404') or
                    int(code) / 100 == 5)
            or util.is_connection_failure(e)):
        logging.error('API call failed; giving up. %s: %s\n%s', code, body, e)
        self.abort(util.ERROR_HTTP_RETURN_CODE)
      else:
        raise
Exemplo n.º 13
0
    def post(self):
        logging.debug('Params: %s', list(self.request.params.items()))

        type = self.request.get('type')
        if type:
            assert type in ('event', )

        source = self.source = util.load_source(self)
        if not source or source.status == 'disabled' or 'listen' not in source.features:
            logging.error('Source not found or disabled. Dropping task.')
            return
        logging.info('Source: %s %s, %s', source.label(), source.key_id(),
                     source.bridgy_url(self))

        post_id = util.get_required_param(self, 'post_id')
        source.updates = {}

        if type == 'event':
            activities = [source.gr_source.get_event(post_id)]
        else:
            activities = source.get_activities(fetch_replies=True,
                                               fetch_likes=True,
                                               fetch_shares=True,
                                               activity_id=post_id,
                                               user_id=source.key_id())

        if not activities or not activities[0]:
            logging.info('Post %s not found.', post_id)
            return
        assert len(activities) == 1, activities
        activity = activities[0]
        activities = {activity['id']: activity}
        self.backfeed(source, responses=activities, activities=activities)

        obj = activity.get('object') or activity
        in_reply_to = util.get_first(obj, 'inReplyTo')
        if in_reply_to:
            parsed = util.parse_tag_uri(in_reply_to.get(
                'id', ''))  # TODO: fall back to url
            if parsed:
                util.add_discover_task(source, parsed[1])
Exemplo n.º 14
0
    def post(self, *args):
        source = self.auth()

        gr_src = self.gr_source()
        id = util.get_required_param(self, 'id')

        # validate request
        parsed_id = util.parse_tag_uri(id)
        if not parsed_id:
            self.abort(400, f'Expected id to be tag URI; got {id}')

        activity = Activity.get_by_id(id)
        if not activity:
            self.abort(404, f'No {gr_src.NAME} post found for id {id}')
        elif activity.source != source.key:
            self.abort(
                403,
                f'Activity {id} is owned by {activity.source}, not {source.key}'
            )

        activity_data = json_loads(activity.activity_json)

        # convert new reactions to AS, merge into existing activity
        try:
            new_reactions = gr_src.merge_scraped_reactions(
                self.request.text, activity_data)
        except ValueError as e:
            msg = "Couldn't parse scraped reactions: %s" % e
            logging.error(msg, stack_info=True)
            self.abort(400, msg)

        activity.activity_json = json_dumps(activity_data)
        activity.put()

        reaction_ids = ' '.join(r['id'] for r in new_reactions)
        logging.info(f"Stored reactions for activity {id}: {reaction_ids}")
        self.output(new_reactions)
Exemplo n.º 15
0
 def test_parse_tag_uri(self):
   self.assertEquals(('x.com', 'foo'), util.parse_tag_uri('tag:x.com,2013:foo'))
   self.assertEquals(('x.com', 'foo'), util.parse_tag_uri('tag:x.com:foo'))
   self.assertEquals(None, util.parse_tag_uri('asdf'))
Exemplo n.º 16
0
    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug('Using ETag %s, last activity id %s',
                          source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json_loads(source.last_activities_cache_json))

        # search for links first so that the user's activities and responses
        # override them if they overlap
        links = source.search_for_links()

        # this user's own activities (and user mentions)
        resp = source.get_activities_response(fetch_replies=True,
                                              fetch_likes=True,
                                              fetch_shares=True,
                                              fetch_mentions=True,
                                              count=50,
                                              etag=source.last_activities_etag,
                                              min_id=source.last_activity_id,
                                              cache=cache)
        etag = resp.get('etag')  # used later
        user_activities = resp.get('items', [])

        # these map ids to AS objects
        responses = {a['id']: a for a in links}
        activities = {a['id']: a for a in links + user_activities}

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = str(id) > str(last_activity_id)
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates['last_activity_id'] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates['last_activities_cache_json'] = json_dumps({
            k: v
            for k, v in cache.items() if k.split()[-1] in silo_activity_ids
        })

        self.backfeed(source, responses, activities=activities)

        source.updates.update({
            'last_polled': source.last_poll_attempt,
            'poll_status': 'ok'
        })
        if etag and etag != source.last_activities_etag:
            source.updates['last_activities_etag'] = etag

        #
        # Possibly refetch updated syndication urls.
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info('refetching h-feed for source %s', source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates['last_hfeed_refetch'] = now

            if relationships:
                logging.info(
                    'refetch h-feed found new rel=syndication relationships: %s',
                    relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException as e:
                    if ('BadRequestError' in str(e.__class__)
                            or 'Timeout' in str(e.__class__)
                            or util.is_connection_failure(e)):
                        logging.info('Timeout while repropagating responses.',
                                     stack_info=True)
                    else:
                        raise
        else:
            logging.info(
                'skipping refetch h-feed. last-syndication-url %s, last-refetch %s',
                source.last_syndication_url, source.last_hfeed_refetch)
Exemplo n.º 17
0
    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug("Using ETag %s, last activity id %s", source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        # search for links first so that the user's activities and responses
        # override them if they overlap
        links = source.search_for_links()

        # this user's own activities (and user mentions)
        resp = source.get_activities_response(
            fetch_replies=True,
            fetch_likes=True,
            fetch_shares=True,
            fetch_mentions=True,
            count=50,
            etag=source.last_activities_etag,
            min_id=source.last_activity_id,
            cache=cache,
        )
        etag = resp.get("etag")  # used later
        user_activities = resp.get("items", [])

        # these map ids to AS objects
        responses = {a["id"]: a for a in links}
        activities = {a["id"]: a for a in links + user_activities}

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = id > last_activity_id
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates["last_activity_id"] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates["last_activities_cache_json"] = json.dumps(
            {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids}
        )

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else private)[id] = activity
        logging.info("Found %d public activities: %s", len(public), public.keys())
        logging.info("Found %d private activities: %s", len(private), private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls([a.get("published") for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates["last_public_post"] = util.as_utc(util.parse_iso8601(max_published))

        source.updates["recent_private_posts"] = len(
            [a for a in private.values() if a.get("published", util.EPOCH_ISO) > last_public_post]
        )

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get("object") or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get("author", {}).get("id") != user_id:
                for tag in obj.get("tags", []):
                    urls = tag.get("urls")
                    if tag.get("objectType") == "person" and tag.get("id") == user_id and urls:
                        activity["originals"], activity["mentions"] = original_post_discovery.discover(
                            source,
                            activity,
                            fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds,
                        )
                        activity["mentions"].update(u.get("value") for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get("attachments", []):
                if (
                    att.get("objectType") in ("note", "article")
                    and att.get("author", {}).get("id") == source.user_tag_id()
                ):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if "originals" not in activity or "mentions" not in activity:
                        activity["originals"], activity["mentions"] = original_post_discovery.discover(
                            source,
                            activity,
                            fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds,
                        )
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get("replies", {}).get("items", [])
            tags = obj.get("tags", [])
            likes = [t for t in tags if Response.get_type(t) == "like"]
            reactions = [t for t in tags if Response.get_type(t) == "react"]
            reposts = [t for t in tags if Response.get_type(t) == "repost"]
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get("id")
                if not id:
                    logging.error("Skipping response without id: %s", json.dumps(resp, indent=2))
                    continue

                resp.setdefault("activities", []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp, existing, log=True):
                        logging.warning("Got two different versions of same response!\n%s\n%s", existing, resp)
                    resp["activities"].extend(existing.get("activities", []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json.loads(source.seen_responses_cache_json):
                id = seen["id"]
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop("activities", [])
            if not activities and resp_type == "post":
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if "originals" not in activity or "mentions" not in activity:
                    activity["originals"], activity["mentions"] = original_post_discovery.discover(
                        source,
                        activity,
                        fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds,
                    )

                targets = original_post_discovery.targets_for_response(
                    resp, originals=activity["originals"], mentions=activity["mentions"]
                )
                if targets:
                    logging.info(
                        "%s has %d webmention target(s): %s", activity.get("url"), len(targets), " ".join(targets)
                    )
                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, t)
                        too_long.add(t[: _MAX_STRING_LENGTH - 4] + "...")

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(
                id=id,
                source=source.key,
                activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities],
                response_json=json.dumps(pruned_response),
                type=resp_type,
                unsent=list(urls_to_activity.keys()),
                failed=list(too_long),
                original_posts=resp.get("originals", []),
            )
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json.dumps(urls_to_activity)
            resp_entity.get_or_save(source)

        # update cache
        if pruned_responses:
            source.updates["seen_responses_cache_json"] = json.dumps(pruned_responses + unchanged_responses)

        source.updates.update({"last_polled": source.last_poll_attempt, "poll_status": "ok"})
        if etag and etag != source.last_activities_etag:
            source.updates["last_activities_etag"] = etag

        #
        # Step 5. possibly refetch updated syndication urls
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info("refetching h-feed for source %s", source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates["last_hfeed_refetch"] = now

            if relationships:
                logging.info("refetch h-feed found new rel=syndication relationships: %s", relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException, e:
                    if isinstance(
                        e, (datastore_errors.BadRequestError, datastore_errors.Timeout)
                    ) or util.is_connection_failure(e):
                        logging.info("Timeout while repropagating responses.", exc_info=True)
                    else:
                        raise
Exemplo n.º 18
0
 def test_parse_tag_uri(self):
   self.assertEquals(('x.com', 'foo'), util.parse_tag_uri('tag:x.com,2013:foo'))
   self.assertEquals(('x.com', 'foo'), util.parse_tag_uri('tag:x.com:foo'))
   self.assertEquals(None, util.parse_tag_uri('asdf'))
Exemplo n.º 19
0
    def get(self, type, source_short_name, string_id, *ids):
        source_cls = models.sources.get(source_short_name)
        if not source_cls:
            self.abort(
                400, "Source type '%s' not found. Known sources: %s" %
                (source_short_name, filter(None, models.sources.keys())))

        self.source = source_cls.get_by_id(string_id)
        if not self.source:
            self.abort(
                400, 'Source %s %s not found' % (source_short_name, string_id))
        elif (self.source.status == 'disabled'
              or ('listen' not in self.source.features
                  and 'email' not in self.source.features)):
            self.abort(
                400, 'Source %s is disabled for backfeed' %
                self.source.bridgy_path())

        format = self.request.get('format', 'html')
        if format not in ('html', 'json'):
            self.abort(400,
                       'Invalid format %s, expected html or json' % format)

        for id in ids:
            if not self.VALID_ID.match(id):
                self.abort(404, 'Invalid id %s' % id)

        label = '%s:%s %s %s' % (source_short_name, string_id, type, ids)
        cache_key = 'H ' + label
        obj = memcache.get(cache_key)
        if obj and not appengine_config.DEBUG:
            logging.info('Using cached object for %s', label)
        else:
            logging.info('Fetching %s', label)
            try:
                obj = self.get_item(*ids)
            except models.DisableSource as e:
                self.abort(
                    401,
                    "Bridgy's access to your account has expired. Please visit https://brid.gy/ to refresh it!"
                )
            except ValueError as e:
                self.abort(400,
                           '%s error:\n%s' % (self.source.GR_CLASS.NAME, e))
            except Exception as e:
                # pass through all API HTTP errors if we can identify them
                code, body = util.interpret_http_exception(e)
                # temporary, trying to debug a flaky test failure
                # eg https://circleci.com/gh/snarfed/bridgy/769
                if code:
                    self.response.status_int = int(code)
                    self.response.headers['Content-Type'] = 'text/plain'
                    self.response.write('%s error:\n%s' %
                                        (self.source.GR_CLASS.NAME, body))
                    return
                else:
                    raise
            memcache.set(cache_key, obj, time=CACHE_TIME)

        if not obj:
            self.abort(404, label)

        if self.source.is_blocked(obj):
            self.abort(410, 'That user is currently blocked')

        # use https for profile pictures so we don't cause SSL mixed mode errors
        # when serving over https.
        author = obj.get('author', {})
        image = author.get('image', {})
        url = image.get('url')
        if url:
            image['url'] = util.update_scheme(url, self)

        mf2_json = microformats2.object_to_json(obj, synthesize_content=False)

        # try to include the author's silo profile url
        author = first_props(mf2_json.get('properties', {})).get('author', {})
        author_uid = first_props(author.get('properties', {})).get('uid', '')
        if author_uid:
            parsed = util.parse_tag_uri(author_uid)
            if parsed:
                silo_url = self.source.gr_source.user_url(parsed[1])
                urls = author.get('properties', {}).setdefault('url', [])
                if silo_url not in microformats2.get_string_urls(urls):
                    urls.append(silo_url)

        # write the response!
        self.response.headers['Access-Control-Allow-Origin'] = '*'
        if format == 'html':
            self.response.headers['Content-Type'] = 'text/html; charset=utf-8'
            url = obj.get('url', '')
            self.response.out.write(
                TEMPLATE.substitute({
                    'refresh':
                    (('<meta http-equiv="refresh" content="0;url=%s">' %
                      url) if url else ''),
                    'url':
                    url,
                    'body':
                    microformats2.json_to_html(mf2_json),
                    'title':
                    self.get_title(obj),
                }))
        elif format == 'json':
            self.response.headers[
                'Content-Type'] = 'application/json; charset=utf-8'
            self.response.out.write(json.dumps(mf2_json, indent=2))
Exemplo n.º 20
0
  def get(self, type, source_short_name, string_id, *ids):
    source_cls = models.sources.get(source_short_name)
    if not source_cls:
      self.abort(400, "Source type '%s' not found. Known sources: %s" %
                 (source_short_name, filter(None, models.sources.keys())))

    self.source = source_cls.get_by_id(string_id)
    if not self.source:
      self.abort(400, 'Source %s %s not found' % (source_short_name, string_id))
    elif self.source.status == 'disabled' or 'listen' not in self.source.features:
      self.abort(400, 'Source %s is disabled for backfeed' % self.source.bridgy_path())

    format = self.request.get('format', 'html')
    if format not in ('html', 'json'):
      self.abort(400, 'Invalid format %s, expected html or json' % format)

    for id in ids:
      if not self.VALID_ID.match(id):
        self.abort(404, 'Invalid id %s' % id)

    label = '%s:%s %s %s' % (source_short_name, string_id, type, ids)
    cache_key = 'H ' + label
    obj = memcache.get(cache_key)
    if obj:
      logging.info('Using cached object for %s', label)
    else:
      logging.info('Fetching %s', label)
      try:
        obj = self.get_item(*ids)
      except models.DisableSource as e:
        self.abort(401, "Bridgy's access to your account has expired. Please visit https://brid.gy/ to refresh it!")
      except Exception as e:
        # pass through all API HTTP errors if we can identify them
        code, body = util.interpret_http_exception(e)
        if not code and util.is_connection_failure(e):
          code = 503
          body = str(e)
        if code:
          self.response.status_int = int(code)
          self.response.headers['Content-Type'] = 'text/plain'
          self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body))
          return
        else:
          raise
      memcache.set(cache_key, obj, time=CACHE_TIME)

    if not obj:
      self.abort(404, label)

    # use https for profile pictures so we don't cause SSL mixed mode errors
    # when serving over https.
    author = obj.get('author', {})
    image = author.get('image', {})
    url = image.get('url')
    if url:
      image['url'] = util.update_scheme(url, self)

    mf2_json = microformats2.object_to_json(obj, synthesize_content=False)

    # try to include the author's silo profile url
    author = first_props(mf2_json.get('properties', {})).get('author', {})
    author_uid = first_props(author.get('properties', {})).get('uid', '')
    if author_uid:
      parsed = util.parse_tag_uri(author_uid)
      if parsed:
        silo_url = self.source.gr_source.user_url(parsed[1])
        urls = author.get('properties', {}).setdefault('url', [])
        if silo_url not in microformats2.get_string_urls(urls):
          urls.append(silo_url)

    # write the response!
    self.response.headers['Access-Control-Allow-Origin'] = '*'
    if format == 'html':
      self.response.headers['Content-Type'] = 'text/html; charset=utf-8'
      self.response.out.write(TEMPLATE.substitute({
            'url': obj.get('url', ''),
            'body': microformats2.json_to_html(mf2_json),
            'title': self.get_title(obj),
            }))
    elif format == 'json':
      self.response.headers['Content-Type'] = 'application/json; charset=utf-8'
      self.response.out.write(json.dumps(mf2_json, indent=2))
Exemplo n.º 21
0
  def get_activities_response(self, **kwargs):
    # TODO: use batch API to get photos, events, etc in one request
    # https://developers.facebook.com/docs/graph-api/making-multiple-requests
    try:
      resp = self.gr_source.get_activities_response(group_id=SELF, **kwargs)

      # if it's requesting one specific activity, then we're done
      if 'activity_id' in kwargs:
        return resp

      # also get uploaded photos manually since facebook sometimes collapses
      # multiple photos into albums, and the album post object won't have the
      # post content, comments, etc. from the individual photo posts.
      # http://stackoverflow.com/questions/12785120
      #
      # TODO: save and use ETag for all of these extra calls
      photos = self.get_data(API_PHOTOS)

      # also get events and RSVPs
      # https://developers.facebook.com/docs/graph-api/reference/user/events/
      # https://developers.facebook.com/docs/graph-api/reference/event#edges
      # TODO: also fetch and use API_USER_RSVPS_DECLINED
      user_rsvps = self.get_data(API_USER_RSVPS)

      # have to re-fetch the events because the user rsvps response doesn't
      # include the event description, which we need for original post links.
      events = [self.gr_source.urlopen(API_EVENT % r['id'])
                for r in user_rsvps if r.get('id')]

      # also, only process events that the user is the owner of. avoids (but
      # doesn't prevent) processing big non-indieweb events with tons of
      # attendees that put us over app engine's instance memory limit. details:
      # https://github.com/snarfed/bridgy/issues/77
      events_and_rsvps = [(e, self.get_data(API_EVENT_RSVPS % e['id']))
                          for e in events
                          if e.get('owner', {}).get('id') == self.key.id()]

    except urllib2.HTTPError as e:
      # Facebook API error details:
      # https://developers.facebook.com/docs/graph-api/using-graph-api/#receiving-errorcodes
      # https://developers.facebook.com/docs/reference/api/errors/
      exc_type, _, exc_traceback = sys.exc_info()
      body = e.read()
      exc_copy = exc_type(e.filename, e.code, e.msg, e.hdrs, cStringIO.StringIO(body))

      try:
        body_json = json.loads(body)
      except:
        logging.exception('Non-JSON response body: %s', body)
        # response isn't JSON. ignore and re-raise the original exception
        raise exc_type, exc_copy, exc_traceback

      error = body_json.get('error', {})
      if error.get('code') in (102, 190):
        subcode = error.get('error_subcode')
        if subcode == 458:  # revoked
          raise models.DisableSource()
        elif subcode in (463, 460):  # expired, changed password
          # ask the user to reauthenticate
          self.gr_source.create_notification(
            self.key.id(),
            "Brid.gy's access to your account has expired. Click here to renew it now!",
            'https://www.brid.gy/facebook/start')
          raise models.DisableSource()

      # other error. re-raise original exception
      raise exc_type, exc_copy, exc_traceback

    # add photos. they show up as both a post and a photo, each with a separate
    # id. the post's object_id field points to the photo's id. de-dupe by
    # switching the post to use the fb_object_id when it's provided.
    activities = resp.setdefault('items', [])
    activities_by_fb_id = {}
    for activity in activities:
      obj = activity.get('object', {})
      fb_id = obj.get('fb_object_id')
      if not fb_id:
        continue

      activities_by_fb_id[fb_id] = activity
      for x in activity, obj:
        parsed = util.parse_tag_uri(x.get('id', ''))
        if parsed:
          _, orig_id = parsed
          x['id'] = self.gr_source.tag_uri(fb_id)
          x['url'] = x.get('url', '').replace(orig_id, fb_id)

    # merge comments and likes from existing photo objects, and add new ones.
    for photo in photos:
      photo_activity = self.gr_source.post_to_activity(photo)
      existing = activities_by_fb_id.get(photo.get('id'))
      if existing:
        existing['object'].setdefault('replies', {}).setdefault('items', []).extend(
          photo_activity['object'].get('replies', {}).get('items', []))
        existing['object'].setdefault('tags', []).extend(
            [t for t in photo_activity['object'].get('tags', [])
             if t.get('verb') == 'like'])
      else:
        activities.append(photo_activity)

    # add events
    activities += [self.gr_source.event_to_activity(e, rsvps=r)
                   for e, r in events_and_rsvps]

    # TODO: remove once we're confident in our id parsing. (i'm going to canary
    # with just a few users before i do it for everyone.)
    #
    # discard objects with ids with colons in them. Background:
    # https://github.com/snarfed/bridgy/issues/305
    def remove_bad_ids(objs, label):
      ret = []
      for o in objs:
        id = util.parse_tag_uri(o.get('id') or o.get('object', {}).get('id') or '')
        if id and ':' in id[1]:
          logging.warning('Cowardly ignoring %s with bad id: %s', label,  id[1])
        else:
          ret.append(o)
      return ret

    resp['items'] = remove_bad_ids(activities, 'activity')
    for activity in resp['items']:
      obj = activity.get('object', {})
      obj['tags'] = remove_bad_ids(obj.setdefault('tags', []), 'tag/like')
      replies = obj.get('replies', {})
      items = replies.get('items')
      if items:
        replies['items'] = remove_bad_ids(items, 'comment')
        replies['totalItems'] = len(replies['items'])

    return util.trim_nulls(resp)
Exemplo n.º 22
0
class Poll(webapp2.RequestHandler):
    """Task handler that fetches and processes new responses from a single source.

  Request parameters:
    source_key: string key of source entity
    last_polled: timestamp, YYYY-MM-DD-HH-MM-SS

  Inserts a propagate task for each response that hasn't been seen before.
  """
    def post(self, *path_args):
        logging.debug('Params: %s', self.request.params)

        key = self.request.params['source_key']
        source = ndb.Key(urlsafe=key).get()
        if not source or source.status == 'disabled' or 'listen' not in source.features:
            logging.error('Source not found or disabled. Dropping task.')
            return
        logging.info('Source: %s %s, %s', source.label(),
                     source.key.string_id(), source.bridgy_url(self))

        last_polled = self.request.params['last_polled']
        if last_polled != source.last_polled.strftime(
                util.POLL_TASK_DATETIME_FORMAT):
            logging.warning(
                'duplicate poll task! deferring to the other task.')
            return

        logging.info('Last poll: %s/log?start_time=%s&key=%s',
                     self.request.host_url,
                     calendar.timegm(source.last_poll_attempt.utctimetuple()),
                     source.key.urlsafe())

        # mark this source as polling
        source.updates = {
            'poll_status': 'polling',
            'last_poll_attempt': util.now_fn(),
        }
        source = models.Source.put_updates(source)

        source.updates = {}
        try:
            self.poll(source)
        except models.DisableSource:
            # the user deauthorized the bridgy app, so disable this source.
            # let the task complete successfully so that it's not retried.
            source.updates['status'] = 'disabled'
            logging.warning('Disabling source!')
        except:
            source.updates['poll_status'] = 'error'
            raise
        finally:
            source = models.Source.put_updates(source)

        # add new poll task. randomize task ETA to within +/- 20% to try to spread
        # out tasks and prevent thundering herds.
        task_countdown = source.poll_period().total_seconds() * random.uniform(
            .8, 1.2)
        util.add_poll_task(source, countdown=task_countdown)

        # feeble attempt to avoid hitting the instance memory limit
        source = None
        gc.collect()

    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug('Using ETag %s, last activity id %s',
                          source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        try:
            # search for links first so that the user's activities and responses
            # override them if they overlap
            links = source.search_for_links()

            # this user's own activities (and user mentions)
            resp = source.get_activities_response(
                fetch_replies=True,
                fetch_likes=True,
                fetch_shares=True,
                fetch_mentions=True,
                count=50,
                etag=source.last_activities_etag,
                min_id=source.last_activity_id,
                cache=cache)
            etag = resp.get('etag')  # used later
            user_activities = resp.get('items', [])

            # these map ids to AS objects
            responses = {a['id']: a for a in links}
            activities = {a['id']: a for a in links + user_activities}

        except Exception, e:
            code, body = util.interpret_http_exception(e)
            if code == '401':
                msg = 'Unauthorized error: %s' % e
                logging.warning(msg, exc_info=True)
                source.updates['poll_status'] = 'ok'
                raise models.DisableSource(msg)
            elif code in util.HTTP_RATE_LIMIT_CODES:
                logging.warning(
                    'Rate limited. Marking as error and finishing. %s', e)
                source.updates.update({
                    'poll_status': 'error',
                    'rate_limited': True
                })
                return
            elif (code
                  and int(code) / 100 == 5) or util.is_connection_failure(e):
                logging.error(
                    'API call failed. Marking as error and finishing. %s: %s\n%s',
                    code, body, e)
                self.abort(ERROR_HTTP_RETURN_CODE)
            else:
                raise

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = id > last_activity_id
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates['last_activity_id'] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates['last_activities_cache_json'] = json.dumps({
            k: v
            for k, v in cache.items() if k.split()[-1] in silo_activity_ids
        })

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else
             private)[id] = activity
        logging.info('Found %d public activities: %s', len(public),
                     public.keys())
        logging.info('Found %d private activities: %s', len(private),
                     private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls(
            [a.get('published') for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates['last_public_post'] = \
                  util.as_utc(util.parse_iso8601(max_published))

        source.updates['recent_private_posts'] = \
          len([a for a in private.values()
               if a.get('published', util.EPOCH_ISO) > last_public_post])

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get('object') or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get('author', {}).get('id') != user_id:
                for tag in obj.get('tags', []):
                    urls = tag.get('urls')
                    if tag.get('objectType') == 'person' and tag.get(
                            'id') == user_id and urls:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                        activity['mentions'].update(
                            u.get('value') for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get('attachments', []):
                if (att.get('objectType') in ('note', 'article') and att.get(
                        'author', {}).get('id') == source.user_tag_id()):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if 'originals' not in activity or 'mentions' not in activity:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get('replies', {}).get('items', [])
            tags = obj.get('tags', [])
            likes = [t for t in tags if Response.get_type(t) == 'like']
            reactions = [t for t in tags if Response.get_type(t) == 'react']
            reposts = [t for t in tags if Response.get_type(t) == 'repost']
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get('id')
                if not id:
                    logging.error('Skipping response without id: %s',
                                  json.dumps(resp, indent=2))
                    continue

                resp.setdefault('activities', []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp,
                                                         existing,
                                                         log=True):
                        logging.warning(
                            'Got two different versions of same response!\n%s\n%s',
                            existing, resp)
                    resp['activities'].extend(existing.get('activities', []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json.loads(source.seen_responses_cache_json):
                id = seen['id']
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(
                        seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop('activities', [])
            if not activities and resp_type == 'post':
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if 'originals' not in activity or 'mentions' not in activity:
                    activity['originals'], activity['mentions'] = \
                      original_post_discovery.discover(
                        source, activity, fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds)

                targets = original_post_discovery.targets_for_response(
                    resp,
                    originals=activity['originals'],
                    mentions=activity['mentions'])
                if targets:
                    logging.info('%s has %d webmention target(s): %s',
                                 activity.get('url'), len(targets),
                                 ' '.join(targets))
                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.warning(
                            'Giving up on target URL over %s chars! %s',
                            _MAX_STRING_LENGTH, t)
                        too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...')

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(id=id,
                                   source=source.key,
                                   activities_json=[
                                       json.dumps(
                                           util.prune_activity(a, source))
                                       for a in activities
                                   ],
                                   response_json=json.dumps(pruned_response),
                                   type=resp_type,
                                   unsent=list(urls_to_activity.keys()),
                                   failed=list(too_long),
                                   original_posts=resp.get('originals', []))
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json.dumps(urls_to_activity)
            resp_entity.get_or_save(source)

        # update cache
        if pruned_responses:
            source.updates['seen_responses_cache_json'] = json.dumps(
                pruned_responses + unchanged_responses)

        source.updates.update({
            'last_polled': source.last_poll_attempt,
            'poll_status': 'ok'
        })
        if etag and etag != source.last_activities_etag:
            source.updates['last_activities_etag'] = etag

        #
        # Step 5. possibly refetch updated syndication urls
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info('refetching h-feed for source %s', source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates['last_hfeed_refetch'] = now

            if relationships:
                logging.info(
                    'refetch h-feed found new rel=syndication relationships: %s',
                    relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException, e:
                    if (isinstance(e, (datastore_errors.BadRequestError,
                                       datastore_errors.Timeout))
                            or util.is_connection_failure(e)):
                        logging.info('Timeout while repropagating responses.',
                                     exc_info=True)
                    else:
                        raise
Exemplo n.º 23
0
  def poll(self, source):
    """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
    if source.last_activities_etag or source.last_activity_id:
      logging.debug('Using ETag %s, last activity id %s',
                    source.last_activities_etag, source.last_activity_id)

    #
    # Step 1: fetch activities:
    # * posts by the user
    # * search all posts for the user's domain URLs to find links
    #
    cache = util.CacheDict()
    if source.last_activities_cache_json:
      cache.update(json.loads(source.last_activities_cache_json))

    # search for links first so that the user's activities and responses
    # override them if they overlap
    links = source.search_for_links()

    # this user's own activities (and user mentions)
    resp = source.get_activities_response(
      fetch_replies=True, fetch_likes=True, fetch_shares=True,
      fetch_mentions=True, count=50, etag=source.last_activities_etag,
      min_id=source.last_activity_id, cache=cache)
    etag = resp.get('etag')  # used later
    user_activities = resp.get('items', [])

    # these map ids to AS objects
    responses = {a['id']: a for a in links}
    activities = {a['id']: a for a in links + user_activities}

    # extract silo activity ids, update last_activity_id
    silo_activity_ids = set()
    last_activity_id = source.last_activity_id
    for id, activity in activities.items():
      # maybe replace stored last activity id
      parsed = util.parse_tag_uri(id)
      if parsed:
        id = parsed[1]
      silo_activity_ids.add(id)
      try:
        # try numeric comparison first
        greater = int(id) > int(last_activity_id)
      except (TypeError, ValueError):
        greater = id > last_activity_id
      if greater:
        last_activity_id = id

    if last_activity_id and last_activity_id != source.last_activity_id:
      source.updates['last_activity_id'] = last_activity_id

    # trim cache to just the returned activity ids, so that it doesn't grow
    # without bound. (WARNING: depends on get_activities_response()'s cache key
    # format, e.g. 'PREFIX ACTIVITY_ID'!)
    source.updates['last_activities_cache_json'] = json.dumps(
      {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids})

    self.backfeed(source, responses, activities=activities)

    source.updates.update({'last_polled': source.last_poll_attempt,
                           'poll_status': 'ok'})
    if etag and etag != source.last_activities_etag:
      source.updates['last_activities_etag'] = etag

    #
    # Possibly refetch updated syndication urls.
    #
    # if the author has added syndication urls since the first time
    # original_post_discovery ran, we'll miss them. this cleanup task will
    # periodically check for updated urls. only kicks in if the author has
    # *ever* published a rel=syndication url
    if source.should_refetch():
      logging.info('refetching h-feed for source %s', source.label())
      relationships = original_post_discovery.refetch(source)

      now = util.now_fn()
      source.updates['last_hfeed_refetch'] = now

      if relationships:
        logging.info('refetch h-feed found new rel=syndication relationships: %s',
                     relationships)
        try:
          self.repropagate_old_responses(source, relationships)
        except BaseException, e:
          if (isinstance(e, (datastore_errors.BadRequestError,
                             datastore_errors.Timeout)) or
              util.is_connection_failure(e)):
            logging.info('Timeout while repropagating responses.', exc_info=True)
          else:
            raise
Exemplo n.º 24
0
class ItemHandler(webapp2.RequestHandler):
    """Fetches a post, repost, like, or comment and serves it as mf2 HTML or JSON.
  """
    handle_exception = handlers.handle_exception
    source = None

    VALID_ID = re.compile(r'^[\w.+:@-]+$')

    def head(self, *args):
        """Return an empty 200 with no caching directives."""

    def get_item(self, id):
        """Fetches and returns an object from the given source.

    To be implemented by subclasses.

    Args:
      source: bridgy.Source subclass
      id: string

    Returns: ActivityStreams object dict
    """
        raise NotImplementedError()

    def get_title(self, obj):
        """Returns the string to be used in the <title> tag.

    Args:
      obj: ActivityStreams object
    """
        return obj.get('title') or obj.get('content') or 'Bridgy Response'

    def get_post(self, id, **kwargs):
        """Fetch a post.

    Args:
      id: string, site-specific post id
      is_event: bool
      kwargs: passed through to get_activities

    Returns: ActivityStreams object dict
    """
        try:
            posts = self.source.get_activities(activity_id=id,
                                               user_id=self.source.key.id(),
                                               **kwargs)
            if posts:
                return posts[0]
            logging.warning('Source post %s not found', id)
        except Exception as e:
            util.interpret_http_exception(e)

    def get(self, type, source_short_name, string_id, *ids):
        source_cls = models.sources.get(source_short_name)
        if not source_cls:
            self.abort(
                400, "Source type '%s' not found. Known sources: %s" %
                (source_short_name, filter(None, models.sources.keys())))

        self.source = source_cls.get_by_id(string_id)
        if not self.source:
            self.abort(
                400, 'Source %s %s not found' % (source_short_name, string_id))

        format = self.request.get('format', 'html')
        if format not in ('html', 'json'):
            self.abort(400,
                       'Invalid format %s, expected html or json' % format)

        for id in ids:
            if not self.VALID_ID.match(id):
                self.abort(404, 'Invalid id %s' % id)

        label = '%s:%s %s %s' % (source_short_name, string_id, type, ids)
        cache_key = 'H ' + label
        obj = memcache.get(cache_key)
        if obj:
            logging.info('Using cached object for %s', label)
        else:
            logging.info('Fetching %s', label)
            try:
                obj = self.get_item(*ids)
            except Exception, e:
                # pass through all API HTTP errors if we can identify them
                code, body = util.interpret_http_exception(e)
                if not code and util.is_connection_failure(e):
                    code = 503
                    body = str(e)
                if code:
                    self.response.status_int = int(code)
                    self.response.headers['Content-Type'] = 'text/plain'
                    self.response.write('%s error:\n%s' %
                                        (self.source.GR_CLASS.NAME, body))
                    return
                else:
                    raise
            memcache.set(cache_key, obj, time=CACHE_TIME)

        if not obj:
            self.abort(404, label)

        # use https for profile pictures so we don't cause SSL mixed mode errors
        # when serving over https.
        author = obj.get('author', {})
        image = author.get('image', {})
        url = image.get('url')
        if url:
            image['url'] = util.update_scheme(url, self)

        mf2_json = microformats2.object_to_json(obj, synthesize_content=False)

        # try to include the author's silo profile url
        author = first_props(mf2_json.get('properties', {})).get('author', {})
        author_uid = first_props(author.get('properties', {})).get('uid', '')
        if author_uid:
            parsed = util.parse_tag_uri(author_uid)
            if parsed:
                silo_url = self.source.gr_source.user_url(parsed[1])
                urls = author.get('properties', {}).setdefault('url', [])
                if silo_url not in microformats2.get_string_urls(urls):
                    urls.append(silo_url)

        # write the response!
        self.response.headers['Access-Control-Allow-Origin'] = '*'
        if format == 'html':
            self.response.headers['Content-Type'] = 'text/html; charset=utf-8'
            self.response.out.write(
                TEMPLATE.substitute({
                    'url':
                    obj.get('url', ''),
                    'body':
                    microformats2.json_to_html(mf2_json),
                    'title':
                    self.get_title(obj),
                }))
        elif format == 'json':
            self.response.headers[
                'Content-Type'] = 'application/json; charset=utf-8'
            self.response.out.write(json.dumps(mf2_json, indent=2))
Exemplo n.º 25
0
    def dispatch_request(self, site, key_id, **kwargs):
        """Handle HTTP request."""
        source_cls = models.sources.get(site)
        if not source_cls:
            error(
                f"Source type '{site}' not found. Known sources: {[s for s in models.sources.keys() if s]}"
            )

        self.source = source_cls.get_by_id(key_id)
        if not self.source:
            error(f'Source {site} {key_id} not found')
        elif (self.source.status == 'disabled'
              or 'listen' not in self.source.features):
            error(
                f'Source {self.source.bridgy_path()} is disabled for backfeed')

        format = request.values.get('format', 'html')
        if format not in ('html', 'json'):
            error(f'Invalid format {format}, expected html or json')

        for id in kwargs.values():
            if not self.VALID_ID.match(id):
                error(f'Invalid id {id}', 404)

        try:
            obj = self.get_item(**kwargs)
        except models.DisableSource:
            error(
                "Bridgy's access to your account has expired. Please visit https://brid.gy/ to refresh it!",
                401)
        except ValueError as e:
            error(f'{self.source.GR_CLASS.NAME} error: {e}')

        if not obj:
            error(f'Not found: {site}:{key_id} {kwargs}', 404)

        if self.source.is_blocked(obj):
            error('That user is currently blocked', 410)

        # use https for profile pictures so we don't cause SSL mixed mode errors
        # when serving over https.
        author = obj.get('author', {})
        image = author.get('image', {})
        url = image.get('url')
        if url:
            image['url'] = util.update_scheme(url, request)

        mf2_json = microformats2.object_to_json(obj, synthesize_content=False)

        # try to include the author's silo profile url
        author = first_props(mf2_json.get('properties', {})).get('author', {})
        author_uid = first_props(author.get('properties', {})).get('uid', '')
        if author_uid:
            parsed = util.parse_tag_uri(author_uid)
            if parsed:
                urls = author.get('properties', {}).setdefault('url', [])
                try:
                    silo_url = self.source.gr_source.user_url(parsed[1])
                    if silo_url not in microformats2.get_string_urls(urls):
                        urls.append(silo_url)
                except NotImplementedError:  # from gr_source.user_url()
                    pass

        # write the response!
        if format == 'html':
            url = obj.get('url', '')
            return TEMPLATE.substitute({
                'refresh':
                (f'<meta http-equiv="refresh" content="0;url={url}">'
                 if url else ''),
                'url':
                url,
                'body':
                microformats2.json_to_html(mf2_json),
                'title':
                obj.get('title') or obj.get('content') or 'Bridgy Response',
            })
        elif format == 'json':
            return mf2_json
Exemplo n.º 26
0
    # use https for profile pictures so we don't cause SSL mixed mode errors
    # when serving over https.
    author = obj.get('author', {})
    image = author.get('image', {})
    url = image.get('url')
    if url:
      image['url'] = util.update_scheme(url, self)

    mf2_json = microformats2.object_to_json(obj)

    # try to include the author's silo profile url
    author = first_props(mf2_json.get('properties', {})).get('author', {})
    author_uid = first_props(author.get('properties', {})).get('uid', '')
    if author_uid:
      parsed = util.parse_tag_uri(author_uid)
      if parsed:
        silo_url = self.source.gr_source.user_url(parsed[1])
        urls = author.get('properties', {}).setdefault('url', [])
        if silo_url not in microformats2.get_string_urls(urls):
          urls.append(silo_url)

    # write the response!
    self.response.headers['Access-Control-Allow-Origin'] = '*'
    if format == 'html':
      self.response.headers['Content-Type'] = 'text/html; charset=utf-8'
      self.response.out.write(TEMPLATE.substitute({
            'url': obj.get('url', ''),
            'body': microformats2.json_to_html(mf2_json),
            'title': obj.get('title', obj.get('content', 'Bridgy Response')),
            }))