Exemplo n.º 1
0
  def test_as_utc(self):
    dt = datetime.datetime(2000, 1, 1)  # naive
    self.assertEqual(dt, util.as_utc(dt))

    tzinfo = util.SimpleTzinfo()
    tzinfo.offset = datetime.timedelta(minutes=-390)
    dt = datetime.datetime(2000, 1, 1, tzinfo=tzinfo)  # aware

    got = util.as_utc(dt)
    self.assertEqual(datetime.datetime(2000, 1, 1, 6, 30), got)
    self.assertEqual(got, got)
Exemplo n.º 2
0
    def backfeed(self, source, responses=None, activities=None):
        """Processes responses and activities and generates propagate tasks.

    Stores property names and values to update in source.updates.

    Args:
      source: Source
      responses: dict mapping AS response id to AS object
      activities: dict mapping AS activity id to AS object
    """
        if responses is None:
            responses = {}
        if activities is None:
            activities = {}

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else
             private)[id] = activity
        logging.info('Found %d public activities: %s', len(public),
                     public.keys())
        logging.info('Found %d private activities: %s', len(private),
                     private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls(
            [a.get('published') for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates['last_public_post'] = \
                  util.as_utc(util.parse_iso8601(max_published))

        source.updates['recent_private_posts'] = \
          len([a for a in private.values()
               if a.get('published', util.EPOCH_ISO) > last_public_post])

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get('object') or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get(
                    'author',
                {}).get('id') != user_id and activity.get('verb') != 'share':
                for tag in obj.get('tags', []):
                    urls = tag.get('urls')
                    if tag.get('objectType') == 'person' and tag.get(
                            'id') == user_id and urls:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                        activity['mentions'].update(
                            u.get('value') for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get('attachments', []):
                if (att.get('objectType') in ('note', 'article') and att.get(
                        'author', {}).get('id') == source.user_tag_id()):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if 'originals' not in activity or 'mentions' not in activity:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get('replies', {}).get('items', [])
            tags = obj.get('tags', [])
            likes = [t for t in tags if Response.get_type(t) == 'like']
            reactions = [t for t in tags if Response.get_type(t) == 'react']
            reposts = [t for t in tags if Response.get_type(t) == 'repost']
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get('id')
                if not id:
                    logging.error('Skipping response without id: %s',
                                  json_dumps(resp, indent=2))
                    continue

                if source.is_blocked(resp):
                    logging.info(
                        'Skipping response by blocked user: %s',
                        json_dumps(resp.get('author') or resp.get('actor'),
                                   indent=2))
                    continue

                resp.setdefault('activities', []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp,
                                                         existing,
                                                         log=True):
                        logging.warning(
                            'Got two different versions of same response!\n%s\n%s',
                            existing, resp)
                    resp['activities'].extend(existing.get('activities', []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json_loads(source.seen_responses_cache_json):
                id = seen['id']
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(
                        seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        source.blocked_ids = None

        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop('activities', [])
            if not activities and resp_type == 'post':
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if 'originals' not in activity or 'mentions' not in activity:
                    activity['originals'], activity['mentions'] = \
                      original_post_discovery.discover(
                        source, activity, fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds)

                targets = original_post_discovery.targets_for_response(
                    resp,
                    originals=activity['originals'],
                    mentions=activity['mentions'])
                if targets:
                    logging.info('%s has %d webmention target(s): %s',
                                 activity.get('url'), len(targets),
                                 ' '.join(targets))
                    # new response to propagate! load block list if we haven't already
                    if source.blocked_ids is None:
                        source.load_blocklist()

                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.info(
                            'Giving up on target URL over %s chars! %s',
                            _MAX_STRING_LENGTH, t)
                        too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...')

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(id=id,
                                   source=source.key,
                                   activities_json=[
                                       json_dumps(
                                           util.prune_activity(a, source))
                                       for a in activities
                                   ],
                                   response_json=json_dumps(pruned_response),
                                   type=resp_type,
                                   unsent=list(urls_to_activity.keys()),
                                   failed=list(too_long),
                                   original_posts=resp.get('originals', []))
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json_dumps(urls_to_activity)
            resp_entity.get_or_save(source,
                                    restart=self.RESTART_EXISTING_TASKS)

        # update cache
        if pruned_responses:
            source.updates['seen_responses_cache_json'] = json_dumps(
                pruned_responses + unchanged_responses)
Exemplo n.º 3
0
    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug("Using ETag %s, last activity id %s", source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        # search for links first so that the user's activities and responses
        # override them if they overlap
        links = source.search_for_links()

        # this user's own activities (and user mentions)
        resp = source.get_activities_response(
            fetch_replies=True,
            fetch_likes=True,
            fetch_shares=True,
            fetch_mentions=True,
            count=50,
            etag=source.last_activities_etag,
            min_id=source.last_activity_id,
            cache=cache,
        )
        etag = resp.get("etag")  # used later
        user_activities = resp.get("items", [])

        # these map ids to AS objects
        responses = {a["id"]: a for a in links}
        activities = {a["id"]: a for a in links + user_activities}

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = id > last_activity_id
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates["last_activity_id"] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates["last_activities_cache_json"] = json.dumps(
            {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids}
        )

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else private)[id] = activity
        logging.info("Found %d public activities: %s", len(public), public.keys())
        logging.info("Found %d private activities: %s", len(private), private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls([a.get("published") for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates["last_public_post"] = util.as_utc(util.parse_iso8601(max_published))

        source.updates["recent_private_posts"] = len(
            [a for a in private.values() if a.get("published", util.EPOCH_ISO) > last_public_post]
        )

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get("object") or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get("author", {}).get("id") != user_id:
                for tag in obj.get("tags", []):
                    urls = tag.get("urls")
                    if tag.get("objectType") == "person" and tag.get("id") == user_id and urls:
                        activity["originals"], activity["mentions"] = original_post_discovery.discover(
                            source,
                            activity,
                            fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds,
                        )
                        activity["mentions"].update(u.get("value") for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get("attachments", []):
                if (
                    att.get("objectType") in ("note", "article")
                    and att.get("author", {}).get("id") == source.user_tag_id()
                ):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if "originals" not in activity or "mentions" not in activity:
                        activity["originals"], activity["mentions"] = original_post_discovery.discover(
                            source,
                            activity,
                            fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds,
                        )
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get("replies", {}).get("items", [])
            tags = obj.get("tags", [])
            likes = [t for t in tags if Response.get_type(t) == "like"]
            reactions = [t for t in tags if Response.get_type(t) == "react"]
            reposts = [t for t in tags if Response.get_type(t) == "repost"]
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get("id")
                if not id:
                    logging.error("Skipping response without id: %s", json.dumps(resp, indent=2))
                    continue

                resp.setdefault("activities", []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp, existing, log=True):
                        logging.warning("Got two different versions of same response!\n%s\n%s", existing, resp)
                    resp["activities"].extend(existing.get("activities", []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json.loads(source.seen_responses_cache_json):
                id = seen["id"]
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop("activities", [])
            if not activities and resp_type == "post":
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if "originals" not in activity or "mentions" not in activity:
                    activity["originals"], activity["mentions"] = original_post_discovery.discover(
                        source,
                        activity,
                        fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds,
                    )

                targets = original_post_discovery.targets_for_response(
                    resp, originals=activity["originals"], mentions=activity["mentions"]
                )
                if targets:
                    logging.info(
                        "%s has %d webmention target(s): %s", activity.get("url"), len(targets), " ".join(targets)
                    )
                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, t)
                        too_long.add(t[: _MAX_STRING_LENGTH - 4] + "...")

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(
                id=id,
                source=source.key,
                activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities],
                response_json=json.dumps(pruned_response),
                type=resp_type,
                unsent=list(urls_to_activity.keys()),
                failed=list(too_long),
                original_posts=resp.get("originals", []),
            )
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json.dumps(urls_to_activity)
            resp_entity.get_or_save(source)

        # update cache
        if pruned_responses:
            source.updates["seen_responses_cache_json"] = json.dumps(pruned_responses + unchanged_responses)

        source.updates.update({"last_polled": source.last_poll_attempt, "poll_status": "ok"})
        if etag and etag != source.last_activities_etag:
            source.updates["last_activities_etag"] = etag

        #
        # Step 5. possibly refetch updated syndication urls
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info("refetching h-feed for source %s", source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates["last_hfeed_refetch"] = now

            if relationships:
                logging.info("refetch h-feed found new rel=syndication relationships: %s", relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException, e:
                    if isinstance(
                        e, (datastore_errors.BadRequestError, datastore_errors.Timeout)
                    ) or util.is_connection_failure(e):
                        logging.info("Timeout while repropagating responses.", exc_info=True)
                    else:
                        raise
Exemplo n.º 4
0
class Poll(webapp2.RequestHandler):
    """Task handler that fetches and processes new responses from a single source.

  Request parameters:
    source_key: string key of source entity
    last_polled: timestamp, YYYY-MM-DD-HH-MM-SS

  Inserts a propagate task for each response that hasn't been seen before.
  """
    def post(self, *path_args):
        logging.debug('Params: %s', self.request.params)

        key = self.request.params['source_key']
        source = ndb.Key(urlsafe=key).get()
        if not source or source.status == 'disabled' or 'listen' not in source.features:
            logging.error('Source not found or disabled. Dropping task.')
            return
        logging.info('Source: %s %s, %s', source.label(),
                     source.key.string_id(), source.bridgy_url(self))

        last_polled = self.request.params['last_polled']
        if last_polled != source.last_polled.strftime(
                util.POLL_TASK_DATETIME_FORMAT):
            logging.warning(
                'duplicate poll task! deferring to the other task.')
            return

        logging.info('Last poll: %s/log?start_time=%s&key=%s',
                     self.request.host_url,
                     calendar.timegm(source.last_poll_attempt.utctimetuple()),
                     source.key.urlsafe())

        # mark this source as polling
        source.updates = {
            'poll_status': 'polling',
            'last_poll_attempt': util.now_fn(),
        }
        source = models.Source.put_updates(source)

        source.updates = {}
        try:
            self.poll(source)
        except models.DisableSource:
            # the user deauthorized the bridgy app, so disable this source.
            # let the task complete successfully so that it's not retried.
            source.updates['status'] = 'disabled'
            logging.warning('Disabling source!')
        except:
            source.updates['poll_status'] = 'error'
            raise
        finally:
            source = models.Source.put_updates(source)

        # add new poll task. randomize task ETA to within +/- 20% to try to spread
        # out tasks and prevent thundering herds.
        task_countdown = source.poll_period().total_seconds() * random.uniform(
            .8, 1.2)
        util.add_poll_task(source, countdown=task_countdown)

        # feeble attempt to avoid hitting the instance memory limit
        source = None
        gc.collect()

    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug('Using ETag %s, last activity id %s',
                          source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        try:
            # search for links first so that the user's activities and responses
            # override them if they overlap
            links = source.search_for_links()

            # this user's own activities (and user mentions)
            resp = source.get_activities_response(
                fetch_replies=True,
                fetch_likes=True,
                fetch_shares=True,
                fetch_mentions=True,
                count=50,
                etag=source.last_activities_etag,
                min_id=source.last_activity_id,
                cache=cache)
            etag = resp.get('etag')  # used later
            user_activities = resp.get('items', [])

            # these map ids to AS objects
            responses = {a['id']: a for a in links}
            activities = {a['id']: a for a in links + user_activities}

        except Exception, e:
            code, body = util.interpret_http_exception(e)
            if code == '401':
                msg = 'Unauthorized error: %s' % e
                logging.warning(msg, exc_info=True)
                source.updates['poll_status'] = 'ok'
                raise models.DisableSource(msg)
            elif code in util.HTTP_RATE_LIMIT_CODES:
                logging.warning(
                    'Rate limited. Marking as error and finishing. %s', e)
                source.updates.update({
                    'poll_status': 'error',
                    'rate_limited': True
                })
                return
            elif (code
                  and int(code) / 100 == 5) or util.is_connection_failure(e):
                logging.error(
                    'API call failed. Marking as error and finishing. %s: %s\n%s',
                    code, body, e)
                self.abort(ERROR_HTTP_RETURN_CODE)
            else:
                raise

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = id > last_activity_id
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates['last_activity_id'] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates['last_activities_cache_json'] = json.dumps({
            k: v
            for k, v in cache.items() if k.split()[-1] in silo_activity_ids
        })

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else
             private)[id] = activity
        logging.info('Found %d public activities: %s', len(public),
                     public.keys())
        logging.info('Found %d private activities: %s', len(private),
                     private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls(
            [a.get('published') for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates['last_public_post'] = \
                  util.as_utc(util.parse_iso8601(max_published))

        source.updates['recent_private_posts'] = \
          len([a for a in private.values()
               if a.get('published', util.EPOCH_ISO) > last_public_post])

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get('object') or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get('author', {}).get('id') != user_id:
                for tag in obj.get('tags', []):
                    urls = tag.get('urls')
                    if tag.get('objectType') == 'person' and tag.get(
                            'id') == user_id and urls:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                        activity['mentions'].update(
                            u.get('value') for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get('attachments', []):
                if (att.get('objectType') in ('note', 'article') and att.get(
                        'author', {}).get('id') == source.user_tag_id()):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if 'originals' not in activity or 'mentions' not in activity:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get('replies', {}).get('items', [])
            tags = obj.get('tags', [])
            likes = [t for t in tags if Response.get_type(t) == 'like']
            reactions = [t for t in tags if Response.get_type(t) == 'react']
            reposts = [t for t in tags if Response.get_type(t) == 'repost']
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get('id')
                if not id:
                    logging.error('Skipping response without id: %s',
                                  json.dumps(resp, indent=2))
                    continue

                resp.setdefault('activities', []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp,
                                                         existing,
                                                         log=True):
                        logging.warning(
                            'Got two different versions of same response!\n%s\n%s',
                            existing, resp)
                    resp['activities'].extend(existing.get('activities', []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json.loads(source.seen_responses_cache_json):
                id = seen['id']
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(
                        seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop('activities', [])
            if not activities and resp_type == 'post':
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if 'originals' not in activity or 'mentions' not in activity:
                    activity['originals'], activity['mentions'] = \
                      original_post_discovery.discover(
                        source, activity, fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds)

                targets = original_post_discovery.targets_for_response(
                    resp,
                    originals=activity['originals'],
                    mentions=activity['mentions'])
                if targets:
                    logging.info('%s has %d webmention target(s): %s',
                                 activity.get('url'), len(targets),
                                 ' '.join(targets))
                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.warning(
                            'Giving up on target URL over %s chars! %s',
                            _MAX_STRING_LENGTH, t)
                        too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...')

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(id=id,
                                   source=source.key,
                                   activities_json=[
                                       json.dumps(
                                           util.prune_activity(a, source))
                                       for a in activities
                                   ],
                                   response_json=json.dumps(pruned_response),
                                   type=resp_type,
                                   unsent=list(urls_to_activity.keys()),
                                   failed=list(too_long),
                                   original_posts=resp.get('originals', []))
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json.dumps(urls_to_activity)
            resp_entity.get_or_save(source)

        # update cache
        if pruned_responses:
            source.updates['seen_responses_cache_json'] = json.dumps(
                pruned_responses + unchanged_responses)

        source.updates.update({
            'last_polled': source.last_poll_attempt,
            'poll_status': 'ok'
        })
        if etag and etag != source.last_activities_etag:
            source.updates['last_activities_etag'] = etag

        #
        # Step 5. possibly refetch updated syndication urls
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info('refetching h-feed for source %s', source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates['last_hfeed_refetch'] = now

            if relationships:
                logging.info(
                    'refetch h-feed found new rel=syndication relationships: %s',
                    relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException, e:
                    if (isinstance(e, (datastore_errors.BadRequestError,
                                       datastore_errors.Timeout))
                            or util.is_connection_failure(e)):
                        logging.info('Timeout while repropagating responses.',
                                     exc_info=True)
                    else:
                        raise
Exemplo n.º 5
0
  def backfeed(self, source, responses=None, activities=None):
    """Processes responses and activities and generates propagate tasks.

    Stores property names and values to update in source.updates.

    Args:
      source: Source
      responses: dict mapping AS response id to AS object
      activities: dict mapping AS activity id to AS object
    """
    if responses is None:
      responses = {}
    if activities is None:
      activities = {}

    # Cache to make sure we only fetch the author's h-feed(s) the
    # first time we see it
    fetched_hfeeds = set()

    # narrow down to just public activities
    public = {}
    private = {}
    for id, activity in activities.items():
      (public if source.is_activity_public(activity) else private)[id] = activity
    logging.info('Found %d public activities: %s', len(public), public.keys())
    logging.info('Found %d private activities: %s', len(private), private.keys())

    last_public_post = (source.last_public_post or util.EPOCH).isoformat()
    public_published = util.trim_nulls([a.get('published') for a in public.values()])
    if public_published:
      max_published = max(public_published)
      if max_published > last_public_post:
        last_public_post = max_published
        source.updates['last_public_post'] = \
          util.as_utc(util.parse_iso8601(max_published))

    source.updates['recent_private_posts'] = \
      len([a for a in private.values()
           if a.get('published', util.EPOCH_ISO) > last_public_post])

    #
    # Step 2: extract responses, store their activities in response['activities']
    #
    # WARNING: this creates circular references in link posts found by search
    # queries in step 1, since they are their own activity. We use
    # prune_activity() and prune_response() in step 4 to remove these before
    # serializing to JSON.
    #
    for id, activity in public.items():
      obj = activity.get('object') or activity

      # handle user mentions
      user_id = source.user_tag_id()
      if obj.get('author', {}).get('id') != user_id:
        for tag in obj.get('tags', []):
          urls = tag.get('urls')
          if tag.get('objectType') == 'person' and tag.get('id') == user_id and urls:
            activity['originals'], activity['mentions'] = \
              original_post_discovery.discover(
                source, activity, fetch_hfeed=True,
                include_redirect_sources=False,
                already_fetched_hfeeds=fetched_hfeeds)
            activity['mentions'].update(u.get('value') for u in urls)
            responses[id] = activity
            break

      # handle quote mentions
      for att in obj.get('attachments', []):
        if (att.get('objectType') in ('note', 'article')
                and att.get('author', {}).get('id') == source.user_tag_id()):
          # now that we've confirmed that one exists, OPD will dig
          # into the actual attachments
          if 'originals' not in activity or 'mentions' not in activity:
            activity['originals'], activity['mentions'] = \
              original_post_discovery.discover(
                source, activity, fetch_hfeed=True,
                include_redirect_sources=False,
                already_fetched_hfeeds=fetched_hfeeds)
          responses[id] = activity
          break

      # extract replies, likes, reactions, reposts, and rsvps
      replies = obj.get('replies', {}).get('items', [])
      tags = obj.get('tags', [])
      likes = [t for t in tags if Response.get_type(t) == 'like']
      reactions = [t for t in tags if Response.get_type(t) == 'react']
      reposts = [t for t in tags if Response.get_type(t) == 'repost']
      rsvps = Source.get_rsvps_from_event(obj)

      # coalesce responses. drop any without ids
      for resp in replies + likes + reactions + reposts + rsvps:
        id = resp.get('id')
        if not id:
          logging.error('Skipping response without id: %s', json.dumps(resp, indent=2))
          continue

        if source.is_blocked(resp):
          logging.info('Skipping response by blocked user: %s',
                       json.dumps(resp.get('author') or resp.get('actor'), indent=2))
          continue

        resp.setdefault('activities', []).append(activity)

        # when we find two responses with the same id, the earlier one may have
        # come from a link post or user mention, and this one is probably better
        # since it probably came from the user's activity, so prefer this one.
        # background: https://github.com/snarfed/bridgy/issues/533
        existing = responses.get(id)
        if existing:
          if source.gr_source.activity_changed(resp, existing, log=True):
            logging.warning('Got two different versions of same response!\n%s\n%s',
                            existing, resp)
          resp['activities'].extend(existing.get('activities', []))

        responses[id] = resp

    #
    # Step 3: filter out responses we've already seen
    #
    # seen responses (JSON objects) for each source are stored in its entity.
    unchanged_responses = []
    if source.seen_responses_cache_json:
      for seen in json.loads(source.seen_responses_cache_json):
        id = seen['id']
        resp = responses.get(id)
        if resp and not source.gr_source.activity_changed(seen, resp, log=True):
          unchanged_responses.append(seen)
          del responses[id]

    #
    # Step 4: store new responses and enqueue propagate tasks
    #
    pruned_responses = []
    for id, resp in responses.items():
      resp_type = Response.get_type(resp)
      activities = resp.pop('activities', [])
      if not activities and resp_type == 'post':
        activities = [resp]
      too_long = set()
      urls_to_activity = {}
      for i, activity in enumerate(activities):
        # we'll usually have multiple responses for the same activity, and the
        # objects in resp['activities'] are shared, so cache each activity's
        # discovered webmention targets inside its object.
        if 'originals' not in activity or 'mentions' not in activity:
          activity['originals'], activity['mentions'] = \
            original_post_discovery.discover(
              source, activity, fetch_hfeed=True,
              include_redirect_sources=False,
              already_fetched_hfeeds=fetched_hfeeds)

        targets = original_post_discovery.targets_for_response(
          resp, originals=activity['originals'], mentions=activity['mentions'])
        if targets:
          logging.info('%s has %d webmention target(s): %s', activity.get('url'),
                       len(targets), ' '.join(targets))
        for t in targets:
          if len(t) <= _MAX_STRING_LENGTH:
            urls_to_activity[t] = i
          else:
            logging.info('Giving up on target URL over %s chars! %s',
                         _MAX_STRING_LENGTH, t)
            too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...')

      # store/update response entity. the prune_*() calls are important to
      # remove circular references in link responses, which are their own
      # activities. details in the step 2 comment above.
      pruned_response = util.prune_response(resp)
      pruned_responses.append(pruned_response)
      resp_entity = Response(
        id=id,
        source=source.key,
        activities_json=[json.dumps(util.prune_activity(a, source))
                         for a in activities],
        response_json=json.dumps(pruned_response),
        type=resp_type,
        unsent=list(urls_to_activity.keys()),
        failed=list(too_long),
        original_posts=resp.get('originals', []))
      if urls_to_activity and len(activities) > 1:
        resp_entity.urls_to_activity=json.dumps(urls_to_activity)
      resp_entity.get_or_save(source, restart=self.RESTART_EXISTING_TASKS)

    # update cache
    if pruned_responses:
      source.updates['seen_responses_cache_json'] = json.dumps(
        pruned_responses + unchanged_responses)