def test_as_utc(self): dt = datetime.datetime(2000, 1, 1) # naive self.assertEqual(dt, util.as_utc(dt)) tzinfo = util.SimpleTzinfo() tzinfo.offset = datetime.timedelta(minutes=-390) dt = datetime.datetime(2000, 1, 1, tzinfo=tzinfo) # aware got = util.as_utc(dt) self.assertEqual(datetime.datetime(2000, 1, 1, 6, 30), got) self.assertEqual(got, got)
def backfeed(self, source, responses=None, activities=None): """Processes responses and activities and generates propagate tasks. Stores property names and values to update in source.updates. Args: source: Source responses: dict mapping AS response id to AS object activities: dict mapping AS activity id to AS object """ if responses is None: responses = {} if activities is None: activities = {} # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls( [a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get( 'author', {}).get('id') != user_id and activity.get('verb') != 'share': for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get( 'id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update( u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get( 'author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json_dumps(resp, indent=2)) continue if source.is_blocked(resp): logging.info( 'Skipping response by blocked user: %s', json_dumps(resp.get('author') or resp.get('actor'), indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning( 'Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json_loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed( seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] source.blocked_ids = None for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) # new response to propagate! load block list if we haven't already if source.blocked_ids is None: source.load_blocklist() for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.info( 'Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response(id=id, source=source.key, activities_json=[ json_dumps( util.prune_activity(a, source)) for a in activities ], response_json=json_dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json_dumps(urls_to_activity) resp_entity.get_or_save(source, restart=self.RESTART_EXISTING_TASKS) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json_dumps( pruned_responses + unchanged_responses)
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug("Using ETag %s, last activity id %s", source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache, ) etag = resp.get("etag") # used later user_activities = resp.get("items", []) # these map ids to AS objects responses = {a["id"]: a for a in links} activities = {a["id"]: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates["last_activity_id"] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates["last_activities_cache_json"] = json.dumps( {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids} ) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info("Found %d public activities: %s", len(public), public.keys()) logging.info("Found %d private activities: %s", len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls([a.get("published") for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates["last_public_post"] = util.as_utc(util.parse_iso8601(max_published)) source.updates["recent_private_posts"] = len( [a for a in private.values() if a.get("published", util.EPOCH_ISO) > last_public_post] ) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get("object") or activity # handle user mentions user_id = source.user_tag_id() if obj.get("author", {}).get("id") != user_id: for tag in obj.get("tags", []): urls = tag.get("urls") if tag.get("objectType") == "person" and tag.get("id") == user_id and urls: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) activity["mentions"].update(u.get("value") for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get("attachments", []): if ( att.get("objectType") in ("note", "article") and att.get("author", {}).get("id") == source.user_tag_id() ): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get("replies", {}).get("items", []) tags = obj.get("tags", []) likes = [t for t in tags if Response.get_type(t) == "like"] reactions = [t for t in tags if Response.get_type(t) == "react"] reposts = [t for t in tags if Response.get_type(t) == "repost"] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get("id") if not id: logging.error("Skipping response without id: %s", json.dumps(resp, indent=2)) continue resp.setdefault("activities", []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning("Got two different versions of same response!\n%s\n%s", existing, resp) resp["activities"].extend(existing.get("activities", [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen["id"] resp = responses.get(id) if resp and not source.gr_source.activity_changed(seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop("activities", []) if not activities and resp_type == "post": activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) targets = original_post_discovery.targets_for_response( resp, originals=activity["originals"], mentions=activity["mentions"] ) if targets: logging.info( "%s has %d webmention target(s): %s", activity.get("url"), len(targets), " ".join(targets) ) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, t) too_long.add(t[: _MAX_STRING_LENGTH - 4] + "...") # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response( id=id, source=source.key, activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get("originals", []), ) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates["seen_responses_cache_json"] = json.dumps(pruned_responses + unchanged_responses) source.updates.update({"last_polled": source.last_poll_attempt, "poll_status": "ok"}) if etag and etag != source.last_activities_etag: source.updates["last_activities_etag"] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info("refetching h-feed for source %s", source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates["last_hfeed_refetch"] = now if relationships: logging.info("refetch h-feed found new rel=syndication relationships: %s", relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if isinstance( e, (datastore_errors.BadRequestError, datastore_errors.Timeout) ) or util.is_connection_failure(e): logging.info("Timeout while repropagating responses.", exc_info=True) else: raise
class Poll(webapp2.RequestHandler): """Task handler that fetches and processes new responses from a single source. Request parameters: source_key: string key of source entity last_polled: timestamp, YYYY-MM-DD-HH-MM-SS Inserts a propagate task for each response that hasn't been seen before. """ def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime( util.POLL_TASK_DATETIME_FORMAT): logging.warning( 'duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s/log?start_time=%s&key=%s', self.request.host_url, calendar.timegm(source.last_poll_attempt.utctimetuple()), source.key.urlsafe()) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except models.DisableSource: # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. source.updates['status'] = 'disabled' logging.warning('Disabling source!') except: source.updates['poll_status'] = 'error' raise finally: source = models.Source.put_updates(source) # add new poll task. randomize task ETA to within +/- 20% to try to spread # out tasks and prevent thundering herds. task_countdown = source.poll_period().total_seconds() * random.uniform( .8, 1.2) util.add_poll_task(source, countdown=task_countdown) # feeble attempt to avoid hitting the instance memory limit source = None gc.collect() def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) source.updates['poll_status'] = 'ok' raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning( 'Rate limited. Marking as error and finishing. %s', e) source.updates.update({ 'poll_status': 'error', 'rate_limited': True }) return elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error( 'API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json.dumps({ k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids }) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls( [a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get('author', {}).get('id') != user_id: for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get( 'id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update( u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get( 'author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json.dumps(resp, indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning( 'Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed( seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning( 'Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response(id=id, source=source.key, activities_json=[ json.dumps( util.prune_activity(a, source)) for a in activities ], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json.dumps( pruned_responses + unchanged_responses) source.updates.update({ 'last_polled': source.last_poll_attempt, 'poll_status': 'ok' }) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info( 'refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if (isinstance(e, (datastore_errors.BadRequestError, datastore_errors.Timeout)) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', exc_info=True) else: raise
def backfeed(self, source, responses=None, activities=None): """Processes responses and activities and generates propagate tasks. Stores property names and values to update in source.updates. Args: source: Source responses: dict mapping AS response id to AS object activities: dict mapping AS activity id to AS object """ if responses is None: responses = {} if activities is None: activities = {} # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls([a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get('author', {}).get('id') != user_id: for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get('id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update(u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get('author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json.dumps(resp, indent=2)) continue if source.is_blocked(resp): logging.info('Skipping response by blocked user: %s', json.dumps(resp.get('author') or resp.get('actor'), indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning('Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed(seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.info('Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response( id=id, source=source.key, activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity=json.dumps(urls_to_activity) resp_entity.get_or_save(source, restart=self.RESTART_EXISTING_TASKS) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json.dumps( pruned_responses + unchanged_responses)