def import_exchange_data(exchange, operation_key_field, operation_timestamp_field, request_json, source, topic_id, func): since_date = None if request_json and 'since-date' in request_json: since_date = parse_date(request_json['since-date']) else: # finds latest item stored in db namespace = assert_env('NAMESPACE_PORTFOLIO') client = datastore.Client(namespace=namespace) source_key = client.key(FieldStoreKind.SOURCE.value, source) exchange_key = client.key(FieldStoreKind.EXCHANGE.value, exchange, parent=source_key) query = client.query(kind=FieldStoreKind.OPERATION.value, ancestor=exchange_key) query.order = ['-' + operation_timestamp_field] latest_entries = list(query.fetch(limit=1)) if len(latest_entries) > 0: latest_entry = latest_entries[0] print('loaded latest entry: {}'.format(latest_entry)) since_date = parse_iso8601(latest_entry[operation_timestamp_field]) print('importing since date {}'.format(since_date)) api_access_key = assert_env('BITMEX_API_ACCESS_KEY') api_secret_key = assert_env('BITMEX_API_SECRET_KEY') client = agg.bitmex_client(api_access_key, api_secret_key) results = func(client, since_date) count = store_results(results, exchange, topic_id, source, operation_key_field, operation_timestamp_field) return count
def get_paging_param(param): val = request.values.get(param) try: return util.parse_iso8601(val.replace(' ', '+')) if val else None except BaseException: error(f"Couldn't parse {param}, {val!r} as ISO8601")
def get_paging_param(param): val = self.request.get(param) try: return util.parse_iso8601(val) if val else None except: msg = "Couldn't parse %s %r as ISO8601" % (param, val) logging.exception(msg) self.abort(400, msg)
def test_parse_iso8601(self): for str, offset in ( ('2012-07-23T05:54:49', None), ('2012-07-23T05:54:49+0000', 0), ('2012-07-23T05:54:49-0000', 0), ('2012-07-23T05:54:49+0130', 90), ('2012-07-23T05:54:49-1300', -780), ('2012-07-23T05:54:49-13:00', -780), ): dt = util.parse_iso8601(str) self.assertEqual(datetime.datetime(2012, 07, 23, 5, 54, 49), dt.replace(tzinfo=None)) if offset is not None: offset = datetime.timedelta(minutes=offset) self.assertEqual(offset, dt.utcoffset())
def from_json(cls, json, video, user=None): """This method exists for testing convenience only. It's called only by code that runs in exclusively in development mode. Do not rely on this method in production code. If you need to break this code to implement some new feature, feel free! """ user = user or users.User(json['user']) return cls( user=user, video=video, video_title=json['video_title'], time_watched=util.parse_iso8601(json['time_watched']), seconds_watched=int(json['seconds_watched']), last_second_watched=int(json['last_second_watched']), points_earned=int(json['points_earned']), playlist_titles=json['playlist_titles'] )
def from_json(cls, json, user_data): """This method exists for testing convenience only. It's called only by code that runs in exclusively in development mode. Do not rely on this method in production code. If you need to break this code to implement some new feature, feel free! """ readable_id = json['video']['readable_id'] video = Video.get_for_readable_id(readable_id) return cls( key_name=UserVideo.get_key_name(video, user_data), user=user_data.user, video=video, last_watched=util.parse_iso8601(json['last_watched']), last_second_watched=int(json['last_second_watched']), seconds_watched=int(json['seconds_watched']), duration=int(json['duration']), completed=bool(json['completed']) )
def get_departures(): ENTUR_API = "https://api.entur.io/journey-planner/v2/graphql" USER_AGENT = "jark_technology - departure-iot" QUERY = ''' query Depatures($quay_id: String!, $numberOfDepartures: Int) { quay(id: $quay_id) { estimatedCalls(numberOfDepartures: $numberOfDepartures) { expectedDepartureTime situations { reportType } } } } ''' response = urequests.post( ENTUR_API, headers={'Accept': 'application/json', "ET-Client-Name": USER_AGENT}, json=dict(query=QUERY, variables={"quay_id": config.get("quay_id"), "numberOfDepartures": 3})) result = None try: result = response.json() except Exception as e: raise e finally: response.close() if __debug__: log.debug(result) estimatedCalls = result["data"]["quay"]["estimatedCalls"] expectedDepartureTimes = map(lambda x: ( parse_iso8601(x["expectedDepartureTime"]), any(map(lambda y: y['reportType'] == "incident", x['situations'])) ), estimatedCalls) return list(expectedDepartureTimes)
def backfeed(self, source, responses=None, activities=None): """Processes responses and activities and generates propagate tasks. Stores property names and values to update in source.updates. Args: source: Source responses: dict mapping AS response id to AS object activities: dict mapping AS activity id to AS object """ if responses is None: responses = {} if activities is None: activities = {} # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls( [a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get( 'author', {}).get('id') != user_id and activity.get('verb') != 'share': for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get( 'id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update( u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get( 'author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json_dumps(resp, indent=2)) continue if source.is_blocked(resp): logging.info( 'Skipping response by blocked user: %s', json_dumps(resp.get('author') or resp.get('actor'), indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning( 'Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json_loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed( seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] source.blocked_ids = None for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) # new response to propagate! load block list if we haven't already if source.blocked_ids is None: source.load_blocklist() for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.info( 'Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response(id=id, source=source.key, activities_json=[ json_dumps( util.prune_activity(a, source)) for a in activities ], response_json=json_dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json_dumps(urls_to_activity) resp_entity.get_or_save(source, restart=self.RESTART_EXISTING_TASKS) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json_dumps( pruned_responses + unchanged_responses)
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug("Using ETag %s, last activity id %s", source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache, ) etag = resp.get("etag") # used later user_activities = resp.get("items", []) # these map ids to AS objects responses = {a["id"]: a for a in links} activities = {a["id"]: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates["last_activity_id"] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates["last_activities_cache_json"] = json.dumps( {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids} ) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info("Found %d public activities: %s", len(public), public.keys()) logging.info("Found %d private activities: %s", len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls([a.get("published") for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates["last_public_post"] = util.as_utc(util.parse_iso8601(max_published)) source.updates["recent_private_posts"] = len( [a for a in private.values() if a.get("published", util.EPOCH_ISO) > last_public_post] ) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get("object") or activity # handle user mentions user_id = source.user_tag_id() if obj.get("author", {}).get("id") != user_id: for tag in obj.get("tags", []): urls = tag.get("urls") if tag.get("objectType") == "person" and tag.get("id") == user_id and urls: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) activity["mentions"].update(u.get("value") for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get("attachments", []): if ( att.get("objectType") in ("note", "article") and att.get("author", {}).get("id") == source.user_tag_id() ): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get("replies", {}).get("items", []) tags = obj.get("tags", []) likes = [t for t in tags if Response.get_type(t) == "like"] reactions = [t for t in tags if Response.get_type(t) == "react"] reposts = [t for t in tags if Response.get_type(t) == "repost"] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get("id") if not id: logging.error("Skipping response without id: %s", json.dumps(resp, indent=2)) continue resp.setdefault("activities", []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning("Got two different versions of same response!\n%s\n%s", existing, resp) resp["activities"].extend(existing.get("activities", [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen["id"] resp = responses.get(id) if resp and not source.gr_source.activity_changed(seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop("activities", []) if not activities and resp_type == "post": activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) targets = original_post_discovery.targets_for_response( resp, originals=activity["originals"], mentions=activity["mentions"] ) if targets: logging.info( "%s has %d webmention target(s): %s", activity.get("url"), len(targets), " ".join(targets) ) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, t) too_long.add(t[: _MAX_STRING_LENGTH - 4] + "...") # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response( id=id, source=source.key, activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get("originals", []), ) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates["seen_responses_cache_json"] = json.dumps(pruned_responses + unchanged_responses) source.updates.update({"last_polled": source.last_poll_attempt, "poll_status": "ok"}) if etag and etag != source.last_activities_etag: source.updates["last_activities_etag"] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info("refetching h-feed for source %s", source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates["last_hfeed_refetch"] = now if relationships: logging.info("refetch h-feed found new rel=syndication relationships: %s", relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if isinstance( e, (datastore_errors.BadRequestError, datastore_errors.Timeout) ) or util.is_connection_failure(e): logging.info("Timeout while repropagating responses.", exc_info=True) else: raise
class Poll(webapp2.RequestHandler): """Task handler that fetches and processes new responses from a single source. Request parameters: source_key: string key of source entity last_polled: timestamp, YYYY-MM-DD-HH-MM-SS Inserts a propagate task for each response that hasn't been seen before. """ def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime( util.POLL_TASK_DATETIME_FORMAT): logging.warning( 'duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s/log?start_time=%s&key=%s', self.request.host_url, calendar.timegm(source.last_poll_attempt.utctimetuple()), source.key.urlsafe()) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except models.DisableSource: # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. source.updates['status'] = 'disabled' logging.warning('Disabling source!') except: source.updates['poll_status'] = 'error' raise finally: source = models.Source.put_updates(source) # add new poll task. randomize task ETA to within +/- 20% to try to spread # out tasks and prevent thundering herds. task_countdown = source.poll_period().total_seconds() * random.uniform( .8, 1.2) util.add_poll_task(source, countdown=task_countdown) # feeble attempt to avoid hitting the instance memory limit source = None gc.collect() def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) source.updates['poll_status'] = 'ok' raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning( 'Rate limited. Marking as error and finishing. %s', e) source.updates.update({ 'poll_status': 'error', 'rate_limited': True }) return elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error( 'API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json.dumps({ k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids }) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls( [a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get('author', {}).get('id') != user_id: for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get( 'id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update( u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get( 'author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json.dumps(resp, indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning( 'Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed( seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning( 'Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response(id=id, source=source.key, activities_json=[ json.dumps( util.prune_activity(a, source)) for a in activities ], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json.dumps( pruned_responses + unchanged_responses) source.updates.update({ 'last_polled': source.last_poll_attempt, 'poll_status': 'ok' }) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info( 'refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if (isinstance(e, (datastore_errors.BadRequestError, datastore_errors.Timeout)) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', exc_info=True) else: raise
def backfeed(self, source, responses=None, activities=None): """Processes responses and activities and generates propagate tasks. Stores property names and values to update in source.updates. Args: source: Source responses: dict mapping AS response id to AS object activities: dict mapping AS activity id to AS object """ if responses is None: responses = {} if activities is None: activities = {} # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls([a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get('author', {}).get('id') != user_id: for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get('id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update(u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get('author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json.dumps(resp, indent=2)) continue if source.is_blocked(resp): logging.info('Skipping response by blocked user: %s', json.dumps(resp.get('author') or resp.get('actor'), indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning('Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed(seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.info('Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response( id=id, source=source.key, activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity=json.dumps(urls_to_activity) resp_entity.get_or_save(source, restart=self.RESTART_EXISTING_TASKS) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json.dumps( pruned_responses + unchanged_responses)
def test_parse_iso_date(self): date_str = '2021-02-05T20:21:28.674000+00:00' self.assertEqual(2021, parse_iso8601(date_str).date().year)