def rebuild_from_query(cls, force=False): logging.info("Loading Index") if cls._is_ndb(): db_query = cls.obj_type.query( *cls._get_query_params_for_indexing()) else: db_query = cls.obj_type.all() object_keys = db_query.fetch(MAX_OBJECTS, keys_only=True) object_ids = set(cls._get_id(x) for x in object_keys) logging.info("Loaded %s objects for indexing", len(object_ids)) if len(object_ids) >= MAX_OBJECTS: logging.critical( 'Found %s objects. Increase the MAX_OBJECTS limit to search more events.', MAX_OBJECTS) doc_index = cls.real_index() docs_per_group = search.MAXIMUM_DOCUMENTS_PER_PUT_REQUEST logging.info("Deleting Expired docs") start_id = '0' doc_ids_to_delete = set() while True: doc_ids = [ x.doc_id for x in doc_index.get_range(ids_only=True, start_id=start_id, include_start_object=False) ] if not doc_ids: break new_ids_to_delete = set(doc_ids).difference(object_ids) doc_ids_to_delete.update(new_ids_to_delete) logging.info( "Looking at %s doc_id candidates for deletion, will delete %s entries.", len(doc_ids), len(new_ids_to_delete)) start_id = doc_ids[-1] if not force and len(doc_ids_to_delete) and len( doc_ids_to_delete) > len(object_ids) * cls.delete_threshold: logging.critical( "Deleting %s docs, more than %d%% of total %s docs", len(doc_ids_to_delete), cls.delete_threshold * 100, len(object_ids)) return logging.info("Deleting %s docs", len(doc_ids_to_delete)) cls.delete_ids(list(doc_ids_to_delete)) # Add all events logging.info("Loading %s docs, in groups of %s", len(object_ids), docs_per_group) object_ids_list = sorted(object_ids) for x in object_ids_list: logging.info('index: %s', x) for i in range(0, len(object_ids_list), docs_per_group): group_object_ids = object_ids_list[i:i + docs_per_group] deferred.defer(cls._save_ids, group_object_ids)
def post(self): event_id = self.request.get('event_id') remapped_address = self.request.get('remapped_address') override_address = self.request.get('override_address') if self.request.get('delete'): e = eventdata.DBEvent.get_by_id(event_id) # This e will be None if the user submits a deletion-form twice if e: event_updates.delete_event(e) self.user.add_message("Event deleted!") return self.redirect('/events/admin_edit?event_id=%s' % event_id) # We could be looking at a potential event for something that is inaccessable to our admin. # So we want to grab the cached value here if possible, which should exist given the admin-edit flow. fb_event = get_fb_event(self.fbl, event_id) logging.info("Fetched fb_event %s", fb_event) if not fb_events.is_public_ish(fb_event): self.add_error('Cannot add secret/closed events to dancedeets!') self.errors_are_fatal() if self.request.get('background'): deferred.defer( add_entities.add_update_event, fb_event, self.fbl, creating_uid=self.user.fb_uid, remapped_address=remapped_address, override_address=override_address, creating_method=eventdata.CM_ADMIN ) self.response.out.write("<title>Added!</title>Added!") else: try: add_entities.add_update_event( fb_event, self.fbl, creating_uid=self.user.fb_uid, remapped_address=remapped_address, override_address=override_address, creating_method=eventdata.CM_ADMIN ) except Exception as e: logging.exception('Error adding event') self.add_error(str(e)) self.errors_are_fatal() self.user.add_message("Changes saved!") return self.redirect('/events/admin_edit?event_id=%s' % event_id)
def process_event_source_ids(discovered_list, fbl): # TODO(lambert): maybe trim any ids from posts with dates "past" the last time we scraped? tricky to get correct though logging.info("Loading processing %s discovered events", len(discovered_list)) event_pipeline.process_discovered_events(fbl, discovered_list) # TODO: Should only run this code on events that we actually decide are worth adding if False: potential_new_source_ids = set([x.extra_source_id for x in discovered_list if x.extra_source_id]) existing_source_ids = set([x.graph_id for x in thing_db.Source.get_by_key_name(potential_new_source_ids) if x]) new_source_ids = set([x for x in potential_new_source_ids if x not in existing_source_ids]) for source_id in new_source_ids: #TODO(lambert): we know it doesn't exist, why does create_source_from_id check datastore? s = thing_db.Source(key_name=source_id) s.put() logging.info("Found %s new sources", len(new_source_ids)) # initiate an out-of-band-scrape for our new sources we found if new_source_ids: deferred.defer(scrape_events_from_source_ids, fbl, new_source_ids)
def function_migrate_thing_to_new_id(fbapi_obj, old_source_id, new_source_id): old_source = thing_db.Source.get_by_key_name(old_source_id) # Maybe we got two of these and it already ran in parallel, so ignore this one if not old_source: return fbl = fb_api.FBLookup(None, fbapi_obj.access_token_list) fbl.fb.raise_on_page_redirect = True try: results = fbl.get(fb_api.LookupThingCommon, new_source_id) except fb_api.PageRedirectException as e: # If our forwarding address in turn has its own forwarding address, # repoint the old thing further down the chain deferred.defer(function_migrate_thing_to_new_id, fbl.fb, old_source_id, e.to_id) return new_source = thing_db.create_source_from_id(fbl, new_source_id) new_source.creating_fb_uid = new_source.creating_fb_uid or old_source.creating_fb_uid new_source.creation_time = new_source.creation_time or old_source.creation_time new_source.last_scrape_time = new_source.last_scrape_time or old_source.last_scrape_time new_source.num_all_events = (new_source.num_all_events or 0) + (old_source.num_all_events or 0) new_source.num_potential_events = (new_source.num_potential_events or 0) + (old_source.num_potential_events or 0) new_source.num_real_events = (new_source.num_real_events or 0) + (old_source.num_real_events or 0) new_source.num_false_negatives = (new_source.num_false_negatives or 0) + ( old_source.num_false_negatives or 0) # Who has pointers to sources?? migrate_potential_events(old_source_id, new_source_id) new_source.put() old_source.delete()
def post(self): if self.json_body['scrapinghub_key'] != keys.get('scrapinghub_key'): self.response.status = 403 return events_to_update = [] new_ids = set() for json_body in self.json_body['items']: event_id = eventdata.DBEvent.generate_id( json_body['namespace'], json_body['namespaced_id']) e = eventdata.DBEvent.get_or_insert(event_id) if e.creating_method is None: new_ids.add(event_id) e.creating_method = eventdata.CM_WEB_SCRAPE events_to_update.append((e, json_body)) event_updates.update_and_save_web_events(events_to_update) for event_id in new_ids: logging.info("New event, publishing to twitter/facebook: %s", event_id) deferred.defer(pubsub.eventually_publish_event, event_id) process_upload_finalization(self.json_body['studio_name']) self.response.status = 200
def migrate_potential_events(old_source_id, new_source_id): #STR_ID_MIGRATE potential_event_list = potential_events.PotentialEvent.gql( "WHERE source_ids = %s" % long(old_source_id)).fetch(100) for pe in potential_event_list: logging.info("old pe %s has ids: %s", pe.fb_event_id, [x.id for x in pe.sources()]) source_infos = set() for source in pe.sources(): # remap ids if source.id == old_source_id: #STR_ID_MIGRATE source = source.copy() source.id = new_source_id source_infos.add(source) pe.set_sources(source_infos) logging.info("new pe %s has ids: %s", pe.fb_event_id, [x.id for x in pe.sources()]) pe.put() if len(potential_event_list): # Tail recursion via task queues! deferred.defer(migrate_potential_events, old_source_id, new_source_id)
def setup_login_state(self, request): #TODO(lambert): change fb api to not request access token, and instead pull it from the user # only request the access token from FB when it's been longer than a day, and do it out-of-band to fetch-and-update-db-and-memcache self.fb_uid = None self.user = None self.access_token = None if len(request.get_all('nt')) > 1: logging.error('Have too many nt= parameters, something is Very Wrong!') for k, v in request.cookies.iteritems(): logging.info("DEBUG: cookie %r = %r", k, v) # Load Facebook cookie try: response = facebook.parse_signed_request_cookie(request.cookies) except Cookie.CookieError: logging.exception("Error processing cookie: %s") return fb_cookie_uid = None if response: fb_cookie_uid = response['user_id'] logging.info("fb cookie id is %s", fb_cookie_uid) # Normally, our trusted source of login id is the FB cookie, # though we may override it below in the case of access_token_md5 trusted_cookie_uid = fb_cookie_uid # for k, v in self.request.cookies.iteritems(): # logging.info('cookie %s = %s', k, v) # Load our dancedeets logged-in user/state our_cookie_uid = None user_login_string = self.get_login_cookie() if user_login_string: user_login_cookie = json.loads(urllib.unquote(user_login_string)) logging.info("Got login cookie: %s", user_login_cookie) if validate_hashed_userlogin(user_login_cookie): our_cookie_uid = user_login_cookie['uid'] # If we have a browser cookie that's verified via access_token_md5, # so let's trust it as authoritative here and ignore the fb cookie if not trusted_cookie_uid and user_login_cookie.get('access_token_md5'): trusted_cookie_uid = our_cookie_uid logging.info("Validated cookie, logging in as %s", our_cookie_uid) if self.request.cookies.get('user_login', ''): logging.info("Deleting old-style user_login cookie") self.response.set_cookie('user_login', '', max_age=0, path='/', domain=self._get_login_cookie_domain()) # If the user has changed facebook users, let's automatically re-login at dancedeets if trusted_cookie_uid and trusted_cookie_uid != our_cookie_uid: self.set_login_cookie(trusted_cookie_uid) our_cookie_uid = trusted_cookie_uid # Don't force-logout the user if there is a our_cookie_uid but not a trusted_cookie_uid # The fb cookie probably expired after a couple hours, and we'd prefer to keep our users logged-in # Logged-out view, just return without setting anything up if not our_cookie_uid: return self.fb_uid = our_cookie_uid self.user = users.User.get_by_id(self.fb_uid) # If we have a user, grab the access token if self.user: if trusted_cookie_uid: # Long-lived tokens should last "around" 60 days, so let's refresh-renew if there's only 40 days left if self.user.fb_access_token_expires: token_expires_soon = (self.user.fb_access_token_expires - datetime.datetime.now()) < datetime.timedelta(days=40) else: # These are either infinite-access tokens (which won't expire soon) # or they are ancient tokens (in which case, our User reload mapreduce has already set user.expired_oauth_token) token_expires_soon = False # Update the access token if necessary if self.user.expired_oauth_token or token_expires_soon or self.request.get('update_fb_access_token'): try: access_token, access_token_expires = self.get_long_lived_token_and_expires(request) except TypeError: logging.info("Could not access cookie ") except facebook.AlreadyHasLongLivedToken: logging.info("Already have long-lived token, FB wouldn't give us a new one, so no need to refresh anything.") else: logging.info("New access token from cookie: %s, expires %s", access_token, access_token_expires) if access_token: self.user = users.User.get_by_id(self.fb_uid) self.user.fb_access_token = access_token self.user.fb_access_token_expires = access_token_expires self.user.expired_oauth_token = False self.user.expired_oauth_token_reason = None # this also sets to memcache self.user.put() logging.info("Stored the new access_token to the User db") else: logging.error("Got a cookie, but no access_token. Using the one from the existing user. Strange!") if 'web' not in self.user.clients: self.user = users.User.get_by_id(self.fb_uid) self.user.clients.append('web') self.user.put() logging.info("Added the web client to the User db") self.access_token = self.user.fb_access_token else: self.access_token = self.user.fb_access_token logging.info("Have dd login cookie but no fb login cookie") if self.user.expired_oauth_token: self.fb_uid = None self.user = None self.access_token = None return elif trusted_cookie_uid: # if we don't have a user but do have a token, the user has granted us permissions, so let's construct the user now try: access_token, access_token_expires = self.get_long_lived_token_and_expires(request) except facebook.AlreadyHasLongLivedToken: logging.warning( "Don't have user, just trusted_cookie_uid. And unable to get long lived token for the incoming request. Giving up and doing logged-out" ) self.fb_uid = None self.access_token = None self.user = None return self.access_token = access_token # Fix this ugly import hack: fbl = fb_api.FBLookup(self.fb_uid, self.access_token) fbl.debug = 'fbl' in self.debug_list fb_user = fbl.get(fb_api.LookupUser, self.fb_uid) referer = self.get_cookie('User-Referer') city = self.request.get('city') or self.get_location_from_headers() or get_location(fb_user) logging.info("User passed in a city of %r, facebook city is %s", self.request.get('city'), get_location(fb_user)) ip = ips.get_remote_ip(self.request) user_creation.create_user_with_fbuser( self.fb_uid, fb_user, self.access_token, access_token_expires, city, ip, send_email=True, referer=referer, client='web' ) # TODO(lambert): handle this MUUUCH better logging.info("Not a /login request and there is no user object, constructed one realllly-quick, and continuing on.") self.user = users.User.get_by_id(self.fb_uid) # Should not happen: if not self.user: logging.error("We still don't have a user!") self.fb_uid = None self.access_token = None self.user = None return else: # no user, no trusted_cookie_uid, but we have fb_uid from the user_login cookie logging.error("We have a user_login cookie, but no user, and no trusted_cookie_uid. Acting as logged-out") self.fb_uid = None self.access_token = None self.user = None return logging.info("Logged in uid %s with name %s and token %s", self.fb_uid, self.user.full_name, self.access_token) # Track last-logged-in state hour_ago = datetime.datetime.now() - datetime.timedelta(hours=1) if not getattr(self.user, 'last_login_time', None) or self.user.last_login_time < hour_ago: # Do this in a separate request so we don't increase latency on this call deferred.defer(update_last_login_time, self.user.fb_uid, datetime.datetime.now(), _queue='slow-queue') backgrounder.load_users([self.fb_uid], allow_cache=False)
def _fetch_object_keys(self, object_keys_to_lookup): logging.info("BatchLookup: Fetching IDs from FB: %s", object_keys_to_lookup) # initiate RPCs object_keys_to_rpcs = {} for object_key in object_keys_to_lookup: cls, oid = break_key(object_key) cls.track_lookup() parts_to_urls = cls.get_lookups(oid) batch_list = [ dict(method='GET', name=part_key, relative_url=url, omit_response_on_success=False) for (part_key, url) in parts_to_urls ] rpc, token = self._create_rpc_for_batch(batch_list, cls.use_access_token) object_keys_to_rpcs[object_key] = rpc, token # fetch RPCs fetched_objects = {} for object_key, (object_rpc, object_token) in object_keys_to_rpcs.iteritems(): cls, oid = break_key(object_key) parts_to_urls = cls.get_lookups(oid) mini_batch_list = [dict(name=part_key, relative_url=url) for (part_key, url) in parts_to_urls] this_object = {} this_object['empty'] = None object_is_bad = False rpc_results = self._map_rpc_to_data(object_rpc) if isinstance(rpc_results, list): named_results = zip(mini_batch_list, rpc_results) elif rpc_results is None: logging.warning("BatchLookup: Has empty rpc_results, perhaps due to URL fetch timeout") object_is_bad = True named_results = [] else: error_code = rpc_results.get('error', {}).get('code') error_type = rpc_results.get('error', {}).get('type') error_message = rpc_results.get('error', {}).get('message') # expired/invalidated OAuth token for User objects. We use one OAuth token per BatchLookup, so no use continuing... # we don't trigger on UserEvents objects since those are often optional and we don't want to break on those, or set invalid bits on those (get it from the User failures instead) if error_code == 190 and error_type == 'OAuthException': logging.warning("Error with expired token: %s", object_token) raise ExpiredOAuthToken(error_message) logging.error("BatchLookup: Error occurred on response, rpc_results is %s", rpc_results) object_is_bad = True named_results = [] for batch_item, result in named_results: object_rpc_name = batch_item['name'] if result is None: logging.warning("BatchLookup: Got timeout when requesting %s", batch_item) if object_rpc_name not in cls.optional_keys: object_is_bad = True continue object_result_code = result['code'] try: object_json = json.loads(result['body']) except: logging.error('Error parsing result body for %r: %r', batch_item, result) raise if object_result_code in [200, 400] and object_json is not None: error_code = None if type(object_json) == dict and ('error_code' in object_json or 'error' in object_json): error_code = object_json.get('error_code', object_json.get('error', {}).get('code', None)) if error_code == 100: # This means the event exists, but the current access_token is insufficient to query it this_object['empty'] = EMPTY_CAUSE_INSUFFICIENT_PERMISSIONS elif error_code == 21: message = object_json['error']['message'] # Facebook gave us a huge hack when they decided to rename/merge page ids, # and so we are forced to deal with remapping by parsing strings at this lowest level. # "Page ID 289919164441106 was migrated to page ID 175608368718. Please update your API calls to the new ID" # But only do it once per object, so rely on object_is_bad to tell us whether we've been through this before if not object_is_bad and re.search('Page ID \d+ was migrated to page ID \d+.', message): from_id, to_id = re.findall(r'ID (\d+)', message) if self.raise_on_page_redirect: raise PageRedirectException(from_id, to_id) else: from event_scraper import thing_db_fixer from util import deferred logging.warning(message) logging.warning("Executing deferred call to migrate to new ID, returning None here.") deferred.defer(thing_db_fixer.function_migrate_thing_to_new_id, self, from_id, to_id) object_is_bad = True elif error_code in [ 2, # Temporary API error: An unexpected error has occurred. Please retry your request later. 2500, # Dependent-lookup on non-existing field: Cannot specify an empty identifier. ]: # Handle errors as documented here: https://developers.facebook.com/docs/graph-api/using-graph-api/v2.0#errors logging.warning("BatchLookup: Error code from FB server for %s: %s: %s", object_rpc_name, error_code, object_json) if object_rpc_name not in cls.optional_keys: object_is_bad = True elif error_code: logging.error("BatchLookup: Error code from FB server for %s: %s: %s", object_rpc_name, error_code, object_json) if object_rpc_name not in cls.optional_keys: object_is_bad = True elif object_json == False: this_object['empty'] = EMPTY_CAUSE_DELETED else: this_object[object_rpc_name] = object_json else: logging.warning("BatchLookup: Got code %s when requesting %s: %s", object_result_code, batch_item, result) if object_rpc_name not in cls.optional_keys: object_is_bad = True if object_is_bad: logging.warning("BatchLookup: Failed to complete object: %s, only have keys %s", object_key, this_object.keys()) else: fetched_objects[object_key] = this_object return fetched_objects
def yield_load_fb_event(fbl, all_events): ctx = context.get() if ctx: params = ctx.mapreduce_spec.mapper.params disable_updates = params['disable_updates'] only_if_updated = params['only_if_updated'] else: disable_updates = [] only_if_updated = True # Process web_events web_events = [x for x in all_events if not x.is_fb_event] events_to_update = [] for web_event in web_events: if event_updates.need_forced_update(web_event): events_to_update.append((web_event, web_event.web_event)) event_updates.update_and_save_web_events(events_to_update, disable_updates=disable_updates) # Now process fb_events db_events = [x for x in all_events if x.is_fb_event] logging.info("loading db events %s", [db_event.fb_event_id for db_event in db_events]) fbl.request_multi(fb_api.LookupEvent, [x.fb_event_id for x in db_events]) fbl.request_multi(fb_api.LookupEventAttending, [x.fb_event_id for x in db_events]) # We load these too, just in case we want to check up on our auto-attendee criteria for events fbl.request_multi(fb_api.LookupEventAttendingMaybe, [x.fb_event_id for x in db_events]) # fbl.request_multi(fb_api.LookupEventPageComments, [x.fb_event_id for x in db_events]) fbl.batch_fetch() events_to_update = [] empty_fb_event_ids = [] for db_event in db_events: try: real_fb_event = fbl.fetched_data(fb_api.LookupEvent, db_event.fb_event_id) # If it's an empty fb_event with our main access token, and we have other tokens we'd like to try... # If there are no visible_to_fb_uids and we don't have permissions, then we don't do this... # # TODO: This would happen on event deletion? # # TODO: Also, who sets visible_to_fb_uids? Why didn't this event have any? # TODO: Who re-sets visible_to_fb_uids after it goes empty? Can we ensure that keeps going? # # TODO: And what happens if we have a deleted event, with visible_to_fb_uids, that we attempt to run and query, and nothing happens? # Should we distinguish between deleted (and inaccessible) and permissions-lost-to-token (and inaccessible)? # # TODO: Why doesn't this update the event? Because add_event_tuple_if_updating seems to do nothing, probably because no fb_event is returned if real_fb_event['empty'] == fb_api.EMPTY_CAUSE_INSUFFICIENT_PERMISSIONS and db_event.visible_to_fb_uids: empty_fb_event_ids.append(db_event.fb_event_id) else: # Otherwise if it's visible to our main token, or there are no other tokens to try, deal with it here. add_event_tuple_if_updating(events_to_update, fbl, db_event, only_if_updated) except fb_api.NoFetchedDataException as e: logging.info("No data fetched for event id %s: %s", db_event.fb_event_id, e) # Now trigger off a background reloading of empty fb_events if empty_fb_event_ids: logging.info("Couldn't fetch, using backup tokens for events: %s", empty_fb_event_ids) deferred.defer( load_fb_events_using_backup_tokens, empty_fb_event_ids, allow_cache=fbl.allow_cache, only_if_updated=only_if_updated, disable_updates=disable_updates ) logging.info("Updating events: %s", [x[0].id for x in events_to_update]) # And then re-save all the events in here if events_to_update: event_updates.update_and_save_fb_events(events_to_update, disable_updates=disable_updates)