def mark_stage_done(user_id, stage, last_processed_ts): # We move failed entries to the error timeseries. So usercache runs never fail. curr_state = get_current_state(user_id, stage) assert(curr_state is not None) assert(curr_state.curr_run_ts is not None) curr_state.last_ts_run = curr_state.curr_run_ts # It is incorrect to assume that we have processed all the data until the # start of the last run. In particular, due to network connectivity or # other issues, it is possible that there is outstanding data on phones # that was collected before the last run started. And if we set this, then # that data will simply be skipped. The same logic applies to all # decorators that are based on client collected data (trip start ts, etc) - # it is only accurate for server generated data. So for maximum generality, # let's allow the stage to pass in last_processed_ts. if last_processed_ts is not None: logging.info("For stage %s, last_ts_processed = %s" % (stage, pydt.datetime.utcfromtimestamp(last_processed_ts).isoformat())) curr_state.last_processed_ts = last_processed_ts else: logging.info("For stage %s, last_ts_processed is unchanged" % stage) curr_state.curr_run_ts = None logging.debug("About to save object %s" % curr_state) edb.save(edb.get_pipeline_state_db(), curr_state) logging.debug("After saving state %s, list is %s" % (curr_state, list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
def mark_stage_done(user_id, stage, last_processed_ts): # We move failed entries to the error timeseries. So usercache runs never fail. curr_state = get_current_state(user_id, stage) assert(curr_state is not None) assert(curr_state.curr_run_ts is not None) curr_state.last_ts_run = curr_state.curr_run_ts # It is incorrect to assume that we have processed all the data until the # start of the last run. In particular, due to network connectivity or # other issues, it is possible that there is outstanding data on phones # that was collected before the last run started. And if we set this, then # that data will simply be skipped. The same logic applies to all # decorators that are based on client collected data (trip start ts, etc) - # it is only accurate for server generated data. So for maximum generality, # let's allow the stage to pass in last_processed_ts. if last_processed_ts is not None: logging.info("For stage %s, last_ts_processed = %s" % (stage, pydt.datetime.utcfromtimestamp(last_processed_ts).isoformat())) curr_state.last_processed_ts = last_processed_ts else: logging.info("For stage %s, last_ts_processed is unchanged" % stage) curr_state.curr_run_ts = None logging.debug("About to save object %s" % curr_state) edb.save(edb.get_pipeline_state_db(), curr_state) logging.debug("After saving state %s, list is %s" % (curr_state, list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
def clearRelatedDb(self): edb.get_timeseries_db().delete_many({"user_id": self.testUUID}) edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID}) edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID}) edb.get_timeseries_db().delete_many({"user_id": self.testUUID1}) edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID1}) edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID1})
def clearRelatedDb(self): edb.get_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_pipeline_state_db().remove({"user_id": self.androidUUID}) edb.get_timeseries_db().remove({"user_id": self.iosUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.iosUUID}) edb.get_pipeline_state_db().remove({"user_id": self.iosUUID})
def get_time_range_for_stage(user_id, stage): """ Returns the start ts and the end ts of the entries in the stage """ curr_state = get_current_state(user_id, stage) if curr_state is None: start_ts = None curr_state = ps.PipelineState() curr_state.user_id = user_id curr_state.pipeline_stage = stage curr_state.curr_run_ts = None curr_state.last_processed_ts = None curr_state.last_ts_run = None else: start_ts = curr_state.last_processed_ts if start_ts is None: logging.info("For stage %s, start_ts is None" % stage) else: logging.info("For stage %s, start_ts = %s" % (stage, pydt.datetime.utcfromtimestamp(start_ts).isoformat())) assert curr_state.curr_run_ts is None, "curr_state.curr_run_ts = %s" % curr_state.curr_run_ts end_ts = time.time() - 5 # Let's pick a point 5 secs in the past to avoid race conditions ret_query = enua.UserCache.TimeQuery("write_ts", start_ts, end_ts) curr_state.curr_run_ts = end_ts edb.get_pipeline_state_db().save(curr_state) return ret_query
def clearRelatedDb(self): edb.get_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_pipeline_state_db().remove({"user_id": self.androidUUID}) edb.get_timeseries_db().remove({"user_id": self.iosUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.iosUUID}) edb.get_pipeline_state_db().remove({"user_id": self.iosUUID})
def tearDown(self): os.remove(self.analysis_conf_path) edb.get_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_timeseries_db().remove({"user_id": self.iosUUID}) edb.get_pipeline_state_db().remove({"user_id": self.androidUUID}) edb.get_pipeline_state_db().remove({"user_id": self.iosUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.androidUUID}) edb.get_analysis_timeseries_db().remove({"user_id": self.iosUUID})
def mark_stage_failed(user_id, stage): curr_state = get_current_state(user_id, stage) assert (curr_state is not None) assert (curr_state.curr_run_ts is not None) # last_ts_run remains unchanged since this run did not succeed # the next query will start from the start_ts of this run # we also reset the curr_run_ts to indicate that we are not currently running curr_state.curr_run_ts = None edb.get_pipeline_state_db().save(curr_state)
def load_pipeline_states(file_prefix, all_uuid_list): import emission.core.get_database as edb for curr_uuid in all_uuid_list: pipeline_filename = "%s_pipelinestate_%s.gz" % (file_prefix, curr_uuid) print("Loading pipeline state for %s from %s" % (curr_uuid, pipeline_filename)) with gzip.open(filename) as gfd: states = json.load(gfd, object_hook=bju.object_hook) edb.get_pipeline_state_db().insert_many(states)
def clearRelatedDb(self): edb.get_timeseries_db().delete_many({"user_id": self.testUUID}) edb.get_analysis_timeseries_db().delete_many( {"user_id": self.testUUID}) edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID}) edb.get_timeseries_db().delete_many({"user_id": self.testUUID1}) edb.get_analysis_timeseries_db().delete_many( {"user_id": self.testUUID1}) edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID1})
def mark_stage_failed(user_id, stage): curr_state = get_current_state(user_id, stage) assert(curr_state is not None) assert(curr_state.curr_run_ts is not None) # last_ts_run remains unchanged since this run did not succeed # the next query will start from the start_ts of this run # we also reset the curr_run_ts to indicate that we are not currently running curr_state.curr_run_ts = None edb.get_pipeline_state_db().save(curr_state)
def reset_pipeline_for_stage(stage, user_id, day_ts): reset_query = {} if user_id is not None: if day_ts is None: print "day_ts is None, deleting stage %s for user %s" % (stage, user_id) print edb.get_pipeline_state_db().remove({'user_id': user_id, 'pipeline_stage': stage.value}) else: if day_ts is None: print "day_ts is None, deleting stage %s for all users" % (stage) print edb.get_pipeline_state_db().remove({'pipeline_stage': stage.value})
def mark_stage_failed(user_id, stage): curr_state = get_current_state(user_id, stage) assert(curr_state is not None) assert(curr_state.curr_run_ts is not None) # last_ts_run remains unchanged since this run did not succeed # the next query will start from the start_ts of this run # we also reset the curr_run_ts to indicate that we are not currently running curr_state.curr_run_ts = None logging.debug("About to save object %s" % curr_state) edb.save(edb.get_pipeline_state_db(), curr_state) logging.debug("After saving state %s, list is %s" % (curr_state, list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
def mark_stage_failed(user_id, stage): curr_state = get_current_state(user_id, stage) assert(curr_state is not None) assert(curr_state.curr_run_ts is not None) # last_ts_run remains unchanged since this run did not succeed # the next query will start from the start_ts of this run # we also reset the curr_run_ts to indicate that we are not currently running curr_state.curr_run_ts = None logging.debug("About to save object %s" % curr_state) edb.save(edb.get_pipeline_state_db(), curr_state) logging.debug("After saving state %s, list is %s" % (curr_state, list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
def load_pipeline_states(file_prefix, all_uuid_list): import emission.core.get_database as edb for curr_uuid in all_uuid_list: pipeline_filename = "%s_pipelinestate_%s.gz" % (file_prefix, curr_uuid) print("Loading pipeline state for %s from %s" % (curr_uuid, pipeline_filename)) with gzip.open(pipeline_filename) as gfd: states = json.load(gfd, object_hook=bju.object_hook) if args.verbose: logging.debug("Loading states of length %s" % len(states)) if len(states) > 0: edb.get_pipeline_state_db().insert_many(states) else: logging.info("No pipeline states found, skipping load")
def post_check(unique_user_list, all_rerun_list): import emission.core.get_database as edb import numpy as np logging.info( "For %s users, loaded %s raw entries, %s processed entries and %s pipeline states" % (len(unique_user_list), edb.get_timeseries_db().count_documents( {"user_id": { "$in": list(unique_user_list) }}), edb.get_analysis_timeseries_db().count_documents( {"user_id": { "$in": list(unique_user_list) }}), edb.get_pipeline_state_db().count_documents({ "user_id": { "$in": list(unique_user_list) } }))) all_rerun_arr = np.array(all_rerun_list) # want to check if no entry needs a rerun? In this case we are done # no entry needs a rerun = all entries are false, not(all entries) are true if np.all(np.logical_not(all_rerun_list)): logging.info( "all entries in the timeline contain analysis results, no need to run the intake pipeline" ) # if all entries need to be re-run, we must have had raw data throughout elif np.all(all_rerun_list): logging.info( "all entries in the timeline contain only raw data, need to run the intake pipeline" ) else: logging.info( "timeline contains a mixture of analysis results and raw data - complain to shankari!" )
def del_objects_after(user_id, reset_ts, is_dry_run): del_query = {} # handle the user del_query.update({"user_id": user_id}) del_query.update({"metadata.key": {"$in": ["inference/prediction", "analysis/inferred_section"]}}) # all objects inserted here have start_ts and end_ts and are trip-like del_query.update({"data.start_ts": {"$gt": reset_ts}}) logging.debug("After all updates, del_query = %s" % del_query) reset_pipeline_query = {"user_id": user_id, "pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value} # Fuzz the TRIP_SEGMENTATION stage 5 mins because of # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312730217 FUZZ_FACTOR = 5 * 60 reset_pipeline_update = {'$set': {'last_processed_ts': reset_ts + FUZZ_FACTOR}} logging.info("About to reset stage %s to %s" % (ecwp.PipelineStages.MODE_INFERENCE, reset_ts)) logging.info("About to delete %d entries" % edb.get_analysis_timeseries_db().find(del_query).count()) logging.info("About to delete entries with keys %s" % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key")) if is_dry_run: logging.info("this is a dry-run, returning from del_objects_after without modifying anything") else: result = edb.get_analysis_timeseries_db().remove(del_query) logging.info("this is not a dry-run, result of deleting analysis entries is %s" % result) result = edb.get_pipeline_state_db().update_one(reset_pipeline_query, reset_pipeline_update) logging.info("this is not a dry-run, result of updating pipeline state is %s" % result.raw_result)
def purge_entries_for_user(curr_uuid, is_purge_state, db_array=None): logging.info("For uuid = %s, deleting entries from the timeseries" % curr_uuid) if db_array is not None: [ts_db, ats_db, udb, psdb] = db_array logging.debug("db_array passed in with databases %s" % db_array) else: import emission.core.get_database as edb ts_db = edb.get_timeseries_db() ats_db = edb.get_analysis_timeseries_db() udb = edb.get_uuid_db() psdb = edb.get_pipeline_state_db() logging.debug("db_array not passed in, looking up databases") timeseries_del_result = ts_db.remove({"user_id": curr_uuid}) logging.info("result = %s" % timeseries_del_result) logging.info( "For uuid = %s, deleting entries from the analysis_timeseries" % curr_uuid) analysis_timeseries_del_result = ats_db.remove({"user_id": curr_uuid}) logging.info("result = %s" % analysis_timeseries_del_result) logging.info("For uuid %s, deleting entries from the user_db" % curr_uuid) user_db_del_result = udb.remove({"uuid": curr_uuid}) logging.info("result = %s" % user_db_del_result) if is_purge_state: logging.info( "For uuid %s, deleting entries from the pipeline_state_db" % curr_uuid) psdb_del_result = psdb.remove({"user_id": curr_uuid}) logging.info("result = %s" % psdb_del_result)
def get_current_state(user_id, stage): curr_state_doc = edb.get_pipeline_state_db().find_one({"user_id": user_id, "pipeline_stage": stage.value}) # logging.debug("returning curr_state_doc %s for stage %s " % (curr_state_doc, stage)) if curr_state_doc is not None: return ps.PipelineState(curr_state_doc) else: return None
def purge_entries_for_user(curr_uuid, is_purge_state, db_array=None): logging.info("For uuid = %s, deleting entries from the timeseries" % curr_uuid) if db_array is not None: [ts_db, ats_db, udb, psdb] = db_array logging.debug("db_array passed in with databases %s" % db_array) else: import emission.core.get_database as edb ts_db = edb.get_timeseries_db() ats_db = edb.get_analysis_timeseries_db() udb = edb.get_uuid_db() psdb = edb.get_pipeline_state_db() logging.debug("db_array not passed in, looking up databases") timeseries_del_result = ts_db.remove({"user_id": curr_uuid}) logging.info("result = %s" % timeseries_del_result) logging.info("For uuid = %s, deleting entries from the analysis_timeseries" % curr_uuid) analysis_timeseries_del_result = ats_db.remove({"user_id": curr_uuid}) logging.info("result = %s" % analysis_timeseries_del_result) logging.info("For uuid %s, deleting entries from the user_db" % curr_uuid) user_db_del_result = udb.remove({"uuid": curr_uuid}) logging.info("result = %s" % user_db_del_result) if is_purge_state: logging.info("For uuid %s, deleting entries from the pipeline_state_db" % curr_uuid) psdb_del_result = psdb.remove({"user_id": curr_uuid}) logging.info("result = %s" % psdb_del_result)
def del_all_objects(is_dry_run): del_query = {} del_query.update({ "metadata.key": { "$in": ["inference/prediction", "analysis/inferred_section"] } }) logging.info("About to delete %d entries" % edb.get_analysis_timeseries_db().find(del_query).count()) logging.info("About to delete entries with keys %s" % edb.get_analysis_timeseries_db().find(del_query).distinct( "metadata.key")) del_pipeline_query = { "pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value } logging.info("About to delete pipeline entries for stage %s" % ecwp.PipelineStages.MODE_INFERENCE) if is_dry_run: logging.info( "this is a dry-run, returning from del_objects_after without modifying anything" ) else: result = edb.get_analysis_timeseries_db().delete_many(del_query) logging.info( "this is not a dry-run, result of deleting analysis entries is %s" % result.raw_result) result = edb.get_pipeline_state_db().delete_many(del_pipeline_query) logging.info( "this is not a dry-run, result of deleting pipeline state is %s" % result.raw_result)
def reset_pipeline_state(user_id, reset_ts, is_dry_run): stages_list = ecwp.PipelineStages # Fuzz the TRIP_SEGMENTATION stage 5 mins because of # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312730217 FUZZ_FACTOR = 5 * 60 trip_seg_reset_pipeline_query = {'user_id': user_id, 'last_processed_ts': {'$ne': None}, # only reset entries that are after the reset_ts # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312958309 'last_processed_ts': {'$gt': reset_ts + FUZZ_FACTOR}, 'pipeline_stage': ecwp.PipelineStages.TRIP_SEGMENTATION.value} trip_seg_update_pipeline_query = {'$set': {'last_processed_ts': reset_ts + FUZZ_FACTOR}} logging.debug("trip_seg_reset_pipeline_query = %s" % trip_seg_reset_pipeline_query) logging.debug("trip_seg_update_pipeline_query = %s" % trip_seg_update_pipeline_query) logging.info("resetting %s trip_seg_pipeline states for user %s to %s" % (edb.get_pipeline_state_db().find(trip_seg_reset_pipeline_query).count(), user_id, reset_ts + FUZZ_FACTOR)) # Don't fuzz the others because of # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312917119 reset_pipeline_query = {'user_id': user_id, 'last_processed_ts': {'$ne': None}, # only reset entries that are after the reset_ts # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312958309 'last_processed_ts': {'$gt': reset_ts}, 'pipeline_stage': {'$ne': ecwp.PipelineStages.TRIP_SEGMENTATION.value}} update_pipeline_query = {'$set': {'last_processed_ts': reset_ts}} logging.debug("reset_pipeline_query = %s" % reset_pipeline_query) logging.debug("update_pipeline_query = %s" % update_pipeline_query) logging.info("out of %s total, resetting %s pipeline states for user %s to %s" % (edb.get_pipeline_state_db().find({'user_id': user_id}).count(), edb.get_pipeline_state_db().find(reset_pipeline_query).count(), user_id, reset_ts)) if is_dry_run: logging.info("this is a dry run, returning from reset_pipeline_state without modifying anything") else: result = edb.get_pipeline_state_db().update( trip_seg_reset_pipeline_query, trip_seg_update_pipeline_query, upsert=False) logging.debug("this is not a dry run, result of updating trip_segmentation stage in reset_pipeline_state = %s" % result) result = edb.get_pipeline_state_db().update( reset_pipeline_query, update_pipeline_query, upsert=False, multi=True) logging.debug("this is not a dry run, result of updating all other stages in reset_pipeline_state = %s" % result)
def get_current_state(user_id, stage): curr_state_doc = edb.get_pipeline_state_db().find_one({"user_id": user_id, "pipeline_stage": stage.value}) #logging.debug("returning curr_state_doc %s for stage %s " % (curr_state_doc, stage)) if curr_state_doc is not None: return ps.PipelineState(curr_state_doc) else: return None
def testMoveDuplicateKey(self): # 5 mins of data, every 30 secs = 10 entries per entry type. There are # 3 entry types, so 30 entries # First all the entries are in the usercache self.assertEqual(len(self.uc1.getMessage()), 30) self.assertEqual(len(list(self.ts1.find_entries())), 0) # Store the entries before the move so that we can duplicate them later entries_before_move = self.uc1.getMessage() # Then we move entries for user1 into longterm enuah.UserCacheHandler.getUserCacheHandler( self.testUserUUID1).moveToLongTerm() # So we end up with all user1 entries in longterm self.assertEqual(len(self.uc1.getMessage()), 0) self.assertEqual(len(list(self.ts1.find_entries())), 30) # Put the same entries (with the same object IDs into the cache again) edb.get_usercache_db().insert_many(entries_before_move) self.assertEqual(len(self.uc1.getMessage()), 30) self.assertEqual(len(self.uc2.getMessage()), 30) # Also reset the user2 cache to be user1 so that we have a fresh supply of entries update_result = edb.get_usercache_db().update_many( {"user_id": self.testUserUUID2}, {"$set": { "user_id": self.testUserUUID1 }}) logging.debug("update_result = %s" % update_result) # Now, we should have 60 entries in the usercache (30 duplicates + 30 from user2) self.assertEqual(len(self.uc1.getMessage()), 60) self.assertEqual(len(list(self.ts1.find_entries())), 30) edb.get_pipeline_state_db().delete_many( {"user_id": self.testUserUUID1}) # Then we move entries for user1 into longterm again enuah.UserCacheHandler.getUserCacheHandler( self.testUserUUID1).moveToLongTerm() # All the duplicates should have been ignored, and the new entries moved into the timeseries self.assertEqual(len(self.uc1.getMessage()), 0) self.assertEqual(len(list(self.ts1.find_entries())), 60)
def export_timeline(user_id, start_day_str, end_day_str, file_name): logging.info("Extracting timeline for user %s day %s -> %s and saving to file %s" % (user_id, start_day_str, end_day_str, file_name)) # day_dt = pydt.datetime.strptime(day_str, "%Y-%m-%d").date() start_day_ts = arrow.get(start_day_str).timestamp end_day_ts = arrow.get(end_day_str).timestamp logging.debug("start_day_ts = %s (%s), end_day_ts = %s (%s)" % (start_day_ts, arrow.get(start_day_ts), end_day_ts, arrow.get(end_day_ts))) ts = esta.TimeSeries.get_time_series(user_id) loc_time_query = estt.TimeQuery("data.ts", start_day_ts, end_day_ts) loc_entry_list = list(estcs.find_entries(user_id, key_list=None, time_query=loc_time_query)) ma_time_query = estt.TimeQuery("metadata.write_ts", start_day_ts, end_day_ts) ma_entry_list = list(estcs.find_entries(user_id, key_list=["background/motion_activity"], time_query=ma_time_query)) trip_time_query = estt.TimeQuery("data.start_ts", start_day_ts, end_day_ts) trip_entry_list = list(ts.find_entries(key_list=None, time_query=trip_time_query)) place_time_query = estt.TimeQuery("data.enter_ts", start_day_ts, end_day_ts) place_entry_list = list(ts.find_entries(key_list=None, time_query=place_time_query)) # Handle the case of the first place, which has no enter_ts and won't be # matched by the default query first_place_extra_query = {'$and': [{'data.enter_ts': {'$exists': False}}, {'data.exit_ts': {'$exists': True}}]} first_place_entry_list = list(ts.find_entries(key_list=None, time_query=None, extra_query_list=[first_place_extra_query])) logging.info("First place entry list = %s" % first_place_entry_list) combined_list = loc_entry_list + ma_entry_list + trip_entry_list + place_entry_list + first_place_entry_list logging.info("Found %d loc entries, %d motion entries, %d trip-like entries, %d place-like entries = %d total entries" % (len(loc_entry_list), len(ma_entry_list), len(trip_entry_list), len(place_entry_list), len(combined_list))) validate_truncation(loc_entry_list, trip_entry_list, place_entry_list) unique_key_list = set([e["metadata"]["key"] for e in combined_list]) logging.info("timeline has unique keys = %s" % unique_key_list) if len(combined_list) == 0 or unique_key_list == set(['stats/pipeline_time']): logging.info("No entries found in range for user %s, skipping save" % user_id) else: # Also dump the pipeline state, since that's where we have analysis results upto # This allows us to copy data to a different *live system*, not just # duplicate for analysis combined_filename = "%s_%s.gz" % (file_name, user_id) with gzip.open(combined_filename, "wt") as gcfd: json.dump(combined_list, gcfd, default=bju.default, allow_nan=False, indent=4) import emission.core.get_database as edb pipeline_state_list = list(edb.get_pipeline_state_db().find({"user_id": user_id})) logging.info("Found %d pipeline states %s" % (len(pipeline_state_list), list([ps["pipeline_stage"] for ps in pipeline_state_list]))) pipeline_filename = "%s_pipelinestate_%s.gz" % (file_name, user_id) with gzip.open(pipeline_filename, "wt") as gpfd: json.dump(pipeline_state_list, gpfd, default=bju.default, allow_nan=False, indent=4)
def get_time_range_for_stage(user_id, stage): """ Returns the start ts and the end ts of the entries in the stage """ curr_state = get_current_state(user_id, stage) if curr_state is None: start_ts = None curr_state = ps.PipelineState() curr_state.user_id = user_id curr_state.pipeline_stage = stage curr_state.curr_run_ts = None curr_state.last_processed_ts = None curr_state.last_ts_run = None else: start_ts = curr_state.last_processed_ts if start_ts is None: logging.info("For stage %s, start_ts is None" % stage) else: logging.info( "For stage %s, start_ts = %s" % (stage, pydt.datetime.utcfromtimestamp(start_ts).isoformat())) assert curr_state.curr_run_ts is None, "curr_state.curr_run_ts = %s" % curr_state.curr_run_ts # Let's pick a point 5 secs in the past. If we don't do this, then we will # read all entries upto the current ts and this may lead to lost data. For # example, let us say that the current ts is t1. At the time that we read # the data, we have 4 entries for t1. By the time we finish copying, we # have 6 entries for t1, we will end up deleting all 6, which will lose 2 # entries. end_ts = time.time() - END_FUZZ_AVOID_LTE ret_query = estt.TimeQuery("metadata.write_ts", start_ts, end_ts) curr_state.curr_run_ts = end_ts logging.debug("About to save object %s" % curr_state) edb.save(edb.get_pipeline_state_db(), curr_state) logging.debug( "After saving state %s, list is %s" % (curr_state, list(edb.get_pipeline_state_db().find({"user_id": user_id})))) return ret_query
def _del_entries_for_query(del_query, is_dry_run): """ This is much easier. The steps are: - delete all analysis objects for this user - delete all pipeline states for this user """ logging.info("About to delete %s analysis results" % edb.get_analysis_timeseries_db().find(del_query).count()) logging.info("About to delete entries with keys %s" % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key")) logging.info("About to delete %s pipeline states" % (edb.get_pipeline_state_db().find(del_query).count())) if is_dry_run: logging.info("this is a dry run, returning from reset_user_to-start without modifying anything") else: result = edb.get_analysis_timeseries_db().remove(del_query) logging.info("this is not a dry run, result of removing analysis objects = %s" % result) result = edb.get_pipeline_state_db().remove(del_query) logging.info("this is not a dry run, result of removing pipeline states = %s" % result)
def testMoveDuplicateKey(self): # 5 mins of data, every 30 secs = 10 entries per entry type. There are # 3 entry types, so 30 entries # First all the entries are in the usercache self.assertEqual(len(self.uc1.getMessage()), 30) self.assertEqual(len(list(self.ts1.find_entries())), 0) # Store the entries before the move so that we can duplicate them later entries_before_move = self.uc1.getMessage() # Then we move entries for user1 into longterm enuah.UserCacheHandler.getUserCacheHandler(self.testUserUUID1).moveToLongTerm() # So we end up with all user1 entries in longterm self.assertEqual(len(self.uc1.getMessage()), 0) self.assertEqual(len(list(self.ts1.find_entries())), 30) # Put the same entries (with the same object IDs into the cache again) edb.get_usercache_db().insert(entries_before_move) self.assertEqual(len(self.uc1.getMessage()), 30) self.assertEqual(len(self.uc2.getMessage()), 30) # Also reset the user2 cache to be user1 so that we have a fresh supply of entries update_result = edb.get_usercache_db().update({"user_id": self.testUserUUID2}, {"$set": {"user_id": self.testUserUUID1}}, multi=True) logging.debug("update_result = %s" % update_result) # Now, we should have 60 entries in the usercache (30 duplicates + 30 from user2) self.assertEqual(len(self.uc1.getMessage()), 60) self.assertEqual(len(list(self.ts1.find_entries())), 30) edb.get_pipeline_state_db().remove({"user_id": self.testUserUUID1}) # Then we move entries for user1 into longterm again enuah.UserCacheHandler.getUserCacheHandler(self.testUserUUID1).moveToLongTerm() # All the duplicates should have been ignored, and the new entries moved into the timeseries self.assertEqual(len(self.uc1.getMessage()), 0) self.assertEqual(len(list(self.ts1.find_entries())), 60)
def get_time_range_for_stage(user_id, stage): """ Returns the start ts and the end ts of the entries in the stage """ curr_state = get_current_state(user_id, stage) if curr_state is None: start_ts = None curr_state = ps.PipelineState() curr_state.user_id = user_id curr_state.pipeline_stage = stage curr_state.curr_run_ts = None curr_state.last_processed_ts = None curr_state.last_ts_run = None else: start_ts = curr_state.last_processed_ts if start_ts is None: logging.info("For stage %s, start_ts is None" % stage) else: logging.info("For stage %s, start_ts = %s" % (stage, pydt.datetime.utcfromtimestamp(start_ts).isoformat())) assert curr_state.curr_run_ts is None, "curr_state.curr_run_ts = %s" % curr_state.curr_run_ts # Let's pick a point 5 secs in the past. If we don't do this, then we will # read all entries upto the current ts and this may lead to lost data. For # example, let us say that the current ts is t1. At the time that we read # the data, we have 4 entries for t1. By the time we finish copying, we # have 6 entries for t1, we will end up deleting all 6, which will lose 2 # entries. end_ts = time.time() - END_FUZZ_AVOID_LTE ret_query = estt.TimeQuery("metadata.write_ts", start_ts, end_ts) curr_state.curr_run_ts = end_ts logging.debug("About to save object %s" % curr_state) edb.save(edb.get_pipeline_state_db(), curr_state) logging.debug("After saving state %s, list is %s" % (curr_state, list(edb.get_pipeline_state_db().find({"user_id": user_id})))) return ret_query
def del_all_objects(is_dry_run): del_query = {} del_query.update({"metadata.key": {"$in": ["inference/prediction", "analysis/inferred_section"]}}) logging.info("About to delete %d entries" % edb.get_analysis_timeseries_db().find(del_query).count()) logging.info("About to delete entries with keys %s" % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key")) del_pipeline_query = {"pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value} logging.info("About to delete pipeline entries for stage %s" % ecwp.PipelineStages.MODE_INFERENCE) if is_dry_run: logging.info("this is a dry-run, returning from del_objects_after without modifying anything") else: result = edb.get_analysis_timeseries_db().delete_many(del_query) logging.info("this is not a dry-run, result of deleting analysis entries is %s" % result.raw_result) result = edb.get_pipeline_state_db().delete_many(del_pipeline_query) logging.info("this is not a dry-run, result of deleting pipeline state is %s" % result.raw_result)
def tearDown(self): import emission.core.get_database as edb edb.get_timeseries_db().remove({"user_id": self.testUUID}) edb.get_pipeline_state_db().remove({"user_id": self.testUUID})
reset_collection(edb.get_analysis_timeseries_db(), user.uuid, new_uuid) logging.debug("Resetting client...") reset_collection(edb.get_client_db(), user.uuid, new_uuid) logging.debug("Resetting client_stats_backup...") reset_collection(edb.get_client_stats_db_backup(), user.uuid, new_uuid) logging.debug("Resetting server_stats_backup...") reset_collection(edb.get_server_stats_db_backup(), user.uuid, new_uuid) logging.debug("Resetting result_stats_backup...") reset_collection(edb.get_result_stats_db_backup(), user.uuid, new_uuid) logging.debug("Resetting edb.get_common_place_db...") reset_collection(edb.get_common_place_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_common_trip_db...") reset_collection(edb.get_common_trip_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_habitica_db...") reset_collection(edb.get_habitica_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_pipeline_state_db...") reset_collection(edb.get_pipeline_state_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_profile_db...") reset_collection(edb.get_profile_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_timeseries_db...") reset_collection(edb.get_timeseries_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_timeseries_error_db...") reset_collection(edb.get_timeseries_error_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_usercache_db...") reset_collection(edb.get_usercache_db(), user.uuid, new_uuid)
parser.add_argument("-i", "--info-only", default=False, action='store_true', help="only print entry analysis") parser.add_argument("-p", "--pipeline-purge", default=False, action='store_true', help="purge the pipeline state as well") args = parser.parse_args() fn = args.timeline_filename logging.info("Loading file or prefix %s" % fn) sel_file_list = common.read_files_with_prefix(fn) ts_db = edb.get_timeseries_db() ats_db = edb.get_analysis_timeseries_db() udb = edb.get_uuid_db() psdb = edb.get_pipeline_state_db() db_array = [ts_db, ats_db, udb, psdb] for i, filename in enumerate(sel_file_list): if "pipelinestate" in filename: continue logging.info("=" * 50) logging.info("Deleting data from file %s" % filename) entries = json.load(gzip.open(filename), object_hook = bju.object_hook) # Obtain uuid and rerun information from entries curr_uuid_list, needs_rerun = common.analyse_timeline(entries) if len(curr_uuid_list) > 1: logging.warning("Found %d users, %s in filename, aborting! " %
# Removes all materialized views and the pipeline state. # This will cause us to reprocess the pipeline from scratch # As history begins to accumulate, we may want to specify a point to reset the # pipeline to instead of deleting everything import logging logging.basicConfig(level=logging.DEBUG) import emission.core.get_database as edb if __name__ == '__main__': print "Deleting all trips" print edb.get_trip_new_db().remove() print "Deleting all sections" print edb.get_section_new_db().remove() print "Deleting pipeline state" print edb.get_pipeline_state_db().remove()
def tearDown(self): edb.get_pipeline_state_db().remove()
edb.get_uuid_db().update({"uuid" : user.uuid}, {"$set": {"uuid" : new_uuid}}) logging.debug("Resetting alternatives...") reset_collection(edb.get_alternatives_db(), user.uuid, new_uuid) logging.debug("Resetting analysis...") reset_collection(edb.get_analysis_timeseries_db(), user.uuid, new_uuid) logging.debug("Resetting client...") reset_collection(edb.get_client_db(), user.uuid, new_uuid) logging.debug("Resetting client_stats_backup...") reset_collection(edb.get_client_stats_db_backup(), user.uuid, new_uuid) logging.debug("Resetting server_stats_backup...") reset_collection(edb.get_server_stats_db_backup(), user.uuid, new_uuid) logging.debug("Resetting result_stats_backup...") reset_collection(edb.get_result_stats_db_backup(), user.uuid, new_uuid) logging.debug("Resetting edb.get_common_place_db...") reset_collection(edb.get_common_place_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_common_trip_db...") reset_collection(edb.get_common_trip_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_habitica_db...") reset_collection(edb.get_habitica_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_pipeline_state_db...") reset_collection(edb.get_pipeline_state_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_profile_db...") reset_collection(edb.get_profile_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_timeseries_db...") reset_collection(edb.get_timeseries_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_timeseries_error_db...") reset_collection(edb.get_timeseries_error_db(), user.uuid, new_uuid) logging.debug("Resetting edb.get_usercache_db...") reset_collection(edb.get_usercache_db(), user.uuid, new_uuid)
def testStartProcessingTwiceTwoStates(self): TEST_DONE_TS_BASE = 999999 self.assertIsNone( epq.get_current_state(self.testUUID, ewps.PipelineStages.USERCACHE)) self.assertIsNone( epq.get_current_state(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION)) self.assertIsNone( epq.get_current_state(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION)) logging.debug("About to start processing for the first time") logging.debug("starting stage usercache %s" % epq.get_time_range_for_stage( self.testUUID, ewps.PipelineStages.USERCACHE)) logging.debug( "starting stage trip_segmentation %s " % epq.get_time_range_for_stage( self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION)) logging.debug( "starting stage section_segmentation %s " % epq.get_time_range_for_stage( self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION)) logging.debug( "After first time processing, states = %s" % list(edb.get_pipeline_state_db().find({"user_id": self.testUUID}))) logging.debug("About to stop processing for the first time") epq.mark_stage_done(self.testUUID, ewps.PipelineStages.USERCACHE, TEST_DONE_TS_BASE) epq.mark_stage_done(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION, TEST_DONE_TS_BASE + 1) epq.mark_stage_done(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION, TEST_DONE_TS_BASE + 2) logging.debug( "After first time stopping, states = %s" % list(edb.get_pipeline_state_db().find({"user_id": self.testUUID}))) logging.debug("About to start processing for the second time") logging.debug("starting stage usercache %s" % epq.get_time_range_for_stage( self.testUUID, ewps.PipelineStages.USERCACHE)) logging.debug( "starting stage trip_segmentation %s " % epq.get_time_range_for_stage( self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION)) logging.debug( "starting stage section_segmentation %s " % epq.get_time_range_for_stage( self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION)) logging.debug( "After second time starting, states = %s" % list(edb.get_pipeline_state_db().find({"user_id": self.testUUID}))) # First set of checks new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.USERCACHE) self.assertIsNotNone(new_state) self.assertIsNotNone(new_state.curr_run_ts) self.assertIsNotNone(new_state.last_ts_run) uc_ts = new_state.curr_run_ts new_state = epq.get_current_state( self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION) self.assertIsNotNone(new_state) self.assertIsNotNone(new_state.curr_run_ts) self.assertIsNotNone(new_state.last_ts_run) ts_ts = new_state.curr_run_ts new_state = epq.get_current_state( self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION) self.assertIsNotNone(new_state) self.assertIsNotNone(new_state.curr_run_ts) self.assertIsNotNone(new_state.last_ts_run) ss_ts = new_state.curr_run_ts logging.debug("About to stop processing for the second time") epq.mark_stage_done(self.testUUID, ewps.PipelineStages.USERCACHE, TEST_DONE_TS_BASE + 10) epq.mark_stage_done(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION, TEST_DONE_TS_BASE + 11) epq.mark_stage_done(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION, TEST_DONE_TS_BASE + 12) logging.debug( "After second time stopping, states = %s" % list(edb.get_pipeline_state_db().find({"user_id": self.testUUID}))) new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.USERCACHE) self.assertIsNotNone(new_state) self.assertIsNone(new_state.curr_run_ts) self.assertEqual(new_state.last_ts_run, uc_ts) self.assertEqual(new_state.last_processed_ts, TEST_DONE_TS_BASE + 10) new_state = epq.get_current_state( self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION) self.assertIsNotNone(new_state) self.assertIsNone(new_state.curr_run_ts) self.assertEqual(new_state.last_ts_run, ts_ts) self.assertEqual(new_state.last_processed_ts, TEST_DONE_TS_BASE + 11) new_state = epq.get_current_state( self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION) self.assertIsNotNone(new_state) self.assertIsNone(new_state.curr_run_ts) self.assertEqual(new_state.last_ts_run, ss_ts) self.assertEqual(new_state.last_processed_ts, TEST_DONE_TS_BASE + 12)
def tearDown(self): import emission.core.get_database as edb edb.get_timeseries_db().delete_many({"user_id": self.testUUID}) edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID})
def tearDown(self): import emission.core.get_database as edb edb.get_timeseries_db().delete_many({"user_id": self.testUUID}) edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID}) os.remove(self.analysis_conf_path)
def tearDown(self): edb.get_pipeline_state_db().delete_many({'user_id': self.testUUID})
def setUp(self): self.testUUID = uuid.uuid4() edb.get_pipeline_state_db().delete_many({'user_id': self.testUUID})
def setUp(self): self.testUUID = uuid.uuid4() edb.get_pipeline_state_db().remove()
def purgeAnalysisData(): print edb.get_analysis_timeseries_db().remove() print edb.get_common_place_db().remove() print edb.get_common_trip_db().remove() print edb.get_pipeline_state_db().remove()
def testStartProcessingTwiceTwoStates(self): TEST_DONE_TS_BASE = 999999 self.assertIsNone(epq.get_current_state(self.testUUID, ewps.PipelineStages.USERCACHE)) self.assertIsNone(epq.get_current_state(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION)) self.assertIsNone(epq.get_current_state(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION)) logging.debug("About to start processing for the first time") logging.debug("starting stage usercache %s" % epq.get_time_range_for_stage(self.testUUID, ewps.PipelineStages.USERCACHE)) logging.debug("starting stage trip_segmentation %s " % epq.get_time_range_for_stage(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION)) logging.debug("starting stage section_segmentation %s " % epq.get_time_range_for_stage(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION)) logging.debug("After first time processing, states = %s" % list(edb.get_pipeline_state_db().find({"user_id": self.testUUID}))) logging.debug("About to stop processing for the first time") epq.mark_stage_done(self.testUUID, ewps.PipelineStages.USERCACHE, TEST_DONE_TS_BASE) epq.mark_stage_done(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION, TEST_DONE_TS_BASE + 1) epq.mark_stage_done(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION, TEST_DONE_TS_BASE + 2) logging.debug("After first time stopping, states = %s" % list(edb.get_pipeline_state_db().find({"user_id": self.testUUID}))) logging.debug("About to start processing for the second time") logging.debug("starting stage usercache %s" % epq.get_time_range_for_stage(self.testUUID, ewps.PipelineStages.USERCACHE)) logging.debug("starting stage trip_segmentation %s " % epq.get_time_range_for_stage(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION)) logging.debug("starting stage section_segmentation %s " % epq.get_time_range_for_stage(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION)) logging.debug("After second time starting, states = %s" % list(edb.get_pipeline_state_db().find({"user_id": self.testUUID}))) # First set of checks new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.USERCACHE) self.assertIsNotNone(new_state) self.assertIsNotNone(new_state.curr_run_ts) self.assertIsNotNone(new_state.last_ts_run) uc_ts = new_state.curr_run_ts new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION) self.assertIsNotNone(new_state) self.assertIsNotNone(new_state.curr_run_ts) self.assertIsNotNone(new_state.last_ts_run) ts_ts = new_state.curr_run_ts new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION) self.assertIsNotNone(new_state) self.assertIsNotNone(new_state.curr_run_ts) self.assertIsNotNone(new_state.last_ts_run) ss_ts = new_state.curr_run_ts logging.debug("About to stop processing for the second time") epq.mark_stage_done(self.testUUID, ewps.PipelineStages.USERCACHE, TEST_DONE_TS_BASE + 10) epq.mark_stage_done(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION, TEST_DONE_TS_BASE + 11) epq.mark_stage_done(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION, TEST_DONE_TS_BASE + 12) logging.debug("After second time stopping, states = %s" % list(edb.get_pipeline_state_db().find({"user_id": self.testUUID}))) new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.USERCACHE) self.assertIsNotNone(new_state) self.assertIsNone(new_state.curr_run_ts) self.assertEquals(new_state.last_ts_run, uc_ts) self.assertEquals(new_state.last_processed_ts, TEST_DONE_TS_BASE + 10) new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION) self.assertIsNotNone(new_state) self.assertIsNone(new_state.curr_run_ts) self.assertEquals(new_state.last_ts_run, ts_ts) self.assertEquals(new_state.last_processed_ts, TEST_DONE_TS_BASE + 11) new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION) self.assertIsNotNone(new_state) self.assertIsNone(new_state.curr_run_ts) self.assertEquals(new_state.last_ts_run, ss_ts) self.assertEquals(new_state.last_processed_ts, TEST_DONE_TS_BASE + 12)
def setUp(self): self.testUUID = uuid.uuid4() edb.get_pipeline_state_db().remove()
def tearDown(self): edb.get_pipeline_state_db().delete_many({'user_id': self.testUUID})
def purgeAnalysisData(): print(edb.get_analysis_timeseries_db().remove()) print(edb.get_common_place_db().remove()) print(edb.get_common_trip_db().remove()) print(edb.get_pipeline_state_db().remove())
def tearDown(self): edb.get_pipeline_state_db().remove()
def reset_pipeline_for_stage(stage, user_id, day_ts): reset_query = {} if user_id is not None: if day_ts is not None: print "Setting new pipeline stage %s for %s to %d" % (stage, user_id, day_ts) print edb.get_pipeline_state_db().update({'user_id': user_id, 'pipeline_stage': stage.value}, {'$set': {'last_processed_ts': day_ts}}, upsert=False) print edb.get_pipeline_state_db().update({'user_id': user_id, 'pipeline_stage': stage.value}, {'$set': {'curr_run_ts': None}}, upsert=False) else: print "day_ts is None, deleting stage %s for user %s" % (stage, user_id) print edb.get_pipeline_state_db().remove({'user_id': user_id, 'pipeline_stage': stage.value}) else: if day_ts is not None: print "Setting new pipeline stage %s for all users to %d" % (stage, day_ts) print edb.get_pipeline_state_db().update({'pipeline_stage': stage.value}, {'$set': {'last_processed_ts': day_ts}}, upsert=False) print edb.get_pipeline_state_db().update({'pipeline_stage': stage.value}, {'$set': {'curr_run_ts': day_ts}}, upsert=False) else: print "day_ts is None, deleting stage %s for all users" % (stage) print edb.get_pipeline_state_db().remove({'pipeline_stage': stage.value})
def setUp(self): self.testUUID = uuid.uuid4() edb.get_pipeline_state_db().delete_many({'user_id': self.testUUID})
parser.add_argument("-p", "--pipeline-purge", default=False, action='store_true', help="purge the pipeline state as well") args = parser.parse_args() fn = args.timeline_filename logging.info("Loading file or prefix %s" % fn) sel_file_list = common.read_files_with_prefix(fn) ts_db = edb.get_timeseries_db() ats_db = edb.get_analysis_timeseries_db() udb = edb.get_uuid_db() psdb = edb.get_pipeline_state_db() for i, filename in enumerate(sel_file_list): logging.info("=" * 50) logging.info("Deleting data from file %s" % filename) entries = json.load(gzip.open(filename), object_hook=bju.object_hook) # Obtain uuid and rerun information from entries curr_uuid_list, needs_rerun = common.analyse_timeline(entries) if len(curr_uuid_list) > 1: logging.warning("Found %d users, %s in filename, aborting! " % (len(curr_uuid_list), curr_uuid_list)) raise RuntimeException( "Found %d users, %s in filename, expecting 1, %s" % (len(curr_uuid_list), curr_uuid_list,