def mark_stage_done(user_id, stage, last_processed_ts):
    # We move failed entries to the error timeseries. So usercache runs never fail.
    curr_state = get_current_state(user_id, stage)
    assert(curr_state is not None)
    assert(curr_state.curr_run_ts is not None)
    curr_state.last_ts_run = curr_state.curr_run_ts
    # It is incorrect to assume that we have processed all the data until the
    # start of the last run. In particular, due to network connectivity or
    # other issues, it is possible that there is outstanding data on phones
    # that was collected before the last run started. And if we set this, then
    # that data will simply be skipped. The same logic applies to all
    # decorators that are based on client collected data (trip start ts, etc) -
    # it is only accurate for server generated data. So for maximum generality,
    # let's allow the stage to pass in last_processed_ts.
    if last_processed_ts is not None:
        logging.info("For stage %s, last_ts_processed = %s" %
                     (stage, pydt.datetime.utcfromtimestamp(last_processed_ts).isoformat()))
        curr_state.last_processed_ts = last_processed_ts
    else:
        logging.info("For stage %s, last_ts_processed is unchanged" % stage)
    curr_state.curr_run_ts = None
    logging.debug("About to save object %s" % curr_state)
    edb.save(edb.get_pipeline_state_db(), curr_state)
    logging.debug("After saving state %s, list is %s" % (curr_state,
        list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
Пример #2
0
def mark_stage_done(user_id, stage, last_processed_ts):
    # We move failed entries to the error timeseries. So usercache runs never fail.
    curr_state = get_current_state(user_id, stage)
    assert(curr_state is not None)
    assert(curr_state.curr_run_ts is not None)
    curr_state.last_ts_run = curr_state.curr_run_ts
    # It is incorrect to assume that we have processed all the data until the
    # start of the last run. In particular, due to network connectivity or
    # other issues, it is possible that there is outstanding data on phones
    # that was collected before the last run started. And if we set this, then
    # that data will simply be skipped. The same logic applies to all
    # decorators that are based on client collected data (trip start ts, etc) -
    # it is only accurate for server generated data. So for maximum generality,
    # let's allow the stage to pass in last_processed_ts.
    if last_processed_ts is not None:
        logging.info("For stage %s, last_ts_processed = %s" %
                     (stage, pydt.datetime.utcfromtimestamp(last_processed_ts).isoformat()))
        curr_state.last_processed_ts = last_processed_ts
    else:
        logging.info("For stage %s, last_ts_processed is unchanged" % stage)
    curr_state.curr_run_ts = None
    logging.debug("About to save object %s" % curr_state)
    edb.save(edb.get_pipeline_state_db(), curr_state)
    logging.debug("After saving state %s, list is %s" % (curr_state,
        list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
 def clearRelatedDb(self):
     edb.get_timeseries_db().delete_many({"user_id": self.testUUID})
     edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID})
     edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID})
     edb.get_timeseries_db().delete_many({"user_id": self.testUUID1})
     edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID1})
     edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID1})
 def clearRelatedDb(self):
     edb.get_timeseries_db().remove({"user_id": self.androidUUID})
     edb.get_analysis_timeseries_db().remove({"user_id": self.androidUUID})
     edb.get_pipeline_state_db().remove({"user_id": self.androidUUID})
     edb.get_timeseries_db().remove({"user_id": self.iosUUID})
     edb.get_analysis_timeseries_db().remove({"user_id": self.iosUUID})
     edb.get_pipeline_state_db().remove({"user_id": self.iosUUID})
Пример #5
0
def get_time_range_for_stage(user_id, stage):
    """
    Returns the start ts and the end ts of the entries in the stage
    """
    curr_state = get_current_state(user_id, stage)

    if curr_state is None:
        start_ts = None
        curr_state = ps.PipelineState()
        curr_state.user_id = user_id
        curr_state.pipeline_stage = stage
        curr_state.curr_run_ts = None
        curr_state.last_processed_ts = None
        curr_state.last_ts_run = None
    else:
        start_ts = curr_state.last_processed_ts

    if start_ts is None:
        logging.info("For stage %s, start_ts is None" % stage)
    else:
        logging.info("For stage %s, start_ts = %s" % (stage, pydt.datetime.utcfromtimestamp(start_ts).isoformat()))

    assert curr_state.curr_run_ts is None, "curr_state.curr_run_ts = %s" % curr_state.curr_run_ts

    end_ts = time.time() - 5 # Let's pick a point 5 secs in the past to avoid race conditions

    ret_query = enua.UserCache.TimeQuery("write_ts", start_ts, end_ts)

    curr_state.curr_run_ts = end_ts
    edb.get_pipeline_state_db().save(curr_state)
    return ret_query
 def clearRelatedDb(self):
     edb.get_timeseries_db().remove({"user_id": self.androidUUID})
     edb.get_analysis_timeseries_db().remove({"user_id": self.androidUUID})
     edb.get_pipeline_state_db().remove({"user_id": self.androidUUID})
     edb.get_timeseries_db().remove({"user_id": self.iosUUID})
     edb.get_analysis_timeseries_db().remove({"user_id": self.iosUUID})
     edb.get_pipeline_state_db().remove({"user_id": self.iosUUID})
Пример #7
0
 def tearDown(self):
     os.remove(self.analysis_conf_path)
     edb.get_timeseries_db().remove({"user_id": self.androidUUID}) 
     edb.get_timeseries_db().remove({"user_id": self.iosUUID})
     edb.get_pipeline_state_db().remove({"user_id": self.androidUUID})
     edb.get_pipeline_state_db().remove({"user_id": self.iosUUID})
     edb.get_analysis_timeseries_db().remove({"user_id": self.androidUUID})
     edb.get_analysis_timeseries_db().remove({"user_id": self.iosUUID}) 
Пример #8
0
def mark_stage_failed(user_id, stage):
    curr_state = get_current_state(user_id, stage)
    assert (curr_state is not None)
    assert (curr_state.curr_run_ts is not None)
    # last_ts_run remains unchanged since this run did not succeed
    # the next query will start from the start_ts of this run
    # we also reset the curr_run_ts to indicate that we are not currently running
    curr_state.curr_run_ts = None
    edb.get_pipeline_state_db().save(curr_state)
Пример #9
0
def load_pipeline_states(file_prefix, all_uuid_list):
    import emission.core.get_database as edb
    for curr_uuid in all_uuid_list:
        pipeline_filename = "%s_pipelinestate_%s.gz" % (file_prefix, curr_uuid)
        print("Loading pipeline state for %s from %s" %
              (curr_uuid, pipeline_filename))
        with gzip.open(filename) as gfd:
            states = json.load(gfd, object_hook=bju.object_hook)
            edb.get_pipeline_state_db().insert_many(states)
Пример #10
0
 def clearRelatedDb(self):
     edb.get_timeseries_db().delete_many({"user_id": self.testUUID})
     edb.get_analysis_timeseries_db().delete_many(
         {"user_id": self.testUUID})
     edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID})
     edb.get_timeseries_db().delete_many({"user_id": self.testUUID1})
     edb.get_analysis_timeseries_db().delete_many(
         {"user_id": self.testUUID1})
     edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID1})
Пример #11
0
def mark_stage_failed(user_id, stage):
    curr_state = get_current_state(user_id, stage)
    assert(curr_state is not None)
    assert(curr_state.curr_run_ts is not None)
    # last_ts_run remains unchanged since this run did not succeed
    # the next query will start from the start_ts of this run
    # we also reset the curr_run_ts to indicate that we are not currently running
    curr_state.curr_run_ts = None
    edb.get_pipeline_state_db().save(curr_state)
Пример #12
0
def reset_pipeline_for_stage(stage, user_id, day_ts):
    reset_query = {}

    if user_id is not None:
        if day_ts is None:
            print "day_ts is None, deleting stage %s for user %s" % (stage, user_id)
            print edb.get_pipeline_state_db().remove({'user_id': user_id,
                    'pipeline_stage': stage.value})
    else:
        if day_ts is None:
            print "day_ts is None, deleting stage %s for all users" % (stage)
            print edb.get_pipeline_state_db().remove({'pipeline_stage': stage.value})
Пример #13
0
def mark_stage_failed(user_id, stage):
    curr_state = get_current_state(user_id, stage)
    assert(curr_state is not None)
    assert(curr_state.curr_run_ts is not None)
    # last_ts_run remains unchanged since this run did not succeed
    # the next query will start from the start_ts of this run
    # we also reset the curr_run_ts to indicate that we are not currently running
    curr_state.curr_run_ts = None
    logging.debug("About to save object %s" % curr_state)
    edb.save(edb.get_pipeline_state_db(), curr_state)
    logging.debug("After saving state %s, list is %s" % (curr_state,
        list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
Пример #14
0
def mark_stage_failed(user_id, stage):
    curr_state = get_current_state(user_id, stage)
    assert(curr_state is not None)
    assert(curr_state.curr_run_ts is not None)
    # last_ts_run remains unchanged since this run did not succeed
    # the next query will start from the start_ts of this run
    # we also reset the curr_run_ts to indicate that we are not currently running
    curr_state.curr_run_ts = None
    logging.debug("About to save object %s" % curr_state)
    edb.save(edb.get_pipeline_state_db(), curr_state)
    logging.debug("After saving state %s, list is %s" % (curr_state,
        list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
def load_pipeline_states(file_prefix, all_uuid_list):
    import emission.core.get_database as edb
    for curr_uuid in all_uuid_list:
        pipeline_filename = "%s_pipelinestate_%s.gz" % (file_prefix, curr_uuid)
        print("Loading pipeline state for %s from %s" %
              (curr_uuid, pipeline_filename))
        with gzip.open(pipeline_filename) as gfd:
            states = json.load(gfd, object_hook=bju.object_hook)
            if args.verbose:
                logging.debug("Loading states of length %s" % len(states))
            if len(states) > 0:
                edb.get_pipeline_state_db().insert_many(states)
            else:
                logging.info("No pipeline states found, skipping load")
Пример #16
0
def post_check(unique_user_list, all_rerun_list):
    import emission.core.get_database as edb
    import numpy as np

    logging.info(
        "For %s users, loaded %s raw entries, %s processed entries and %s pipeline states"
        % (len(unique_user_list), edb.get_timeseries_db().count_documents(
            {"user_id": {
                "$in": list(unique_user_list)
            }}), edb.get_analysis_timeseries_db().count_documents(
                {"user_id": {
                    "$in": list(unique_user_list)
                }}), edb.get_pipeline_state_db().count_documents({
                    "user_id": {
                        "$in": list(unique_user_list)
                    }
                })))

    all_rerun_arr = np.array(all_rerun_list)

    # want to check if no entry needs a rerun? In this case we are done
    # no entry needs a rerun = all entries are false, not(all entries) are true
    if np.all(np.logical_not(all_rerun_list)):
        logging.info(
            "all entries in the timeline contain analysis results, no need to run the intake pipeline"
        )
    # if all entries need to be re-run, we must have had raw data throughout
    elif np.all(all_rerun_list):
        logging.info(
            "all entries in the timeline contain only raw data, need to run the intake pipeline"
        )
    else:
        logging.info(
            "timeline contains a mixture of analysis results and raw data - complain to shankari!"
        )
Пример #17
0
def del_objects_after(user_id, reset_ts, is_dry_run):
    del_query = {}
    # handle the user
    del_query.update({"user_id": user_id})

    del_query.update({"metadata.key": {"$in": ["inference/prediction", "analysis/inferred_section"]}})
    # all objects inserted here have start_ts and end_ts and are trip-like
    del_query.update({"data.start_ts": {"$gt": reset_ts}})
    logging.debug("After all updates, del_query = %s" % del_query)

    reset_pipeline_query = {"user_id": user_id, "pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value}
    # Fuzz the TRIP_SEGMENTATION stage 5 mins because of
    # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312730217
    FUZZ_FACTOR = 5 * 60
    reset_pipeline_update = {'$set': {'last_processed_ts': reset_ts + FUZZ_FACTOR}}
    logging.info("About to reset stage %s to %s" 
        % (ecwp.PipelineStages.MODE_INFERENCE, reset_ts))
    

    logging.info("About to delete %d entries" 
        % edb.get_analysis_timeseries_db().find(del_query).count())
    logging.info("About to delete entries with keys %s" 
        % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key"))
    
    if is_dry_run:
        logging.info("this is a dry-run, returning from del_objects_after without modifying anything")
    else:
        result = edb.get_analysis_timeseries_db().remove(del_query)
        logging.info("this is not a dry-run, result of deleting analysis entries is %s" % result)
        result = edb.get_pipeline_state_db().update_one(reset_pipeline_query, reset_pipeline_update)
        logging.info("this is not a dry-run, result of updating pipeline state is %s" % result.raw_result)
Пример #18
0
def purge_entries_for_user(curr_uuid, is_purge_state, db_array=None):
    logging.info("For uuid = %s, deleting entries from the timeseries" %
                 curr_uuid)
    if db_array is not None:
        [ts_db, ats_db, udb, psdb] = db_array
        logging.debug("db_array passed in with databases %s" % db_array)
    else:
        import emission.core.get_database as edb

        ts_db = edb.get_timeseries_db()
        ats_db = edb.get_analysis_timeseries_db()
        udb = edb.get_uuid_db()
        psdb = edb.get_pipeline_state_db()
        logging.debug("db_array not passed in, looking up databases")

    timeseries_del_result = ts_db.remove({"user_id": curr_uuid})
    logging.info("result = %s" % timeseries_del_result)

    logging.info(
        "For uuid = %s, deleting entries from the analysis_timeseries" %
        curr_uuid)
    analysis_timeseries_del_result = ats_db.remove({"user_id": curr_uuid})
    logging.info("result = %s" % analysis_timeseries_del_result)

    logging.info("For uuid %s, deleting entries from the user_db" % curr_uuid)
    user_db_del_result = udb.remove({"uuid": curr_uuid})
    logging.info("result = %s" % user_db_del_result)

    if is_purge_state:
        logging.info(
            "For uuid %s, deleting entries from the pipeline_state_db" %
            curr_uuid)
        psdb_del_result = psdb.remove({"user_id": curr_uuid})
        logging.info("result = %s" % psdb_del_result)
def get_current_state(user_id, stage):
    curr_state_doc = edb.get_pipeline_state_db().find_one({"user_id": user_id, "pipeline_stage": stage.value})
    # logging.debug("returning curr_state_doc  %s for stage %s " % (curr_state_doc, stage))
    if curr_state_doc is not None:
        return ps.PipelineState(curr_state_doc)
    else:
        return None
Пример #20
0
def purge_entries_for_user(curr_uuid, is_purge_state, db_array=None):
    logging.info("For uuid = %s, deleting entries from the timeseries" % curr_uuid)
    if db_array is not None:
        [ts_db, ats_db, udb, psdb] = db_array
        logging.debug("db_array passed in with databases %s" % db_array)
    else:
        import emission.core.get_database as edb

        ts_db = edb.get_timeseries_db()
        ats_db = edb.get_analysis_timeseries_db()
        udb = edb.get_uuid_db()
        psdb = edb.get_pipeline_state_db()
        logging.debug("db_array not passed in, looking up databases")

    timeseries_del_result = ts_db.remove({"user_id": curr_uuid})
    logging.info("result = %s" % timeseries_del_result)

    logging.info("For uuid = %s, deleting entries from the analysis_timeseries" % curr_uuid)
    analysis_timeseries_del_result = ats_db.remove({"user_id": curr_uuid})
    logging.info("result = %s" % analysis_timeseries_del_result)

    logging.info("For uuid %s, deleting entries from the user_db" % curr_uuid)
    user_db_del_result = udb.remove({"uuid": curr_uuid})
    logging.info("result = %s" % user_db_del_result)

    if is_purge_state:
        logging.info("For uuid %s, deleting entries from the pipeline_state_db" % curr_uuid)
        psdb_del_result = psdb.remove({"user_id": curr_uuid})
        logging.info("result = %s" % psdb_del_result)
Пример #21
0
def del_all_objects(is_dry_run):
    del_query = {}
    del_query.update({
        "metadata.key": {
            "$in": ["inference/prediction", "analysis/inferred_section"]
        }
    })
    logging.info("About to delete %d entries" %
                 edb.get_analysis_timeseries_db().find(del_query).count())
    logging.info("About to delete entries with keys %s" %
                 edb.get_analysis_timeseries_db().find(del_query).distinct(
                     "metadata.key"))

    del_pipeline_query = {
        "pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value
    }
    logging.info("About to delete pipeline entries for stage %s" %
                 ecwp.PipelineStages.MODE_INFERENCE)

    if is_dry_run:
        logging.info(
            "this is a dry-run, returning from del_objects_after without modifying anything"
        )
    else:
        result = edb.get_analysis_timeseries_db().delete_many(del_query)
        logging.info(
            "this is not a dry-run, result of deleting analysis entries is %s"
            % result.raw_result)
        result = edb.get_pipeline_state_db().delete_many(del_pipeline_query)
        logging.info(
            "this is not a dry-run, result of deleting pipeline state is %s" %
            result.raw_result)
Пример #22
0
def reset_pipeline_state(user_id, reset_ts, is_dry_run):
    stages_list = ecwp.PipelineStages

    # Fuzz the TRIP_SEGMENTATION stage 5 mins because of
    # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312730217
    FUZZ_FACTOR = 5 * 60
    trip_seg_reset_pipeline_query = {'user_id': user_id,
                                     'last_processed_ts': {'$ne': None},
    # only reset entries that are after the reset_ts
    # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312958309
                                     'last_processed_ts': {'$gt': reset_ts + FUZZ_FACTOR},
                                     'pipeline_stage': ecwp.PipelineStages.TRIP_SEGMENTATION.value}
    trip_seg_update_pipeline_query = {'$set': {'last_processed_ts': reset_ts + FUZZ_FACTOR}}
    logging.debug("trip_seg_reset_pipeline_query = %s" % trip_seg_reset_pipeline_query)
    logging.debug("trip_seg_update_pipeline_query = %s" % trip_seg_update_pipeline_query)
    logging.info("resetting %s trip_seg_pipeline states for user %s to %s" % 
            (edb.get_pipeline_state_db().find(trip_seg_reset_pipeline_query).count(),
            user_id, reset_ts + FUZZ_FACTOR))

    # Don't fuzz the others because of 
    # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312917119
    reset_pipeline_query = {'user_id': user_id,
                            'last_processed_ts': {'$ne': None},
    # only reset entries that are after the reset_ts
    # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312958309
                            'last_processed_ts': {'$gt': reset_ts},
                            'pipeline_stage': {'$ne': ecwp.PipelineStages.TRIP_SEGMENTATION.value}}
    update_pipeline_query = {'$set': {'last_processed_ts': reset_ts}}
    logging.debug("reset_pipeline_query = %s" % reset_pipeline_query)
    logging.debug("update_pipeline_query = %s" % update_pipeline_query)
    logging.info("out of %s total, resetting %s pipeline states for user %s to %s" % 
            (edb.get_pipeline_state_db().find({'user_id': user_id}).count(),
            edb.get_pipeline_state_db().find(reset_pipeline_query).count(),
            user_id, reset_ts))

    if is_dry_run:
        logging.info("this is a dry run, returning from reset_pipeline_state without modifying anything")
    else:
        result = edb.get_pipeline_state_db().update(
                    trip_seg_reset_pipeline_query, trip_seg_update_pipeline_query,
                    upsert=False)
        logging.debug("this is not a dry run, result of updating trip_segmentation stage in reset_pipeline_state = %s" % result)

        result = edb.get_pipeline_state_db().update(
                    reset_pipeline_query, update_pipeline_query,
                    upsert=False, multi=True)
        logging.debug("this is not a dry run, result of updating all other stages in reset_pipeline_state = %s" % result)
Пример #23
0
def get_current_state(user_id, stage):
    curr_state_doc = edb.get_pipeline_state_db().find_one({"user_id": user_id,
                                                            "pipeline_stage": stage.value})
    #logging.debug("returning curr_state_doc  %s for stage %s " % (curr_state_doc, stage))
    if curr_state_doc is not None:
        return ps.PipelineState(curr_state_doc)
    else:
        return None
    def testMoveDuplicateKey(self):
        # 5 mins of data, every 30 secs = 10 entries per entry type. There are
        # 3 entry types, so 30 entries

        # First all the entries are in the usercache
        self.assertEqual(len(self.uc1.getMessage()), 30)
        self.assertEqual(len(list(self.ts1.find_entries())), 0)

        # Store the entries before the move so that we can duplicate them later
        entries_before_move = self.uc1.getMessage()

        # Then we move entries for user1 into longterm
        enuah.UserCacheHandler.getUserCacheHandler(
            self.testUserUUID1).moveToLongTerm()

        # So we end up with all user1 entries in longterm
        self.assertEqual(len(self.uc1.getMessage()), 0)
        self.assertEqual(len(list(self.ts1.find_entries())), 30)

        # Put the same entries (with the same object IDs into the cache again)
        edb.get_usercache_db().insert_many(entries_before_move)
        self.assertEqual(len(self.uc1.getMessage()), 30)

        self.assertEqual(len(self.uc2.getMessage()), 30)
        # Also reset the user2 cache to be user1 so that we have a fresh supply of entries
        update_result = edb.get_usercache_db().update_many(
            {"user_id": self.testUserUUID2},
            {"$set": {
                "user_id": self.testUserUUID1
            }})
        logging.debug("update_result = %s" % update_result)

        # Now, we should have 60 entries in the usercache (30 duplicates + 30 from user2)
        self.assertEqual(len(self.uc1.getMessage()), 60)
        self.assertEqual(len(list(self.ts1.find_entries())), 30)

        edb.get_pipeline_state_db().delete_many(
            {"user_id": self.testUserUUID1})

        # Then we move entries for user1 into longterm again
        enuah.UserCacheHandler.getUserCacheHandler(
            self.testUserUUID1).moveToLongTerm()

        # All the duplicates should have been ignored, and the new entries moved into the timeseries
        self.assertEqual(len(self.uc1.getMessage()), 0)
        self.assertEqual(len(list(self.ts1.find_entries())), 60)
def export_timeline(user_id, start_day_str, end_day_str, file_name):
    logging.info("Extracting timeline for user %s day %s -> %s and saving to file %s" %
                 (user_id, start_day_str, end_day_str, file_name))

    # day_dt = pydt.datetime.strptime(day_str, "%Y-%m-%d").date()
    start_day_ts = arrow.get(start_day_str).timestamp
    end_day_ts = arrow.get(end_day_str).timestamp
    logging.debug("start_day_ts = %s (%s), end_day_ts = %s (%s)" % 
        (start_day_ts, arrow.get(start_day_ts),
         end_day_ts, arrow.get(end_day_ts)))

    ts = esta.TimeSeries.get_time_series(user_id)
    loc_time_query = estt.TimeQuery("data.ts", start_day_ts, end_day_ts)
    loc_entry_list = list(estcs.find_entries(user_id, key_list=None, time_query=loc_time_query))
    ma_time_query = estt.TimeQuery("metadata.write_ts", start_day_ts, end_day_ts)
    ma_entry_list = list(estcs.find_entries(user_id, key_list=["background/motion_activity"], time_query=ma_time_query))
    trip_time_query = estt.TimeQuery("data.start_ts", start_day_ts, end_day_ts)
    trip_entry_list = list(ts.find_entries(key_list=None, time_query=trip_time_query))
    place_time_query = estt.TimeQuery("data.enter_ts", start_day_ts, end_day_ts)
    place_entry_list = list(ts.find_entries(key_list=None, time_query=place_time_query))
    # Handle the case of the first place, which has no enter_ts and won't be
    # matched by the default query
    first_place_extra_query = {'$and': [{'data.enter_ts': {'$exists': False}},
                                        {'data.exit_ts': {'$exists': True}}]}
    first_place_entry_list = list(ts.find_entries(key_list=None, time_query=None, extra_query_list=[first_place_extra_query]))
    logging.info("First place entry list = %s" % first_place_entry_list)

    combined_list = loc_entry_list + ma_entry_list + trip_entry_list + place_entry_list + first_place_entry_list
    logging.info("Found %d loc entries, %d motion entries, %d trip-like entries, %d place-like entries = %d total entries" %
        (len(loc_entry_list), len(ma_entry_list), len(trip_entry_list), len(place_entry_list), len(combined_list)))

    validate_truncation(loc_entry_list, trip_entry_list, place_entry_list)

    unique_key_list = set([e["metadata"]["key"] for e in combined_list])
    logging.info("timeline has unique keys = %s" % unique_key_list)
    if len(combined_list) == 0 or unique_key_list == set(['stats/pipeline_time']):
        logging.info("No entries found in range for user %s, skipping save" % user_id)
    else:
        # Also dump the pipeline state, since that's where we have analysis results upto
        # This allows us to copy data to a different *live system*, not just
        # duplicate for analysis
        combined_filename = "%s_%s.gz" % (file_name, user_id)
        with gzip.open(combined_filename, "wt") as gcfd:
            json.dump(combined_list,
                gcfd, default=bju.default, allow_nan=False, indent=4)

        import emission.core.get_database as edb

        pipeline_state_list = list(edb.get_pipeline_state_db().find({"user_id": user_id}))
        logging.info("Found %d pipeline states %s" %
            (len(pipeline_state_list),
             list([ps["pipeline_stage"] for ps in pipeline_state_list])))

        pipeline_filename = "%s_pipelinestate_%s.gz" % (file_name, user_id)
        with gzip.open(pipeline_filename, "wt") as gpfd:
            json.dump(pipeline_state_list,
                gpfd, default=bju.default, allow_nan=False, indent=4)
Пример #26
0
def get_time_range_for_stage(user_id, stage):
    """
    Returns the start ts and the end ts of the entries in the stage
    """
    curr_state = get_current_state(user_id, stage)

    if curr_state is None:
        start_ts = None
        curr_state = ps.PipelineState()
        curr_state.user_id = user_id
        curr_state.pipeline_stage = stage
        curr_state.curr_run_ts = None
        curr_state.last_processed_ts = None
        curr_state.last_ts_run = None
    else:
        start_ts = curr_state.last_processed_ts

    if start_ts is None:
        logging.info("For stage %s, start_ts is None" % stage)
    else:
        logging.info(
            "For stage %s, start_ts = %s" %
            (stage, pydt.datetime.utcfromtimestamp(start_ts).isoformat()))

    assert curr_state.curr_run_ts is None, "curr_state.curr_run_ts = %s" % curr_state.curr_run_ts
    # Let's pick a point 5 secs in the past. If we don't do this, then we will
    # read all entries upto the current ts and this may lead to lost data. For
    # example, let us say that the current ts is t1. At the time that we read
    # the data, we have 4 entries for t1. By the time we finish copying, we
    # have 6 entries for t1, we will end up deleting all 6, which will lose 2
    # entries.
    end_ts = time.time() - END_FUZZ_AVOID_LTE

    ret_query = estt.TimeQuery("metadata.write_ts", start_ts, end_ts)

    curr_state.curr_run_ts = end_ts
    logging.debug("About to save object %s" % curr_state)
    edb.save(edb.get_pipeline_state_db(), curr_state)
    logging.debug(
        "After saving state %s, list is %s" %
        (curr_state,
         list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
    return ret_query
Пример #27
0
def _del_entries_for_query(del_query, is_dry_run):
    """
        This is much easier. The steps are:
        - delete all analysis objects for this user
        - delete all pipeline states for this user
    """
    logging.info("About to delete %s analysis results" %
                    edb.get_analysis_timeseries_db().find(del_query).count())
    logging.info("About to delete entries with keys %s" 
        % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key"))
    logging.info("About to delete %s pipeline states" % 
            (edb.get_pipeline_state_db().find(del_query).count()))

    if is_dry_run:
        logging.info("this is a dry run, returning from reset_user_to-start without modifying anything")
    else: 
        result = edb.get_analysis_timeseries_db().remove(del_query)
        logging.info("this is not a dry run, result of removing analysis objects = %s" % result)
        result = edb.get_pipeline_state_db().remove(del_query)
        logging.info("this is not a dry run, result of removing pipeline states = %s" % result)
    def testMoveDuplicateKey(self):
        # 5 mins of data, every 30 secs = 10 entries per entry type. There are
        # 3 entry types, so 30 entries

        # First all the entries are in the usercache
        self.assertEqual(len(self.uc1.getMessage()), 30)
        self.assertEqual(len(list(self.ts1.find_entries())), 0)

        # Store the entries before the move so that we can duplicate them later
        entries_before_move = self.uc1.getMessage()

        # Then we move entries for user1 into longterm
        enuah.UserCacheHandler.getUserCacheHandler(self.testUserUUID1).moveToLongTerm()

        # So we end up with all user1 entries in longterm
        self.assertEqual(len(self.uc1.getMessage()), 0)
        self.assertEqual(len(list(self.ts1.find_entries())), 30)

        # Put the same entries (with the same object IDs into the cache again)
        edb.get_usercache_db().insert(entries_before_move)
        self.assertEqual(len(self.uc1.getMessage()), 30)

        self.assertEqual(len(self.uc2.getMessage()), 30)
        # Also reset the user2 cache to be user1 so that we have a fresh supply of entries
        update_result = edb.get_usercache_db().update({"user_id": self.testUserUUID2},
                                      {"$set": {"user_id": self.testUserUUID1}},
                                      multi=True)
        logging.debug("update_result = %s" % update_result)

        # Now, we should have 60 entries in the usercache (30 duplicates + 30 from user2)
        self.assertEqual(len(self.uc1.getMessage()), 60)
        self.assertEqual(len(list(self.ts1.find_entries())), 30)

        edb.get_pipeline_state_db().remove({"user_id": self.testUserUUID1})

        # Then we move entries for user1 into longterm again
        enuah.UserCacheHandler.getUserCacheHandler(self.testUserUUID1).moveToLongTerm()

        # All the duplicates should have been ignored, and the new entries moved into the timeseries
        self.assertEqual(len(self.uc1.getMessage()), 0)
        self.assertEqual(len(list(self.ts1.find_entries())), 60)
Пример #29
0
def get_time_range_for_stage(user_id, stage):
    """
    Returns the start ts and the end ts of the entries in the stage
    """
    curr_state = get_current_state(user_id, stage)

    if curr_state is None:
        start_ts = None
        curr_state = ps.PipelineState()
        curr_state.user_id = user_id
        curr_state.pipeline_stage = stage
        curr_state.curr_run_ts = None
        curr_state.last_processed_ts = None
        curr_state.last_ts_run = None
    else:
        start_ts = curr_state.last_processed_ts

    if start_ts is None:
        logging.info("For stage %s, start_ts is None" % stage)
    else:
        logging.info("For stage %s, start_ts = %s" % (stage, pydt.datetime.utcfromtimestamp(start_ts).isoformat()))

    assert curr_state.curr_run_ts is None, "curr_state.curr_run_ts = %s" % curr_state.curr_run_ts
    # Let's pick a point 5 secs in the past. If we don't do this, then we will
    # read all entries upto the current ts and this may lead to lost data. For
    # example, let us say that the current ts is t1. At the time that we read
    # the data, we have 4 entries for t1. By the time we finish copying, we
    # have 6 entries for t1, we will end up deleting all 6, which will lose 2
    # entries.
    end_ts = time.time() - END_FUZZ_AVOID_LTE

    ret_query = estt.TimeQuery("metadata.write_ts", start_ts, end_ts)

    curr_state.curr_run_ts = end_ts
    logging.debug("About to save object %s" % curr_state)
    edb.save(edb.get_pipeline_state_db(), curr_state)
    logging.debug("After saving state %s, list is %s" % (curr_state,
        list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
    return ret_query
Пример #30
0
def del_all_objects(is_dry_run):
    del_query = {}
    del_query.update({"metadata.key": {"$in": ["inference/prediction", "analysis/inferred_section"]}})
    logging.info("About to delete %d entries" 
        % edb.get_analysis_timeseries_db().find(del_query).count())
    logging.info("About to delete entries with keys %s" 
        % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key"))

    del_pipeline_query = {"pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value}
    logging.info("About to delete pipeline entries for stage %s" %
        ecwp.PipelineStages.MODE_INFERENCE)

    if is_dry_run:
        logging.info("this is a dry-run, returning from del_objects_after without modifying anything")
    else:
        result = edb.get_analysis_timeseries_db().delete_many(del_query)
        logging.info("this is not a dry-run, result of deleting analysis entries is %s" % result.raw_result)
        result = edb.get_pipeline_state_db().delete_many(del_pipeline_query)
        logging.info("this is not a dry-run, result of deleting pipeline state is %s" % result.raw_result)
Пример #31
0
 def tearDown(self):
     import emission.core.get_database as edb
     edb.get_timeseries_db().remove({"user_id": self.testUUID})
     edb.get_pipeline_state_db().remove({"user_id": self.testUUID})
            reset_collection(edb.get_analysis_timeseries_db(), user.uuid,
                             new_uuid)
            logging.debug("Resetting client...")
            reset_collection(edb.get_client_db(), user.uuid, new_uuid)
            logging.debug("Resetting client_stats_backup...")
            reset_collection(edb.get_client_stats_db_backup(), user.uuid,
                             new_uuid)
            logging.debug("Resetting server_stats_backup...")
            reset_collection(edb.get_server_stats_db_backup(), user.uuid,
                             new_uuid)
            logging.debug("Resetting result_stats_backup...")
            reset_collection(edb.get_result_stats_db_backup(), user.uuid,
                             new_uuid)
            logging.debug("Resetting edb.get_common_place_db...")
            reset_collection(edb.get_common_place_db(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_common_trip_db...")
            reset_collection(edb.get_common_trip_db(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_habitica_db...")
            reset_collection(edb.get_habitica_db(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_pipeline_state_db...")
            reset_collection(edb.get_pipeline_state_db(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_profile_db...")
            reset_collection(edb.get_profile_db(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_timeseries_db...")
            reset_collection(edb.get_timeseries_db(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_timeseries_error_db...")
            reset_collection(edb.get_timeseries_error_db(), user.uuid,
                             new_uuid)
            logging.debug("Resetting edb.get_usercache_db...")
            reset_collection(edb.get_usercache_db(), user.uuid, new_uuid)
    parser.add_argument("-i", "--info-only", default=False, action='store_true',
        help="only print entry analysis")

    parser.add_argument("-p", "--pipeline-purge", default=False, action='store_true',
        help="purge the pipeline state as well")

    args = parser.parse_args()
    fn = args.timeline_filename
    logging.info("Loading file or prefix %s" % fn)
    sel_file_list = common.read_files_with_prefix(fn)

    ts_db = edb.get_timeseries_db()
    ats_db = edb.get_analysis_timeseries_db()
    udb = edb.get_uuid_db()
    psdb = edb.get_pipeline_state_db()
    db_array = [ts_db, ats_db, udb, psdb]

    for i, filename in enumerate(sel_file_list):
        if "pipelinestate" in filename:
            continue

        logging.info("=" * 50)
        logging.info("Deleting data from file %s" % filename)

        entries = json.load(gzip.open(filename), object_hook = bju.object_hook)

        # Obtain uuid and rerun information from entries
        curr_uuid_list, needs_rerun = common.analyse_timeline(entries)
        if len(curr_uuid_list) > 1:
            logging.warning("Found %d users, %s in filename, aborting! " % 
# Removes all materialized views and the pipeline state.
# This will cause us to reprocess the pipeline from scratch
# As history begins to accumulate, we may want to specify a point to reset the
# pipeline to instead of deleting everything
import logging
logging.basicConfig(level=logging.DEBUG)

import emission.core.get_database as edb

if __name__ == '__main__':
    print "Deleting all trips"
    print edb.get_trip_new_db().remove()
    print "Deleting all sections"
    print edb.get_section_new_db().remove()
    print "Deleting pipeline state"
    print edb.get_pipeline_state_db().remove()
 def tearDown(self):
     edb.get_pipeline_state_db().remove()
Пример #36
0
            edb.get_uuid_db().update({"uuid" : user.uuid},
                                     {"$set": {"uuid" : new_uuid}})
            logging.debug("Resetting alternatives...")
            reset_collection(edb.get_alternatives_db(), user.uuid, new_uuid)
            logging.debug("Resetting analysis...")
            reset_collection(edb.get_analysis_timeseries_db(), user.uuid, new_uuid)
            logging.debug("Resetting client...")
            reset_collection(edb.get_client_db(), user.uuid, new_uuid)
            logging.debug("Resetting client_stats_backup...")
            reset_collection(edb.get_client_stats_db_backup(), user.uuid, new_uuid)
            logging.debug("Resetting server_stats_backup...")
            reset_collection(edb.get_server_stats_db_backup(), user.uuid, new_uuid)
            logging.debug("Resetting result_stats_backup...")
            reset_collection(edb.get_result_stats_db_backup(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_common_place_db...")
            reset_collection(edb.get_common_place_db(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_common_trip_db...")
            reset_collection(edb.get_common_trip_db(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_habitica_db...")
            reset_collection(edb.get_habitica_db(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_pipeline_state_db...")
            reset_collection(edb.get_pipeline_state_db(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_profile_db...")
            reset_collection(edb.get_profile_db(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_timeseries_db...")
            reset_collection(edb.get_timeseries_db(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_timeseries_error_db...")
            reset_collection(edb.get_timeseries_error_db(), user.uuid, new_uuid)
            logging.debug("Resetting edb.get_usercache_db...")
            reset_collection(edb.get_usercache_db(), user.uuid, new_uuid)
Пример #37
0
    def testStartProcessingTwiceTwoStates(self):
        TEST_DONE_TS_BASE = 999999

        self.assertIsNone(
            epq.get_current_state(self.testUUID,
                                  ewps.PipelineStages.USERCACHE))
        self.assertIsNone(
            epq.get_current_state(self.testUUID,
                                  ewps.PipelineStages.TRIP_SEGMENTATION))
        self.assertIsNone(
            epq.get_current_state(self.testUUID,
                                  ewps.PipelineStages.SECTION_SEGMENTATION))

        logging.debug("About to start processing for the first time")
        logging.debug("starting stage usercache %s" %
                      epq.get_time_range_for_stage(
                          self.testUUID, ewps.PipelineStages.USERCACHE))
        logging.debug(
            "starting stage trip_segmentation %s " %
            epq.get_time_range_for_stage(
                self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION))
        logging.debug(
            "starting stage section_segmentation %s " %
            epq.get_time_range_for_stage(
                self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION))
        logging.debug(
            "After first time processing, states = %s" %
            list(edb.get_pipeline_state_db().find({"user_id": self.testUUID})))

        logging.debug("About to stop processing for the first time")
        epq.mark_stage_done(self.testUUID, ewps.PipelineStages.USERCACHE,
                            TEST_DONE_TS_BASE)
        epq.mark_stage_done(self.testUUID,
                            ewps.PipelineStages.TRIP_SEGMENTATION,
                            TEST_DONE_TS_BASE + 1)
        epq.mark_stage_done(self.testUUID,
                            ewps.PipelineStages.SECTION_SEGMENTATION,
                            TEST_DONE_TS_BASE + 2)
        logging.debug(
            "After first time stopping, states = %s" %
            list(edb.get_pipeline_state_db().find({"user_id": self.testUUID})))

        logging.debug("About to start processing for the second time")
        logging.debug("starting stage usercache %s" %
                      epq.get_time_range_for_stage(
                          self.testUUID, ewps.PipelineStages.USERCACHE))
        logging.debug(
            "starting stage trip_segmentation %s " %
            epq.get_time_range_for_stage(
                self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION))
        logging.debug(
            "starting stage section_segmentation %s " %
            epq.get_time_range_for_stage(
                self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION))
        logging.debug(
            "After second time starting, states = %s" %
            list(edb.get_pipeline_state_db().find({"user_id": self.testUUID})))

        # First set of checks
        new_state = epq.get_current_state(self.testUUID,
                                          ewps.PipelineStages.USERCACHE)
        self.assertIsNotNone(new_state)
        self.assertIsNotNone(new_state.curr_run_ts)
        self.assertIsNotNone(new_state.last_ts_run)
        uc_ts = new_state.curr_run_ts

        new_state = epq.get_current_state(
            self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION)
        self.assertIsNotNone(new_state)
        self.assertIsNotNone(new_state.curr_run_ts)
        self.assertIsNotNone(new_state.last_ts_run)
        ts_ts = new_state.curr_run_ts

        new_state = epq.get_current_state(
            self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION)
        self.assertIsNotNone(new_state)
        self.assertIsNotNone(new_state.curr_run_ts)
        self.assertIsNotNone(new_state.last_ts_run)
        ss_ts = new_state.curr_run_ts

        logging.debug("About to stop processing for the second time")
        epq.mark_stage_done(self.testUUID, ewps.PipelineStages.USERCACHE,
                            TEST_DONE_TS_BASE + 10)
        epq.mark_stage_done(self.testUUID,
                            ewps.PipelineStages.TRIP_SEGMENTATION,
                            TEST_DONE_TS_BASE + 11)
        epq.mark_stage_done(self.testUUID,
                            ewps.PipelineStages.SECTION_SEGMENTATION,
                            TEST_DONE_TS_BASE + 12)
        logging.debug(
            "After second time stopping, states = %s" %
            list(edb.get_pipeline_state_db().find({"user_id": self.testUUID})))

        new_state = epq.get_current_state(self.testUUID,
                                          ewps.PipelineStages.USERCACHE)
        self.assertIsNotNone(new_state)
        self.assertIsNone(new_state.curr_run_ts)
        self.assertEqual(new_state.last_ts_run, uc_ts)
        self.assertEqual(new_state.last_processed_ts, TEST_DONE_TS_BASE + 10)

        new_state = epq.get_current_state(
            self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION)
        self.assertIsNotNone(new_state)
        self.assertIsNone(new_state.curr_run_ts)
        self.assertEqual(new_state.last_ts_run, ts_ts)
        self.assertEqual(new_state.last_processed_ts, TEST_DONE_TS_BASE + 11)

        new_state = epq.get_current_state(
            self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION)
        self.assertIsNotNone(new_state)
        self.assertIsNone(new_state.curr_run_ts)
        self.assertEqual(new_state.last_ts_run, ss_ts)
        self.assertEqual(new_state.last_processed_ts, TEST_DONE_TS_BASE + 12)
Пример #38
0
 def tearDown(self):
     import emission.core.get_database as edb
     edb.get_timeseries_db().delete_many({"user_id": self.testUUID})
     edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID})
 def tearDown(self):
     import emission.core.get_database as edb
     edb.get_timeseries_db().delete_many({"user_id": self.testUUID})
     edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID})
     os.remove(self.analysis_conf_path)
Пример #40
0
 def tearDown(self):
     edb.get_pipeline_state_db().delete_many({'user_id': self.testUUID})
 def setUp(self):
     self.testUUID = uuid.uuid4()
     edb.get_pipeline_state_db().delete_many({'user_id': self.testUUID})
 def setUp(self):
     self.testUUID = uuid.uuid4()
     edb.get_pipeline_state_db().remove()
def purgeAnalysisData():
  print edb.get_analysis_timeseries_db().remove()
  print edb.get_common_place_db().remove()
  print edb.get_common_trip_db().remove()
  print edb.get_pipeline_state_db().remove()
    def testStartProcessingTwiceTwoStates(self):
        TEST_DONE_TS_BASE = 999999

        self.assertIsNone(epq.get_current_state(self.testUUID, ewps.PipelineStages.USERCACHE))
        self.assertIsNone(epq.get_current_state(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION))
        self.assertIsNone(epq.get_current_state(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION))

        logging.debug("About to start processing for the first time")
        logging.debug("starting stage usercache %s" % epq.get_time_range_for_stage(self.testUUID, ewps.PipelineStages.USERCACHE))
        logging.debug("starting stage trip_segmentation %s " % epq.get_time_range_for_stage(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION))
        logging.debug("starting stage section_segmentation %s " % epq.get_time_range_for_stage(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION))
        logging.debug("After first time processing, states = %s" % 
            list(edb.get_pipeline_state_db().find({"user_id": self.testUUID})))

        logging.debug("About to stop processing for the first time")
        epq.mark_stage_done(self.testUUID, ewps.PipelineStages.USERCACHE, TEST_DONE_TS_BASE)
        epq.mark_stage_done(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION, TEST_DONE_TS_BASE + 1)
        epq.mark_stage_done(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION, TEST_DONE_TS_BASE + 2)
        logging.debug("After first time stopping, states = %s" % 
            list(edb.get_pipeline_state_db().find({"user_id": self.testUUID})))

        logging.debug("About to start processing for the second time")
        logging.debug("starting stage usercache %s" % epq.get_time_range_for_stage(self.testUUID, ewps.PipelineStages.USERCACHE))
        logging.debug("starting stage trip_segmentation %s " % epq.get_time_range_for_stage(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION))
        logging.debug("starting stage section_segmentation %s " % epq.get_time_range_for_stage(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION))
        logging.debug("After second time starting, states = %s" % 
            list(edb.get_pipeline_state_db().find({"user_id": self.testUUID})))

        # First set of checks
        new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.USERCACHE)
        self.assertIsNotNone(new_state)
        self.assertIsNotNone(new_state.curr_run_ts)
        self.assertIsNotNone(new_state.last_ts_run)
        uc_ts = new_state.curr_run_ts

        new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION)
        self.assertIsNotNone(new_state)
        self.assertIsNotNone(new_state.curr_run_ts)
        self.assertIsNotNone(new_state.last_ts_run)
        ts_ts = new_state.curr_run_ts

        new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION)
        self.assertIsNotNone(new_state)
        self.assertIsNotNone(new_state.curr_run_ts)
        self.assertIsNotNone(new_state.last_ts_run)
        ss_ts = new_state.curr_run_ts

        logging.debug("About to stop processing for the second time")
        epq.mark_stage_done(self.testUUID, ewps.PipelineStages.USERCACHE, TEST_DONE_TS_BASE + 10)
        epq.mark_stage_done(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION, TEST_DONE_TS_BASE + 11)
        epq.mark_stage_done(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION, TEST_DONE_TS_BASE + 12)
        logging.debug("After second time stopping, states = %s" % 
            list(edb.get_pipeline_state_db().find({"user_id": self.testUUID})))

        new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.USERCACHE)
        self.assertIsNotNone(new_state)
        self.assertIsNone(new_state.curr_run_ts)
        self.assertEquals(new_state.last_ts_run, uc_ts)
        self.assertEquals(new_state.last_processed_ts, TEST_DONE_TS_BASE + 10)

        new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.TRIP_SEGMENTATION)
        self.assertIsNotNone(new_state)
        self.assertIsNone(new_state.curr_run_ts)
        self.assertEquals(new_state.last_ts_run, ts_ts)
        self.assertEquals(new_state.last_processed_ts, TEST_DONE_TS_BASE + 11)

        new_state = epq.get_current_state(self.testUUID, ewps.PipelineStages.SECTION_SEGMENTATION)
        self.assertIsNotNone(new_state)
        self.assertIsNone(new_state.curr_run_ts)
        self.assertEquals(new_state.last_ts_run, ss_ts)
        self.assertEquals(new_state.last_processed_ts, TEST_DONE_TS_BASE + 12)
 def setUp(self):
     self.testUUID = uuid.uuid4()
     edb.get_pipeline_state_db().remove()
 def tearDown(self):
     edb.get_pipeline_state_db().delete_many({'user_id': self.testUUID})
def purgeAnalysisData():
  print(edb.get_analysis_timeseries_db().remove())
  print(edb.get_common_place_db().remove())
  print(edb.get_common_trip_db().remove())
  print(edb.get_pipeline_state_db().remove())
 def tearDown(self):
     edb.get_pipeline_state_db().remove()
Пример #49
0
def reset_pipeline_for_stage(stage, user_id, day_ts):
    reset_query = {}

    if user_id is not None:
        if day_ts is not None:
            print "Setting new pipeline stage %s for %s to %d" % (stage, user_id, day_ts)
            print edb.get_pipeline_state_db().update({'user_id': user_id,
                    'pipeline_stage': stage.value},
                    {'$set': {'last_processed_ts': day_ts}}, upsert=False)
            print edb.get_pipeline_state_db().update({'user_id': user_id,
                    'pipeline_stage': stage.value},
                    {'$set': {'curr_run_ts': None}}, upsert=False)
        else:
            print "day_ts is None, deleting stage %s for user %s" % (stage, user_id)
            print edb.get_pipeline_state_db().remove({'user_id': user_id,
                    'pipeline_stage': stage.value})
    else:
        if day_ts is not None:
            print "Setting new pipeline stage %s for all users to %d" % (stage, day_ts)
            print edb.get_pipeline_state_db().update({'pipeline_stage': stage.value},
                    {'$set': {'last_processed_ts': day_ts}}, upsert=False)
            print edb.get_pipeline_state_db().update({'pipeline_stage': stage.value},
                    {'$set': {'curr_run_ts': day_ts}}, upsert=False)
        else:
            print "day_ts is None, deleting stage %s for all users" % (stage)
            print edb.get_pipeline_state_db().remove({'pipeline_stage': stage.value})
Пример #50
0
 def setUp(self):
     self.testUUID = uuid.uuid4()
     edb.get_pipeline_state_db().delete_many({'user_id': self.testUUID})
    parser.add_argument("-p",
                        "--pipeline-purge",
                        default=False,
                        action='store_true',
                        help="purge the pipeline state as well")

    args = parser.parse_args()
    fn = args.timeline_filename
    logging.info("Loading file or prefix %s" % fn)
    sel_file_list = common.read_files_with_prefix(fn)

    ts_db = edb.get_timeseries_db()
    ats_db = edb.get_analysis_timeseries_db()
    udb = edb.get_uuid_db()
    psdb = edb.get_pipeline_state_db()

    for i, filename in enumerate(sel_file_list):
        logging.info("=" * 50)
        logging.info("Deleting data from file %s" % filename)

        entries = json.load(gzip.open(filename), object_hook=bju.object_hook)

        # Obtain uuid and rerun information from entries
        curr_uuid_list, needs_rerun = common.analyse_timeline(entries)
        if len(curr_uuid_list) > 1:
            logging.warning("Found %d users, %s in filename, aborting! " %
                            (len(curr_uuid_list), curr_uuid_list))
            raise RuntimeException(
                "Found %d users, %s in filename, expecting 1, %s" %
                (len(curr_uuid_list), curr_uuid_list,