def export_timeline(user_id, start_day_str, end_day_str, file_name):
    logging.info("Extracting timeline for user %s day %s -> %s and saving to file %s" %
                 (user_id, start_day_str, end_day_str, file_name))

    # day_dt = pydt.datetime.strptime(day_str, "%Y-%m-%d").date()
    start_day_ts = arrow.get(start_day_str).timestamp
    end_day_ts = arrow.get(end_day_str).timestamp
    logging.debug("start_day_ts = %s (%s), end_day_ts = %s (%s)" % 
        (start_day_ts, arrow.get(start_day_ts),
         end_day_ts, arrow.get(end_day_ts)))

    ts = esta.TimeSeries.get_time_series(user_id)
    loc_time_query = estt.TimeQuery("data.ts", start_day_ts, end_day_ts)
    loc_entry_list = list(estcs.find_entries(user_id, key_list=None, time_query=loc_time_query))
    ma_time_query = estt.TimeQuery("metadata.write_ts", start_day_ts, end_day_ts)
    ma_entry_list = list(estcs.find_entries(user_id, key_list=["background/motion_activity"], time_query=ma_time_query))
    trip_time_query = estt.TimeQuery("data.start_ts", start_day_ts, end_day_ts)
    trip_entry_list = list(ts.find_entries(key_list=None, time_query=trip_time_query))
    place_time_query = estt.TimeQuery("data.enter_ts", start_day_ts, end_day_ts)
    place_entry_list = list(ts.find_entries(key_list=None, time_query=place_time_query))
    # Handle the case of the first place, which has no enter_ts and won't be
    # matched by the default query
    first_place_extra_query = {'$and': [{'data.enter_ts': {'$exists': False}},
                                        {'data.exit_ts': {'$exists': True}}]}
    first_place_entry_list = list(ts.find_entries(key_list=None, time_query=None, extra_query_list=[first_place_extra_query]))
    logging.info("First place entry list = %s" % first_place_entry_list)

    combined_list = loc_entry_list + ma_entry_list + trip_entry_list + place_entry_list + first_place_entry_list
    logging.info("Found %d loc entries, %d motion entries, %d trip-like entries, %d place-like entries = %d total entries" %
        (len(loc_entry_list), len(ma_entry_list), len(trip_entry_list), len(place_entry_list), len(combined_list)))

    validate_truncation(loc_entry_list, trip_entry_list, place_entry_list)

    unique_key_list = set([e["metadata"]["key"] for e in combined_list])
    logging.info("timeline has unique keys = %s" % unique_key_list)
    if len(combined_list) == 0 or unique_key_list == set(['stats/pipeline_time']):
        logging.info("No entries found in range for user %s, skipping save" % user_id)
    else:
        # Also dump the pipeline state, since that's where we have analysis results upto
        # This allows us to copy data to a different *live system*, not just
        # duplicate for analysis
        combined_filename = "%s_%s.gz" % (file_name, user_id)
        with gzip.open(combined_filename, "wt") as gcfd:
            json.dump(combined_list,
                gcfd, default=bju.default, allow_nan=False, indent=4)

        import emission.core.get_database as edb

        pipeline_state_list = list(edb.get_pipeline_state_db().find({"user_id": user_id}))
        logging.info("Found %d pipeline states %s" %
            (len(pipeline_state_list),
             list([ps["pipeline_stage"] for ps in pipeline_state_list])))

        pipeline_filename = "%s_pipelinestate_%s.gz" % (file_name, user_id)
        with gzip.open(pipeline_filename, "wt") as gpfd:
            json.dump(pipeline_state_list,
                gpfd, default=bju.default, allow_nan=False, indent=4)
예제 #2
0
def incident_heatmap(user_uuid, modes, time_query, region):
    """
    Return a list of geojson points with properties for the time and the stress level
    related to incidents. This should not return full entries because that can
    expose the user_id in the aggregate case. Maybe it can return the data part only?
    Or should we put the other entries into the properties?
    :param modes: The modes that we want to query for
    :param time_query: The time query, in either local date or timestamp
    :param region: The region of interest
    :return: list of `incident` objects, with all metadata stripped out
    """

    if region is None:
        geo_query = None
    else:
        geo_query = estg.GeoQuery(["data.loc"], region)

    extra_query_list = []
    if modes is not None:
        mode_enum_list = [ecwm.MotionTypes[mode] for mode in modes]
        extra_query_list.append(esdlq.get_mode_query(mode_enum_list))

    if user_uuid is None:
        incident_entry_list = esda.get_entries(MANUAL_INCIDENT_KEY, user_id=None,
                                          time_query=time_query, geo_query=geo_query,
                                          extra_query_list=extra_query_list)
    else:
        # We don't support aggregate queries on the usercache. And that is
        # actually fine, because we don't expect immediate results for the
        # aggregate case. We just want to query the usercache to ensure that
        # the incidents don't magically disappear just because they got pushed
        # to the server but are not yet processed
        incident_entry_list = estc.find_entries([MANUAL_INCIDENT_KEY], time_query)
    return {"incidents": [e.data for e in incident_entry_list]}
예제 #3
0
def get_user_input_from_cache_series(user_id, trip_obj, user_input_key):
    tq = estt.TimeQuery("data.start_ts", trip_obj.data.start_ts,
                        trip_obj.data.end_ts)
    ts = esta.TimeSeries.get_time_series(user_id)
    potential_candidates = estsc.find_entries(user_id, [user_input_key], tq)
    return final_candidate(valid_user_input(ts, trip_obj),
                           potential_candidates)
예제 #4
0
def getTimeseriesEntries(time_type):
    if 'user' not in request.json:
        abort(401, "only a user can read his/her data")

    user_uuid = getUUID(request)

    key_list = request.json['key_list']
    if 'from_local_date' in request.json and 'to_local_date' in request.json:
        start_time = request.json['from_local_date']
        end_time = request.json['to_local_date']
        time_query = esttc.TimeComponentQuery("metadata.write_ts",
                                              start_time,
                                              end_time)
    else:
        start_time = request.json['start_time']
        end_time = request.json['end_time']
        time_query = estt.TimeQuery("metadata.write_ts",
                                              start_time,
                                              end_time)
    # Note that queries from usercache are limited to 100,000 entries
    # and entries from timeseries are limited to 250,000, so we will
    # return at most 350,000 entries. So this means that we don't need
    # additional filtering, but this should be documented in
    # the API
    data_list = esdc.find_entries(user_uuid, key_list, time_query)
    return {'phone_data': data_list}
예제 #5
0
def getPublicData():
    ids = request.json['phone_ids']
    all_uuids = map(lambda id: UUID(id), ids)
    uuids = [uuid for uuid in all_uuids if uuid in estag.TEST_PHONE_IDS]

    from_ts = request.query.from_ts
    to_ts = request.query.to_ts

    time_range = estt.TimeQuery("metadata.write_ts", float(from_ts),
                                float(to_ts))
    time_query = time_range.get_query()

    user_queries = map(lambda id: {'user_id': id}, uuids)

    for q in user_queries:
        q.update(time_query)

    num_entries_ts = map(lambda q: edb.get_timeseries_db().find(q).count(),
                         user_queries)
    num_entries_uc = map(lambda q: edb.get_usercache_db().find(q).count(),
                         user_queries)
    total_entries = sum(num_entries_ts + num_entries_uc)
    logging.debug("Total entries requested: %d" % total_entries)

    threshold = 200000
    if total_entries > threshold:
        data_list = None
    else:
        data_list = map(lambda u: esdc.find_entries(u, None, time_range),
                        all_uuids)

    return {'phone_data': data_list}
예제 #6
0
def getTimeseriesEntries(time_type):
    if 'user' not in request.json:
        abort(401, "only a user can read his/her data")

    user_uuid = getUUID(request)

    key_list = request.json['key_list']
    if 'from_local_date' in request.json and 'to_local_date' in request.json:
        start_time = request.json['from_local_date']
        end_time = request.json['to_local_date']
        time_query = esttc.TimeComponentQuery("metadata.write_ts",
                                              start_time,
                                              end_time)
    else:
        start_time = request.json['start_time']
        end_time = request.json['end_time']
        time_query = estt.TimeQuery("metadata.write_ts",
                                              start_time,
                                              end_time)
    # Note that queries from usercache are limited to 100,000 entries
    # and entries from timeseries are limited to 250,000, so we will
    # return at most 350,000 entries. So this means that we don't need
    # additional filtering, but this should be documented in
    # the API
    data_list = esdc.find_entries(user_uuid, key_list, time_query)
    return {'phone_data': data_list}
예제 #7
0
def getPublicData():
  ids = request.json['phone_ids']
  all_uuids = map(lambda id: UUID(id), ids)
  uuids = [uuid for uuid in all_uuids if uuid in estag.TEST_PHONE_IDS]

  from_ts = request.query.from_ts
  to_ts = request.query.to_ts

  time_range = estt.TimeQuery("metadata.write_ts", float(from_ts), float(to_ts))
  time_query = time_range.get_query()

  user_queries = map(lambda id: {'user_id': id}, uuids)

  for q in user_queries:
    q.update(time_query)

  num_entries_ts = map(lambda q: edb.get_timeseries_db().find(q).count(), user_queries)
  num_entries_uc = map(lambda q: edb.get_usercache_db().find(q).count(), user_queries)
  total_entries = sum(num_entries_ts + num_entries_uc)
  logging.debug("Total entries requested: %d" % total_entries)

  threshold = 200000
  if total_entries > threshold:
    data_list = None
  else:
    data_list = map(lambda u: esdc.find_entries(u, None, time_range), all_uuids)

  return {'phone_data': data_list}
def get_user_input_from_cache_series(user_id, trip_obj, user_input_key):
    tq = estt.TimeQuery("data.start_ts", trip_obj.data.start_ts, trip_obj.data.end_ts)
    potential_candidates = estsc.find_entries(user_id, [user_input_key], tq)
    if len(potential_candidates) == 0:
        return None
    sorted_pc = sorted(potential_candidates, key=lambda c:c["metadata"]["write_ts"])
    most_recent_entry = potential_candidates[-1]
    logging.debug("most recent entry has id %s" % most_recent_entry["_id"])
    logging.debug("and is mapped to entry %s" % most_recent_entry)
    return ecwe.Entry(most_recent_entry)
예제 #9
0
def getTimeseriesEntries(time_type):
    if 'user' not in request.json:
        abort(401, "only a user can read his/her data")

    user_uuid = getUUID(request)

    key_list = request.json['key_list']
    if 'from_local_date' in request.json and 'to_local_date' in request.json:
        start_time = request.json['from_local_date']
        end_time = request.json['to_local_date']
        time_key = request.json.get('key_local_date', 'metadata.write_ts')
        time_query = esttc.TimeComponentQuery(time_key,
                                              start_time,
                                              end_time)
    else:
        start_time = request.json['start_time']
        end_time = request.json['end_time']
        time_key = request.json.get('key_time', 'metadata.write_ts')
        time_query = estt.TimeQuery(time_key,
                                    start_time,
                                    end_time)
    # Note that queries from usercache are limited to 100,000 entries
    # and entries from timeseries are limited to 250,000, so we will
    # return at most 350,000 entries. So this means that we don't need
    # additional filtering, but this should be documented in
    # the API
    data_list = esdc.find_entries(user_uuid, key_list, time_query)
    if 'max_entries' in request.json:
        me = request.json['max_entries']
        if (type(me) != int):
            logging.error("aborting: max entry count is %s, type %s, expected int" % (me, type(me)))
            abort(500, "Invalid max_entries %s" % me)

        if len(data_list) > me:
            if request.json['trunc_method'] == 'first':
                logging.debug("first n entries is %s" % me)
                data_list = data_list[:me]
            if request.json['trunc_method'] == 'last':
                logging.debug("first n entries is %s" % me)
                data_list = data_list[-me:]
            elif request.json["trunc_method"] == "sample":
                sample_rate = len(data_list)//me + 1
                logging.debug("sampling rate is %s" % sample_rate)
                data_list = data_list[::sample_rate]
            else:
                logging.error("aborting: unexpected sampling method %s" % request.json["trunc_method"])
                abort(500, "sampling method not specified while retriving limited data")
        else:
            logging.debug("Found %d entries < %s, no truncation" % (len(data_list), me))
    logging.debug("successfully returning list of size %s" % len(data_list))
    return {'phone_data': data_list}
예제 #10
0
def incident_heatmap(user_uuid, modes, time_query, region):
    """
    Return a list of geojson points with properties for the time and the stress level
    related to incidents. This should not return full entries because that can
    expose the user_id in the aggregate case. Maybe it can return the data part only?
    Or should we put the other entries into the properties?
    :param modes: The modes that we want to query for
    :param time_query: The time query, in either local date or timestamp
    :param region: The region of interest
    :return: list of `incident` objects, with all metadata stripped out
    """

    if region is None:
        geo_query = None
    else:
        geo_query = estg.GeoQuery(["data.loc"], region)

    extra_query_list = []
    if modes is not None:
        mode_enum_list = [ecwm.MotionTypes[mode] for mode in modes]
        extra_query_list.append(esdlq.get_mode_query(mode_enum_list))

    if user_uuid is None:
        incident_entry_list = esda.get_entries(
            MANUAL_INCIDENT_KEY,
            user_id=None,
            time_query=time_query,
            geo_query=geo_query,
            extra_query_list=extra_query_list)
    else:
        # We don't support aggregate queries on the usercache. And that is
        # actually fine, because we don't expect immediate results for the
        # aggregate case. We just want to query the usercache to ensure that
        # the incidents don't magically disappear just because they got pushed
        # to the server but are not yet processed
        incident_entry_list = estc.find_entries([MANUAL_INCIDENT_KEY],
                                                time_query)
    return {"incidents": [e.data for e in incident_entry_list]}