def evaluate_bins(self): self.labels = [] for bin in self.bins: for b in bin: self.labels.append(self.bins.index(bin)) if not self.data or not self.bins: return if len(self.labels) < 2: logging.debug('Everything is in one bin.') return labels = numpy.array(self.labels) points = [] for bin in self.bins: for b in bin: tb = self.data[b] start_place = esda.get_entry(esda.CLEANED_PLACE_KEY, tb.data.start_place) end_place = esda.get_entry(esda.CLEANED_PLACE_KEY, tb.data.end_place) start_lon = start_place.data.location["coordinates"][0] start_lat = start_place.data.location["coordinates"][1] end_lon = end_place.data.location["coordinates"][0] end_lat = end_place.data.location["coordinates"][1] path = [start_lat, start_lon, end_lat, end_lon] points.append(path) logging.debug("number of labels are %d, number of points are = %d" % (len(labels), len(points))) a = metrics.silhouette_score(numpy.array(points), labels) logging.debug('number of bins is %d' % len(self.bins)) logging.debug('silhouette score is %d' % a) return a
def __init__(self, data, radius): self.data = data if not data: self.data = [] self.bins = [] self.radius = float(radius) for t in self.data: logging.debug("Considering trip %s" % t) try: start_place = esda.get_entry(esda.CLEANED_PLACE_KEY, t.data.start_place) end_place = esda.get_entry(esda.CLEANED_PLACE_KEY, t.data.end_place) start_lon = start_place.data.location["coordinates"][0] start_lat = start_place.data.location["coordinates"][1] end_lon = end_place.data.location["coordinates"][0] end_lat = end_place.data.location["coordinates"][1] logging.debug("endpoints are = (%s, %s) and (%s, %s)" % (start_lon, start_lat, end_lon, end_lat)) if self.distance(start_lat, start_lon, end_lat, end_lon): self.data.remove(t) except: logging.exception( "exception while getting start and end places for %s" % t) self.data.remove(t) logging.debug( 'After removing trips that are points, there are %s data points' % len(self.data)) self.size = len(self.data)
def get_reps(self): self.reps = [] if not self.data: return for cluster in self.clusters: points = [[], [], [], []] for c in cluster: if self.is_old: points[0].append(c.trip_start_location.lat) points[1].append(c.trip_start_location.lon) points[2].append(c.trip_end_location.lat) points[3].append(c.trip_end_location.lon) else: # We want (lat, lon) to be consistent with old above. # But in the new, our data is in geojson so it is (lon, lat). # Fix it by flipping the order of the indices # Note also that we want to use the locations of the start # and end places, not of the start point of the trip, which # may be some distance away due to geofencing. start_place = esda.get_entry(esda.CLEANED_PLACE_KEY, c.data.start_place) end_place = esda.get_entry(esda.CLEANED_PLACE_KEY, c.data.end_place) points[0].append(start_place.data.location["coordinates"][1]) points[1].append(start_place.data.location["coordinates"][0]) points[2].append(end_place.data.location["coordinates"][1]) points[3].append(end_place.data.location["coordinates"][0]) logging.debug("in representatives, endpoints are = %s" % points) centers = numpy.mean(points, axis=1) a = Trip(None, None, None, None, None, None, Coordinate(centers[0], centers[1]), Coordinate(centers[2], centers[3])) self.reps.append(a)
def get_reps(self): self.reps = [] if not self.data: return for i, cluster in enumerate(self.clusters): logging.debug("Considering cluster %d = %s" % (i, cluster)) points = [[], [], [], []] # If this cluster has no points, we skip it if len(cluster) == 0: logging.info("Cluster %d = %s, has length %d, skipping" % (i, cluster, len(cluster))) continue for j, c in enumerate(cluster): logging.debug("Consider point %d = %s" % (j, c)) start_place = esda.get_entry(esda.CLEANED_PLACE_KEY, c.data.start_place) end_place = esda.get_entry(esda.CLEANED_PLACE_KEY, c.data.end_place) points[0].append(start_place.data.location["coordinates"][1]) # lat points[1].append(start_place.data.location["coordinates"][0]) # lng points[2].append(end_place.data.location["coordinates"][1]) # lat points[3].append(end_place.data.location["coordinates"][0]) # lng logging.debug("in representatives, endpoints have len = %s" % len(points)) centers = numpy.mean(points, axis=1) logging.debug("For cluster %d, centers are %s" % (i, centers)) t = ecwt.Trip({ "start_loc": gj.Point([centers[1], centers[0]]), "end_loc": gj.Point([centers[3], centers[2]]) }) a = ecwe.Entry.create_entry(c.user_id, "analysis/cleaned_trip", t) self.reps.append(a)
def __init__(self, data, radius): self.data = data if not data: self.data = [] self.bins = [] self.radius = float(radius) for t in self.data: logging.debug("Considering trip %s" % t) try: start_place = esda.get_entry(esda.CLEANED_PLACE_KEY, t.data.start_place) end_place = esda.get_entry(esda.CLEANED_PLACE_KEY, t.data.end_place) start_lon = start_place.data.location["coordinates"][0] start_lat = start_place.data.location["coordinates"][1] end_lon = end_place.data.location["coordinates"][0] end_lat = end_place.data.location["coordinates"][1] logging.debug("endpoints are = (%s, %s) and (%s, %s)" % (start_lon, start_lat, end_lon, end_lat)) if self.distance(start_lat, start_lon, end_lat, end_lon): self.data.remove(t) except: logging.exception("exception while getting start and end places for %s" % t) self.data.remove(t) logging.debug('After removing trips that are points, there are %s data points' % len(self.data)) self.size = len(self.data)
def evaluate_bins(self): self.labels = [] for bin in self.bins: for b in bin: self.labels.append(self.bins.index(bin)) if not self.data or not self.bins: return if len(self.labels) < 2: logging.debug('Everything is in one bin.') return labels = numpy.array(self.labels) points = [] for bin in self.bins: for b in bin: tb = self.data[b] start_place = esda.get_entry(esda.CLEANED_PLACE_KEY, tb.data.start_place) end_place = esda.get_entry(esda.CLEANED_PLACE_KEY, tb.data.end_place) start_lon = start_place.data.location["coordinates"][0] start_lat = start_place.data.location["coordinates"][1] end_lon = end_place.data.location["coordinates"][0] end_lat = end_place.data.location["coordinates"][1] path = [start_lat, start_lon, end_lat, end_lon] points.append(path) logging.debug("number of labels are %d, number of points are = %d" % (len(labels), len(points))) a = metrics.silhouette_score(numpy.array(points), labels) logging.debug('number of bins is %d' % len(self.bins)) logging.debug('silhouette score is %d' % a) return a
def get_reps(self): self.reps = [] if not self.data: return for i, cluster in enumerate(self.clusters): logging.debug("Considering cluster %d = %s" % (i, cluster)) points = [[], [], [], []] # If this cluster has no points, we skip it if len(cluster) == 0: logging.info("Cluster %d = %s, has length %d, skipping" % (i, cluster, len(cluster))) continue for j, c in enumerate(cluster): logging.debug("Consider point %d = %s" % (j, c)) start_place = esda.get_entry(esda.CLEANED_PLACE_KEY, c.data.start_place) end_place = esda.get_entry(esda.CLEANED_PLACE_KEY, c.data.end_place) points[0].append(start_place.data.location["coordinates"][1]) # lat points[1].append(start_place.data.location["coordinates"][0]) # lng points[2].append(end_place.data.location["coordinates"][1]) # lat points[3].append(end_place.data.location["coordinates"][0]) # lng logging.debug("in representatives, endpoints have len = %s" % len(points)) centers = numpy.mean(points, axis=1) logging.debug("For cluster %d, centers are %s" % (i, centers)) t = ecwt.Trip({ "start_loc": gj.Point([centers[1], centers[0]]), "end_loc": gj.Point([centers[3], centers[2]]) }) a = ecwe.Entry.create_entry(c.user_id, "analysis/cleaned_trip", t) self.reps.append(a)
def _addIfNotExists(self, place_id): """ Adds the place specified by the given place_id to the place list and the place map and returns it :param place_id: :return: """ import emission.storage.decorations.place_queries as esdp if place_id not in self.id_map: logging.debug("place id %s is not in the map, searching in database" % place_id) place_entry = esda.get_entry(self.place_key, place_id) self.places.append(place_entry) self.id_map[place_id] = place_entry logging.debug("retrieved object %s and added to id_map" % place_entry) return place_entry else: return self.id_map[place_id]
def _addIfNotExists(self, place_id): """ Adds the place specified by the given place_id to the place list and the place map and returns it :param place_id: :return: """ import emission.storage.decorations.place_queries as esdp if place_id not in self.id_map: logging.debug( "place id %s is not in the map, searching in database" % place_id) place_entry = esda.get_entry(self.place_key, place_id) self.places.append(place_entry) self.id_map[place_id] = place_entry logging.debug("retrieved object %s and added to id_map" % place_entry) return place_entry else: return self.id_map[place_id]
def testGetLastPlace(self): old_place = ecwrp.Rawplace() old_place.enter_ts = 5 old_place_id = esta.TimeSeries.get_time_series( self.testUserId).insert_data( self.testUserId, "segmentation/raw_place", old_place) old_place_entry = esda.get_entry(esda.RAW_PLACE_KEY, old_place_id) logging.debug("old place entry is %s "% old_place_entry) esta.TimeSeries.get_time_series(self.testUserId).update(old_place_entry) # The place saved in the previous step has no exit_ts set, so it is the # last place last_place_entry = esdp.get_last_place_entry(esda.RAW_PLACE_KEY, self.testUserId) last_place_entry["data"]["exit_ts"] = 6 logging.debug("About to update entry to %s" % last_place_entry) esta.TimeSeries.get_time_series(self.testUserId).update(last_place_entry) # Now that I have set the exit_ts and saved it, there is no last place last_place_entry = esdp.get_last_place_entry(esda.RAW_PLACE_KEY, self.testUserId) self.assertIsNone(last_place_entry)
def testGetLastPlace(self): old_place = ecwrp.Rawplace() old_place.enter_ts = 5 old_place_id = esta.TimeSeries.get_time_series( self.testUserId).insert_data(self.testUserId, "segmentation/raw_place", old_place) old_place_entry = esda.get_entry(esda.RAW_PLACE_KEY, old_place_id) logging.debug("old place entry is %s " % old_place_entry) esta.TimeSeries.get_time_series( self.testUserId).update(old_place_entry) # The place saved in the previous step has no exit_ts set, so it is the # last place last_place_entry = esdp.get_last_place_entry(esda.RAW_PLACE_KEY, self.testUserId) last_place_entry["data"]["exit_ts"] = 6 logging.debug("About to update entry to %s" % last_place_entry) esta.TimeSeries.get_time_series( self.testUserId).update(last_place_entry) # Now that I have set the exit_ts and saved it, there is no last place last_place_entry = esdp.get_last_place_entry(esda.RAW_PLACE_KEY, self.testUserId) self.assertIsNone(last_place_entry)
def get_last_place_before(place_key, reset_ts, user_id): """ Unlike `get_last_place_before` which returns the last place in the timeline, this returns the last place before a particular timestamp. Used to reset the pipeline, for example. To implement this, we can't just look for places before that timestamp, because then we will get a list. And we don't want to retrieve all of them and sort either. We can look for places that exit after that timestamp, but that will also give a list. But hopefully, a shorter list, so that we don't have to sort as much. I can't think of an alternative that doesn't require sorting. Oh wait! There is an alternative! We can look for the place that has an enter timestamp before the ts and an exit timestamp after, or a trip that has a start timestamp before the ts and an end timestamp after. We should only find one. And if we find the trip then the place is its start place. Note that these correspond to the two use cases in https://github.com/e-mission/e-mission-server/issues/333 """ trip_key_query = _get_trip_key_query(place_key) logging.debug("Looking for last place before %s" % reset_ts) ts = esta.TimeSeries.get_time_series(user_id) all_user_places = list(edb.get_analysis_timeseries_db().find( {"user_id": user_id, "metadata.key": place_key}, {"_id": True, "data.enter_fmt_time": True, "data.exit_fmt_time": True})) logging.debug("all places for this user = %s" % all_user_places) ret_place_doc = ts.analysis_timeseries_db.find_one({'user_id': user_id, 'metadata.key': place_key, 'data.exit_ts' : {'$gt': reset_ts}, 'data.enter_ts': {'$lt': reset_ts} }) logging.debug("last place doc for user %s = %s" % (user_id, ret_place_doc)) ret_trip_doc = ts.analysis_timeseries_db.find_one({'user_id': user_id, 'metadata.key': trip_key_query, 'data.end_ts' : {'$gt': reset_ts}, 'data.start_ts': {'$lt': reset_ts} }) logging.debug("last trip doc for user %s = %s" % (user_id, ret_trip_doc)) if ret_place_doc is None and ret_trip_doc is None: # Check to see if the pipeline ended before this last_place = get_last_place_entry(place_key, user_id) logging.debug("last_place = %s, reset_ts = %s" % (last_place, reset_ts)) if last_place is None: return None elif last_place.data.enter_ts < reset_ts: return last_place else: raise ValueError("No trip or place straddling time %s for user %s" % (reset_ts, user_id)) if ret_place_doc is None: assert ret_trip_doc is not None logging.info("ret_trip_doc start = %s, end = %s" % (ret_trip_doc["data"]["start_fmt_time"], ret_trip_doc["data"]["end_fmt_time"])) ret_place_doc = esda.get_entry(place_key, ret_trip_doc["data"]['start_place']) assert ret_place_doc is not None ret_place = ecwe.Entry(ret_place_doc) return ret_place
def get_last_place_before(place_key, reset_ts, user_id): """ Unlike `get_last_place_before` which returns the last place in the timeline, this returns the last place before a particular timestamp. Used to reset the pipeline, for example. To implement this, we can't just look for places before that timestamp, because then we will get a list. And we don't want to retrieve all of them and sort either. We can look for places that exit after that timestamp, but that will also give a list. But hopefully, a shorter list, so that we don't have to sort as much. I can't think of an alternative that doesn't require sorting. Oh wait! There is an alternative! We can look for the place that has an enter timestamp before the ts and an exit timestamp after, or a trip that has a start timestamp before the ts and an end timestamp after. We should only find one. And if we find the trip then the place is its start place. Note that these correspond to the two use cases in https://github.com/e-mission/e-mission-server/issues/333 """ trip_key_query = _get_trip_key_query(place_key) logging.debug("Looking for last place before %s" % reset_ts) ts = esta.TimeSeries.get_time_series(user_id) all_user_places = list(edb.get_analysis_timeseries_db().find( {"user_id": user_id, "metadata.key": place_key}, {"_id": True, "data.enter_fmt_time": True, "data.exit_fmt_time": True})) logging.debug("all places for this user = %s" % all_user_places) ret_place_doc = ts.analysis_timeseries_db.find_one({'user_id': user_id, 'metadata.key': place_key, 'data.exit_ts' : {'$gt': reset_ts}, 'data.enter_ts': {'$lt': reset_ts} }) logging.debug("last place doc for user %s = %s" % (user_id, ret_place_doc)) ret_trip_doc = ts.analysis_timeseries_db.find_one({'user_id': user_id, 'metadata.key': trip_key_query, 'data.end_ts' : {'$gt': reset_ts}, 'data.start_ts': {'$lt': reset_ts} }) logging.debug("last trip doc for user %s = %s" % (user_id, ret_trip_doc)) if ret_place_doc is None and ret_trip_doc is None: # Check to see if the pipeline ended before this last_place = get_last_place_entry(place_key, user_id) logging.debug("last_place = %s, reset_ts = %s" % (last_place, reset_ts)) if last_place is None: return None elif last_place.data.enter_ts is None: return None elif last_place.data.enter_ts < reset_ts: return last_place else: raise ValueError("No trip or place straddling time %s for user %s" % (reset_ts, user_id)) if ret_place_doc is None: assert ret_trip_doc is not None logging.info("ret_trip_doc start = %s, end = %s" % (ret_trip_doc["data"]["start_fmt_time"], ret_trip_doc["data"]["end_fmt_time"])) ret_place_doc = esda.get_entry(place_key, ret_trip_doc["data"]['start_place']) assert ret_place_doc is not None ret_place = ecwe.Entry(ret_place_doc) return ret_place
def reset_user_to_ts(user_id, ts, is_dry_run): """ When we delete objects, we want to leave an open connection to the prior chain to connect the newly created chain to. In other words, if we want to delete after 2016-07-23, we want the place that we entered at 2016-07-22 to be retained but with no starting trip, so that we can rejoin the newly identified trip to the existing place The various use cases for this are documented under https://github.com/e-mission/e-mission-server/issues/333 But basically, it comes down to a) find the place before the time b) clear all analysis results after it c) open the place d) reset pipeline states to its enter_ts FYI: this is how we did the query earlier edb.get_analysis_timeseries_db().find(first_affected_query).sort('data.exit_ts').limit(1) """ if user_id is None: logging.info("user_id = None, skipping reset...") return # Find the place before the time try: last_cleaned_place = esdp.get_last_place_before(esda.CLEANED_PLACE_KEY, ts, user_id) logging.debug("last_cleaned_place = %s" % last_cleaned_place) if last_cleaned_place is None or last_cleaned_place.data.exit_ts is None: logging.info("Data collection for user %s stopped before reset time, early return" % user_id) return except ValueError as e: first_cleaned_place = esdp.get_first_place_entry(esda.CLEANED_PLACE_KEY, user_id) if first_cleaned_place is not None and first_cleaned_place.data.exit_ts > ts: logging.info("first_cleaned_place.exit = %s (%s), resetting to start" % (first_cleaned_place.data.exit_ts, first_cleaned_place.data.exit_fmt_time)) reset_user_to_start(user_id, is_dry_run) return else: raise last_raw_place_id = last_cleaned_place["data"]["raw_places"][-1] last_raw_place = esda.get_entry(esda.RAW_PLACE_KEY, last_raw_place_id) logging.debug("last_raw_place = %s" % last_raw_place) # Reason for using first_raw_place is # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312735236 first_raw_place_id = last_cleaned_place["data"]["raw_places"][0] first_raw_place = esda.get_entry(esda.RAW_PLACE_KEY, first_raw_place_id) logging.debug("first_raw_place = %s" % first_raw_place) last_place_enter_ts = first_raw_place.data.enter_ts logging.debug("last_place_enter_ts = %s" % last_place_enter_ts) reset_ts = last_place_enter_ts logging.debug("reset_ts = %s" % last_place_enter_ts) # clear all analysis results after it del_objects_after(user_id, reset_ts, is_dry_run) # open the raw and cleaned places reset_last_place(last_cleaned_place, is_dry_run) reset_last_place(last_raw_place, is_dry_run) # reset pipeline states to its enter_ts reset_pipeline_state(user_id, reset_ts, is_dry_run)
if last_cleaned_place is None or last_cleaned_place.data.exit_ts is None: logging.info("Data collection for user %s stopped before reset time, early return" % user_id) return except ValueError, e: first_cleaned_place = esdp.get_first_place_entry(esda.CLEANED_PLACE_KEY, user_id) if first_cleaned_place is not None and first_cleaned_place.data.exit_ts > ts: logging.info("first_cleaned_place.exit = %s (%s), resetting to start" % (first_cleaned_place.data.exit_ts, first_cleaned_place.data.exit_fmt_time)) reset_user_to_start(user_id, is_dry_run) return else: raise last_raw_place_id = last_cleaned_place["data"]["raw_places"][-1] last_raw_place = esda.get_entry(esda.RAW_PLACE_KEY, last_raw_place_id) logging.debug("last_raw_place = %s" % last_raw_place) # Reason for using first_raw_place is # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312735236 first_raw_place_id = last_cleaned_place["data"]["raw_places"][0] first_raw_place = esda.get_entry(esda.RAW_PLACE_KEY, first_raw_place_id) logging.debug("first_raw_place = %s" % first_raw_place) last_place_enter_ts = first_raw_place.data.enter_ts logging.debug("last_place_enter_ts = %s" % last_place_enter_ts) reset_ts = last_place_enter_ts logging.debug("reset_ts = %s" % last_place_enter_ts) # clear all analysis results after it