def continue_just_ended(self, idx, currPoint, filtered_points_df):
        """
        Normally, since the logic here and the
        logic on the phone are the same, if we have detected a trip
        end, any points after this are part of the new trip.

        However, in some circumstances, notably in my data from 27th
        August, there appears to be a mismatch and we get a couple of
        points past the end that we detected here.  So let's look for
        points that are within the distance filter, and are at a
        delta of a minute, and join them to the just ended trip instead of using them to
        start the new trip

        :param idx: Index of the current point
        :param currPoint: current point
        :param filtered_points_df: dataframe of filtered points
        :return: True if we should continue the just ended trip, False otherwise
        """
        if idx == 0:
            return False
        else:
            lastPoint = ad.AttrDict(filtered_points_df.iloc[idx - 1])
            logging.debug("Comparing with lastPoint = %s, distance = %s, time = %s" %
                          (lastPoint, pf.calDistance(lastPoint, currPoint) < self.distance_threshold,
                           currPoint.ts - lastPoint.ts <= self.time_threshold))
            # Unlike the time filter, with the distance filter, we concatenate all points
            # that are within the distance threshold with the previous trip
            # end, since because of the distance filter, even noisy points
            # can occur at an arbitrary time in the future
            if pf.calDistance(lastPoint, currPoint) < self.distance_threshold:
                logging.info("Points %s and %s are within the distance filter so part of the same trip" %
                             (lastPoint, currPoint))
                return True
            else:
                return False
    def continue_just_ended(self, idx, currPoint, filtered_points_df):
        """
        Normally, since the logic here and the
        logic on the phone are the same, if we have detected a trip
        end, any points after this are part of the new trip.

        However, in some circumstances, notably in my data from 27th
        August, there appears to be a mismatch and we get a couple of
        points past the end that we detected here.  So let's look for
        points that are within the distance filter, and are at a
        delta of a minute, and join them to the just ended trip instead of using them to
        start the new trip

        :param idx: Index of the current point
        :param currPoint: current point
        :param filtered_points_df: dataframe of filtered points
        :return: True if we should continue the just ended trip, False otherwise
        """
        if idx == 0:
            return False
        else:
            prev_point = ad.AttrDict(filtered_points_df.iloc[idx - 1])
            logging.debug("Comparing with prev_point = %s" % prev_point)
            if pf.calDistance(prev_point, currPoint) < self.distance_threshold and \
                                    currPoint.ts - prev_point.ts <= 60:
                logging.info("Points %s and %s are within the distance filter and only 1 min apart so part of the same trip" %
                             (prev_point, currPoint))
                return True
            else:
                return False
Exemplo n.º 3
0
def recalc_speed(points_df):
    """
    The input dataframe already has "speed" and "distance" columns.
    Drop them and recalculate speeds from the first point onwards.
    The speed column has the speed between each point and its previous point.
    The first row has a speed of zero.
    """
    stripped_df = points_df.drop("speed", axis=1).drop("distance", axis=1)
    logging.debug("columns in points_df = %s" % points_df.columns)
    point_list = [ad.AttrDict(row) for row in points_df.to_dict('records')]
    zipped_points_list = list(zip(point_list, point_list[1:]))
    distances = [pf.calDistance(p1, p2) for (p1, p2) in zipped_points_list]
    distances.insert(0, 0)
    with_speeds_df = pd.concat([
        stripped_df,
        pd.Series(distances, index=points_df.index, name="distance")
    ],
                               axis=1)
    speeds = [pf.calSpeed(p1, p2) for (p1, p2) in zipped_points_list]
    speeds.insert(0, 0)
    with_speeds_df = pd.concat([
        with_speeds_df,
        pd.Series(speeds, index=points_df.index, name="speed")
    ],
                               axis=1)
    return with_speeds_df
Exemplo n.º 4
0
    def continue_just_ended(self, idx, currPoint, filtered_points_df):
        """
        Normally, since the logic here and the
        logic on the phone are the same, if we have detected a trip
        end, any points after this are part of the new trip.

        However, in some circumstances, notably in my data from 27th
        August, there appears to be a mismatch and we get a couple of
        points past the end that we detected here.  So let's look for
        points that are within the distance filter, and are at a
        delta of a minute, and join them to the just ended trip instead of using them to
        start the new trip

        :param idx: Index of the current point
        :param currPoint: current point
        :param filtered_points_df: dataframe of filtered points
        :return: True if we should continue the just ended trip, False otherwise
        """
        if idx == 0:
            return False
        else:
            prev_point = ad.AttrDict(filtered_points_df.iloc[idx - 1])
            logging.debug("Comparing with prev_point = %s" % prev_point)
            if pf.calDistance(prev_point, currPoint) < self.distance_threshold and \
                                    currPoint.ts - prev_point.ts <= 60:
                logging.info(
                    "Points %s and %s are within the distance filter and only 1 min apart so part of the same trip"
                    % (prev_point, currPoint))
                return True
            else:
                return False
Exemplo n.º 5
0
def add_dist_heading_speed(points_df):
    # type: (pandas.DataFrame) -> pandas.DataFrame
    """
    Returns a new dataframe with an added "speed" column.
    The speed column has the speed between each point and its previous point.
    The first row has a speed of zero.
    """
    point_list = [ad.AttrDict(row) for row in points_df.to_dict('records')]
    zipped_points_list = list(zip(point_list, point_list[1:]))

    distances = [pf.calDistance(p1, p2) for (p1, p2) in zipped_points_list]
    distances.insert(0, 0)
    speeds = [pf.calSpeed(p1, p2) for (p1, p2) in zipped_points_list]
    speeds.insert(0, 0)
    headings = [pf.calHeading(p1, p2) for (p1, p2) in zipped_points_list]
    headings.insert(0, 0)

    with_distances_df = pd.concat(
        [points_df, pd.Series(distances, name="distance")], axis=1)
    with_speeds_df = pd.concat(
        [with_distances_df, pd.Series(speeds, name="speed")], axis=1)
    if "heading" in with_speeds_df.columns:
        with_speeds_df.drop("heading", axis=1, inplace=True)
    with_headings_df = pd.concat(
        [with_speeds_df, pd.Series(headings, name="heading")], axis=1)
    return with_headings_df
    def continue_just_ended(self, idx, currPoint, filtered_points_df):
        """
        Normally, since the logic here and the
        logic on the phone are the same, if we have detected a trip
        end, any points after this are part of the new trip.

        However, in some circumstances, notably in my data from 27th
        August, there appears to be a mismatch and we get a couple of
        points past the end that we detected here.  So let's look for
        points that are within the distance filter, and are at a
        delta of a minute, and join them to the just ended trip instead of using them to
        start the new trip

        :param idx: Index of the current point
        :param currPoint: current point
        :param filtered_points_df: dataframe of filtered points
        :return: True if we should continue the just ended trip, False otherwise
        """
        if idx == 0:
            return False
        else:
            lastPoint = ad.AttrDict(filtered_points_df.iloc[idx - 1])
            logging.debug(
                "Comparing with lastPoint = %s, distance = %s, time = %s" %
                (lastPoint, pf.calDistance(
                    lastPoint, currPoint) < self.distance_threshold,
                 currPoint.ts - lastPoint.ts <= self.time_threshold))
            # Unlike the time filter, with the distance filter, we concatenate all points
            # that are within the distance threshold with the previous trip
            # end, since because of the distance filter, even noisy points
            # can occur at an arbitrary time in the future
            if pf.calDistance(lastPoint, currPoint) < self.distance_threshold:
                logging.info(
                    "Points %s and %s are within the distance filter so part of the same trip"
                    % (lastPoint, currPoint))
                return True
            else:
                return False
def recalc_speed(points_df):
    """
    The input dataframe already has "speed" and "distance" columns.
    Drop them and recalculate speeds from the first point onwards.
    The speed column has the speed between each point and its previous point.
    The first row has a speed of zero.
    """
    stripped_df = points_df.drop("speed", axis=1).drop("distance", axis=1)
    point_list = [ad.AttrDict(row) for row in points_df.to_dict('records')]
    zipped_points_list = zip(point_list, point_list[1:])
    distances = [pf.calDistance(p1, p2) for (p1, p2) in zipped_points_list]
    distances.insert(0, 0)
    with_speeds_df = pd.concat([stripped_df, pd.Series(distances, index=points_df.index, name="distance")], axis=1)
    speeds = [pf.calSpeed(p1, p2) for (p1, p2) in zipped_points_list]
    speeds.insert(0, 0)
    with_speeds_df = pd.concat([with_speeds_df, pd.Series(speeds, index=points_df.index, name="speed")], axis=1)
    return with_speeds_df
    def has_trip_ended(self, prev_point, curr_point, last10PointsDistances, last5MinsDistances, last5MinTimes):
        # Another mismatch between phone and server. Phone stops tracking too soon,
        # so the distance is still greater than the threshold at the end of the trip.
        # But then the next point is a long time away, so we can split again (similar to a distance filter)
        if prev_point is None:
            logging.debug("prev_point is None, continuing trip")
        else:
            timeDelta = curr_point.ts - prev_point.ts
            distDelta = pf.calDistance(prev_point, curr_point)
            if timeDelta > 0:
                speedDelta = distDelta / timeDelta
            else:
                speedDelta = np.nan
            speedThreshold = float(self.distance_threshold) / self.time_threshold
            if (timeDelta > 2 * self.time_threshold and # We have been here for a while
                 speedDelta < speedThreshold): # we haven't moved very much
                logging.debug("prev_point.ts = %s, curr_point.ts = %s, threshold = %s, large gap = %s, ending trip" %
                              (prev_point.ts, curr_point.ts,self.time_threshold, curr_point.ts - prev_point.ts))
                return True
            else:
                logging.debug("prev_point.ts = %s, curr_point.ts = %s, time gap = %s (vs %s), distance_gap = %s (vs %s), speed_gap = %s (vs %s) continuing trip" %
                              (prev_point.ts, curr_point.ts,
                               timeDelta, self.time_threshold,
                               distDelta, self.distance_threshold,
                               speedDelta, speedThreshold))

        # The -30 is a fuzz factor intended to compensate for older clients
        # where data collection stopped after 5 mins, so that we never actually
        # see 5 mins of data

        if (len(last10PointsDistances) < self.point_threshold - 1 or
                    len(last5MinsDistances) == 0 or
                    last5MinTimes.max() < self.time_threshold - 30):
            logging.debug("Too few points to make a decision, continuing")
            return False

        # Normal end-of-trip case
        logging.debug("last5MinsDistances.max() = %s, last10PointsDistance.max() = %s" %
                      (last5MinsDistances.max(), last10PointsDistances.max()))
        if (last5MinsDistances.max() < self.distance_threshold and
            last10PointsDistances.max() < self.distance_threshold):
                return True
def add_dist_heading_speed(points_df):
    """
    Returns a new dataframe with an added "speed" column.
    The speed column has the speed between each point and its previous point.
    The first row has a speed of zero.
    """
    point_list = [ad.AttrDict(row) for row in points_df.to_dict('records')]
    zipped_points_list = zip(point_list, point_list[1:])

    distances = [pf.calDistance(p1, p2) for (p1, p2) in zipped_points_list]
    distances.insert(0, 0)
    speeds = [pf.calSpeed(p1, p2) for (p1, p2) in zipped_points_list]
    speeds.insert(0, 0)
    headings = [pf.calHeading(p1, p2) for (p1, p2) in zipped_points_list]
    headings.insert(0, 0)

    with_distances_df = pd.concat([points_df, pd.Series(distances, name="distance")], axis=1)
    with_speeds_df = pd.concat([with_distances_df, pd.Series(speeds, name="speed")], axis=1)
    with_headings_df = pd.concat([with_speeds_df, pd.Series(headings, name="heading")], axis=1)
    return with_headings_df
    def has_trip_ended(self, lastPoint, currPoint, timeseries):
        # So we must not have been moving for the last _time filter_
        # points. So the trip must have ended
        # Since this is a distance filter, we detect that the last
        # trip has ended at the time that the new trip starts. So
        # if the last_trip_end_point is lastPoint, then
        # curr_trip_start_point should be currPoint. But then we will
        # have problems with the spurious, noisy points that are
        # generated until the geofence is turned on, if ever
        # So we will continue to defer new trip starting until we
        # have worked through all of those.
        timeDelta = currPoint.ts - lastPoint.ts
        distDelta = pf.calDistance(lastPoint, currPoint)
        logging.debug("lastPoint = %s, time difference = %s dist difference %s" %
                      (lastPoint, timeDelta, distDelta))
        if timeDelta > self.time_threshold:
            # We have been at this location for more than the time filter.
            # This could be because we have not been moving for the last
            # _time filter_ points, or because we didn't get points for
            # that duration, (e.g. because we were underground)
            if timeDelta > 0:
                speedDelta = distDelta / timeDelta
            else:
                speedDelta = np.nan
            # this is way too slow. On ios, we use 5meters in 10 minutes.
            # On android, we use 10 meters in 5 mins, which seems to work better
            # for this kind of test
            speedThreshold = float(self.distance_threshold * 2) / (self.time_threshold / 2)

            if eaisr.is_tracking_restarted_in_range(lastPoint.ts, currPoint.ts, timeseries):
                logging.debug("tracking was restarted, ending trip")
                return True

            # In general, we get multiple locations between each motion activity. If we see a bunch of motion activities
            # between two location points, and there is a large gap between the last location and the first
            # motion activity as well, let us just assume that there was a restart
            ongoing_motion_check = len(eaisr.get_ongoing_motion_in_range(lastPoint.ts, currPoint.ts, timeseries)) > 0
            if timeDelta > self.time_threshold and not ongoing_motion_check:
                logging.debug("lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ongoing_motion_in_range = %s, ending trip" %
                              (lastPoint.ts, currPoint.ts,self.time_threshold, currPoint.ts - lastPoint.ts, ongoing_motion_check))
                return True

            # http://www.huffingtonpost.com/hoppercom/the-worlds-20-longest-non-stop-flights_b_5994268.html
            # Longest flight is 17 hours, which is the longest you can go without cell reception
            # And even if you split an air flight that long into two, you will get some untracked time in the
            # middle, so that's good.
            TWELVE_HOURS = 12 * 60 * 60
            if timeDelta > TWELVE_HOURS:
                logging.debug("lastPoint.ts = %s, currPoint.ts = %s, TWELVE_HOURS = %s, large gap = %s, ending trip" %
                              (lastPoint.ts, currPoint.ts, TWELVE_HOURS, currPoint.ts - lastPoint.ts))
                return True

            if (timeDelta > self.time_threshold and # We have been here for a while
                        speedDelta < speedThreshold): # we haven't moved very much
                logging.debug("lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ending trip" %
                              (lastPoint.ts, currPoint.ts,self.time_threshold, currPoint.ts - lastPoint.ts))
                return True
            else:
                logging.debug("lastPoint.ts = %s, currPoint.ts = %s, time gap = %s (vs %s), distance_gap = %s (vs %s), speed_gap = %s (vs %s) continuing trip" %
                              (lastPoint.ts, currPoint.ts,
                               timeDelta, self.time_threshold,
                               distDelta, self.distance_threshold,
                               speedDelta, speedThreshold))
                return False
    def has_trip_ended(self, lastPoint, currPoint, timeseries):
        # So we must not have been moving for the last _time filter_
        # points. So the trip must have ended
        # Since this is a distance filter, we detect that the last
        # trip has ended at the time that the new trip starts. So
        # if the last_trip_end_point is lastPoint, then
        # curr_trip_start_point should be currPoint. But then we will
        # have problems with the spurious, noisy points that are
        # generated until the geofence is turned on, if ever
        # So we will continue to defer new trip starting until we
        # have worked through all of those.
        timeDelta = currPoint.ts - lastPoint.ts
        distDelta = pf.calDistance(lastPoint, currPoint)
        logging.debug(
            "lastPoint = %s, time difference = %s dist difference %s" %
            (lastPoint, timeDelta, distDelta))
        if timeDelta > self.time_threshold:
            # We have been at this location for more than the time filter.
            # This could be because we have not been moving for the last
            # _time filter_ points, or because we didn't get points for
            # that duration, (e.g. because we were underground)
            if timeDelta > 0:
                speedDelta = old_div(distDelta, timeDelta)
            else:
                speedDelta = np.nan
            # this is way too slow. On ios, we use 5meters in 10 minutes.
            # On android, we use 10 meters in 5 mins, which seems to work better
            # for this kind of test
            speedThreshold = old_div(float(self.distance_threshold * 2),
                                     (old_div(self.time_threshold, 2)))

            if eaisr.is_tracking_restarted_in_range(lastPoint.ts, currPoint.ts,
                                                    timeseries):
                logging.debug("tracking was restarted, ending trip")
                return True

            # In general, we get multiple locations between each motion activity. If we see a bunch of motion activities
            # between two location points, and there is a large gap between the last location and the first
            # motion activity as well, let us just assume that there was a restart
            ongoing_motion_in_range = eaisr.get_ongoing_motion_in_range(
                lastPoint.ts, currPoint.ts, timeseries)
            ongoing_motion_check = len(ongoing_motion_in_range) > 0
            if timeDelta > self.time_threshold and not ongoing_motion_check:
                logging.debug(
                    "lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ongoing_motion_in_range = %s, ending trip"
                    % (lastPoint.ts, currPoint.ts, self.time_threshold,
                       currPoint.ts - lastPoint.ts, ongoing_motion_check))
                return True

            # http://www.huffingtonpost.com/hoppercom/the-worlds-20-longest-non-stop-flights_b_5994268.html
            # Longest flight is 17 hours, which is the longest you can go without cell reception
            # And even if you split an air flight that long into two, you will get some untracked time in the
            # middle, so that's good.
            TWELVE_HOURS = 12 * 60 * 60
            if timeDelta > TWELVE_HOURS:
                logging.debug(
                    "lastPoint.ts = %s, currPoint.ts = %s, TWELVE_HOURS = %s, large gap = %s, ending trip"
                    % (lastPoint.ts, currPoint.ts, TWELVE_HOURS,
                       currPoint.ts - lastPoint.ts))
                return True

            if (timeDelta > self.time_threshold
                    and  # We have been here for a while
                    speedDelta < speedThreshold):  # we haven't moved very much
                # This can happen even during ongoing trips due to spurious points
                # generated on some iOS phones
                # https://github.com/e-mission/e-mission-server/issues/577#issuecomment-376379460
                if eaistc.is_huge_invalid_ts_offset(self, lastPoint, currPoint,
                                                    timeseries,
                                                    ongoing_motion_in_range):
                    # invalidate from memory and the database.
                    logging.debug("About to set valid column for index = %s" %
                                  (currPoint.idx))
                    self.filtered_points_df.valid.iloc[currPoint.idx] = False
                    logging.debug("After dropping %d, filtered points = %s" %
                                  (currPoint.idx, self.filtered_points_df.
                                   iloc[currPoint.idx - 5:currPoint.idx +
                                        5][["valid", "fmt_time"]]))
                    logging.debug("remove huge invalid ts offset point = %s" %
                                  currPoint)
                    timeseries.invalidate_raw_entry(currPoint["_id"])
                    # We currently re-retrieve the last point every time, so
                    # searching upwards is good enough but if we use
                    # lastPoint = currPoint, we should update currPoint here
                    return False
                else:
                    logging.debug(
                        "lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ending trip"
                        % (lastPoint.ts, currPoint.ts, self.time_threshold,
                           currPoint.ts - lastPoint.ts))
                    return True
            else:
                logging.debug(
                    "lastPoint.ts = %s, currPoint.ts = %s, time gap = %s (vs %s), distance_gap = %s (vs %s), speed_gap = %s (vs %s) continuing trip"
                    % (lastPoint.ts, currPoint.ts, timeDelta,
                       self.time_threshold, distDelta, self.distance_threshold,
                       speedDelta, speedThreshold))
                return False
    def segment_into_trips(self, timeseries, time_query):
        """
        Examines the timeseries database for a specific range and returns the
        segmentation points. Note that the input is the entire timeseries and
        the time range. This allows algorithms to use whatever combination of
        data that they want from the sensor streams in order to determine the
        segmentation points.
        """
        filtered_points_pre_ts_diff_df = timeseries.get_data_df("background/filtered_location", time_query)
        # Sometimes, we can get bogus points because data.ts and
        # metadata.write_ts are off by a lot. If we don't do this, we end up
        # appearing to travel back in time
        # https://github.com/e-mission/e-mission-server/issues/457
        filtered_points_df = filtered_points_pre_ts_diff_df[(filtered_points_pre_ts_diff_df.metadata_write_ts - filtered_points_pre_ts_diff_df.ts) < 1000]
        filtered_points_df.reset_index(inplace=True)
        transition_df = timeseries.get_data_df("statemachine/transition", time_query)
        if len(transition_df) > 0:
            logging.debug("transition_df = %s" % transition_df[["fmt_time", "transition"]])
        else:
            logging.debug("no transitions found. This can happen for continuous sensing")

        self.last_ts_processed = None

        logging.info("Last ts processed = %s" % self.last_ts_processed)

        segmentation_points = []
        last_trip_end_point = None
        curr_trip_start_point = None
        just_ended = True
        prevPoint = None
        for idx, row in filtered_points_df.iterrows():
            currPoint = ad.AttrDict(row)
            currPoint.update({"idx": idx})
            logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30)
            if curr_trip_start_point is None:
                logging.debug("Appending currPoint because the current start point is None")
                # segmentation_points.append(currPoint)

            if just_ended:
                if self.continue_just_ended(idx, currPoint, filtered_points_df):
                    # We have "processed" the currPoint by deciding to glom it
                    self.last_ts_processed = currPoint.metadata_write_ts
                    continue
                # else:
                sel_point = currPoint
                logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx))
                curr_trip_start_point = sel_point
                just_ended = False

            last5MinsPoints_df = filtered_points_df[np.logical_and(
                                                        np.logical_and(
                                                                filtered_points_df.ts > currPoint.ts - self.time_threshold,
                                                                filtered_points_df.ts < currPoint.ts),
                                                        filtered_points_df.ts >= curr_trip_start_point.ts)]
            # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive.
            # Using .iloc just ends up including points after this one.
            # So we reset_index upstream and use it here.
            # We are going to use the last 8 points for now.
            # TODO: Change this back to last 10 points once we normalize phone and this
            last10Points_df = filtered_points_df.iloc[max(idx-self.point_threshold, curr_trip_start_point.idx):idx+1]
            distanceToLast = lambda row: pf.calDistance(ad.AttrDict(row), currPoint)
            timeToLast = lambda row: currPoint.ts - ad.AttrDict(row).ts
            last5MinsDistances = last5MinsPoints_df.apply(distanceToLast, axis=1)
            logging.debug("last5MinsDistances = %s with length %d" % (last5MinsDistances.as_matrix(), len(last5MinsDistances)))
            last10PointsDistances = last10Points_df.apply(distanceToLast, axis=1)
            logging.debug("last10PointsDistances = %s with length %d, shape %s" % (last10PointsDistances.as_matrix(),
                                                                           len(last10PointsDistances),
                                                                           last10PointsDistances.shape))

            # Fix for https://github.com/e-mission/e-mission-server/issues/348
            last5MinTimes = last5MinsPoints_df.apply(timeToLast, axis=1)
            
            logging.debug("len(last10PointsDistances) = %d, len(last5MinsDistances) = %d" %
                  (len(last10PointsDistances), len(last5MinsDistances)))
            logging.debug("last5MinsTimes.max() = %s, time_threshold = %s" %
                          (last5MinTimes.max() if len(last5MinTimes) > 0 else np.NaN, self.time_threshold))

            if self.has_trip_ended(prevPoint, currPoint, timeseries, last10PointsDistances, last5MinsDistances, last5MinTimes):
                (ended_before_this, last_trip_end_point) = self.get_last_trip_end_point(filtered_points_df,
                                                                                       last10Points_df, last5MinsPoints_df)
                segmentation_points.append((curr_trip_start_point, last_trip_end_point))
                logging.info("Found trip end at %s" % last_trip_end_point.fmt_time)
                # We have processed everything up to the trip end by marking it as a completed trip
                self.last_ts_processed = currPoint.metadata_write_ts
                if ended_before_this:
                    # in this case, we end a trip at the previous point, and the next trip starts at this
                    # point, not the next one
                    just_ended = False
                    prevPoint = currPoint
                    curr_trip_start_point = currPoint
                    logging.debug("Setting new trip start point %s with idx %s" %
                                  (currPoint, currPoint.idx))
                else:
                    # We end a trip at the current point, and the next trip starts at the next point
                    just_ended = True
                    prevPoint = None
            else:
                prevPoint = currPoint

        logging.debug("Iterated over all points, just_ended = %s, len(transition_df) = %s" %
                      (just_ended, len(transition_df)))
        if not just_ended and len(transition_df) > 0:
            stopped_moving_after_last = transition_df[(transition_df.ts > currPoint.ts) & (transition_df.transition == 2)]
            logging.debug("looking after %s, found transitions %s" %
                          (currPoint.ts, stopped_moving_after_last))
            if len(stopped_moving_after_last) > 0:
                (unused, last_trip_end_point) = self.get_last_trip_end_point(filtered_points_df,
                                                                             last10Points_df, None)
                segmentation_points.append((curr_trip_start_point, last_trip_end_point))
                logging.debug("Found trip end at %s" % last_trip_end_point.fmt_time)
                # We have processed everything up to the trip end by marking it as a completed trip
                self.last_ts_processed = currPoint.metadata_write_ts

        return segmentation_points
    def has_trip_ended(self, prev_point, curr_point, timeseries, last10PointsDistances, last5MinsDistances, last5MinTimes):
        # Another mismatch between phone and server. Phone stops tracking too soon,
        # so the distance is still greater than the threshold at the end of the trip.
        # But then the next point is a long time away, so we can split again (similar to a distance filter)
        if prev_point is None:
            logging.debug("prev_point is None, continuing trip")
        else:
            timeDelta = curr_point.ts - prev_point.ts
            distDelta = pf.calDistance(prev_point, curr_point)
            if timeDelta > 0:
                speedDelta = old_div(distDelta, timeDelta)
            else:
                speedDelta = np.nan
            speedThreshold = old_div(float(self.distance_threshold), self.time_threshold)

            if eaisr.is_tracking_restarted_in_range(prev_point.ts, curr_point.ts, timeseries):
                logging.debug("tracking was restarted, ending trip")
                return True

            ongoing_motion_check = len(eaisr.get_ongoing_motion_in_range(prev_point.ts, curr_point.ts, timeseries)) > 0
            if timeDelta > 2 * self.time_threshold and not ongoing_motion_check:
                logging.debug("lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ongoing_motion_in_range = %s, ending trip" %
                              (prev_point.ts, curr_point.ts,self.time_threshold, curr_point.ts - prev_point.ts, ongoing_motion_check))
                return True

            # http://www.huffingtonpost.com/hoppercom/the-worlds-20-longest-non-stop-flights_b_5994268.html
            # Longest flight is 17 hours, which is the longest you can go without cell reception
            # And even if you split an air flight that long into two, you will get some untracked time in the
            # middle, so that's good.
            TWELVE_HOURS = 12 * 60 * 60
            if timeDelta > TWELVE_HOURS:
                logging.debug("prev_point.ts = %s, curr_point.ts = %s, TWELVE_HOURS = %s, large gap = %s, ending trip" %
                              (prev_point.ts, curr_point.ts, TWELVE_HOURS, curr_point.ts - prev_point.ts))
                return True

            if (timeDelta > 2 * self.time_threshold and # We have been here for a while
                 speedDelta < speedThreshold): # we haven't moved very much
                logging.debug("prev_point.ts = %s, curr_point.ts = %s, threshold = %s, large gap = %s, ending trip" %
                              (prev_point.ts, curr_point.ts,self.time_threshold, curr_point.ts - prev_point.ts))
                return True
            else:
                logging.debug("prev_point.ts = %s, curr_point.ts = %s, time gap = %s (vs %s), distance_gap = %s (vs %s), speed_gap = %s (vs %s) continuing trip" %
                              (prev_point.ts, curr_point.ts,
                               timeDelta, self.time_threshold,
                               distDelta, self.distance_threshold,
                               speedDelta, speedThreshold))

        # The -30 is a fuzz factor intended to compensate for older clients
        # where data collection stopped after 5 mins, so that we never actually
        # see 5 mins of data

        if (len(last10PointsDistances) < self.point_threshold - 1 or
                    len(last5MinsDistances) == 0 or
                    last5MinTimes.max() < self.time_threshold - 30):
            logging.debug("Too few points to make a decision, continuing")
            return False

        # Normal end-of-trip case
        logging.debug("last5MinsDistances.max() = %s, last10PointsDistance.max() = %s" %
                      (last5MinsDistances.max(), last10PointsDistances.max()))
        if (last5MinsDistances.max() < self.distance_threshold and
            last10PointsDistances.max() < self.distance_threshold):
                return True
Exemplo n.º 14
0
    def segment_into_trips(self, timeseries, time_query):
        """
        Examines the timeseries database for a specific range and returns the
        segmentation points. Note that the input is the entire timeseries and
        the time range. This allows algorithms to use whatever combination of
        data that they want from the sensor streams in order to determine the
        segmentation points.
        """
        filtered_points_df = timeseries.get_data_df(
            "background/filtered_location", time_query)
        transition_df = timeseries.get_data_df("statemachine/transition",
                                               time_query)

        if len(filtered_points_df) == 0:
            self.last_ts_processed = None
        else:
            # TODO: Decide whether we should return the write_ts in the entry,
            # or whether we should search by timestamp instead.
            # Depends on final direction for the timequery
            self.last_ts_processed = filtered_points_df.iloc[
                -1].metadata_write_ts

        logging.info("Last ts processed = %s" % self.last_ts_processed)

        segmentation_points = []
        last_trip_end_point = None
        curr_trip_start_point = None
        just_ended = True
        for idx, row in filtered_points_df.iterrows():
            currPoint = ad.AttrDict(row)
            currPoint.update({"idx": idx})
            logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30)
            if curr_trip_start_point is None:
                logging.debug(
                    "Appending currPoint because the current start point is None"
                )
                # segmentation_points.append(currPoint)

            if just_ended:
                lastPoint = ad.AttrDict(filtered_points_df.iloc[idx - 1])
                logging.debug(
                    "Comparing with lastPoint = %s, distance = %s, time = %s" %
                    (lastPoint, pf.calDistance(
                        lastPoint, currPoint) < self.distance_threshold,
                     currPoint.ts - lastPoint.ts <= self.time_threshold))
                # Unlike the time filter, with the distance filter, we concatenate all points
                # that are within the distance threshold with the previous trip
                # end, since because of the distance filter, even noisy points
                # can occur at an arbitrary time in the future
                if pf.calDistance(lastPoint,
                                  currPoint) < self.distance_threshold:
                    logging.info(
                        "Points %s and %s are within the distance filter so part of the same trip"
                        % (lastPoint, currPoint))
                    continue
                # else:
                # Here's where we deal with the start trip. At this point, the
                # distance is greater than the filter.
                sel_point = currPoint
                logging.debug("Setting new trip start point %s with idx %s" %
                              (sel_point, sel_point.idx))
                curr_trip_start_point = sel_point
                just_ended = False
            else:
                # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive.
                # Using .iloc just ends up including points after this one.
                # So we reset_index upstream and use it here.
                last10Points_df = filtered_points_df.iloc[
                    max(idx -
                        self.point_threshold, curr_trip_start_point.idx):idx +
                    1]
                lastPoint = ad.AttrDict(filtered_points_df.iloc[idx - 1])
                logging.debug(
                    "lastPoint = %s, time difference = %s dist difference %s" %
                    (lastPoint, currPoint.ts - lastPoint.ts,
                     pf.calDistance(lastPoint, currPoint)))
                if currPoint.ts - lastPoint.ts > self.time_threshold:
                    # We have been at this location for more than the time filter.
                    # So we must not have been moving for the last _time filter_
                    # points. So the trip must have ended
                    # Since this is a distance filter, we detect that the last
                    # trip has ended at the time that the new trip starts. So
                    # if the last_trip_end_point is lastPoint, then
                    # curr_trip_start_point should be currPoint. But then we will
                    # have problems with the spurious, noisy points that are
                    # generated until the geofence is turned on, if ever
                    # So we will continue to defer new trip starting until we
                    # have worked through all of those.
                    last_trip_end_point = lastPoint
                    logging.debug(
                        "Appending last_trip_end_point %s with index %s " %
                        (last_trip_end_point, idx - 1))
                    segmentation_points.append(
                        (curr_trip_start_point, last_trip_end_point))
                    logging.info("Found trip end at %s" %
                                 last_trip_end_point.fmt_time)
                    just_ended = True
        # Since we only end a trip when we start a new trip, this means that
        # the last trip that was pushed is ignored. Consider the example of
        # 2016-02-22 when I took kids to karate. We arrived shortly after 4pm,
        # so during that remote push, a trip end was not detected. And we got
        # back home shortly after 5pm, so the trip end was only detected on the
        # phone at 6pm. At that time, the following points were pushed:
        # ..., 2016-02-22T16:04:02, 2016-02-22T16:49:34, 2016-02-22T16:49:50,
        # ..., 2016-02-22T16:57:04
        # Then, on the server, while iterating through the points, we detected
        # a trip end at 16:04, and a new trip start at 16:49. But we did not
        # detect the trip end at 16:57, because we didn't start a new point.
        # This has two issues:
        # - we won't see this trip until the next trip start, which may be on the next day
        # - we won't see this trip at all, because when we run the pipeline the
        # next time, we will only look at points from that time onwards. These
        # points have been marked as "processed", so they won't even be considered.

        # There are multiple potential fixes:
        # - we can mark only the completed trips as processed. This will solve (2) above, but not (1)
        # - we can mark a trip end based on the fact that we only push data
        # when a trip ends, so if we have data, it means that the trip has been
        # detected as ended on the phone.
        # This seems a bit fragile - what if we start pushing incomplete trip
        # data for efficiency reasons? Therefore, we also check to see if there
        # is a trip_end_detected in this timeframe after the last point. If so,
        # then we end the trip at the last point that we have.
        if not just_ended and len(transition_df) > 0:
            stopped_moving_after_last = transition_df[
                (transition_df.ts > currPoint.ts)
                & (transition_df.transition == 2)]
            if len(stopped_moving_after_last) > 0:
                segmentation_points.append((curr_trip_start_point, currPoint))
        return segmentation_points
Exemplo n.º 15
0
 def end_points_distance(segment):
     if segment.start == segment.end:
         raise RuntimeError("This is messed up segment. Investigate further")
     return pf.calDistance(segment.segment_df.iloc[0], segment.segment_df.iloc[-1])
Exemplo n.º 16
0
 def end_points_distance(segment):
     if segment.start == segment.end:
         raise RuntimeError(
             "This is messed up segment. Investigate further")
     return pf.calDistance(segment.segment_df.iloc[0],
                           segment.segment_df.iloc[-1])
Exemplo n.º 17
0
    def has_trip_ended(self, lastPoint, currPoint, timeseries):
        # So we must not have been moving for the last _time filter_
        # points. So the trip must have ended
        # Since this is a distance filter, we detect that the last
        # trip has ended at the time that the new trip starts. So
        # if the last_trip_end_point is lastPoint, then
        # curr_trip_start_point should be currPoint. But then we will
        # have problems with the spurious, noisy points that are
        # generated until the geofence is turned on, if ever
        # So we will continue to defer new trip starting until we
        # have worked through all of those.
        timeDelta = currPoint.ts - lastPoint.ts
        distDelta = pf.calDistance(lastPoint, currPoint)
        logging.debug(
            "lastPoint = %s, time difference = %s dist difference %s" %
            (lastPoint, timeDelta, distDelta))
        if timeDelta > self.time_threshold:
            # We have been at this location for more than the time filter.
            # This could be because we have not been moving for the last
            # _time filter_ points, or because we didn't get points for
            # that duration, (e.g. because we were underground)
            if timeDelta > 0:
                speedDelta = distDelta / timeDelta
            else:
                speedDelta = np.nan
            # this is way too slow. On ios, we use 5meters in 10 minutes.
            # On android, we use 10 meters in 5 mins, which seems to work better
            # for this kind of test
            speedThreshold = float(
                self.distance_threshold * 2) / (self.time_threshold / 2)

            if eaisr.is_tracking_restarted_in_range(lastPoint.ts, currPoint.ts,
                                                    timeseries):
                logging.debug("tracking was restarted, ending trip")
                return True

            # In general, we get multiple locations between each motion activity. If we see a bunch of motion activities
            # between two location points, and there is a large gap between the last location and the first
            # motion activity as well, let us just assume that there was a restart
            ongoing_motion_check = len(
                eaisr.get_ongoing_motion_in_range(lastPoint.ts, currPoint.ts,
                                                  timeseries)) > 0
            if timeDelta > self.time_threshold and not ongoing_motion_check:
                logging.debug(
                    "lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ongoing_motion_in_range = %s, ending trip"
                    % (lastPoint.ts, currPoint.ts, self.time_threshold,
                       currPoint.ts - lastPoint.ts, ongoing_motion_check))
                return True

            # http://www.huffingtonpost.com/hoppercom/the-worlds-20-longest-non-stop-flights_b_5994268.html
            # Longest flight is 17 hours, which is the longest you can go without cell reception
            # And even if you split an air flight that long into two, you will get some untracked time in the
            # middle, so that's good.
            TWELVE_HOURS = 12 * 60 * 60
            if timeDelta > TWELVE_HOURS:
                logging.debug(
                    "lastPoint.ts = %s, currPoint.ts = %s, TWELVE_HOURS = %s, large gap = %s, ending trip"
                    % (lastPoint.ts, currPoint.ts, TWELVE_HOURS,
                       currPoint.ts - lastPoint.ts))
                return True

            if (timeDelta > self.time_threshold
                    and  # We have been here for a while
                    speedDelta < speedThreshold):  # we haven't moved very much
                logging.debug(
                    "lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ending trip"
                    % (lastPoint.ts, currPoint.ts, self.time_threshold,
                       currPoint.ts - lastPoint.ts))
                return True
            else:
                logging.debug(
                    "lastPoint.ts = %s, currPoint.ts = %s, time gap = %s (vs %s), distance_gap = %s (vs %s), speed_gap = %s (vs %s) continuing trip"
                    % (lastPoint.ts, currPoint.ts, timeDelta,
                       self.time_threshold, distDelta, self.distance_threshold,
                       speedDelta, speedThreshold))
                return False
    def segment_into_trips(self, timeseries, time_query):
        """
        Examines the timeseries database for a specific range and returns the
        segmentation points. Note that the input is the entire timeseries and
        the time range. This allows algorithms to use whatever combination of
        data that they want from the sensor streams in order to determine the
        segmentation points.
        """
        filtered_points_df = timeseries.get_data_df("background/filtered_location", time_query)

        if len(filtered_points_df) == 0:
            self.last_ts_processed = None
        else:
            # TODO: Decide whether we should return the write_ts in the entry,
            # or whether we should search by timestamp instead.
            # Depends on final direction for the timequery
            self.last_ts_processed = filtered_points_df.iloc[-1].metadata_write_ts

        logging.info("Last ts processed = %s" % self.last_ts_processed)

        segmentation_points = []
        last_trip_end_point = None
        curr_trip_start_point = None
        just_ended = True
        for idx, row in filtered_points_df.iterrows():
            currPoint = ad.AttrDict(row)
            currPoint.update({"idx": idx})
            logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30)
            if curr_trip_start_point is None:
                logging.debug("Appending currPoint because the current start point is None")
                # segmentation_points.append(currPoint)

            if just_ended:
                lastPoint = ad.AttrDict(filtered_points_df.iloc[idx-1])
                logging.debug("Comparing with lastPoint = %s, distance = %s, time = %s" % 
                    (lastPoint, pf.calDistance(lastPoint, currPoint) < self.distance_threshold,
                     currPoint.ts - lastPoint.ts <= self.time_threshold))
                # Unlike the time filter, with the distance filter, we concatenate all points
                # that are within the distance threshold with the previous trip
                # end, since because of the distance filter, even noisy points
                # can occur at an arbitrary time in the future
                if pf.calDistance(lastPoint, currPoint) < self.distance_threshold:
                    logging.info("Points %s and %s are within the distance filter so part of the same trip" %
                                 (lastPoint, currPoint))
                    continue
                # else: 
                # Here's where we deal with the start trip. At this point, the
                # distance is greater than the filter. 
                sel_point = currPoint
                logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx))
                curr_trip_start_point = sel_point
                just_ended = False
            else:
                # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive.
                # Using .iloc just ends up including points after this one.
                # So we reset_index upstream and use it here.
                last10Points_df = filtered_points_df.iloc[max(idx-self.point_threshold, curr_trip_start_point.idx):idx+1]
                lastPoint = ad.AttrDict(filtered_points_df.iloc[idx-1])
                logging.debug("lastPoint = %s, time difference = %s dist difference %s" %
                    (lastPoint, currPoint.ts - lastPoint.ts, pf.calDistance(lastPoint, currPoint)))
                if currPoint.ts - lastPoint.ts > self.time_threshold:
                    # We have been at this location for more than the time filter.
                    # So we must not have been moving for the last _time filter_
                    # points. So the trip must have ended
                    # Since this is a distance filter, we detect that the last
                    # trip has ended at the time that the new trip starts. So
                    # if the last_trip_end_point is lastPoint, then
                    # curr_trip_start_point should be currPoint. But then we will
                    # have problems with the spurious, noisy points that are
                    # generated until the geofence is turned on, if ever
                    # So we will continue to defer new trip starting until we
                    # have worked through all of those.
                    last_trip_end_point = lastPoint
                    logging.debug("Appending last_trip_end_point %s with index %s " %
                        (last_trip_end_point, idx-1))
                    segmentation_points.append((curr_trip_start_point, last_trip_end_point))
                    logging.info("Found trip end at %s" % last_trip_end_point.fmt_time)
                    just_ended = True
        return segmentation_points
Exemplo n.º 19
0
    def segment_into_trips(self, timeseries, time_query):
        """
        Examines the timeseries database for a specific range and returns the
        segmentation points. Note that the input is the entire timeseries and
        the time range. This allows algorithms to use whatever combination of
        data that they want from the sensor streams in order to determine the
        segmentation points.
        """
        filtered_points_df = timeseries.get_data_df(
            "background/filtered_location", time_query)

        if len(filtered_points_df) == 0:
            self.last_ts_processed = None
        else:
            # TODO: Decide whether we should return the write_ts in the entry,
            # or whether we should search by timestamp instead.
            # Depends on final direction for the timequery
            self.last_ts_processed = filtered_points_df.iloc[
                -1].metadata_write_ts

        logging.info("Last ts processed = %s" % self.last_ts_processed)

        segmentation_points = []
        last_trip_end_point = None
        curr_trip_start_point = None
        just_ended = True
        for idx, row in filtered_points_df.iterrows():
            currPoint = ad.AttrDict(row)
            currPoint.update({"idx": idx})
            logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30)
            if curr_trip_start_point is None:
                logging.debug(
                    "Appending currPoint because the current start point is None"
                )
                # segmentation_points.append(currPoint)

            if just_ended:
                # Normally, at this point, since the logic here and the
                # logic on the phone are the same, if we have detected a trip
                # end, any points after this are part of the new trip.
                #
                #
                # However, in some circumstances, notably in my data from 27th
                # August, there appears to be a mismatch and we get a couple of
                # points past the end that we detected here.  So let's look for
                # points that are within the distance filter, and are at a
                # delta of 30 secs, and ignore them instead of using them to
                # start the new trip
                prev_point = ad.AttrDict(filtered_points_df.iloc[idx - 1])
                logging.debug("Comparing with prev_point = %s" % prev_point)
                if pf.calDistance(prev_point, currPoint) < self.distance_threshold and \
                    currPoint.ts - prev_point.ts <= 60:
                    logging.info(
                        "Points %s and %s are within the distance filter and only 1 min apart so part of the same trip"
                        % (prev_point, currPoint))
                    continue
                # else:
                sel_point = currPoint
                logging.debug("Setting new trip start point %s with idx %s" %
                              (sel_point, sel_point.idx))
                curr_trip_start_point = sel_point
                just_ended = False

            last5MinsPoints_df = filtered_points_df[np.logical_and(
                np.logical_and(
                    filtered_points_df.ts > currPoint.ts - self.time_threshold,
                    filtered_points_df.ts < currPoint.ts),
                filtered_points_df.ts >= curr_trip_start_point.ts)]
            # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive.
            # Using .iloc just ends up including points after this one.
            # So we reset_index upstream and use it here.
            # We are going to use the last 8 points for now.
            # TODO: Change this back to last 10 points once we normalize phone and this
            last10Points_df = filtered_points_df.iloc[
                max(idx -
                    self.point_threshold, curr_trip_start_point.idx):idx + 1]
            distanceToLast = lambda (row): pf.calDistance(
                ad.AttrDict(row), currPoint)
            last5MinsDistances = last5MinsPoints_df.apply(distanceToLast,
                                                          axis=1)
            logging.debug(
                "last5MinsDistances = %s with length %d" %
                (last5MinsDistances.as_matrix(), len(last5MinsDistances)))
            last10PointsDistances = last10Points_df.apply(distanceToLast,
                                                          axis=1)
            logging.debug(
                "last10PointsDistances = %s with length %d, shape %s" %
                (last10PointsDistances.as_matrix(), len(last10PointsDistances),
                 last10PointsDistances.shape))

            logging.debug(
                "len(last10PointsDistances) = %d, len(last5MinsDistances) = %d"
                % (len(last10PointsDistances), len(last5MinsDistances)))
            if (len(last10PointsDistances) < self.point_threshold - 1
                    or len(last5MinsDistances) == 0):
                logging.debug("Too few points to make a decision, continuing")
            else:
                logging.debug(
                    "last5MinsDistances.max() = %s, last10PointsDistance.max() = %s"
                    % (last5MinsDistances.max(), last10PointsDistances.max()))
                if (last5MinsDistances.max() < self.distance_threshold and
                        last10PointsDistances.max() < self.distance_threshold):
                    last_trip_end_index = int(
                        min(np.median(last5MinsPoints_df.index),
                            np.median(last10Points_df.index)))
                    #                     logging.debug("last5MinPoints.median = %s (%s), last10Points_df = %s (%s), sel index = %s" %
                    #                         (np.median(last5MinsPoints_df.index), last5MinsPoints_df.index,
                    #                          np.median(last10Points_df.index), last10Points_df.index,
                    #                          last_trip_end_index))
                    last_trip_end_point_row = filtered_points_df.iloc[
                        last_trip_end_index]
                    last_trip_end_point = ad.AttrDict(
                        filtered_points_df.iloc[last_trip_end_index])
                    logging.debug(
                        "Appending last_trip_end_point %s with index %s " %
                        (last_trip_end_point, last_trip_end_point_row.name))
                    segmentation_points.append(
                        (curr_trip_start_point, last_trip_end_point))
                    logging.info("Found trip end at %s" %
                                 last_trip_end_point.fmt_time)
                    just_ended = True
        return segmentation_points
    def segment_into_trips(self, timeseries, time_query):
        """
        Examines the timeseries database for a specific range and returns the
        segmentation points. Note that the input is the entire timeseries and
        the time range. This allows algorithms to use whatever combination of
        data that they want from the sensor streams in order to determine the
        segmentation points.
        """
        filtered_points_df = timeseries.get_data_df("background/filtered_location", time_query)
        transition_df = timeseries.get_data_df("statemachine/transition", time_query)

        if len(filtered_points_df) == 0:
            self.last_ts_processed = None
        else:
            # TODO: Decide whether we should return the write_ts in the entry,
            # or whether we should search by timestamp instead.
            # Depends on final direction for the timequery
            self.last_ts_processed = filtered_points_df.iloc[-1].metadata_write_ts

        logging.info("Last ts processed = %s" % self.last_ts_processed)

        segmentation_points = []
        last_trip_end_point = None
        curr_trip_start_point = None
        just_ended = True
        for idx, row in filtered_points_df.iterrows():
            currPoint = ad.AttrDict(row)
            currPoint.update({"idx": idx})
            logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30)
            if curr_trip_start_point is None:
                logging.debug("Appending currPoint because the current start point is None")
                # segmentation_points.append(currPoint)

            if just_ended:
                lastPoint = ad.AttrDict(filtered_points_df.iloc[idx-1])
                logging.debug("Comparing with lastPoint = %s, distance = %s, time = %s" % 
                    (lastPoint, pf.calDistance(lastPoint, currPoint) < self.distance_threshold,
                     currPoint.ts - lastPoint.ts <= self.time_threshold))
                # Unlike the time filter, with the distance filter, we concatenate all points
                # that are within the distance threshold with the previous trip
                # end, since because of the distance filter, even noisy points
                # can occur at an arbitrary time in the future
                if pf.calDistance(lastPoint, currPoint) < self.distance_threshold:
                    logging.info("Points %s and %s are within the distance filter so part of the same trip" %
                                 (lastPoint, currPoint))
                    continue
                # else: 
                # Here's where we deal with the start trip. At this point, the
                # distance is greater than the filter. 
                sel_point = currPoint
                logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx))
                curr_trip_start_point = sel_point
                just_ended = False
            else:
                # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive.
                # Using .iloc just ends up including points after this one.
                # So we reset_index upstream and use it here.
                last10Points_df = filtered_points_df.iloc[max(idx-self.point_threshold, curr_trip_start_point.idx):idx+1]
                lastPoint = ad.AttrDict(filtered_points_df.iloc[idx-1])
                logging.debug("lastPoint = %s, time difference = %s dist difference %s" %
                    (lastPoint, currPoint.ts - lastPoint.ts, pf.calDistance(lastPoint, currPoint)))
                if currPoint.ts - lastPoint.ts > self.time_threshold:
                    # We have been at this location for more than the time filter.
                    # So we must not have been moving for the last _time filter_
                    # points. So the trip must have ended
                    # Since this is a distance filter, we detect that the last
                    # trip has ended at the time that the new trip starts. So
                    # if the last_trip_end_point is lastPoint, then
                    # curr_trip_start_point should be currPoint. But then we will
                    # have problems with the spurious, noisy points that are
                    # generated until the geofence is turned on, if ever
                    # So we will continue to defer new trip starting until we
                    # have worked through all of those.
                    last_trip_end_point = lastPoint
                    logging.debug("Appending last_trip_end_point %s with index %s " %
                        (last_trip_end_point, idx-1))
                    segmentation_points.append((curr_trip_start_point, last_trip_end_point))
                    logging.info("Found trip end at %s" % last_trip_end_point.fmt_time)
                    just_ended = True
        # Since we only end a trip when we start a new trip, this means that
        # the last trip that was pushed is ignored. Consider the example of
        # 2016-02-22 when I took kids to karate. We arrived shortly after 4pm,
        # so during that remote push, a trip end was not detected. And we got
        # back home shortly after 5pm, so the trip end was only detected on the
        # phone at 6pm. At that time, the following points were pushed:
        # ..., 2016-02-22T16:04:02, 2016-02-22T16:49:34, 2016-02-22T16:49:50,
        # ..., 2016-02-22T16:57:04
        # Then, on the server, while iterating through the points, we detected
        # a trip end at 16:04, and a new trip start at 16:49. But we did not
        # detect the trip end at 16:57, because we didn't start a new point.
        # This has two issues:
        # - we won't see this trip until the next trip start, which may be on the next day
        # - we won't see this trip at all, because when we run the pipeline the
        # next time, we will only look at points from that time onwards. These
        # points have been marked as "processed", so they won't even be considered.

        # There are multiple potential fixes:
        # - we can mark only the completed trips as processed. This will solve (2) above, but not (1)
        # - we can mark a trip end based on the fact that we only push data
        # when a trip ends, so if we have data, it means that the trip has been
        # detected as ended on the phone.
        # This seems a bit fragile - what if we start pushing incomplete trip
        # data for efficiency reasons? Therefore, we also check to see if there
        # is a trip_end_detected in this timeframe after the last point. If so,
        # then we end the trip at the last point that we have.
        if not just_ended and len(transition_df) > 0:
            stopped_moving_after_last = transition_df[(transition_df.ts > currPoint.ts) & (transition_df.transition == 2)]
            if len(stopped_moving_after_last) > 0:
                segmentation_points.append((curr_trip_start_point, currPoint))
        return segmentation_points
Exemplo n.º 21
0
    def has_trip_ended(self, prev_point, curr_point, timeseries,
                       last10PointsDistances, last5MinsDistances,
                       last5MinTimes):
        # Another mismatch between phone and server. Phone stops tracking too soon,
        # so the distance is still greater than the threshold at the end of the trip.
        # But then the next point is a long time away, so we can split again (similar to a distance filter)
        if prev_point is None:
            logging.debug("prev_point is None, continuing trip")
        else:
            timeDelta = curr_point.ts - prev_point.ts
            distDelta = pf.calDistance(prev_point, curr_point)
            if timeDelta > 0:
                speedDelta = old_div(distDelta, timeDelta)
            else:
                speedDelta = np.nan
            speedThreshold = old_div(float(self.distance_threshold),
                                     self.time_threshold)

            if eaisr.is_tracking_restarted_in_range(prev_point.ts,
                                                    curr_point.ts, timeseries):
                logging.debug("tracking was restarted, ending trip")
                return True

            ongoing_motion_check = len(
                eaisr.get_ongoing_motion_in_range(prev_point.ts, curr_point.ts,
                                                  timeseries)) > 0
            if timeDelta > 2 * self.time_threshold and not ongoing_motion_check:
                logging.debug(
                    "lastPoint.ts = %s, currPoint.ts = %s, threshold = %s, large gap = %s, ongoing_motion_in_range = %s, ending trip"
                    % (prev_point.ts, curr_point.ts, self.time_threshold,
                       curr_point.ts - prev_point.ts, ongoing_motion_check))
                return True

            # http://www.huffingtonpost.com/hoppercom/the-worlds-20-longest-non-stop-flights_b_5994268.html
            # Longest flight is 17 hours, which is the longest you can go without cell reception
            # And even if you split an air flight that long into two, you will get some untracked time in the
            # middle, so that's good.
            TWELVE_HOURS = 12 * 60 * 60
            if timeDelta > TWELVE_HOURS:
                logging.debug(
                    "prev_point.ts = %s, curr_point.ts = %s, TWELVE_HOURS = %s, large gap = %s, ending trip"
                    % (prev_point.ts, curr_point.ts, TWELVE_HOURS,
                       curr_point.ts - prev_point.ts))
                return True

            if (timeDelta > 2 * self.time_threshold
                    and  # We have been here for a while
                    speedDelta < speedThreshold):  # we haven't moved very much
                logging.debug(
                    "prev_point.ts = %s, curr_point.ts = %s, threshold = %s, large gap = %s, ending trip"
                    % (prev_point.ts, curr_point.ts, self.time_threshold,
                       curr_point.ts - prev_point.ts))
                return True
            else:
                logging.debug(
                    "prev_point.ts = %s, curr_point.ts = %s, time gap = %s (vs %s), distance_gap = %s (vs %s), speed_gap = %s (vs %s) continuing trip"
                    % (prev_point.ts, curr_point.ts, timeDelta,
                       self.time_threshold, distDelta, self.distance_threshold,
                       speedDelta, speedThreshold))

        # The -30 is a fuzz factor intended to compensate for older clients
        # where data collection stopped after 5 mins, so that we never actually
        # see 5 mins of data

        if (len(last10PointsDistances) < self.point_threshold - 1
                or len(last5MinsDistances) == 0
                or last5MinTimes.max() < self.time_threshold - 30):
            logging.debug("Too few points to make a decision, continuing")
            return False

        # Normal end-of-trip case
        logging.debug(
            "last5MinsDistances.max() = %s, last10PointsDistance.max() = %s" %
            (last5MinsDistances.max(), last10PointsDistances.max()))
        if (last5MinsDistances.max() < self.distance_threshold
                and last10PointsDistances.max() < self.distance_threshold):
            return True
Exemplo n.º 22
0
    def segment_into_trips(self, timeseries, time_query):
        """
        Examines the timeseries database for a specific range and returns the
        segmentation points. Note that the input is the entire timeseries and
        the time range. This allows algorithms to use whatever combination of
        data that they want from the sensor streams in order to determine the
        segmentation points.
        """
        filtered_points_pre_ts_diff_df = timeseries.get_data_df(
            "background/filtered_location", time_query)
        # Sometimes, we can get bogus points because data.ts and
        # metadata.write_ts are off by a lot. If we don't do this, we end up
        # appearing to travel back in time
        # https://github.com/e-mission/e-mission-server/issues/457
        filtered_points_df = filtered_points_pre_ts_diff_df[(
            filtered_points_pre_ts_diff_df.metadata_write_ts -
            filtered_points_pre_ts_diff_df.ts) < 1000]
        filtered_points_df.reset_index(inplace=True)
        transition_df = timeseries.get_data_df("statemachine/transition",
                                               time_query)
        if len(transition_df) > 0:
            logging.debug("transition_df = %s" %
                          transition_df[["fmt_time", "transition"]])
        else:
            logging.debug(
                "no transitions found. This can happen for continuous sensing")

        self.last_ts_processed = None

        logging.info("Last ts processed = %s" % self.last_ts_processed)

        segmentation_points = []
        last_trip_end_point = None
        curr_trip_start_point = None
        just_ended = True
        prevPoint = None
        for idx, row in filtered_points_df.iterrows():
            currPoint = ad.AttrDict(row)
            currPoint.update({"idx": idx})
            logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30)
            if curr_trip_start_point is None:
                logging.debug(
                    "Appending currPoint because the current start point is None"
                )
                # segmentation_points.append(currPoint)

            if just_ended:
                if self.continue_just_ended(idx, currPoint,
                                            filtered_points_df):
                    # We have "processed" the currPoint by deciding to glom it
                    self.last_ts_processed = currPoint.metadata_write_ts
                    continue
                # else:
                sel_point = currPoint
                logging.debug("Setting new trip start point %s with idx %s" %
                              (sel_point, sel_point.idx))
                curr_trip_start_point = sel_point
                just_ended = False

            last5MinsPoints_df = filtered_points_df[np.logical_and(
                np.logical_and(
                    filtered_points_df.ts > currPoint.ts - self.time_threshold,
                    filtered_points_df.ts < currPoint.ts),
                filtered_points_df.ts >= curr_trip_start_point.ts)]
            # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive.
            # Using .iloc just ends up including points after this one.
            # So we reset_index upstream and use it here.
            # We are going to use the last 8 points for now.
            # TODO: Change this back to last 10 points once we normalize phone and this
            last10Points_df = filtered_points_df.iloc[
                max(idx -
                    self.point_threshold, curr_trip_start_point.idx):idx + 1]
            distanceToLast = lambda row: pf.calDistance(
                ad.AttrDict(row), currPoint)
            timeToLast = lambda row: currPoint.ts - ad.AttrDict(row).ts
            last5MinsDistances = last5MinsPoints_df.apply(distanceToLast,
                                                          axis=1)
            logging.debug(
                "last5MinsDistances = %s with length %d" %
                (last5MinsDistances.to_numpy(), len(last5MinsDistances)))
            last10PointsDistances = last10Points_df.apply(distanceToLast,
                                                          axis=1)
            logging.debug(
                "last10PointsDistances = %s with length %d, shape %s" %
                (last10PointsDistances.to_numpy(), len(last10PointsDistances),
                 last10PointsDistances.shape))

            # Fix for https://github.com/e-mission/e-mission-server/issues/348
            last5MinTimes = last5MinsPoints_df.apply(timeToLast, axis=1)

            logging.debug(
                "len(last10PointsDistances) = %d, len(last5MinsDistances) = %d"
                % (len(last10PointsDistances), len(last5MinsDistances)))
            logging.debug(
                "last5MinsTimes.max() = %s, time_threshold = %s" %
                (last5MinTimes.max() if len(last5MinTimes) > 0 else np.NaN,
                 self.time_threshold))

            if self.has_trip_ended(prevPoint, currPoint, timeseries,
                                   last10PointsDistances, last5MinsDistances,
                                   last5MinTimes):
                (ended_before_this,
                 last_trip_end_point) = self.get_last_trip_end_point(
                     filtered_points_df, last10Points_df, last5MinsPoints_df)
                segmentation_points.append(
                    (curr_trip_start_point, last_trip_end_point))
                logging.info("Found trip end at %s" %
                             last_trip_end_point.fmt_time)
                # We have processed everything up to the trip end by marking it as a completed trip
                self.last_ts_processed = currPoint.metadata_write_ts
                if ended_before_this:
                    # in this case, we end a trip at the previous point, and the next trip starts at this
                    # point, not the next one
                    just_ended = False
                    prevPoint = currPoint
                    curr_trip_start_point = currPoint
                    logging.debug(
                        "Setting new trip start point %s with idx %s" %
                        (currPoint, currPoint.idx))
                else:
                    # We end a trip at the current point, and the next trip starts at the next point
                    just_ended = True
                    prevPoint = None
            else:
                prevPoint = currPoint

        logging.debug(
            "Iterated over all points, just_ended = %s, len(transition_df) = %s"
            % (just_ended, len(transition_df)))
        if not just_ended and len(transition_df) > 0:
            stopped_moving_after_last = transition_df[
                (transition_df.ts > currPoint.ts)
                & (transition_df.transition == 2)]
            logging.debug("looking after %s, found transitions %s" %
                          (currPoint.ts, stopped_moving_after_last))
            if len(stopped_moving_after_last) > 0:
                (unused, last_trip_end_point) = self.get_last_trip_end_point(
                    filtered_points_df, last10Points_df, None)
                segmentation_points.append(
                    (curr_trip_start_point, last_trip_end_point))
                logging.debug("Found trip end at %s" %
                              last_trip_end_point.fmt_time)
                # We have processed everything up to the trip end by marking it as a completed trip
                self.last_ts_processed = currPoint.metadata_write_ts

        return segmentation_points
    def segment_into_trips(self, timeseries, time_query):
        """
        Examines the timeseries database for a specific range and returns the
        segmentation points. Note that the input is the entire timeseries and
        the time range. This allows algorithms to use whatever combination of
        data that they want from the sensor streams in order to determine the
        segmentation points.
        """
        filtered_points_df = timeseries.get_data_df("background/filtered_location", time_query)

        if len(filtered_points_df) == 0:
            self.last_ts_processed = None
        else:
            # TODO: Decide whether we should return the write_ts in the entry,
            # or whether we should search by timestamp instead.
            # Depends on final direction for the timequery
            self.last_ts_processed = filtered_points_df.iloc[-1].metadata_write_ts

        logging.info("Last ts processed = %s" % self.last_ts_processed)

        segmentation_points = []
        last_trip_end_point = None
        curr_trip_start_point = None
        just_ended = True
        for idx, row in filtered_points_df.iterrows():
            currPoint = ad.AttrDict(row)
            currPoint.update({"idx": idx})
            logging.debug("-" * 30 + str(currPoint.fmt_time) + "-" * 30)
            if curr_trip_start_point is None:
                logging.debug("Appending currPoint because the current start point is None")
                # segmentation_points.append(currPoint)

            if just_ended:
                # Normally, at this point, since the logic here and the
                # logic on the phone are the same, if we have detected a trip
                # end, any points after this are part of the new trip.
                #
                #
                # However, in some circumstances, notably in my data from 27th
                # August, there appears to be a mismatch and we get a couple of
                # points past the end that we detected here.  So let's look for
                # points that are within the distance filter, and are at a
                # delta of 30 secs, and ignore them instead of using them to
                # start the new trip
                prev_point = ad.AttrDict(filtered_points_df.iloc[idx - 1])
                logging.debug("Comparing with prev_point = %s" % prev_point)
                if pf.calDistance(prev_point, currPoint) < self.distance_threshold and \
                    currPoint.ts - prev_point.ts <= 60:
                    logging.info("Points %s and %s are within the distance filter and only 1 min apart so part of the same trip" %
                                 (prev_point, currPoint))
                    continue
                # else: 
                sel_point = currPoint
                logging.debug("Setting new trip start point %s with idx %s" % (sel_point, sel_point.idx))
                curr_trip_start_point = sel_point
                just_ended = False
                
            last5MinsPoints_df = filtered_points_df[np.logical_and(
                                                        np.logical_and(
                                                                filtered_points_df.ts > currPoint.ts - self.time_threshold,
                                                                filtered_points_df.ts < currPoint.ts),
                                                        filtered_points_df.ts >= curr_trip_start_point.ts)]
            # Using .loc here causes problems if we have filtered out some points and so the index is non-consecutive.
            # Using .iloc just ends up including points after this one.
            # So we reset_index upstream and use it here.
            # We are going to use the last 8 points for now.
            # TODO: Change this back to last 10 points once we normalize phone and this
            last10Points_df = filtered_points_df.iloc[max(idx-self.point_threshold, curr_trip_start_point.idx):idx+1]
            distanceToLast = lambda(row): pf.calDistance(ad.AttrDict(row), currPoint)
            last5MinsDistances = last5MinsPoints_df.apply(distanceToLast, axis=1)
            logging.debug("last5MinsDistances = %s with length %d" % (last5MinsDistances.as_matrix(), len(last5MinsDistances)))
            last10PointsDistances = last10Points_df.apply(distanceToLast, axis=1)
            logging.debug("last10PointsDistances = %s with length %d, shape %s" % (last10PointsDistances.as_matrix(),
                                                                           len(last10PointsDistances),
                                                                           last10PointsDistances.shape))
            
            logging.debug("len(last10PointsDistances) = %d, len(last5MinsDistances) = %d" %
                  (len(last10PointsDistances), len(last5MinsDistances)))
            if (len(last10PointsDistances) < self.point_threshold - 1 or len(last5MinsDistances) == 0):
                logging.debug("Too few points to make a decision, continuing")
            else:
                logging.debug("last5MinsDistances.max() = %s, last10PointsDistance.max() = %s" %
                  (last5MinsDistances.max(), last10PointsDistances.max()))
                if (last5MinsDistances.max() < self.distance_threshold and 
                    last10PointsDistances.max() < self.distance_threshold):
                    last_trip_end_index = int(min(np.median(last5MinsPoints_df.index),
                                               np.median(last10Points_df.index)))
#                     logging.debug("last5MinPoints.median = %s (%s), last10Points_df = %s (%s), sel index = %s" %
#                         (np.median(last5MinsPoints_df.index), last5MinsPoints_df.index,
#                          np.median(last10Points_df.index), last10Points_df.index,
#                          last_trip_end_index))
                    last_trip_end_point_row = filtered_points_df.iloc[last_trip_end_index]
                    last_trip_end_point = ad.AttrDict(filtered_points_df.iloc[last_trip_end_index])
                    logging.debug("Appending last_trip_end_point %s with index %s " %
                        (last_trip_end_point, last_trip_end_point_row.name))
                    segmentation_points.append((curr_trip_start_point, last_trip_end_point))
                    logging.info("Found trip end at %s" % last_trip_end_point.fmt_time)
                    just_ended = True
        return segmentation_points