def testPointFilteringRichmondJump(self): classicJumpTrip1 = self.trip_entries[6] self.loadPointsForTrip(classicJumpTrip1.get_id()) classicJumpSections1 = [s for s in self.section_entries if s.data.trip_id == classicJumpTrip1.get_id()] outlier_algo = eaics.BoxplotOutlier() jump_algo = eaicj.SmoothZigzag(False, 100) for i, section_entry in enumerate(classicJumpSections1): logging.debug("-" * 20 + "Considering section %s" % i + "-" * 20) section_df = self.ts.get_data_df("background/filtered_location", esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section_entry.get_id())) with_speeds_df = eaicl.add_dist_heading_speed(section_df) maxSpeed = outlier_algo.get_threshold(with_speeds_df) logging.debug("Max speed for section %s = %s" % (i, maxSpeed)) jump_algo.filter(with_speeds_df) logging.debug("Retaining points %s" % np.nonzero(jump_algo.inlier_mask_)) to_delete_mask = np.logical_not(jump_algo.inlier_mask_) logging.debug("Deleting points %s" % np.nonzero(to_delete_mask)) delete_ids = list(with_speeds_df[to_delete_mask]._id) logging.debug("Deleting ids %s" % delete_ids) # There is only one section self.assertEqual(i, 0) # The bad section, should have the third point filtered self.assertEqual(np.count_nonzero(to_delete_mask), 1) self.assertEqual([str(id) for id in delete_ids], ["55e86dbb7d65cb39ee987e09"])
def filter_jumps(user_id, section_id): """ filters out any jumps in the points related to this section and stores a entry that lists the deleted points for this trip and this section. :param user_id: the user id to filter the trips for :param section_id: the section_id to filter the trips for :return: none. saves an entry with the filtered points into the database. """ logging.debug("filter_jumps(%s, %s) called" % (user_id, section_id)) outlier_algo = eaico.BoxplotOutlier() filtering_algo = eaicj.SmoothZigzag() tq = esds.get_time_query_for_section(section_id) ts = esta.TimeSeries.get_time_series(user_id) section_points_df = ts.get_data_df("background/filtered_location", tq) logging.debug("len(section_points_df) = %s" % len(section_points_df)) points_to_ignore_df = get_points_to_filter(section_points_df, outlier_algo, filtering_algo) if points_to_ignore_df is None: # There were no points to delete return deleted_point_id_list = list(points_to_ignore_df._id) logging.debug("deleted %s points" % len(deleted_point_id_list)) filter_result = ecws.Smoothresults() filter_result.section = section_id filter_result.deleted_points = deleted_point_id_list filter_result.outlier_algo = "BoxplotOutlier" filter_result.filtering_algo = "SmoothZigzag" result_entry = ecwe.Entry.create_entry(user_id, "analysis/smoothing", filter_result) ts.insert(result_entry)
def testPointFilteringZigzag(self): classicJumpTrip1 = self.trip_entries[8] self.loadPointsForTrip(classicJumpTrip1.get_id()) classicJumpSections1 = [ s for s in self.section_entries if s.data.trip_id == classicJumpTrip1.get_id() ] outlier_algo = eaics.BoxplotOutlier() jump_algo = eaicj.SmoothZigzag(False, 100) for i, section_entry in enumerate(classicJumpSections1): logging.debug("-" * 20 + "Considering section %s" % i + "-" * 20) section_df = self.ts.get_data_df( "background/filtered_location", esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section_entry.get_id())) with_speeds_df = eaicl.add_dist_heading_speed(section_df) maxSpeed = outlier_algo.get_threshold(with_speeds_df) logging.debug("Max speed for section %s = %s" % (i, maxSpeed)) jump_algo.filter(with_speeds_df) logging.debug("Retaining points %s" % np.nonzero(jump_algo.inlier_mask_.to_numpy())) to_delete_mask = np.logical_not(jump_algo.inlier_mask_) logging.debug("Deleting points %s" % np.nonzero(to_delete_mask.to_numpy())) delete_ids = list(with_speeds_df[to_delete_mask]._id) logging.debug("Deleting ids %s" % delete_ids) if i == 0: # this is the zigzag section self.assertEqual( np.nonzero(to_delete_mask.to_numpy())[0].tolist(), [25, 64, 114, 115, 116, 117, 118, 119, 120, 123, 126]) self.assertEqual(delete_ids, [ boi.ObjectId('55edafe77d65cb39ee9882ff'), boi.ObjectId('55edcc157d65cb39ee98836e'), boi.ObjectId('55edcc1f7d65cb39ee988400'), boi.ObjectId('55edcc1f7d65cb39ee988403'), boi.ObjectId('55edcc1f7d65cb39ee988406'), boi.ObjectId('55edcc1f7d65cb39ee988409'), boi.ObjectId('55edcc1f7d65cb39ee98840c'), boi.ObjectId('55edcc207d65cb39ee988410'), boi.ObjectId('55edcc207d65cb39ee988412'), boi.ObjectId('55edcc217d65cb39ee98841f'), boi.ObjectId('55edcc217d65cb39ee988429') ]) else: self.assertEqual(len(np.nonzero(to_delete_mask.to_numpy())[0]), 0) self.assertEqual(len(delete_ids), 0)
def filter(self, with_speeds_df): self.inlier_mask_ = pd.Series([True] * with_speeds_df.shape[0]) self.with_speeds_df = with_speeds_df self.find_segments() logging.debug("After splitting, segment list is %s with size %s" % (self.segment_list, len(self.segment_list))) if len(self.segment_list) == 1: # there were no jumps, so there's nothing to do logging.info("No jumps, nothing to filter") return start_segment_idx = self.find_start_segment(self.segment_list) self.segment_list[start_segment_idx].state = Segment.State.GOOD self.mark_segment_states(start_segment_idx, SmoothZigzag.Direction.RIGHT) self.mark_segment_states(start_segment_idx, SmoothZigzag.Direction.LEFT) unknown_segments = [ segment for segment in self.segment_list if segment.state == Segment.State.UNKNOWN ] logging.debug("unknown_segments = %s" % unknown_segments) assert len( unknown_segments ) == 0, "Found %s unknown segments - early termination of loop?" % len( unknown_segments) bad_segments = [ segment for segment in self.segment_list if segment.state == Segment.State.BAD ] logging.debug("bad_segments = %s" % bad_segments) for segment in bad_segments: self.inlier_mask_[segment.start:segment.end] = False logging.debug("after setting values, outlier_mask = %s" % np.nonzero(self.inlier_mask_ == False)) # logging.debug("point details are %s" % with_speeds_df[np.logical_not(self.inlier_mask_)]) # TODO: This is not the right place for this - adds too many dependencies # Should do this in the outer class in general so that we can do # multiple passes of any filtering algorithm import emission.analysis.intake.cleaning.cleaning_methods.speed_outlier_detection as cso import emission.analysis.intake.cleaning.location_smoothing as ls recomputed_speeds_df = ls.recalc_speed( self.with_speeds_df[self.inlier_mask_]) recomputed_threshold = cso.BoxplotOutlier( ignore_zeros=True).get_threshold(recomputed_speeds_df) # assert recomputed_speeds_df[recomputed_speeds_df.speed > recomputed_threshold].shape[0] == 0, "After first round, still have outliers %s" % recomputed_speeds_df[recomputed_speeds_df.speed > recomputed_threshold] if recomputed_speeds_df[recomputed_speeds_df.speed > recomputed_threshold].shape[0] != 0: logging.info( "After first round, still have outliers %s" % recomputed_speeds_df[ recomputed_speeds_df.speed > recomputed_threshold])
def filter_jumps(user_id, section_id): """ filters out any jumps in the points related to this section and stores a entry that lists the deleted points for this trip and this section. :param user_id: the user id to filter the trips for :param section_id: the section_id to filter the trips for :return: none. saves an entry with the filtered points into the database. """ logging.debug("filter_jumps(%s, %s) called" % (user_id, section_id)) outlier_algo = eaico.BoxplotOutlier() tq = esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section_id) ts = esta.TimeSeries.get_time_series(user_id) section_points_df = ts.get_data_df("background/filtered_location", tq) is_ios = section_points_df["filter"].dropna().unique().tolist() == [ "distance" ] if is_ios: logging.debug("Found iOS section, filling in gaps with fake data") section_points_df = _ios_fill_fake_data(section_points_df) filtering_algo = eaicj.SmoothZigzag(is_ios, DEFAULT_SAME_POINT_DISTANCE) logging.debug("len(section_points_df) = %s" % len(section_points_df)) points_to_ignore_df = get_points_to_filter(section_points_df, outlier_algo, filtering_algo) if points_to_ignore_df is None: # There were no points to delete return points_to_ignore_df_filtered = points_to_ignore_df._id.dropna() logging.debug( "after filtering ignored points, %s -> %s" % (len(points_to_ignore_df), len(points_to_ignore_df_filtered))) # We shouldn't really filter any fuzzed points because they represent 100m in 60 secs # but let's actually check for that # assert len(points_to_ignore_df) == len(points_to_ignore_df_filtered) deleted_point_id_list = list(points_to_ignore_df_filtered) logging.debug("deleted %s points" % len(deleted_point_id_list)) filter_result = ecws.Smoothresults() filter_result.section = section_id filter_result.deleted_points = deleted_point_id_list filter_result.outlier_algo = "BoxplotOutlier" filter_result.filtering_algo = "SmoothZigzag" result_entry = ecwe.Entry.create_entry(user_id, "analysis/smoothing", filter_result) ts.insert(result_entry)
def testPointFilteringShanghaiJump(self): classicJumpTrip1 = self.trip_entries[0] self.loadPointsForTrip(classicJumpTrip1.get_id()) classicJumpSections1 = [ s for s in self.section_entries if s.data.trip_id == classicJumpTrip1.get_id() ] outlier_algo = eaics.BoxplotOutlier() jump_algo = eaicj.SmoothZigzag(False, 100) for i, section_entry in enumerate(classicJumpSections1): logging.debug("-" * 20 + "Considering section %s" % i + "-" * 20) section_df = self.ts.get_data_df( "background/filtered_location", esda.get_time_query_for_trip_like(esda.RAW_SECTION_KEY, section_entry.get_id())) with_speeds_df = eaicl.add_dist_heading_speed(section_df) maxSpeed = outlier_algo.get_threshold(with_speeds_df) logging.debug("Max speed for section %s = %s" % (i, maxSpeed)) jump_algo.filter(with_speeds_df) logging.debug("Retaining points %s" % np.nonzero(jump_algo.inlier_mask_.to_numpy())) to_delete_mask = np.logical_not(jump_algo.inlier_mask_) logging.debug("Deleting points %s" % np.nonzero(to_delete_mask.to_numpy())) delete_ids = list(with_speeds_df[to_delete_mask]._id) logging.debug("Deleting ids %s" % delete_ids) # Automated checks. Might be able to remove logging statements later if i != 2: # Not the bad section. Should not be filtered self.assertEqual(np.count_nonzero(to_delete_mask), 0) self.assertEqual(len(delete_ids), 0) else: # The bad section, should have the third point filtered self.assertEqual(np.count_nonzero(to_delete_mask), 1) self.assertEqual([str(id) for id in delete_ids], ["55d8c4837d65cb39ee983cb4"])