def testJul22SplitAroundReboot(self): dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1 + ".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2 + ".ground_truth"), object_hook=bju.object_hook) etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook=bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) # Although we process the day's data in two batches, we should get the same result self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) # Although we process the day's data in two batches, we should get the same result self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_2).data)
def testZeroDurationPlaceInterpolationSingleSync(self): # Test for 545114feb5ac15caac4110d39935612525954b71 dataFile_1 = "emission/tests/data/real_examples/shankari_2016-01-12" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-01-13" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 1, 'day': 12}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 1, 'day': 13}) cacheKey_1 = "diary/trips-2016-01-12" cacheKey_2 = "diary/trips-2016-01-13" ground_truth_1 = json.load(open(dataFile_1 + ".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2 + ".ground_truth"), object_hook=bju.object_hook) etc.setupRealExample(self, dataFile_1) self.entries = json.load(open(dataFile_2), object_hook=bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) # Although we process the day's data in two batches, we should get the same result self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) # Although we process the day's data in two batches, we should get the same result self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_2).data)
def testZeroDurationPlaceInterpolationMultiSync(self): # Test for 545114feb5ac15caac4110d39935612525954b71 dataFile_1 = "emission/tests/data/real_examples/shankari_2016-01-12" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-01-13" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 1, 'day': 12}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 1, 'day': 13}) cacheKey_1 = "diary/trips-2016-01-12" cacheKey_2 = "diary/trips-2016-01-13" ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook) etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) # Although we process the day's data in two batches, we should get the same result self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) # Although we process the day's data in two batches, we should get the same result self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data)
def testJul22SplitAroundReboot(self): dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook) etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) # Although we process the day's data in two batches, we should get the same result self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) # Although we process the day's data in two batches, we should get the same result self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data)
def testResetToStart(self): """ - Load data for both days - Run pipelines - Verify that all is well - Reset to start - Verify that there is no analysis data - Re-run pipelines - Verify that all is well """ # Load all data dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook) # Run both pipelines etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) # Check results: so far, so good api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data) # Reset pipeline to start epr.reset_user_to_start(self.testUUID, is_dry_run=False) # Now there are no results api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.assertEqual(api_result, []) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.assertEqual(api_result, []) # Re-run the pipeline again etc.runIntakePipeline(self.testUUID) # Should be back to ground truth api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data)
def testResetToPast(self): """ - Load data for both days - Run pipelines - Verify that all is well - Reset to a date before both - Verify that analysis data for the both days is removed - Re-run pipelines - Verify that all is well """ # Load all data dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook) # Run both pipelines etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) # Verify that all is well api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data) # Reset to a date well before the two days reset_ts = arrow.get("2015-07-24").timestamp epr.reset_user_to_ts(self.testUUID, reset_ts, is_dry_run=False) # Data should be completely deleted api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.assertEqual(api_result, []) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.assertEqual(api_result, []) # Re-running the pipeline again etc.runIntakePipeline(self.testUUID) # Should reconstruct everything api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data)
def testAug10MultiSyncEndNotDetected(self): # Re-run, but with multiple calls to sync data # This tests the effect of online versus offline analysis and segmentation with potentially partial data dataFile = "emission/tests/data/real_examples/shankari_2016-08-10" start_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 9}) end_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 10}) cacheKey = "diary/trips-2016-08-10" with open( "emission/tests/data/real_examples/shankari_2016-08-910.ground_truth" ) as gtf: ground_truth = json.load(gtf, object_hook=bju.object_hook) logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count()) with open(dataFile) as df: all_entries = json.load(df, object_hook=bju.object_hook) ts_1030 = arrow.get("2016-08-10T10:30:00-07:00").timestamp logging.debug("ts_1030 = %s, converted back = %s" % (ts_1030, arrow.get(ts_1030).to("America/Los_Angeles"))) before_1030_entries = [ e for e in all_entries if ad.AttrDict(e).metadata.write_ts <= ts_1030 ] after_1030_entries = [ e for e in all_entries if ad.AttrDict(e).metadata.write_ts > ts_1030 ] # First load all data from the 9th. Otherwise, the missed trip is the first trip, # and we don't set the last_ts_processed # See the code around "logging.debug("len(segmentation_points) == 0, early return")" etc.setupRealExample( self, "emission/tests/data/real_examples/shankari_2016-08-09") # Sync at 10:30 to capture all the points on the trip *to* the optometrist # Skip the last few points to ensure that the trip end is skipped self.entries = before_1030_entries[0:-2] etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Then sync after 10:30 self.entries = after_1030_entries etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) self.persistGroundTruthIfNeeded(api_result, dataFile, start_ld, cacheKey) # Although we process the day's data in two batches, we should get the same result self.compare_approx_result(ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth).data, time_fuzz=60, distance_fuzz=100)
def setUp(self): etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-aug-27") self.androidUUID = self.testUUID self.testUUID = uuid.UUID("c76a0487-7e5a-3b17-a449-47be666b36f6") self.entries = json.load(open("emission/tests/data/real_examples/iphone_2015-11-06"), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) self.iosUUID = self.testUUID eaicf.filter_accuracy(self.iosUUID) logging.debug("androidUUID = %s, iosUUID = %s" % (self.androidUUID, self.iosUUID))
def setUp(self): etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-aug-27") self.androidUUID = self.testUUID eaicf.filter_accuracy(self.androidUUID) self.testUUID = uuid.UUID("c76a0487-7e5a-3b17-a449-47be666b36f6") self.entries = json.load(open("emission/tests/data/real_examples/iphone_2015-11-06"), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) self.iosUUID = self.testUUID eaicf.filter_accuracy(self.iosUUID)
def testOct07MultiSyncSpuriousEndDetected(self): # Re-run, but with multiple calls to sync data # This tests the effect of online versus offline analysis and segmentation with potentially partial data dataFile = "emission/tests/data/real_examples/issue_436_assertion_error" start_ld = ecwl.LocalDate({'year': 2016, 'month': 10, 'day': 0o7}) end_ld = ecwl.LocalDate({'year': 2016, 'month': 10, 'day': 0o7}) cacheKey = "diary/trips-2016-10-07" with open(dataFile + ".ground_truth") as gtf: ground_truth = json.load(gtf, object_hook=bju.object_hook) logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().estimated_document_count()) with open(dataFile) as df: all_entries = json.load(df, object_hook=bju.object_hook) # 18:01 because the transition was at 2016-02-22T18:00:09.623404-08:00, so right after # 18:00 ts_1800 = arrow.get("2016-10-07T18:33:11-07:00").timestamp logging.debug("ts_1800 = %s, converted back = %s" % (ts_1800, arrow.get(ts_1800).to("America/Los_Angeles"))) before_1800_entries = [ e for e in all_entries if ad.AttrDict(e).metadata.write_ts <= ts_1800 ] after_1800_entries = [ e for e in all_entries if ad.AttrDict(e).metadata.write_ts > ts_1800 ] # Sync at 18:00 to capture all the points on the trip *to* the optometrist # Skip the last few points to ensure that the trip end is skipped etc.createAndFillUUID(self) self.entries = before_1800_entries etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Then sync after 18:00 self.entries = after_1800_entries etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) self.persistGroundTruthIfNeeded(api_result, dataFile, start_ld, cacheKey) # Although we process the day's data in two batches, we should get the same result self.compare_approx_result(ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth).data, time_fuzz=60, distance_fuzz=100)
def setUp(self): self.analysis_conf_path = \ etc.set_analysis_config("intake.cleaning.filter_accuracy.enable", True) etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-aug-27") self.androidUUID = self.testUUID eaicf.filter_accuracy(self.androidUUID) self.testUUID = uuid.UUID("c76a0487-7e5a-3b17-a449-47be666b36f6") self.entries = json.load(open("emission/tests/data/real_examples/iphone_2015-11-06"), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) self.iosUUID = self.testUUID eaicf.filter_accuracy(self.iosUUID)
def setUp(self): self.analysis_conf_path = \ etc.set_analysis_config("intake.cleaning.filter_accuracy.enable", True) etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-aug-27") self.androidUUID = self.testUUID self.testUUID = uuid.UUID("c76a0487-7e5a-3b17-a449-47be666b36f6") self.entries = json.load(open("emission/tests/data/real_examples/iphone_2015-11-06"), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) self.iosUUID = self.testUUID eaicf.filter_accuracy(self.iosUUID) logging.debug("androidUUID = %s, iosUUID = %s" % (self.androidUUID, self.iosUUID))
def testResetToFuture(self): """ - Load data for both days - Run pipelines - Reset to a date after the two - Verify that all is well - Re-run pipelines and ensure that there are no errors """ # Load all data dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1 + ".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2 + ".ground_truth"), object_hook=bju.object_hook) # Run both pipelines etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook=bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) # Reset to a date well after the two days reset_ts = arrow.get("2017-07-24").timestamp epr.reset_user_to_ts(self.testUUID, reset_ts, is_dry_run=False) # Data should be untouched because of early return api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_2).data) # Re-running the pipeline again should not affect anything etc.runIntakePipeline(self.testUUID)
def testFeb22MultiSyncEndNotDetected(self): # Re-run, but with multiple calls to sync data # This tests the effect of online versus offline analysis and segmentation with potentially partial data dataFile = "emission/tests/data/real_examples/iphone_2016-02-22" start_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22}) end_ld = ecwl.LocalDate({'year': 2016, 'month': 2, 'day': 22}) cacheKey = "diary/trips-2016-02-22" ground_truth = json.load(open(dataFile + ".ground_truth"), object_hook=bju.object_hook) logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().count()) all_entries = json.load(open(dataFile), object_hook=bju.object_hook) # 18:01 because the transition was at 2016-02-22T18:00:09.623404-08:00, so right after # 18:00 ts_1800 = arrow.get("2016-02-22T18:00:30-08:00").timestamp logging.debug("ts_1800 = %s, converted back = %s" % (ts_1800, arrow.get(ts_1800).to("America/Los_Angeles"))) before_1800_entries = [ e for e in all_entries if ad.AttrDict(e).metadata.write_ts <= ts_1800 ] after_1800_entries = [ e for e in all_entries if ad.AttrDict(e).metadata.write_ts > ts_1800 ] # Sync at 18:00 to capture all the points on the trip *to* the optometrist # Skip the last few points to ensure that the trip end is skipped import uuid self.testUUID = uuid.uuid4() self.entries = before_1800_entries[0:-2] etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Then sync after 18:00 self.entries = after_1800_entries etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Although we process the day's data in two batches, we should get the same result self.compare_approx_result(ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth).data, time_fuzz=60, distance_fuzz=100)
def testResetToFuture(self): """ - Load data for both days - Run pipelines - Reset to a date after the two - Verify that all is well - Re-run pipelines and ensure that there are no errors """ # Load all data dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook) # Run both pipelines etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) # Reset to a date well after the two days reset_ts = arrow.get("2017-07-24").timestamp epr.reset_user_to_ts(self.testUUID, reset_ts, is_dry_run=False) # Data should be untouched because of early return api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data) # Re-running the pipeline again should not affect anything etc.runIntakePipeline(self.testUUID)
def testAug10MultiSyncEndDetected(self): # Re-run, but with multiple calls to sync data # This tests the effect of online versus offline analysis and segmentation with potentially partial data dataFile = "emission/tests/data/real_examples/shankari_2016-08-10" start_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 9}) end_ld = ecwl.LocalDate({'year': 2016, 'month': 8, 'day': 10}) cacheKey = "diary/trips-2016-08-10" ground_truth = json.load(open("emission/tests/data/real_examples/shankari_2016-08-910.ground_truth"), object_hook=bju.object_hook) logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().count()) all_entries = None with open(dataFile) as secondfp: all_entries = json.load(secondfp, object_hook = bju.object_hook) ts_1030 = arrow.get("2016-08-10T10:30:00-07:00").timestamp logging.debug("ts_1030 = %s, converted back = %s" % (ts_1030, arrow.get(ts_1030).to("America/Los_Angeles"))) before_1030_entries = [e for e in all_entries if ad.AttrDict(e).metadata.write_ts <= ts_1030] after_1030_entries = [e for e in all_entries if ad.AttrDict(e).metadata.write_ts > ts_1030] # First load all data from the 9th. Otherwise, the missed trip is the first trip, # and we don't set the last_ts_processed # See the code around "logging.debug("len(segmentation_points) == 0, early return")" etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2016-08-09") # Sync at 10:30 to capture all the points on the trip *to* the optometrist self.entries = before_1030_entries etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Then sync after 10:30 self.entries = after_1030_entries etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Although we process the day's data in two batches, we should get the same result self.compare_approx_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth).data, time_fuzz=60, distance_fuzz=100)
def checkConfirmedTripsAndSections(self, dataFile, ld, preload=False): with open(dataFile + ".ground_truth") as gfp: ground_truth = json.load(gfp, object_hook=bju.object_hook) etc.setupRealExample(self, dataFile) if (preload): self.entries = json.load(open(dataFile + ".user_inputs"), object_hook=bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) if (not preload): self.entries = json.load(open(dataFile + ".user_inputs"), object_hook=bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) ts = esta.TimeSeries.get_time_series(self.testUUID) confirmed_trips = list( ts.find_entries(["analysis/confirmed_trip"], None)) with open(dataFile + ".expected_confirmed_trips") as dect: expected_confirmed_trips = json.load(dect, object_hook=bju.object_hook) self.compare_trip_result(confirmed_trips, expected_confirmed_trips)
def testOct07MultiSyncSpuriousEndDetected(self): # Re-run, but with multiple calls to sync data # This tests the effect of online versus offline analysis and segmentation with potentially partial data dataFile = "emission/tests/data/real_examples/issue_436_assertion_error" start_ld = ecwl.LocalDate({'year': 2016, 'month': 10, 'day': 07}) end_ld = ecwl.LocalDate({'year': 2016, 'month': 10, 'day': 07}) cacheKey = "diary/trips-2016-10-07" ground_truth = json.load(open(dataFile+".ground_truth"), object_hook=bju.object_hook) logging.info("Before loading, timeseries db size = %s" % edb.get_timeseries_db().count()) all_entries = json.load(open(dataFile), object_hook = bju.object_hook) # 18:01 because the transition was at 2016-02-22T18:00:09.623404-08:00, so right after # 18:00 ts_1800 = arrow.get("2016-10-07T18:33:11-07:00").timestamp logging.debug("ts_1800 = %s, converted back = %s" % (ts_1800, arrow.get(ts_1800).to("America/Los_Angeles"))) before_1800_entries = [e for e in all_entries if ad.AttrDict(e).metadata.write_ts <= ts_1800] after_1800_entries = [e for e in all_entries if ad.AttrDict(e).metadata.write_ts > ts_1800] # Sync at 18:00 to capture all the points on the trip *to* the optometrist # Skip the last few points to ensure that the trip end is skipped import uuid self.testUUID = uuid.uuid4() self.entries = before_1800_entries etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Then sync after 18:00 self.entries = after_1800_entries etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld, end_ld) # Although we process the day's data in two batches, we should get the same result self.compare_approx_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth).data, time_fuzz=60, distance_fuzz=100)
def testResetToTsInMiddleOfPlace(self): """ - Load data for both days - Run pipelines - Verify that all is well - Reset to a date between the two - Verify that analysis data for the first day is unchanged - Verify that analysis data for the second day does not exist - Re-run pipelines - Verify that all is well """ # Load all data dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook) # Run both pipelines etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) # Check results: so far, so good api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data) # Reset pipeline to july 23. # Note that this is actually 22nd 16:00 PDT, so this is partway # through the 22nd reset_ts = arrow.get("2016-07-23").timestamp epr.reset_user_to_ts(self.testUUID, reset_ts, is_dry_run=False) # First day is unchanged, except that the last place doesn't have # exit data. # TODO: Modify ground truth to capture this change # Until then, we know that this will fail # api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) # self.compare_result(ad.AttrDict({'result': api_result}).result, # ad.AttrDict(ground_truth_1).data) # Second day does not exist api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) logging.debug(json.dumps(api_result, indent=4, default=bju.default)) self.assertEqual(api_result, []) # Re-run the pipeline again etc.runIntakePipeline(self.testUUID) # Should be back to ground truth api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data)
def testResetToTsInMiddleOfPlace(self): """ - Load data for both days - Run pipelines - Verify that all is well - Reset to a date between the two - Verify that analysis data for the first day is unchanged - Verify that analysis data for the second day does not exist - Re-run pipelines - Verify that all is well """ # Load all data dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1 + ".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2 + ".ground_truth"), object_hook=bju.object_hook) # Run both pipelines etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook=bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) # Check results: so far, so good api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_2).data) # Reset pipeline to july 23. # Note that this is actually 22nd 16:00 PDT, so this is partway # through the 22nd reset_ts = arrow.get("2016-07-23").timestamp epr.reset_user_to_ts(self.testUUID, reset_ts, is_dry_run=False) # First day is unchanged, except that the last place doesn't have # exit data. # TODO: Modify ground truth to capture this change # Until then, we know that this will fail # api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) # self.compare_result(ad.AttrDict({'result': api_result}).result, # ad.AttrDict(ground_truth_1).data) # Second day does not exist api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) logging.debug(json.dumps(api_result, indent=4, default=bju.default)) self.assertEqual(api_result, []) # Re-run the pipeline again etc.runIntakePipeline(self.testUUID) # Should be back to ground truth api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_2).data)
def testResetToTsInMiddleOfTrip(self): """ - Load data for both days - Run pipelines - Verify that all is well - Reset to a date between the two - Verify that analysis data for the first day is unchanged - Verify that analysis data for the second day does not exist - Re-run pipelines - Verify that all is well """ # Load all data dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1+".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2+".ground_truth"), object_hook=bju.object_hook) # Run both pipelines etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook = bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) # Check results: so far, so good api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data) # Reset pipeline to july 24. # Note that this is actually 23nd 16:00 PDT # This will reset in the middle of the untracked time, which is # technically a trip, and will allow us to test the trip resetting # code reset_ts = arrow.get("2016-07-24").timestamp epr.reset_user_to_ts(self.testUUID, reset_ts, is_dry_run=False) # Second day does not exist api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) logging.debug(json.dumps(api_result, indent=4, default=bju.default)) self.assertEqual(api_result, []) # Re-run the pipeline again etc.runIntakePipeline(self.testUUID) # Should be back to ground truth api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result(ad.AttrDict({'result': api_result}).result, ad.AttrDict(ground_truth_2).data)
def testResetToTsInMiddleOfTrip(self): """ - Load data for both days - Run pipelines - Verify that all is well - Reset to a date between the two - Verify that analysis data for the first day is unchanged - Verify that analysis data for the second day does not exist - Re-run pipelines - Verify that all is well """ # Load all data dataFile_1 = "emission/tests/data/real_examples/shankari_2016-07-22" dataFile_2 = "emission/tests/data/real_examples/shankari_2016-07-25" start_ld_1 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 22}) start_ld_2 = ecwl.LocalDate({'year': 2016, 'month': 7, 'day': 25}) cacheKey_1 = "diary/trips-2016-07-22" cacheKey_2 = "diary/trips-2016-07-25" ground_truth_1 = json.load(open(dataFile_1 + ".ground_truth"), object_hook=bju.object_hook) ground_truth_2 = json.load(open(dataFile_2 + ".ground_truth"), object_hook=bju.object_hook) # Run both pipelines etc.setupRealExample(self, dataFile_1) etc.runIntakePipeline(self.testUUID) self.entries = json.load(open(dataFile_2), object_hook=bju.object_hook) etc.setupRealExampleWithEntries(self) etc.runIntakePipeline(self.testUUID) # Check results: so far, so good api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_2).data) # Reset pipeline to july 24. # Note that this is actually 23nd 16:00 PDT # This will reset in the middle of the untracked time, which is # technically a trip, and will allow us to test the trip resetting # code reset_ts = arrow.get("2016-07-24").timestamp epr.reset_user_to_ts(self.testUUID, reset_ts, is_dry_run=False) # Second day does not exist api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) logging.debug(json.dumps(api_result, indent=4, default=bju.default)) self.assertEqual(api_result, []) # Re-run the pipeline again etc.runIntakePipeline(self.testUUID) # Should be back to ground truth api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_1, start_ld_1) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_1).data) api_result = gfc.get_geojson_for_dt(self.testUUID, start_ld_2, start_ld_2) self.compare_result( ad.AttrDict({ 'result': api_result }).result, ad.AttrDict(ground_truth_2).data)