def test_event_score_column_names(exemplar_level_param): if exemplar_level_param["level"] not in JoinToLocation.allowed_levels: pytest.skip(f'{exemplar_level_param["level"]} not valid for this test') es = EventScore(start="2016-01-01", stop="2016-01-05", **exemplar_level_param) assert es.head(0).columns.tolist() == es.column_names
def test_whether_score_that_do_not_cover_domain_raises(get_dataframe): """ Test whether scoring rules that do not cover the whole domain is an error. """ with pytest.raises(ValueError): es = EventScore(start="2016-01-01", stop="2016-01-05", score_hour={0: 0}) with pytest.raises(ValueError): es = EventScore(start="2016-01-01", stop="2016-01-05", score_dow={"monday": 0})
def test_meaningful_locations_od_results(get_dataframe): """ Test that OD on MeaningfulLocations returns expected results and counts clusters per subscriber correctly. """ # FIXME: Because of the nature of the test data, we can't actually test much for most admin levels because # the counts will always be below 15, and hence get redacted! mfl_a = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="unknown", ) mfl_b = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-02", stop="2016-01-03", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-02", stop="2016-01-03", spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="unknown", ) mfl_od = MeaningfulLocationsOD( meaningful_locations_a=mfl_a, meaningful_locations_b=mfl_b, spatial_unit=make_spatial_unit("admin", level=1), ) mfl_od_df = get_dataframe(mfl_od) # Aggregate should not include any counts below 15 assert all(mfl_od_df.total > 15) # Smoke test one admin1 region gets the expected result assert mfl_od_df[(mfl_od_df.pcod_from == "524 1") & ( mfl_od_df.pcod_to == "524 4")].total[0] == pytest.approx(16.490_807) assert mfl_od_df.total.sum() == pytest.approx(350.806_012)
def test_meaningful_locations_od_results(get_dataframe, meaningful_locations_labels): """ Test that OD on MeaningfulLocations returns expected results and counts clusters per subscriber correctly. """ mfl_a = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="unknown", ) mfl_b = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-02", stop="2016-01-03", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-02", stop="2016-01-03", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="unknown", ) mfl_od = MeaningfulLocationsOD( meaningful_locations_a=mfl_a, meaningful_locations_b=mfl_b, spatial_unit=make_spatial_unit("admin", level=1), ) mfl_od_df = get_dataframe(mfl_od) # Smoke test one admin1 region gets the expected result regional_flow = mfl_od_df[(mfl_od_df.pcod_from == "524 1") & (mfl_od_df.pcod_to == "524 4")].value.tolist()[0] assert regional_flow == pytest.approx(16.490_807) assert mfl_od_df.value.sum() == pytest.approx(454.0)
def test_meaningful_locations_od_redaction(get_dataframe, meaningful_locations_labels): """ Test that OD on MeaningfulLocations is redacted to >15. """ mfl_a = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="unknown", ) mfl_b = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-02", stop="2016-01-03", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-02", stop="2016-01-03", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="unknown", ) mfl_od = RedactedMeaningfulLocationsOD( meaningful_locations_od=MeaningfulLocationsOD( meaningful_locations_a=mfl_a, meaningful_locations_b=mfl_b, spatial_unit=make_spatial_unit("admin", level=1), )) mfl_od_df = get_dataframe(mfl_od) # Aggregate should not include any counts below 15 assert all(mfl_od_df.value > 15)
def test_column_names_meaningful_locations_od(exemplar_spatial_unit_param, get_column_names_from_run, meaningful_locations_labels): """Test that column_names property matches head(0) for an od matrix between meaningful locations""" if not exemplar_spatial_unit_param.is_polygon: pytest.xfail( f"The spatial unit {exemplar_spatial_unit_param} is not supported as an aggregation unit for ODs between MeaningfulLocations." ) mfl_a = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="evening", ) mfl_b = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="unknown", ) mfl_od = MeaningfulLocationsOD( meaningful_locations_a=mfl_a, meaningful_locations_b=mfl_b, spatial_unit=exemplar_spatial_unit_param, ) assert get_column_names_from_run(mfl_od) == mfl_od.column_names
def test_whether_zero_score_returns_only_zero(get_dataframe): """ Test whether passing a scoring rule where all events are scored with 0 returns only 0 scores. """ es = EventScore( start="2016-01-01", stop="2016-01-05", score_hour=[0] * 24, score_dow=dict.fromkeys( { "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", }, 0, ), spatial_unit=make_spatial_unit("versioned-site"), ) df = get_dataframe(es) valid = df[["score_hour", "score_dow"]] == 0 assert all(valid.all())
def test_out_of_bounds_score_raises(scorer, out_of_bounds_val, flowmachine_connect): """ Test whether passing a scoring rule which is out of bounds errors. """ scorers = dict( score_hour=dict.fromkeys(range(24), 0), score_dow=dict.fromkeys( { "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", }, 0, ), ) scorers[scorer][scorers[scorer].popitem()[0]] = out_of_bounds_val with pytest.raises(ValueError): es = EventScore( start="2016-01-01", stop="2016-01-05", spatial_unit=make_spatial_unit("versioned-site"), **scorers )
def test_column_names_meaningful_locations_aggregate( exemplar_level_param, get_column_names_from_run): """ Test that column_names property matches head(0) for aggregated meaningful locations""" if exemplar_level_param[ "level"] not in MeaningfulLocationsAggregate.allowed_levels: pytest.xfail( f'The level "{exemplar_level_param["level"]}" is not supported as an aggregation unit for MeaningfulLocations.' ) mfl_agg = MeaningfulLocationsAggregate( meaningful_locations=MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", level="versioned-site")), radius=1, ), scores=EventScore(start="2016-01-01", stop="2016-01-02", level="versioned-site"), labels=labels, label="evening", ), **exemplar_level_param, ) assert get_column_names_from_run(mfl_agg) == mfl_agg.column_names
def test_meaningful_locations_aggregation_results(exemplar_level_param, get_dataframe): """ Test that aggregating MeaningfulLocations returns expected results and redacts values below 15. """ if exemplar_level_param[ "level"] not in MeaningfulLocationsAggregate.allowed_levels: pytest.xfail( f'The level "{exemplar_level_param["level"]}" is not supported as an aggregation unit for MeaningfulLocations.' ) mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", level="versioned-site")), radius=1, ), scores=EventScore(start="2016-01-01", stop="2016-01-02", level="versioned-site"), labels=labels, label="evening", ) mfl_agg = MeaningfulLocationsAggregate(meaningful_locations=mfl, **exemplar_level_param) mfl_df = get_dataframe(mfl) mfl_agg_df = get_dataframe(mfl_agg) # Aggregate should not include any counts below 15 assert all(mfl_agg_df.total > 15) # Sum of aggregate should be less than the number of unique subscribers assert mfl_agg_df.total.sum() < mfl_df.subscriber.nunique()
def test_meaningful_locations_aggregate_disallowed_spatial_unit_raises( meaningful_locations_labels, ): """Test that a bad spatial unit raises an InvalidSpatialUnitError""" with pytest.raises(InvalidSpatialUnitError): mfl_agg = MeaningfulLocationsAggregate( meaningful_locations=MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="evening", ), spatial_unit=make_spatial_unit("lon-lat"), )
def test_whether_passing_reserved_label_fails(): """ Test whether passing the reserved label 'unknown' fails. """ es = EventScore( start="2016-01-01", stop="2016-01-05", spatial_unit=make_spatial_unit("versioned-site"), ) with pytest.raises(ValueError): ls = LabelEventScore( scores=es, labels={ "unknown": { "type": "Polygon", "coordinates": [ [ [-1.1, -1.1], [-1.0, 1.1], [1.1, 1.1], [1.1, -1.1], [-1.1, -1.1], ] ], } }, )
def test_constructor_overlaps_bounds_dict_raises(): """ Should raise an error if an overlap is found in constructor. """ bounds_dict = { "evening": [ { "hour_lower_bound": 0, "hour_upper_bound": 1, "day_of_week_upper_bound": 1, "day_of_week_lower_bound": 0.5, }, { "hour_lower_bound": 0.00001, "hour_upper_bound": 1, "day_of_week_upper_bound": -0.5, "day_of_week_lower_bound": -1, }, ], "day": [ { "hour_lower_bound": -1, "hour_upper_bound": 0, "day_of_week_upper_bound": 0.5, "day_of_week_lower_bound": -0.5, } ], } with pytest.raises(ValueError): LabelEventScore( scores=EventScore(start="2016-01-01", stop="2016-01-05"), labels=bounds_dict )
def test_existing_enumerated_type_initialization_fails(): """ Tests whether initializing an existing enumerated type in the database with extra arguments fail. """ es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") ls = LabelEventScore( es, { "evening": "(score_hour > 0) AND (score_dow > 0.5 OR score_dow < -0.5)", "daytime": "(score_hour < 0) AND (score_dow < 0.5 AND score_dow > -0.5)", }, "location_type", "evening", ) with pytest.raises(ValueError): ls = LabelEventScore( es, { "evening": "(score_hour > 0) AND (score_dow > 0.5 OR score_dow < -0.5)", "daytime": "(score_hour < 0) AND (score_dow < 0.5 AND score_dow > -0.5)", "new_label": "(score_hour > 1", }, "location_type", "evening", ) ls.head()
def test_meaningful_locations_aggregation_results(exemplar_spatial_unit_param, get_dataframe, meaningful_locations_labels): """ Test that aggregating MeaningfulLocations returns expected results and redacts values below 15. """ mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="evening", ) mfl_agg = RedactedMeaningfulLocationsAggregate( meaningful_locations_aggregate=MeaningfulLocationsAggregate( meaningful_locations=mfl, spatial_unit=make_spatial_unit("admin", level=3))) mfl_agg_df = get_dataframe(mfl_agg) # Aggregate should not include any counts below 15 assert all(mfl_agg_df.value > 15)
def test_join_returns_the_same_clusters(): """ Test whether joining to another table for which the start and stop time are the same yields the same clusters. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) hartigan = HartiganCluster(calldays=cd, radius=50) har_df = hartigan.to_geopandas() es = EventScore( start="2016-01-01", stop="2016-01-04", spatial_unit=make_spatial_unit("versioned-site"), ) joined = ( hartigan.join_to_cluster_components(es).to_geopandas().sort_values( ["subscriber", "rank", "calldays"])) joined.reset_index(inplace=True, drop=True) har_df.sort_values(["subscriber", "rank", "calldays"], inplace=True) har_df.reset_index(inplace=True, drop=True) cols = ["subscriber", "geometry", "rank", "calldays"] compare = joined[cols] == har_df[cols] assert all(compare.all())
def test_meaningful_locations_od_raises_for_bad_spatial_unit( exemplar_spatial_unit_param, get_dataframe): """ Test that od on meaningful locations raises an InvalidSpatialUnitError for a bad spatial unit. """ mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="evening", ) with pytest.raises(InvalidSpatialUnitError): mfl_od = MeaningfulLocationsOD( meaningful_locations_a=mfl, meaningful_locations_b=mfl, spatial_unit=make_spatial_unit("lon-lat"), )
def test_meaningful_locations_aggregation_results(exemplar_spatial_unit_param, get_dataframe): """ Test that aggregating MeaningfulLocations returns expected results and redacts values below 15. """ if not exemplar_spatial_unit_param.is_polygon: pytest.xfail( f"The spatial unit {exemplar_spatial_unit_param} is not supported as an aggregation unit for MeaningfulLocations." ) mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="evening", ) mfl_agg = MeaningfulLocationsAggregate( meaningful_locations=mfl, spatial_unit=exemplar_spatial_unit_param) mfl_df = get_dataframe(mfl) mfl_agg_df = get_dataframe(mfl_agg) # Aggregate should not include any counts below 15 assert all(mfl_agg_df.total > 15) # Sum of aggregate should be less than the number of unique subscribers assert mfl_agg_df.total.sum() < mfl_df.subscriber.nunique()
def test_meaningful_locations_results(label, expected_number_of_clusters, get_dataframe): """ Test that MeaningfulLocations returns expected results and counts clusters per subscriber correctly. """ mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label=label, ) mfl_df = get_dataframe(mfl) assert len(mfl_df) == expected_number_of_clusters count_clusters = mfl_df.groupby(["subscriber", "label", "n_clusters"], as_index=False).count() # Check that query has correctly counted the number of clusters per subscriber assert all(count_clusters.n_clusters == count_clusters.cluster)
def test_labelled_event_score_column_names( exemplar_level_param, get_column_names_from_run ): if exemplar_level_param["level"] not in JoinToLocation.allowed_levels: pytest.skip(f'{exemplar_level_param["level"]} not valid for this test') es = EventScore(start="2016-01-01", stop="2016-01-05", **exemplar_level_param) labelled = LabelEventScore(scores=es, required="evening") assert get_column_names_from_run(labelled) == labelled.column_names
def test_labelled_event_score_column_names( exemplar_spatial_unit_param, get_column_names_from_run ): es = EventScore( start="2016-01-01", stop="2016-01-05", spatial_unit=exemplar_spatial_unit_param ) labelled = LabelEventScore(scores=es, required="evening") assert get_column_names_from_run(labelled) == labelled.column_names
def test_joined_hartigan_column_names(): """Test that Hartigan has correct column_names property.""" cd = CallDays("2016-01-01", "2016-01-04", level="versioned-site") hartigan = HartiganCluster(cd, 50) es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") joined = hartigan.join_to_cluster_components(es) assert joined.head(0).columns.tolist() == joined.column_names
def test_constructor_raises_value_error(bad_bound): """ Constructor should raise valueerror for bad bounds.. """ with pytest.raises(ValueError): LabelEventScore( scores=EventScore(start="2016-01-01", stop="2016-01-05"), labels={"DUMMY_LABEL": [bad_bound]}, )
def test_whether_scores_are_within_score_bounds(get_dataframe): """ Test whether the scores are within the bounds of maximum and minimum scores. """ es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") df = get_dataframe(es) max_score = df[["score_hour", "score_dow"]].max() min_score = df[["score_hour", "score_dow"]].min() assert all(max_score <= [1, 1]) assert all(min_score >= [-1, -1])
def test_whether_passing_reserved_label_fails(): """ Test whether passing the reserved label 'unknown' fails. """ es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") with pytest.raises(ValueError): ls = LabelEventScore(es, {"unknown": "(score_hour >= -1)"}, "location_type")
def test_whether_required_label_relabels(get_dataframe): """ Test whether required label relabel the location of subscribers who did not originally have the required label. """ es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") ls = LabelEventScore(es, {"daytime": "(score_hour >= -1)"}, "location_type", "evening") df = get_dataframe(ls) assert list(df["label"].unique()) == ["evening"]
def test_locations_are_labelled_correctly(get_dataframe): """ Test whether locations are labelled corrected. """ es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") ls = LabelEventScore(es, {"daytime": "(score_hour >= -1)"}, "location_type") df = get_dataframe(ls) assert list(df["label"].unique()) == ["daytime"]
def test_joined_hartigan_column_names(get_column_names_from_run): """Test that Hartigan has correct column_names property.""" cd = CallDays( subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site")) hartigan = HartiganCluster(calldays=cd, radius=50) es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") joined = hartigan.join_to_cluster_components(es) assert get_column_names_from_run(joined) == joined.column_names
def test_whether_score_that_do_not_cover_domain_return_null(get_dataframe): """ Test whether scoring rules that do not cover the whole domain return null values. """ es = EventScore( start="2016-01-01", stop="2016-01-05", score_hour={(7, 9): 0}, score_dow={(1, 2): 0}, ) df = get_dataframe(es) valid = df[["score_hour", "score_dow"]].apply(lambda x: (x.isnull()) | (x == 0)) assert all(valid.all())
def test_joined_hartigan_cluster_bad_query_column_names_raises_error(): """ Test that joining a HartiganCluster to a query without 'site_id' and 'version' columns raises an error. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) hartigan = HartiganCluster(calldays=cd, radius=50) es = EventScore(start="2016-01-01", stop="2016-01-04", spatial_unit=make_spatial_unit("lon-lat")) with pytest.raises(ValueError): hartigan.join_to_cluster_components(es)