def test_joined_agg_date_mismatch(): """ Test that join aggregate with mismatched dates raises a warning. """ mfl = MostFrequentLocation("2016-01-01", "2016-01-04", level="admin3") with pytest.warns(UserWarning): mfl.join_aggregate(RadiusOfGyration("2016-01-02", "2016-01-04")) with pytest.warns(UserWarning): mfl.join_aggregate(RadiusOfGyration("2016-01-01", "2016-01-05"))
def test_dropna(get_length): """ Test that we are able to keep rows with NA values. """ start, stop = "2016-01-01", "2016-01-03" msisdn = "1vGR8kp342yxEpwY" sql = """ select msisdn as subscriber, 2 as val from events.calls where msisdn = '{}' limit 1 """.format( msisdn ) metrics = [CustomQuery(sql, ["subscriber"]), RadiusOfGyration(start, stop)] fc = feature_collection(metrics, dropna=False) # usully without dropna=False this query would only return # a single row. We check that this is not the case. assert get_length(fc) > 1
def test_joined_agg_hours_mismatch(): """ Test that join aggregate with mismatched hours doesn't warn. """ mfl = MostFrequentLocation("2016-01-01 10:00", "2016-01-04", level="admin3") with warnings.catch_warnings(record=True) as w: mfl.join_aggregate(RadiusOfGyration("2016-01-01", "2016-01-04")) assert not w
def test_histogram_param_value_errors(param_name, param_value, expected_exception): radius_of_gyration = RadiusOfGyration("2016-01-01", "2016-01-02") args = dict(bins=10, metric=radius_of_gyration) args[param_name] = param_value with pytest.raises(ValueError, match=expected_exception): HistogramAggregation(**args)
def test_can_get_item_subscriber_metric(get_dataframe): """g flowmachine.SubscriberFeature allows for getting items """ rog = RadiusOfGyration("2016-01-01", "2016-01-03") dl = daily_location("2016-01-03") single_subscriber = list(get_dataframe(dl).head(8).subscriber)[3] sub = get_dataframe(rog[single_subscriber]) assert set(sub.subscriber) == {single_subscriber}
def setUp(self): self.rog = RadiusOfGyration("2016-01-01", "2016-01-02") self.low = 150 self.high = 155 self.rog_df = self.rog.get_dataframe().query( "{low} <= rog <= {high}".format(low=self.low, high=self.high)) self.sub = self.rog.numeric_subset(col="rog", low=self.low, high=self.high)
def test_call_with_str_raises_error(): """ Numeric subset can't be called with a string in arguments low and high """ rog = RadiusOfGyration("2016-01-01", "2016-01-02") with pytest.raises(TypeError): rog.numeric_subset(col="value", low="foo", high=1) with pytest.raises(TypeError): rog.numeric_subset(col="value", low=1, high="bar")
def test_joined_aggregate(self): """ Test join aggregate. """ mfl = MostFrequentLocation("2016-01-01", "2016-01-04", level="admin3") joined = mfl.join_aggregate( RadiusOfGyration("2016-01-01", "2016-01-04")) self.assertAlmostEqual( joined.get_dataframe().set_index("name").ix["Rasuwa"].rog, 199.956021886114)
def test_joined_aggregate(get_dataframe): """ Test join aggregate. """ mfl = MostFrequentLocation("2016-01-01", "2016-01-04", level="admin3") joined = mfl.join_aggregate(RadiusOfGyration("2016-01-01", "2016-01-04")) assert ( pytest.approx(199.956021886114) == get_dataframe(joined).set_index("name").ix["Rasuwa"].rog )
def test_joined_aggregate(get_dataframe): """ Test join aggregate. """ mfl = MostFrequentLocation("2016-01-01", "2016-01-04", level="admin3") joined = mfl.join_aggregate(RadiusOfGyration("2016-01-01", "2016-01-04")) assert ( pytest.approx(203.12391560786) == get_dataframe(joined).set_index("pcod").loc["524 2 05 29"].rog )
def test_can_numsubset_with_inf(get_dataframe): """ flowmachine.RadiusOfGyration can be subset between -Inf and Inf """ rog = RadiusOfGyration("2016-01-01", "2016-01-02") low = -float("Infinity") high = float("Infinity") sub = get_dataframe(rog.numeric_subset(col="value", low=low, high=high)) df = get_dataframe(rog).query("{low} <= value <= {high}".format(low=low, high=high)) pd.testing.assert_frame_equal(sub, df)
def _flowmachine_query_obj(self): """ Return the underlying flowmachine radius_of_gyration object. Returns ------- Query """ return RadiusOfGyration( start=self.start_date, stop=self.end_date, subscriber_subset=self.subscriber_subset, )
def test_joined_median_aggregate(self): """ Test join with median aggregate. """ mfl = MostFrequentLocation("2016-01-01", "2016-01-04", level="admin3") rog = RadiusOfGyration("2016-01-01", "2016-01-04") joined = mfl.join_aggregate(rog, method="median") rawus_avg = (rog.get_dataframe().set_index("subscriber").join( mfl.get_dataframe().set_index("subscriber")).set_index( "name").ix["Rasuwa"].rog.median()) self.assertAlmostEqual( joined.get_dataframe().set_index("name").ix["Rasuwa"].rog, rawus_avg)
def test_can_numsubset_with_low_and_high(get_dataframe): """ flowmachine.RadiusOfGyration can be subset within a range """ rog = RadiusOfGyration("2016-01-01", "2016-01-02") low = 150 high = 155 rog_df = (get_dataframe(rog).query("{low} <= value <= {high}".format( low=low, high=high)).set_index("subscriber")) sub = get_dataframe(rog.numeric_subset(col="value", low=low, high=high)).set_index("subscriber") pd.testing.assert_frame_equal(sub, rog_df)
def test_create_histogram_using_int_bins_value(get_dataframe): """ Create histogram using one bins value. """ radius_of_gyration = RadiusOfGyration("2016-01-01", "2016-01-02") agg = HistogramAggregation(metric=radius_of_gyration, bins=5, censor=False) df = get_dataframe(agg) numpy_histogram, numpy_bins = np.histogram( get_dataframe(radius_of_gyration).value, bins=5) assert df.value.sum() == len(get_dataframe(radius_of_gyration)) assert numpy_histogram.tolist() == df.value.tolist() assert numpy_bins.tolist()[:-1] == pytest.approx(df.lower_edge.tolist()) assert numpy_bins.tolist()[1:] == pytest.approx(df.upper_edge.tolist())
def _unsampled_query_obj(self): """ Return the underlying flowmachine radius_of_gyration object. Returns ------- Query """ return RadiusOfGyration( start=self.start_date, stop=self.end_date, table=self.event_types, subscriber_subset=self.subscriber_subset, hours=self.hours, )
def test_create_histogram_using_list_of_bins_values(get_dataframe): """ Create histogram using list of bins values. """ radius_of_gyration = RadiusOfGyration("2016-01-01", "2016-01-02") agg = HistogramAggregation(metric=radius_of_gyration, bins=[10, 20, 30, 40, 50, 60], censor=False) df = get_dataframe(agg) numpy_histogram, numpy_bins = np.histogram( get_dataframe(radius_of_gyration).value, bins=[10, 20, 30, 40, 50, 60]) assert numpy_histogram.tolist() == df.value.tolist() assert numpy_bins.tolist()[:-1] == pytest.approx(df.lower_edge.tolist()) assert numpy_bins.tolist()[1:] == pytest.approx(df.upper_edge.tolist())
def test_num_subset_can_be_stored(get_dataframe): """ Test that flowmachine.NumericSubset can be stored. """ rog = RadiusOfGyration("2016-01-01", "2016-01-02") low = 150 high = 155 rog_df = get_dataframe(rog).query("{low} <= value <= {high}".format( low=low, high=high)) sub = rog.numeric_subset(col="value", low=low, high=high) sub.store().result() assert sub.is_stored # Test that the store is of the right length sub = rog.numeric_subset(col="value", low=low, high=high) assert len(get_dataframe(sub)) == len(rog_df)
def test_all_above_threshold(get_dataframe): """ Test that values are not returned where there are not enough people in the aggregate. """ in_agg = get_dataframe( RedactedJoinedSpatialAggregate( joined_spatial_aggregate=JoinedSpatialAggregate( locations=daily_location("2016-01-01"), metric=RadiusOfGyration("2016-01-01", "2016-01-02"), ))).pcod assert len(in_agg) > 0 under_15 = get_dataframe( daily_location("2016-01-01").aggregate().numeric_subset(col="value", low=0, high=15)).pcod assert set(under_15).isdisjoint(in_agg)
def test_create_histogram_using_bins_and_range_values(get_dataframe): """ Create histogram using one bins and range values. """ radius_of_gyration = RadiusOfGyration("2016-01-01", "2016-01-02") agg = HistogramAggregation(metric=radius_of_gyration, bins=5, range=(130.00, 230.00), censor=False) df = get_dataframe(agg) numpy_histogram, numpy_bins = np.histogram( get_dataframe(radius_of_gyration).value, bins=5, range=(130.00, 230.00)) assert numpy_histogram.tolist() == df.value.tolist() assert numpy_bins.tolist()[:-1] == pytest.approx(df.lower_edge.tolist()) assert numpy_bins.tolist()[1:] == pytest.approx(df.upper_edge.tolist())
def test_joined_median_aggregate(get_dataframe): """ Test join with median aggregate. """ mfl = MostFrequentLocation("2016-01-01", "2016-01-04", level="admin3") rog = RadiusOfGyration("2016-01-01", "2016-01-04") joined = mfl.join_aggregate(rog, method="median") rawus_avg = ( get_dataframe(rog) .set_index("subscriber") .join(get_dataframe(mfl).set_index("subscriber")) .set_index("pcod") .loc["524 2 05 29"] .rog.median() ) assert ( pytest.approx(rawus_avg) == get_dataframe(joined).set_index("pcod").loc["524 2 05 29"].rog ), rawus_avg
def test_collects_metrics(): """ Test that we can instantiate flowmachine.feature_collection with list of objects. """ start, stop = "2016-01-01", "2016-01-03" metrics = [ RadiusOfGyration(start, stop), NocturnalEvents(start, stop), SubscriberDegree(start, stop), ] expected_columns = [ "subscriber", "value_radiusofgyration_0", "value_nocturnalevents_1", "value_subscriberdegree_2", ] fc = feature_collection(metrics) column_names = fc.column_names assert expected_columns == column_names
def test_create_histogram_using_bins_list_and_range_values(get_dataframe): """ Create histogram using list of bins and range values (checking for consistency with numpy). """ radius_of_gyration = RadiusOfGyration("2016-01-01", "2016-01-02") agg = HistogramAggregation( metric=radius_of_gyration, bins=[10, 20, 30, 40, 50, 60], range=(130.00, 230.00), censor=False, ) df = get_dataframe(agg) numpy_histogram, numpy_bins = np.histogram( get_dataframe(radius_of_gyration).value, bins=[10, 20, 30, 40, 50, 60], range=(130.00, 230.00), ) assert numpy_histogram.tolist() == df.value.tolist() assert numpy_bins.tolist()[:-1] == pytest.approx(df.lower_edge.tolist()) assert numpy_bins.tolist()[1:] == pytest.approx(df.upper_edge.tolist())
def test_query_can_be_subscriber_set_restricted(subscriber_list_table, subscriber_list, get_dataframe): """Test that some queries can be limited to only a subset of subscribers.""" rog = RadiusOfGyration("2016-01-01", "2016-01-03", subscriber_subset=subscriber_list_table) hl = ModalLocation(*[ daily_location(d, subscriber_subset=subscriber_list_table) for d in list_of_dates("2016-01-01", "2016-01-03") ]) rog_df = get_dataframe(rog) hl_df = get_dataframe(hl) # Get the set of subscribers present in the dataframe, we need to handle the logic # of msisdn_from/msisdn_to calculated_subscriber_set = set(rog_df.subscriber) assert calculated_subscriber_set == set(subscriber_list) calculated_subscriber_set = set(hl_df.subscriber) assert calculated_subscriber_set == set(subscriber_list)
def test_query_can_be_subscriber_set_restricted(self): """Test that some queries can be limited to only a subset of subscribers.""" # Create a temporary table in the DB con = Table.connection.engine sql = "DROP TABLE IF EXISTS subscriber_list" con.execute(sql) sql = """CREATE TABLE subscriber_list (subscriber TEXT)""" con.execute(sql) formatted_subscribers = ",".join("('{}')".format(u) for u in self.subscriber_list) sql = """INSERT INTO subscriber_list (subscriber) VALUES {}""".format( formatted_subscribers) con.execute(sql) rog = RadiusOfGyration("2016-01-01", "2016-01-03", subscriber_subset=Table("subscriber_list")) hl = HomeLocation(*[ daily_location(d, subscriber_subset=Table("subscriber_list")) for d in list_of_dates("2016-01-01", "2016-01-03") ]) rog_df = rog.get_dataframe() hl_df = hl.get_dataframe() sql = "DROP TABLE IF EXISTS subscriber_list" con.execute(sql) # Get the set of subscribers present in the dataframe, we need to handle the logic # of msisdn_from/msisdn_to calculated_subscriber_set = set(rog_df.subscriber) self.assertEqual(calculated_subscriber_set, set(self.subscriber_list)) calculated_subscriber_set = set(hl_df.subscriber) self.assertEqual(calculated_subscriber_set, set(self.subscriber_list))
def setUp(self): self.dl = daily_location("2016-01-03") self.rog = RadiusOfGyration("2016-01-01", "2016-01-03") self.subscriber_list = list(self.dl.head(8).subscriber) self.single_subscriber = self.subscriber_list[3]