def test_avoids_searching_extra_tables(self): """ EventTableSubset() query doesn't look in additional partitioned tables. """ sd = EventTableSubset("2016-01-01", "2016-01-02") explain_string = sd.explain() self.assertNotIn("calls_20160103", explain_string)
def test_cdrs_can_be_subset_by_table(self): """ We can subset CDRs by a table in the database. """ # Create a temporary table in the DB con = Table.connection.engine sql = "DROP TABLE IF EXISTS subscriber_list" con.execute(sql) sql = """CREATE TABLE subscriber_list (subscriber TEXT)""" con.execute(sql) formatted_subscribers = ",".join("('{}')".format(u) for u in self.subscriber_list) sql = """INSERT INTO subscriber_list (subscriber) VALUES {}""".format( formatted_subscribers) con.execute(sql) su = EventTableSubset("2016-01-01", "2016-01-03", subscriber_subset=Table("subscriber_list")) df = su.get_dataframe() sql = "DROP TABLE IF EXISTS subscriber_list" con.execute(sql) # Get the set of subscribers present in the dataframe, we need to handle the logic # of msisdn_from/msisdn_to calculated_subscriber_set = set(df.subscriber) self.assertEqual(calculated_subscriber_set, set(self.subscriber_list))
def test_stores_view(flowmachine_connect): """ EventTableSubset().to_sql() can be stored as a VIEW. """ query = EventTableSubset("2016-01-01", "2016-01-01 01:00:00") query.to_sql(schema="tests", name="test_view", as_view=True) assert "test_view" in flowmachine_connect.inspector.get_view_names(schema="tests")
def test_events_table_subset_column_names(columns): """Test that EventTableSubset column_names property is accurate.""" etu = EventTableSubset("2016-01-01", "2016-01-02", columns=columns, tables=["events.calls"]) assert etu.head(0).columns.tolist() == etu.column_names
def test_avoids_searching_extra_tables(get_dataframe): """ EventTableSubset() query doesn't look in additional partitioned tables. """ sd = EventTableSubset("2016-01-01", "2016-01-02") explain_string = sd.explain() assert "calls_20160103" not in explain_string
def test_dataframe_has_column_names(self): """ Returning the dataframe gives the expected column names. """ sd = EventTableSubset("2016-01-01", "2016-01-02") df = sd.get_dataframe() self.assertEqual(sorted(df.columns), self.expected_columns)
def test_head_has_column_names(self): """ Returning the head of the dataframe gives the expected column names. """ sd = EventTableSubset("2016-01-01", "2016-01-02") head = sd.head() self.assertEqual(sorted(head.columns), self.expected_columns)
def test_stores_table(flowmachine_connect): """ EventTableSubset().to_sql() can be stored as a TABLE. """ query = EventTableSubset("2016-01-01", "2016-01-01 01:00:00") query.to_sql(schema="tests", name="test_table") assert "test_table" in flowmachine_connect.inspector.get_table_names(schema="tests")
def test_error_on_all_missing(): """ Date subsetter should error when all dates are missing. """ with pytest.raises(MissingDateError): EventTableSubset("2016-05-01", "2016-05-02") with pytest.raises(MissingDateError): EventTableSubset("2016-05-01", "2016-05-02", table="events.topups")
def test_error_on_all_missing(self): """ Date subsetter should error when all dates are missing. """ with self.assertRaises(MissingDateError): sd = EventTableSubset("2016-05-01", "2016-05-02") with self.assertRaises(MissingDateError): sd = EventTableSubset("2016-05-01", "2016-05-02", table="events.topups")
def test_explain(self): """ EventTableSubset().explain() method returns a string """ # Usually not a critical function, so let's simply test by # asserting that it returns a string sd = EventTableSubset("2016-01-01", "2016-01-02") explain_string = sd.explain() self.assertIs(type(explain_string), str) self.assertIs(type(sd.explain(analyse=True)), str)
def test_explain(get_dataframe): """ EventTableSubset().explain() method returns a string """ # Usually not a critical function, so let's simply test by # asserting that it returns a string sd = EventTableSubset("2016-01-01", "2016-01-02") explain_string = sd.explain() assert isinstance(explain_string, str) assert isinstance(sd.explain(analyse=True), str)
def test_events_table_subscriber_ident_substitutions(ident): """Test that EventTableSubset replaces the subscriber ident column name with subscriber.""" etu = EventTableSubset( "2016-01-01", "2016-01-02", columns=[ident], tables=["events.calls"], subscriber_identifier=ident, ) assert "subscriber" == etu.head(0).columns[0] assert ["subscriber"] == etu.column_names
def test_can_subset_by_hour(self): """ EventTableSubset() can subset by a range of hours """ sd = EventTableSubset("2016-01-01", "2016-01-04", hours=(12, 17)) df = sd.get_dataframe() df["hour"] = df.datetime.apply(lambda x: x.hour) df["day"] = df.datetime.apply(lambda x: x.day) Range = df.hour.max() - df.hour.min() self.assertEqual(Range, 4) # Also check that all the dates are still there self.assertTrue(3 in df.day and 2 in df.day and 1 in df.day)
def test_can_subset_by_sampler(self): """Test that we can use the output of another query to subset by.""" unique_subs_sample = UniqueSubscribers( "2016-01-01", "2016-01-07").random_sample(size=10, method="system", seed=0.1) su = EventTableSubset("2016-01-01", "2016-01-03", subscriber_subset=unique_subs_sample) su_set = set(su.get_dataframe().subscriber) uu_set = set(unique_subs_sample.get_dataframe().subscriber) self.assertSetEqual(su_set, uu_set) self.assertEqual(len(su_set), 10)
def test_handles_backwards_dates(self): """ If the subscriber passes dates that are 'backwards' this will be interpreted as spanning midnight. """ sd = EventTableSubset("2016-01-01", "2016-01-04", hours=(20, 5)) df = sd.get_dataframe() df["hour"] = df.datetime.apply(lambda x: x.hour) df["day"] = df.datetime.apply(lambda x: x.day) unique_hours = list(df.hour.unique()) unique_hours.sort() self.assertEqual([0, 1, 2, 3, 4, 20, 21, 22, 23], unique_hours) # Also check that all the dates are still there self.assertTrue(3 in df.day and 2 in df.day and 1 in df.day)
def test_cdrs_can_be_subset_by_list(self): """ We can subset CDRs with a list. """ su = EventTableSubset("2016-01-01", "2016-01-03", subscriber_subset=self.subscriber_list) df = su.get_dataframe() # Get the set of subscribers present in the dataframe, we need to handle the logic # of msisdn_from/msisdn_to calculated_subscriber_set = set(df.subscriber) self.assertEqual(calculated_subscriber_set, set(self.subscriber_list))
def test_warns_on_missing(): """ Date subsetter should warn on missing dates. """ message = "115 of 122 calendar dates missing. Earliest date is 2016-01-01, latest is 2016-01-07" with pytest.warns(UserWarning, match=message): EventTableSubset("2016-01-01", "2016-05-02")
def test_warns_on_missing(self): """ Date subsetter should warn on missing dates. """ message = "115 of 122 calendar dates missing. Earliest date is 2016-01-01, latest is 2016-01-07" with self.assertWarnsRegex(UserWarning, message): EventTableSubset("2016-01-01", "2016-05-02")
def test_handles_dates(self): """ Date subsetter can handle timestamp without hours or mins. """ sd = EventTableSubset("2016-01-01", "2016-01-02") df = sd.get_dataframe() minimum = df["datetime"].min().to_pydatetime() maximum = df["datetime"].max().to_pydatetime() min_comparison = pytz.timezone("Etc/UTC").localize(datetime( 2016, 1, 1)) max_comparison = pytz.timezone("Etc/UTC").localize(datetime( 2016, 1, 2)) self.assertTrue(minimum.timestamp() > min_comparison.timestamp()) self.assertTrue(maximum.timestamp() < max_comparison.timestamp())
def test_handles_mins(self): """ Date subsetter can handle timestamps including the times. """ sd = EventTableSubset("2016-01-01 13:30:30", "2016-01-02 16:25:00") df = sd.get_dataframe() minimum = df["datetime"].min().to_pydatetime() maximum = df["datetime"].max().to_pydatetime() min_comparison = pytz.timezone("Etc/UTC").localize( datetime(2016, 1, 1, 13, 30, 30)) max_comparison = pytz.timezone("Etc/UTC").localize( datetime(2016, 1, 2, 16, 25, 0)) self.assertTrue(minimum.timestamp() > min_comparison.timestamp()) self.assertTrue(maximum.timestamp() < max_comparison.timestamp())
def test_subset_correct(self): """Test that pushed in subsetting matches .subset result""" su = EventTableSubset("2016-01-01", "2016-01-03", subscriber_subset=self.subscriber_list) subsu = EventTableSubset("2016-01-01", "2016-01-03").subset("subscriber", self.subscriber_list) self.assertTrue(all(su.get_dataframe() == subsu.get_dataframe())) su = HomeLocation(*[ daily_location(d, subscriber_subset=self.subscriber_list) for d in list_of_dates("2016-01-01", "2016-01-07") ]) subsu = HomeLocation(*[ daily_location(d) for d in list_of_dates("2016-01-01", "2016-01-03") ]).subset("subscriber", self.subscriber_list) self.assertTrue(all(su.get_dataframe() == subsu.get_dataframe()))
def test_ommitted_subscriber_column(self): """Test that a result is returned and warning is raised when ommitting a subscriber column.""" with self.assertWarns(UserWarning): su_omit_col = EventTableSubset( "2016-01-01", "2016-01-03", subscriber_subset=self.subscriber_list, columns=["duration"], ).get_dataframe() su_all_cols = EventTableSubset( "2016-01-01", "2016-01-03", subscriber_subset=self.subscriber_list, columns=["msisdn", "duration"], ).get_dataframe() self.assertListEqual(su_omit_col.duration.values.tolist(), su_all_cols.duration.values.tolist()) self.assertListEqual(su_omit_col.columns.tolist(), ["duration"])
def test_subset_correct(subscriber_list, get_dataframe): """Test that pushed in subsetting matches .subset result""" su = EventTableSubset("2016-01-01", "2016-01-03", subscriber_subset=subscriber_list) subsu = EventTableSubset("2016-01-01", "2016-01-03").subset("subscriber", subscriber_list) assert all(get_dataframe(su) == get_dataframe(subsu)) su = ModalLocation(*[ daily_location(d, subscriber_subset=subscriber_list) for d in list_of_dates("2016-01-01", "2016-01-07") ]) subsu = ModalLocation( * [daily_location(d) for d in list_of_dates("2016-01-01", "2016-01-03")]).subset( "subscriber", subscriber_list) assert all(get_dataframe(su) == get_dataframe(subsu))
def test_omitted_subscriber_column(get_dataframe, subscriber_list): """Test that a result is returned and warning is raised when omitting a subscriber column.""" with pytest.warns(UserWarning): su_omit_col = get_dataframe( EventTableSubset( "2016-01-01", "2016-01-03", subscriber_subset=subscriber_list, columns=["duration"], )) su_all_cols = get_dataframe( EventTableSubset( "2016-01-01", "2016-01-03", subscriber_subset=subscriber_list, columns=["msisdn", "duration"], )) assert su_omit_col.duration.values.tolist( ) == su_all_cols.duration.values.tolist() assert su_omit_col.columns.tolist() == ["duration"]
def test_default_dates(get_dataframe): """ Test whether not passing a start and/or stop date will default to the min and/or max dates in the table. """ sd = EventTableSubset(None, "2016-01-04") df = get_dataframe(sd) minimum = df["datetime"].min().to_pydatetime() min_comparison = pytz.timezone("Etc/UTC").localize( datetime(2016, 1, 1, 0, 0, 0)) assert minimum.timestamp() > min_comparison.timestamp() sd = EventTableSubset("2016-01-04", None, hours=(20, 5)) df = get_dataframe(sd) maximum = df["datetime"].max().to_pydatetime() max_comparison = pytz.timezone("Etc/UTC").localize( datetime(2016, 1, 8, 0, 0, 0)) assert maximum.timestamp() < max_comparison.timestamp()
def test_can_subset_by_sampler(get_dataframe): """Test that we can use the output of another query to subset by.""" unique_subs_sample = UniqueSubscribers( "2016-01-01", "2016-01-07").random_sample(size=10, method="system", seed=0.1) su = EventTableSubset("2016-01-01", "2016-01-03", subscriber_subset=unique_subs_sample) su_set = set(get_dataframe(su).subscriber) uu_set = set(get_dataframe(unique_subs_sample).subscriber) assert su_set == uu_set assert len(su_set) == 10
def test_can_subset_by_hour(get_dataframe): """ EventTableSubset() can subset by a range of hours """ sd = EventTableSubset("2016-01-01", "2016-01-04", hours=(12, 17)) df = get_dataframe(sd) df["hour"] = df.datetime.apply(lambda x: x.hour) df["day"] = df.datetime.apply(lambda x: x.day) Range = df.hour.max() - df.hour.min() assert 4 == Range # Also check that all the dates are still there assert 3 in df.day assert 2 in df.day assert 1 in df.day
def test_handles_backwards_hours(get_dataframe): """ If the subscriber passes hours that are 'backwards' this will be interpreted as spanning midnight. """ sd = EventTableSubset("2016-01-01", "2016-01-04", hours=(20, 5)) df = get_dataframe(sd) df["hour"] = df.datetime.apply(lambda x: x.hour) df["day"] = df.datetime.apply(lambda x: x.day) unique_hours = list(df.hour.unique()) unique_hours.sort() assert [0, 1, 2, 3, 4, 20, 21, 22, 23] == unique_hours # Also check that all the dates are still there assert 3 in df.day assert 2 in df.day assert 1 in df.day
def test_cdrs_can_be_subset_by_table(subscriber_list_table, get_dataframe, subscriber_list): """ We can subset CDRs by a table in the database. """ su = EventTableSubset("2016-01-01", "2016-01-03", subscriber_subset=subscriber_list_table) df = get_dataframe(su) # Get the set of subscribers present in the dataframe, we need to handle the logic # of msisdn_from/msisdn_to calculated_subscriber_set = set(df.subscriber) assert calculated_subscriber_set == set(subscriber_list)