def test_events_table_subset_column_names(columns): """Test that EventTableSubset column_names property is accurate.""" etu = EventTableSubset(start="2016-01-01", stop="2016-01-02", columns=columns, table="events.calls") assert etu.head(0).columns.tolist() == etu.column_names
def test_turn_on_caching(): """ *.get_dataframe() dataframe is retained when we turning on caching. """ sd = EventTableSubset(start="2016-01-01", stop="2016-01-02") sd.get_dataframe() sd.turn_off_caching() sd.turn_on_caching() sd.get_dataframe() assert isinstance(sd._df, pd.DataFrame)
def test_events_table_subscriber_ident_substitutions(ident): """Test that EventTableSubset replaces the subscriber ident column name with subscriber.""" etu = EventTableSubset( start="2016-01-01", stop="2016-01-02", columns=[ident], table="events.calls", subscriber_identifier=ident, ) assert "subscriber" == etu.head(0).columns[0] assert ["subscriber"] == etu.column_names
def test_graph(): """Test that dependency graph util runs and has some correct entries.""" g = daily_location("2016-01-01").dependency_graph() sd = EventTableSubset("2016-01-01", "2016-01-02", columns=["msisdn", "datetime", "location_id"]) assert "x{}".format(sd.md5) in g.nodes()
def test_cache_is_returned(): """ Cache property is returned when called. """ sd = EventTableSubset(start="2016-01-01", stop="2016-01-02") sd.get_dataframe() sd.turn_on_caching() assert sd.cache sd.turn_off_caching() assert not sd.cache
def test_subset_correct(subscriber_list, get_dataframe): """Test that pushed in subsetting matches .subset result""" su = EventTableSubset(start="2016-01-01", stop="2016-01-03", subscriber_subset=subscriber_list) subsu = EventTableSubset(start="2016-01-01", stop="2016-01-03").subset("subscriber", subscriber_list) assert all(get_dataframe(su) == get_dataframe(subsu)) su = ModalLocation(*[ daily_location(d, subscriber_subset=subscriber_list) for d in list_of_dates("2016-01-01", "2016-01-07") ]) subsu = ModalLocation( * [daily_location(d) for d in list_of_dates("2016-01-01", "2016-01-03")]).subset( "subscriber", subscriber_list) assert all(get_dataframe(su) == get_dataframe(subsu))
def test_get_df_without_caching(): """ *.get_dataframe() can still retrieve the dataframe without caching. """ sd = EventTableSubset(start="2016-01-01", stop="2016-01-02") sd.get_dataframe() sd.turn_off_caching() assert isinstance(sd.get_dataframe(), pd.DataFrame) assert isinstance(sd.get_dataframe(), pd.DataFrame)
def test_omitted_subscriber_column(get_dataframe, subscriber_list): """Test that a result is returned and warning is raised when omitting a subscriber column.""" with pytest.warns(UserWarning): su_omit_col = get_dataframe( EventTableSubset( start="2016-01-01", stop="2016-01-03", subscriber_subset=subscriber_list, columns=["duration"], )) su_all_cols = get_dataframe( EventTableSubset( start="2016-01-01", stop="2016-01-03", subscriber_subset=subscriber_list, columns=["msisdn", "duration"], )) assert su_omit_col.duration.values.tolist( ) == su_all_cols.duration.values.tolist() assert su_omit_col.columns.tolist() == ["duration"]
def test_calculate_dependency_graph(): """ Test that calculate_dependency_graph() runs and the returned graph has some correct entries. """ query = daily_location("2016-01-01") G = calculate_dependency_graph(query, analyse=True) sd = EventTableSubset( start="2016-01-01", stop="2016-01-02", columns=["msisdn", "datetime", "location_id"], ) assert f"x{sd.query_id}" in G.nodes() assert G.nodes[f"x{sd.query_id}"]["query_object"].query_id == sd.query_id
def test_can_subset_by_sampler(get_dataframe): """Test that we can use the output of another query to subset by.""" unique_subs_sample = UniqueSubscribers( "2016-01-01", "2016-01-07").random_sample(size=10, sampling_method="system", seed=0.1) su = EventTableSubset(start="2016-01-01", stop="2016-01-03", subscriber_subset=unique_subs_sample) su_set = set(get_dataframe(su).subscriber) uu_set = set(get_dataframe(unique_subs_sample).subscriber) assert su_set == uu_set assert len(su_set) == 10
def test_turn_off_caching(): """ *.turn_off_caching() 'forgets' generated dataframe. """ sd = EventTableSubset(start="2016-01-01", stop="2016-01-02") sd.get_dataframe() sd.turn_off_caching() with pytest.raises(AttributeError): sd._df
def test_cdrs_can_be_subset_by_list(get_dataframe, subscriber_list): """ We can subset CDRs with a list. """ su = EventTableSubset(start="2016-01-01", stop="2016-01-03", subscriber_subset=subscriber_list) df = get_dataframe(su) # Get the set of subscribers present in the dataframe, we need to handle the logic # of msisdn_from/msisdn_to calculated_subscriber_set = set(df.subscriber) assert calculated_subscriber_set == set(subscriber_list)
def test_turn_off_caching_handles_error(): """ *.turn_off_caching() works even if ._df attribute is not present. """ sd = EventTableSubset(start="2016-01-01", stop="2016-01-02") sd.get_dataframe() sd.turn_off_caching() sd.turn_on_caching() sd.get_dataframe() del sd._df sd.turn_off_caching()