def test_approximate_dfeat_of_need_all_values(entityset): es = entityset p = Percentile(es['log']['value']) agg_feat = Sum(p, es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=cutoff_time) log_df = es['log'].df instances = [0, 2] cutoffs = [pd.Timestamp('2011-04-09 10:31:19'), pd.Timestamp('2011-04-09 11:00:00')] approxes = [pd.Timestamp('2011-04-09 10:31:10'), pd.Timestamp('2011-04-09 11:00:00')] true_vals = [] true_vals_approx = [] for instance, cutoff, approx in zip(instances, cutoffs, approxes): log_data_cutoff = log_df[log_df['datetime'] < cutoff] log_data_cutoff['percentile'] = log_data_cutoff['value'].rank(pct=True) true_agg = log_data_cutoff.loc[log_data_cutoff['session_id'] == instance, 'percentile'].fillna(0).sum() true_vals.append(round(true_agg, 3)) log_data_approx = log_df[log_df['datetime'] < approx] log_data_approx['percentile'] = log_data_approx['value'].rank(pct=True) true_agg_approx = log_data_approx.loc[log_data_approx['session_id'].isin([0, 1, 2]), 'percentile'].fillna(0).sum() true_vals_approx.append(round(true_agg_approx, 3)) lapprox = [round(x, 3) for x in feature_matrix[dfeat.get_name()].tolist()] test_list = [round(x, 3) for x in feature_matrix[agg_feat.get_name()].tolist()] assert lapprox == true_vals_approx assert test_list == true_vals
def test_approximate_dfeat_of_need_all_values(entityset): es = entityset p = Percentile(es['log']['value']) agg_feat = Sum(p, es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) log_df = es['log'].df instances = [0, 2] cutoffs = [pd.Timestamp('2011-04-09 10:31:19'), pd.Timestamp('2011-04-09 11:00:00')] approxes = [pd.Timestamp('2011-04-09 10:31:10'), pd.Timestamp('2011-04-09 11:00:00')] true_vals = [] true_vals_approx = [] for instance, cutoff, approx in zip(instances, cutoffs, approxes): log_data_cutoff = log_df[log_df['datetime'] < cutoff] log_data_cutoff['percentile'] = log_data_cutoff['value'].rank(pct=True) true_agg = log_data_cutoff.loc[log_data_cutoff['session_id'] == instance, 'percentile'].fillna(0).sum() true_vals.append(round(true_agg, 3)) log_data_approx = log_df[log_df['datetime'] < approx] log_data_approx['percentile'] = log_data_approx['value'].rank(pct=True) true_agg_approx = log_data_approx.loc[log_data_approx['session_id'].isin([0, 1, 2]), 'percentile'].fillna(0).sum() true_vals_approx.append(round(true_agg_approx, 3)) lapprox = [round(x, 3) for x in feature_matrix[dfeat.get_name()].tolist()] test_list = [round(x, 3) for x in feature_matrix[agg_feat.get_name()].tolist()] assert lapprox == true_vals_approx assert test_list == true_vals
def test_agg_same_method_name(es): """ Pandas relies on the function name when calculating aggregations. This means if a two primitives with the same function name are applied to the same column, pandas can't differentiate them. We have a work around to this based on the name property that we test here. """ # test with normally defined functions def custom_primitive(x): return x.sum() Sum = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="sum") def custom_primitive(x): return x.max() Max = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="max") f_sum = Sum(es["log"]["value"], es["customers"]) f_max = Max(es["log"]["value"], es["customers"]) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()] # test with lambdas Sum = make_agg_primitive(lambda x: x.sum(), input_types=[Numeric], return_type=Numeric, name="sum") Max = make_agg_primitive(lambda x: x.max(), input_types=[Numeric], return_type=Numeric, name="max") f_sum = Sum(es["log"]["value"], es["customers"]) f_max = Max(es["log"]["value"], es["customers"]) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]
def test_make_agg_feat_of_identity_variable(entityset, backend): agg_feat = Sum(entityset['log']['value'], parent_entity=entityset['sessions']) pandas_backend = backend([agg_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[agg_feat.get_name()][0] assert (v == 50)
def test_make_agg_feat_of_identity_variable(entityset, backend): agg_feat = Sum(entityset['log']['value'], parent_entity=entityset['sessions']) pandas_backend = backend([agg_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[agg_feat.get_name()][0] assert (v == 50)
def test_agg_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) agg = Sum(p, es['sessions']) pandas_backend = PandasBackend(es, [agg]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]] for t, a in zip(true_p.values, df[agg.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_make_agg_feat_of_agg_feat(entityset, backend): log_count_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions']) customer_sum_feat = Sum(log_count_feat, parent_entity=entityset['customers']) pandas_backend = backend([customer_sum_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[customer_sum_feat.get_name()][0] assert (v == 10)
def test_make_agg_feat_of_agg_feat(entityset, backend): log_count_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions']) customer_sum_feat = Sum(log_count_feat, parent_entity=entityset['customers']) pandas_backend = backend([customer_sum_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[customer_sum_feat.get_name()][0] assert (v == 10)
def test_agg_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) agg = Sum(p, es['sessions']) pandas_backend = PandasBackend(es, [agg]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]] for t, a in zip(true_p.values, df[agg.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_agg_same_method_name(es): """ Pandas relies on the function name when calculating aggregations. This means if a two primitives with the same function name are applied to the same column, pandas can't differentiate them. We have a work around to this based on the name property that we test here. """ # test with normally defined functions def custom_primitive(x): return x.sum() Sum = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="sum") def custom_primitive(x): return x.max() Max = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="max") f_sum = Sum(es["log"]["value"], es["customers"]) f_max = Max(es["log"]["value"], es["customers"]) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()] # test with lambdas Sum = make_agg_primitive(lambda x: x.sum(), input_types=[Numeric], return_type=Numeric, name="sum") Max = make_agg_primitive(lambda x: x.max(), input_types=[Numeric], return_type=Numeric, name="max") f_sum = Sum(es["log"]["value"], es["customers"]) f_max = Max(es["log"]["value"], es["customers"]) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]