def test_isin_feat_other_syntax_int(es): isin = Feature(es['log']['value']).isin([5, 10]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [False, True, True, False, False, False, False, False] v = df[isin.get_name()].values.tolist() assert true == v
def test_make_trans_feat(es): f = Hour(es['log']['datetime']) pandas_backend = PandasBackend(es, [f]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[f.get_name()][0] assert v == 10
def test_isin_feat_other_syntax(es): isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v
def test_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) pandas_backend = PandasBackend(es, [v, p]) df = pandas_backend.calculate_all_features(range(17), None) true = df[v.get_name()].rank(pct=True) for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_direct_from_variable(es): # should be same behavior as test_direct_from_identity d = DirectFeature(base_feature=es['sessions']['device_type'], child_entity=es['log']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features(instance_ids=[0, 5], time_last=None) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_diff_single_value(es): diff = ft.Feature( [es['stores']['num_square_feet'], es['stores'][u'région_id']], primitive=Diff) pandas_backend = PandasBackend(es, [diff]) df = pandas_backend.calculate_all_features(instance_ids=[5], time_last=None) assert df.shape[0] == 1 assert df[diff.get_name()].dropna().shape[0] == 0
def test_percentile(es): v = ft.Feature(es['log']['value']) p = ft.Feature(v, primitive=Percentile) pandas_backend = PandasBackend(es, [p]) df = pandas_backend.calculate_all_features(range(10, 17), None) true = es['log'].df[v.get_name()].rank(pct=True) true = true.loc[range(10, 17)] for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def check(feature): pandas_backend = PandasBackend(es, [feature]) df_1 = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) df_2 = pandas_backend.calculate_all_features(instance_ids=[2, 4], time_last=None) # check that the value for instance id 2 matches assert (df_2.loc[2] == df_1.loc[2]).all()
def test_compare_all_nans(es): nan_feat = Mode(es['log']['product_id'], es['sessions']) compare = nan_feat == 'brown bag' # before all data time_last = pd.Timestamp('1/1/1993') pandas_backend = PandasBackend(es, [nan_feat, compare]) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=time_last) assert df[nan_feat.get_name()].dropna().shape[0] == 0 assert not df[compare.get_name()].any()
def test_not_feature(es): likes_ice_cream = es['customers']['loves_ice_cream'] not_feat = Not(likes_ice_cream) features = [not_feat] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1], time_last=None) v = df[not_feat.get_name()].values assert not v[0] assert v[1]
def test_isin_feat_custom(es): def pd_is_in(array, list_of_outputs=None): if list_of_outputs is None: list_of_outputs = [] return pd.Series(array).isin(list_of_outputs) def isin_generate_name(self): return u"%s.isin(%s)" % (self.base_features[0].get_name(), str(self.kwargs['list_of_outputs'])) IsIn = make_trans_primitive( pd_is_in, [Variable], Boolean, name="is_in", description="For each value of the base feature, checks whether it is " "in a list that is provided.", cls_attributes={"generate_name": isin_generate_name}) isin = IsIn(es['log']['product_id'], list_of_outputs=["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v isin = Feature(es['log']['value']).isin([5, 10]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [False, True, True, False, False, False, False, False] v = df[isin.get_name()].values.tolist() assert true == v
def test_cum_count(es): log_id_feat = es['log']['id'] cum_count = CumCount(log_id_feat, es['log']['session_id']) features = [cum_count] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_count.get_name()].values assert len(cvalues) == 15 cum_count_values = [1, 2, 3, 4, 5, 1, 2, 3, 4, 1, 1, 2, 1, 2, 3] for i, v in enumerate(cum_count_values): assert v == cvalues[i]
def test_cum_max(es): log_value_feat = es['log']['value'] cum_max = CumMax(log_value_feat, es['log']['session_id']) features = [cum_max] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_max.get_name()].values assert len(cvalues) == 15 cum_max_values = [0, 5, 10, 15, 20, 0, 1, 2, 3, 0, 0, 5, 0, 7, 14] for i, v in enumerate(cum_max_values): assert v == cvalues[i]
def test_direct_percentile(es): v = Feature(es['customers']['age']) p = Percentile(v) d = Feature(p, es['sessions']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features([0, 1], None) cust_vals = es['customers'].df[[v.get_name()]] cust_vals['percentile'] = cust_vals[v.get_name()].rank(pct=True) true_p = cust_vals['percentile'].loc[[0, 0]] for t, a in zip(true_p.values, df[d.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_agg_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) agg = Sum(p, es['sessions']) pandas_backend = PandasBackend(es, [agg]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]] for t, a in zip(true_p.values, df[agg.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_cum_mean(es): log_value_feat = es['log']['value'] cum_mean = CumMean(log_value_feat, es['log']['session_id']) features = [cum_mean] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_mean.get_name()].values assert len(cvalues) == 15 cum_mean_values = [0, 2.5, 5, 7.5, 10, 0, .5, 1, 1.5, 0, 0, 2.5, 0, 3.5, 7] for i, v in enumerate(cum_mean_values): assert v == cvalues[i]
def test_percentile_agg(es): v = ft.Feature(es['log']['value']) agg = ft.Feature(v, parent_entity=es['sessions'], primitive=Sum) pagg = ft.Feature(agg, primitive=Percentile) pandas_backend = PandasBackend(es, [pagg]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] true_p = log_vals.groupby('session_id')[v.get_name()].sum().fillna(0) true_p = true_p.rank(pct=True)[[0, 1]] for t, a in zip(true_p.values, df[pagg.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_isnull_feat(es): value = IdentityFeature(es['log']['value']) diff = Diff(value, es['log']['session_id']) isnull = IsNull(diff) features = [isnull] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(15), None) # correct_vals_diff = [ # np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7] correct_vals = [True, False, False, False, False, True, False, False, False, True, True, False, True, False, False] values = df[isnull.get_name()].values.tolist() assert correct_vals == values
def test_override_cmp_from_variable(es): count_lo = IdentityFeature(es['log']['value']) > 1 to_test = [False, True, True] features = [count_lo] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) v = df[count_lo.get_name()].values.tolist() for i, test in enumerate(to_test): assert v[i] == test
def test_cum_sum_use_previous(es): log_value_feat = es['log']['value'] cum_sum = CumSum(log_value_feat, es['log']['session_id'], use_previous=Timedelta(3, 'observations', entity=es['log'])) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [0, 5, 15, 30, 45, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21] for i, v in enumerate(cum_sum_values): assert v == cvalues[i]
def test_direct_agg_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) agg = Sum(p, es['customers']) d = Feature(agg, es['sessions']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) log_vals['customer_id'] = [0] * 10 + [1] * 5 + [2] * 2 true_p = log_vals.groupby('customer_id')['percentile'].sum().fillna(0) true_p = true_p[[0, 0]] for t, a in zip(true_p.values, df[d.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or round(t, 3) == round(a, 3)
def test_cum_sum_where(es): log_value_feat = es['log']['value'] compare_feat = GreaterThan(log_value_feat, 3) dfeat = Feature(es['sessions']['customer_id'], es['log']) cum_sum = CumSum(log_value_feat, dfeat, where=compare_feat) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [0, 5, 15, 30, 50, 50, 50, 50, 50, 50, 0, 5, 5, 12, 26] for i, v in enumerate(cum_sum_values): if not np.isnan(v): assert v == cvalues[i] else: assert (np.isnan(cvalues[i]))
def test_haversine(es): log_latlong_feat = es['log']['latlong'] log_latlong_feat2 = es['log']['latlong2'] haversine = Haversine(log_latlong_feat, log_latlong_feat2) features = [haversine] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) values = df[haversine.get_name()].values real = [ 0., 524.15585776, 1043.00845747, 1551.12130243, 2042.79840241, 0., 137.86000883, 275.59396684, 413.07563177, 0., 0., 524.15585776, 0., 739.93819145, 1464.27975511 ] assert len(values) == 15 for i, v in enumerate(real): assert v - values[i] < .0001
def test_cum_sum_use_previous_and_where_absolute(es): log_value_feat = es['log']['value'] compare_feat = GreaterThan(log_value_feat, 3) dfeat = Feature(es['sessions']['customer_id'], es['log']) cum_sum = CumSum(log_value_feat, dfeat, es["log"]["datetime"], where=compare_feat, use_previous=Timedelta(40, 'seconds')) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cum_sum_values = [0, 5, 15, 30, 50, 0, 0, 0, 0, 0, 0, 5, 0, 7, 21] cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 for i, v in enumerate(cum_sum_values): assert v == cvalues[i]
def test_latlong(es): log_latlong_feat = es['log']['latlong'] latitude = Latitude(log_latlong_feat) longitude = Longitude(log_latlong_feat) features = [latitude, longitude] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) latvalues = df[latitude.get_name()].values lonvalues = df[longitude.get_name()].values assert len(latvalues) == 15 assert len(lonvalues) == 15 real_lats = [0, 5, 10, 15, 20, 0, 1, 2, 3, 0, 0, 5, 0, 7, 14] real_lons = [0, 2, 4, 6, 8, 0, 1, 2, 3, 0, 0, 2, 0, 3, 6] for i, v, in enumerate(real_lats): assert v == latvalues[i] for i, v, in enumerate(real_lons): assert v == lonvalues[i]
def test_compare_of_transform(es): day = Day(es['log']['datetime']) to_test = [(Equals, [False, True]), (NotEquals, [True, False]), (LessThan, [True, False]), (LessThanEqualTo, [True, True]), (GreaterThan, [False, False]), (GreaterThanEqualTo, [False, True])] features = [] for test in to_test: features.append(test[0](day, 10)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 14], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]
def test_override_boolean(es): count = Count(es['log']['value'], es['sessions']) count_lo = GreaterThan(count, 1) count_hi = LessThan(count, 10) to_test = [[True, True, True], [True, True, False], [False, False, True]] features = [] features.append(count_lo.OR(count_hi)) features.append(count_lo.AND(count_hi)) features.append(~(count_lo.AND(count_hi))) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test
def test_two_kinds_of_dependents(es): v = Feature(es['log']['value']) product = Feature(es['log']['product_id']) agg = Sum(v, es['customers'], where=product == 'coke zero') p = Percentile(agg) g = Absolute(agg) agg2 = Sum(v, es['sessions'], where=product == 'coke zero') # Adding this feature in tests line 218 in pandas_backend # where we remove columns in result_frame that already exist # in the output entity_frames in preparation for pd.concat # In a prior version, this failed because we changed the result_frame # variable itself, rather than making a new variable _result_frame. # When len(output_frames) > 1, the second iteration won't have # all the necessary columns because they were removed in the first agg3 = Sum(agg2, es['customers']) pandas_backend = PandasBackend(es, [p, g, agg3]) df = pandas_backend.calculate_all_features([0, 1], None) assert df[p.get_name()].tolist() == [0.5, 1.0] assert df[g.get_name()].tolist() == [15, 26]
def test_arithmetic_of_agg(es): customer_id_feat = es['customers']['id'] store_id_feat = es['stores']['id'] count_customer = Count(customer_id_feat, parent_entity=es['regions']) count_stores = Count(store_id_feat, parent_entity=es['regions']) to_test = [(Add, [6, 2]), (Subtract, [0, -2]), (Multiply, [9, 0]), (Divide, [1, 0])] features = [] for test in to_test: features.append(test[0](count_customer, count_stores)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features( instance_ids=['United States', 'Mexico'], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]
def test_cum_sum_group_on_nan(es): log_value_feat = es['log']['value'] es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 + ['toothpaste'] * 3 + ['brown bag'] * 2 + ['shoes'] + [np.nan] * 4 + ['coke_zero'] * 2) cum_sum = CumSum(log_value_feat, es['log']['product_id']) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [ 0, 5, 15, 15, 35, 0, 1, 3, 3, 3, 0, np.nan, np.nan, np.nan, np.nan ] for i, v in enumerate(cum_sum_values): if np.isnan(v): assert (np.isnan(cvalues[i])) else: assert v == cvalues[i]