def test_cum_sum_use_previous_group_on_nan(es): # TODO: Figure out how to test where `df` # in pd_rolling get_function() has multiindex log_value_feat = es['log']['value'] es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 + ['toothpaste'] * 3 + ['brown bag'] * 2 + ['shoes'] + [np.nan] * 4 + ['coke_zero'] * 2) cum_sum = CumSum(log_value_feat, es['log']['product_id'], es["log"]["datetime"], use_previous=Timedelta(40, 'seconds')) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [0, 5, 15, 15, 35, 0, 1, 3, 3, 0, 0, np.nan, np.nan, np.nan, np.nan] for i, v in enumerate(cum_sum_values): if np.isnan(v): assert (np.isnan(cvalues[i])) else: assert v == cvalues[i]
def test_cum_sum_group_on_nan(es): log_value_feat = es['log']['value'] es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 + ['toothpaste'] * 3 + ['brown bag'] * 2 + ['shoes'] + [np.nan] * 4 + ['coke_zero'] * 2) cum_sum = CumSum(log_value_feat, es['log']['product_id']) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [0, 5, 15, 15, 35, 0, 1, 3, 3, 3, 0, np.nan, np.nan, np.nan, np.nan] for i, v in enumerate(cum_sum_values): if np.isnan(v): assert (np.isnan(cvalues[i])) else: assert v == cvalues[i]
def test_cum_sum_use_previous_group_on_nan(es): # TODO: Figure out how to test where `df` # in pd_rolling get_function() has multiindex log_value_feat = es['log']['value'] es['log'].df['product_id'] = (['coke zero'] * 3 + ['car'] * 2 + ['toothpaste'] * 3 + ['brown bag'] * 2 + ['shoes'] + [np.nan] * 4 + ['coke_zero'] * 2) cum_sum = CumSum(log_value_feat, es['log']['product_id'], es["log"]["datetime"], use_previous=Timedelta(40, 'seconds')) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [ 0, 5, 15, 15, 35, 0, 1, 3, 3, 0, 0, np.nan, np.nan, np.nan, np.nan ] for i, v in enumerate(cum_sum_values): if np.isnan(v): assert (np.isnan(cvalues[i])) else: assert v == cvalues[i]
def test_cum_sum(es): log_value_feat = es['log']['value'] cum_sum = CumSum(log_value_feat, es['log']['session_id']) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [0, 5, 15, 30, 50, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21] for i, v in enumerate(cum_sum_values): assert v == cvalues[i]
def test_cum_sum_where(es): log_value_feat = es['log']['value'] compare_feat = GreaterThan(log_value_feat, 3) dfeat = Feature(es['sessions']['customer_id'], es['log']) cum_sum = CumSum(log_value_feat, dfeat, where=compare_feat) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [0, 5, 15, 30, 50, 50, 50, 50, 50, 50, 0, 5, 5, 12, 26] for i, v in enumerate(cum_sum_values): if not np.isnan(v): assert v == cvalues[i] else: assert (np.isnan(cvalues[i]))
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): new_name = 'MyFeature' original_names = original.get_feature_names() renamed = original.rename(new_name) new_names = [new_name] if len(original_names) == 1 else [ new_name + '[{}]'.format(i) for i in range(len(original_names)) ] check_names(renamed, new_name, new_names) serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] check_names(deserialized, new_name, new_names) identity_original = ft.IdentityFeature(es['log'].ww['value']) assert identity_original.get_name() == 'value' value = ft.IdentityFeature(es['log'].ww['value']) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, 'customers', primitive) assert agg_original.get_name() == 'MAX(log.value)' direct_original = ft.DirectFeature( ft.IdentityFeature(es['customers'].ww['age']), 'sessions') assert direct_original.get_name() == 'customers.age' primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == 'value * 2' zipcode = ft.IdentityFeature(es['log'].ww['zipcode']) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode) assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode' multioutput_original = ft.Feature(es['log'].ww['product_id'], parent_dataframe_name='customers', primitive=NMostCommon(n=2)) assert multioutput_original.get_name( ) == 'N_MOST_COMMON(log.product_id, n=2)' featureslice_original = ft.feature_base.FeatureOutputSlice( multioutput_original, 0) assert featureslice_original.get_name( ) == 'N_MOST_COMMON(log.product_id, n=2)[0]' feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original, multioutput_original, featureslice_original ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)
def test_cum_sum_use_previous_and_where_absolute(es): log_value_feat = es['log']['value'] compare_feat = GreaterThan(log_value_feat, 3) dfeat = Feature(es['sessions']['customer_id'], es['log']) cum_sum = CumSum(log_value_feat, dfeat, es["log"]["datetime"], where=compare_feat, use_previous=Timedelta(40, 'seconds')) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cum_sum_values = [0, 5, 15, 30, 50, 0, 0, 0, 0, 0, 0, 5, 0, 7, 21] cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 for i, v in enumerate(cum_sum_values): assert v == cvalues[i]
def test_cum_sum_use_previous_and_where(es): log_value_feat = es['log']['value'] compare_feat = GreaterThan(log_value_feat, 3) # todo should this be cummean? dfeat = Feature(es['sessions']['customer_id'], es['log']) cum_sum = CumSum(log_value_feat, dfeat, where=compare_feat, use_previous=Timedelta(3, 'observations', entity=es['log'])) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cum_sum_values = [0, 5, 15, 30, 45, 45, 45, 45, 45, 45, 0, 5, 5, 12, 26] cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 for i, v in enumerate(cum_sum_values): assert v == cvalues[i]
def test_cum_sum_use_previous_integer_time(int_es): es = int_es log_value_feat = es['log']['value'] with pytest.raises(AssertionError, match=''): CumSum(log_value_feat, es['log']['session_id'], use_previous=Timedelta(3, 'm')) cum_sum = CumSum(log_value_feat, es['log']['session_id'], use_previous=Timedelta(3, 'observations', entity=es['log'])) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [0, 5, 15, 30, 45, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21] for i, v in enumerate(cum_sum_values): assert v == cvalues[i]
def test_cum_sum_use_previous_integer_time(int_es): es = int_es log_value_feat = es['log']['value'] with pytest.raises(AssertionError): CumSum(log_value_feat, es['log']['session_id'], use_previous=Timedelta(3, 'm')) cum_sum = CumSum(log_value_feat, es['log']['session_id'], use_previous=Timedelta(3, 'observations', entity=es['log'])) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [0, 5, 15, 30, 45, 0, 1, 3, 6, 0, 0, 5, 0, 7, 21] for i, v in enumerate(cum_sum_values): assert v == cvalues[i]
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): renamed = original.rename('MyFeature') assert renamed.get_name() == 'MyFeature' serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] assert deserialized.get_name() == 'MyFeature' identity_original = ft.IdentityFeature(es['log']['value']) assert identity_original.get_name() == 'value' value = ft.IdentityFeature(es['log']['value']) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, es['customers'], primitive) assert agg_original.get_name() == 'MAX(log.value)' direct_original = ft.DirectFeature(es['customers']['age'], es['sessions']) assert direct_original.get_name() == 'customers.age' primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == 'value * 2' zipcode = ft.IdentityFeature(es['log']['zipcode']) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode) assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode' feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)
def test_serialization(pd_es): value = ft.IdentityFeature(pd_es["log"].ww["value"]) zipcode = ft.IdentityFeature(pd_es["log"].ww["zipcode"]) primitive = CumSum() groupby = ft.feature_base.GroupByTransformFeature(value, primitive, zipcode) dictionary = { "name": None, "base_features": [value.unique_name()], "primitive": serialize_primitive(primitive), "groupby": zipcode.unique_name(), } assert dictionary == groupby.get_arguments() dependencies = { value.unique_name(): value, zipcode.unique_name(): zipcode, } assert groupby == ft.feature_base.GroupByTransformFeature.from_dictionary( dictionary, pd_es, dependencies, PrimitivesDeserializer() )
def test_serialization(es): value = ft.IdentityFeature(es['log']['value']) zipcode = ft.IdentityFeature(es['log']['zipcode']) primitive = CumSum() groupby = ft.feature_base.GroupByTransformFeature(value, primitive, zipcode) dictionary = { 'base_features': [value.unique_name()], 'primitive': serialize_primitive(primitive), 'groupby': zipcode.unique_name(), } assert dictionary == groupby.get_arguments() dependencies = { value.unique_name(): value, zipcode.unique_name(): zipcode, } assert groupby == \ ft.feature_base.GroupByTransformFeature.from_dictionary(dictionary, es, dependencies, PrimitivesDeserializer())
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): new_name = "MyFeature" original_names = original.get_feature_names() renamed = original.rename(new_name) new_names = ( [new_name] if len(original_names) == 1 else [new_name + "[{}]".format(i) for i in range(len(original_names))] ) check_names(renamed, new_name, new_names) serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] check_names(deserialized, new_name, new_names) identity_original = ft.IdentityFeature(es["log"].ww["value"]) assert identity_original.get_name() == "value" value = ft.IdentityFeature(es["log"].ww["value"]) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, "customers", primitive) assert agg_original.get_name() == "MAX(log.value)" direct_original = ft.DirectFeature( ft.IdentityFeature(es["customers"].ww["age"]), "sessions" ) assert direct_original.get_name() == "customers.age" primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == "value * 2" zipcode = ft.IdentityFeature(es["log"].ww["zipcode"]) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode ) assert groupby_original.get_name() == "CUM_SUM(value) by zipcode" multioutput_original = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) assert multioutput_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)" featureslice_original = ft.feature_base.FeatureOutputSlice(multioutput_original, 0) assert featureslice_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)[0]" feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original, multioutput_original, featureslice_original, ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)