def test_inplace_encodes_features(es): f1 = IdentityFeature(es["log"]["product_id"]) features = [f1] feature_matrix = calculate_feature_matrix(features, es, instance_ids=[0, 1, 2, 3, 4, 5]) feature_matrix_shape = feature_matrix.shape feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features) assert feature_matrix_encoded.shape != feature_matrix_shape assert feature_matrix.shape == feature_matrix_shape # inplace they should be the same feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, inplace=True) assert feature_matrix_encoded.shape == feature_matrix.shape
def test_to_encode_features(es): f1 = IdentityFeature(es["log"]["product_id"]) f2 = IdentityFeature(es["log"]["value"]) features = [f1, f2] feature_matrix = calculate_feature_matrix(features, es, instance_ids=[0, 1, 2, 3, 4, 5]) feature_matrix_encoded, features_encoded = encode_features( feature_matrix, features) feature_matrix_encoded_shape = feature_matrix_encoded.shape # to_encode should keep product_id as a string, and not create 3 additional columns to_encode = [] feature_matrix_encoded, features_encoded = encode_features( feature_matrix, features, to_encode=to_encode) assert feature_matrix_encoded_shape != feature_matrix_encoded.shape to_encode = ['value'] feature_matrix_encoded, features_encoded = encode_features( feature_matrix, features, to_encode=to_encode) assert feature_matrix_encoded_shape != feature_matrix_encoded.shape
def test_direct(es): d1 = DirectFeature(IdentityFeature(es['customers'].ww['engagement_level']), 'sessions') d2 = DirectFeature(d1, 'log') graph = graph_feature(d2).source d1_name = d1.get_name() d2_name = d2.get_name() prim_node1 = '1_{}_join'.format(d1_name) prim_node2 = '0_{}_join'.format(d2_name) log_table = '\u2605 log (target)' sessions_table = 'sessions' customers_table = 'customers' groupby_edge1 = '"{}" -> sessions:customer_id'.format(prim_node1) groupby_edge2 = '"{}" -> log:session_id'.format(prim_node2) groupby_input1 = 'customers:engagement_level -> "{}"'.format(prim_node1) groupby_input2 = 'sessions:"{}" -> "{}"'.format(d1_name, prim_node2) d1_edge = '"{}" -> sessions:"{}"'.format(prim_node1, d1_name) d2_edge = '"{}" -> log:"{}"'.format(prim_node2, d2_name) graph_components = [ d1_name, d2_name, prim_node1, prim_node2, log_table, sessions_table, customers_table, groupby_edge1, groupby_edge2, groupby_input1, groupby_input2, d1_edge, d2_edge ] for component in graph_components: assert component in graph dataframes = { 'customers': [customers_table, 'engagement_level'], 'sessions': [sessions_table, 'customer_id', d1_name], 'log': [log_table, 'session_id', d2_name] } for dataframe in dataframes: regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe) matches = re.findall(regex, graph, re.DOTALL) assert len(matches) == 1 rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL) assert len(rows) == len(dataframes[dataframe]) for row in rows: matched = False for i in dataframes[dataframe]: if i in row: matched = True dataframes[dataframe].remove(i) break assert matched
def test_copy_features_does_not_copy_entityset(es): agg = ft.Feature(es['log']['value'], parent_entity=es['sessions'], primitive=Sum) agg_where = ft.Feature(es['log']['value'], parent_entity=es['sessions'], where=IdentityFeature(es['log']['value']) == 2, primitive=Sum) agg_use_previous = ft.Feature(es['log']['value'], parent_entity=es['sessions'], use_previous='4 days', primitive=Sum) agg_use_previous_where = ft.Feature(es['log']['value'], parent_entity=es['sessions'], where=IdentityFeature( es['log']['value']) == 2, use_previous='4 days', primitive=Sum) features = [agg, agg_where, agg_use_previous, agg_use_previous_where] in_memory_size = asizeof(locals()) copied = [f.copy() for f in features] new_in_memory_size = asizeof(locals()) assert new_in_memory_size < 2 * in_memory_size
def test_aggregation(es): feat = AggregationFeature(IdentityFeature(es["log"].ww["id"]), "sessions", Count) graph = graph_feature(feat).source feat_name = feat.get_name() prim_node = "0_{}_count".format(feat_name) groupby_node = "{}_groupby_log--session_id".format(feat_name) sessions_table = "\u2605 sessions (target)" log_table = "log" groupby_edge = 'log:session_id -> "{}"'.format(groupby_node) groupby_input = 'log:id -> "{}"'.format(groupby_node) prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node) feat_edge = '"{}" -> sessions:"{}"'.format(prim_node, feat_name) graph_components = [ feat_name, prim_node, groupby_node, sessions_table, log_table, groupby_edge, groupby_input, prim_input, feat_edge, ] for component in graph_components: assert component in graph dataframes = { "log": [log_table, "id", "session_id"], "sessions": [sessions_table, feat_name], } for dataframe in dataframes: regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe) matches = re.findall(regex, graph, re.DOTALL) assert len(matches) == 1 rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL) assert len(rows) == len(dataframes[dataframe]) for row in rows: matched = False for i in dataframes[dataframe]: if i in row: matched = True dataframes[dataframe].remove(i) break assert matched
def test_make_agg_feat_where_count(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], where=IdentityFeature( es['log']['product_id']) == 'coke zero', primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[agg_feat.get_name()][0] assert (v == 3)
def test_make_agg_feat_where_count(es): agg_feat = ft.Feature( es["log"].ww["id"], parent_dataframe_name="sessions", where=IdentityFeature(es["log"].ww["product_id"]) == "coke zero", primitive=Count, ) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0]))) v = df[agg_feat.get_name()][0] assert v == 3
def test_to_dictionary_direct(es): actual = ft.Feature(IdentityFeature(es["sessions"].ww["customer_id"]), "log").to_dictionary() expected = { 'type': 'DirectFeature', 'dependencies': ['sessions: customer_id'], 'arguments': {'name': None, 'base_feature': 'sessions: customer_id', 'relationship': {'parent_dataframe_name': 'sessions', 'child_dataframe_name': 'log', 'parent_column_name': 'id', 'child_column_name': 'session_id'} } } assert expected == actual
def test_direct_of_multi_output_transform_feat(es): # TODO: Update to work with Dask and Spark if es.dataframe_type != Library.PANDAS.value: pytest.xfail("Custom primitive is not compatible with Dask or Spark") class TestTime(TransformPrimitive): name = "test_time" input_types = [ColumnSchema(logical_type=Datetime)] return_type = ColumnSchema(semantic_tags={"numeric"}) number_output_features = 6 def get_function(self): def test_f(x): times = pd.Series(x) units = ["year", "month", "day", "hour", "minute", "second"] return [ times.apply(lambda x: getattr(x, unit)) for unit in units ] return test_f base_feature = IdentityFeature(es["customers"].ww["signup_date"]) join_time_split = Feature(base_feature, primitive=TestTime) alt_features = [ Feature(base_feature, primitive=Year), Feature(base_feature, primitive=Month), Feature(base_feature, primitive=Day), Feature(base_feature, primitive=Hour), Feature(base_feature, primitive=Minute), Feature(base_feature, primitive=Second), ] fm, fl = dfs( entityset=es, target_dataframe_name="sessions", trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second], ) # Get column names of for multi feature and normal features subnames = DirectFeature(join_time_split, "sessions").get_feature_names() altnames = [DirectFeature(f, "sessions").get_name() for f in alt_features] # Check values are equal between for col1, col2 in zip(subnames, altnames): assert (fm[col1] == fm[col2]).all()
def test_multioutput(es): multioutput = AggregationFeature(IdentityFeature(es['log'].ww['zipcode']), 'sessions', NMostCommon) feat = FeatureOutputSlice(multioutput, 0) graph = graph_feature(feat).source feat_name = feat.get_name() prim_node = '0_{}_n_most_common'.format(multioutput.get_name()) groupby_node = '{}_groupby_log--session_id'.format(multioutput.get_name()) sessions_table = '\u2605 sessions (target)' log_table = 'log' groupby_edge = 'log:session_id -> "{}"'.format(groupby_node) groupby_input = 'log:zipcode -> "{}"'.format(groupby_node) prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node) feat_edge = '"{}" -> sessions:"{}"'.format(prim_node, feat_name) graph_components = [ feat_name, prim_node, groupby_node, sessions_table, log_table, groupby_edge, groupby_input, prim_input, feat_edge ] for component in graph_components: assert component in graph dataframes = { 'log': [log_table, 'zipcode', 'session_id'], 'sessions': [sessions_table, feat_name] } for dataframe in dataframes: regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe) matches = re.findall(regex, graph, re.DOTALL) assert len(matches) == 1 rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL) assert len(rows) == len(dataframes[dataframe]) for row in rows: matched = False for i in dataframes[dataframe]: if i in row: matched = True dataframes[dataframe].remove(i) break assert matched
def test_to_dictionary_direct(es): actual = ft.Feature( IdentityFeature(es["sessions"].ww["customer_id"]), "log" ).to_dictionary() expected = { "type": "DirectFeature", "dependencies": ["sessions: customer_id"], "arguments": { "name": "sessions.customer_id", "base_feature": "sessions: customer_id", "relationship": { "parent_dataframe_name": "sessions", "child_dataframe_name": "log", "parent_column_name": "id", "child_column_name": "session_id", }, }, } assert expected == actual
def test_make_agg_feat_multiple_dtypes(entityset, backend): compare_prod = IdentityFeature(entityset['log']['product_id']) == 'coke zero' agg_feat = ft.Feature(entityset['log']['id'], parent_entity=entityset['sessions'], where=compare_prod, primitive=Count) agg_feat2 = ft.Feature(entityset['log']['product_id'], parent_entity=entityset['sessions'], where=compare_prod, primitive=Mode) pandas_backend = backend([agg_feat, agg_feat2]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[agg_feat.get_name()][0] v2 = df[agg_feat2.get_name()][0] assert (v == 3) assert (v2 == 'coke zero')
def test_make_agg_feat_where_count_or_device_type_feat(entityset, backend): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ log_count_feat = ft.Feature(entityset['log']['id'], parent_entity=entityset['sessions'], primitive=Count) compare_count = log_count_feat > 1 compare_device_type = IdentityFeature(entityset['sessions']['device_type']) == 1 or_feat = compare_count.OR(compare_device_type) feat = ft.Feature(entityset['sessions']['id'], parent_entity=entityset['customers'], where=or_feat, primitive=Count) pandas_backend = backend([feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) name = feat.get_name() instances = df[name] assert (instances[0] == 3)
def _add_identity_features(self, all_features, entity): """converts all variables from the given entity into features Args: all_features (dict[Entity.id -> dict[str -> BaseFeature]]): Dict containing a dict for each entity. Each nested dict has features as values with their ids as keys. entity (Entity): Entity to calculate features for. """ variables = entity.variables for v in variables: new_f = IdentityFeature(variable=v) self._handle_new_feature(all_features=all_features, new_feature=new_f) # add seed features, if any, for dfs to build on top of # if there are any multi output features, this will build on # top of each output of the feature. for f in self.seed_features: if f.entity.id == entity.id: self._handle_new_feature(all_features=all_features, new_feature=f)
def test_make_agg_feat_multiple_dtypes(es): compare_prod = IdentityFeature(es['log']['product_id']) == 'coke zero' agg_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], where=compare_prod, primitive=Count) agg_feat2 = ft.Feature(es['log']['product_id'], parent_entity=es['sessions'], where=compare_prod, primitive=Mode) feature_set = FeatureSet([agg_feat, agg_feat2]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] v2 = df[agg_feat2.get_name()][0] assert (v == 3) assert (v2 == 'coke zero')
def test_make_dfeat_of_agg_feat_through_parent(entityset, backend): """ The graph looks like this: R C = Customers, the entity we're trying to predict on / \\ R = Regions, a parent of customers S C S = Stores, a child of regions | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on S. """ store_id_feat = IdentityFeature(entityset['stores']['id']) store_count_feat = ft.Feature(store_id_feat, parent_entity=entityset[u'régions'], primitive=Count) num_stores_feat = DirectFeature(store_count_feat, child_entity=entityset['customers']) pandas_backend = backend([num_stores_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[num_stores_feat.get_name()][0] assert (v == 3)
def test_make_agg_feat_where_count_or_device_type_feat(es): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) compare_count = log_count_feat > 1 compare_device_type = IdentityFeature(es['sessions']['device_type']) == 1 or_feat = compare_count.OR(compare_device_type) feat = ft.Feature(es['sessions']['id'], parent_entity=es['customers'], where=or_feat, primitive=Count) feature_set = FeatureSet([feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) name = feat.get_name() instances = df[name] assert (instances[0] == 3)
def _add_identity_features(self, all_features, dataframe): """converts all columns from the given dataframe into features Args: all_features (dict[dataframe name -> dict[str -> BaseFeature]]): Dict containing a dict for each dataframe. Each nested dict has features as values with their ids as keys. dataframe (DataFrame): DataFrame to calculate features for. """ for col in dataframe.columns: if col in self.ignore_columns[ dataframe.ww.name] or col == LTI_COLUMN_NAME: continue new_f = IdentityFeature(self.es[dataframe.ww.name].ww[col]) self._handle_new_feature(all_features=all_features, new_feature=new_f) # add seed features, if any, for dfs to build on top of # if there are any multi output features, this will build on # top of each output of the feature. for f in self.seed_features: if f.dataframe_name == dataframe.ww.name: self._handle_new_feature(all_features=all_features, new_feature=f)
def test_encode_features_handles_dictionary_input(es): f1 = IdentityFeature(es["log"]["product_id"]) f2 = IdentityFeature(es["log"]["purchased"]) f3 = IdentityFeature(es["log"]["session_id"]) features = [f1, f2, f3] feature_matrix = calculate_feature_matrix(features, es, instance_ids=range(16)) feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features) true_values = ['product_id = coke zero', 'product_id = toothpaste', 'product_id = car', 'product_id = brown bag', 'product_id = taco clock', 'product_id = Haribo sugar-free gummy bears', 'product_id is unknown', 'purchased', 'session_id = 0', 'session_id = 1', 'session_id = 4', 'session_id = 3', 'session_id = 5', 'session_id = 2', 'session_id is unknown'] assert len(features_encoded) == 15 for col in true_values: assert col in list(feature_matrix_encoded.columns) top_n_dict = {} feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, top_n=top_n_dict) assert len(features_encoded) == 15 for col in true_values: assert col in list(feature_matrix_encoded.columns) top_n_dict = {f1.get_name(): 4, f3.get_name(): 3} feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, top_n=top_n_dict) assert len(features_encoded) == 10 true_values = ['product_id = coke zero', 'product_id = toothpaste', 'product_id = car', 'product_id = brown bag', 'product_id is unknown', 'purchased', 'session_id = 0', 'session_id = 1', 'session_id = 4', 'session_id is unknown'] for col in true_values: assert col in list(feature_matrix_encoded.columns) feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, top_n=top_n_dict, include_unknown=False) true_values = ['product_id = coke zero', 'product_id = toothpaste', 'product_id = car', 'product_id = brown bag', 'purchased', 'session_id = 0', 'session_id = 1', 'session_id = 4'] assert len(features_encoded) == 8 for col in true_values: assert col in list(feature_matrix_encoded.columns)
def test_variable_description(es): variable_description = 'the name of the device used for each session' es['sessions']['device_name'].description = variable_description identity_feat = IdentityFeature(es['sessions']['device_name']) assert describe_feature(identity_feat) == variable_description[0].upper( ) + variable_description[1:] + '.'
def test_return_type_inference_direct_feature(es): mode = ft.Feature(es["log"].ww["priority_level"], parent_dataframe_name="customers", primitive=Mode) mode_session = ft.Feature(mode, "sessions") assert mode_session.column_schema == IdentityFeature(es["log"].ww["priority_level"]).column_schema
def test_aggregation_description_use_previous(es): feature = AggregationFeature(IdentityFeature(es['log'].ww['value']), 'sessions', Mean, use_previous='5d') description = 'The average of the "value" of the previous 5 days of "log" for each "id" in "sessions".' assert describe_feature(feature) == description
def test_groupby_transform_description(es): feature = GroupByTransformFeature(IdentityFeature(es['log'].ww['value']), CumMean, IdentityFeature(es['log'].ww['session_id'])) description = 'The cumulative mean of the "value" for each "session_id".' assert describe_feature(feature) == description
def test_transform_description(es): feature = TransformFeature(IdentityFeature(es['log'].ww['value']), Absolute) description = 'The absolute value of the "value".' assert describe_feature(feature) == description
def test_direct_with_single_possible_path(es): feat = DirectFeature(IdentityFeature(es['customers'].ww['age']), 'sessions') assert feat.relationship_path_name() == 'customers' assert feat.get_name() == 'customers.age'
def test_multioutput_description(es): n_most_common = NMostCommon(2) n_most_common_feature = AggregationFeature( IdentityFeature(es["log"].ww["zipcode"]), "sessions", n_most_common ) first_most_common_slice = n_most_common_feature[0] second_most_common_slice = n_most_common_feature[1] n_most_common_base = 'The 2 most common values of the "zipcode" of all instances of "log" for each "id" in "sessions".' n_most_common_first = ( 'The most common value of the "zipcode" of all instances of "log" ' 'for each "id" in "sessions".' ) n_most_common_second = ( 'The 2nd most common value of the "zipcode" of all instances of ' '"log" for each "id" in "sessions".' ) assert describe_feature(n_most_common_feature) == n_most_common_base assert describe_feature(first_most_common_slice) == n_most_common_first assert describe_feature(second_most_common_slice) == n_most_common_second class CustomMultiOutput(TransformPrimitive): name = "custom_multioutput" input_types = [ColumnSchema(semantic_tags={"category"})] return_type = ColumnSchema(semantic_tags={"category"}) number_output_features = 4 custom_feat = TransformFeature( IdentityFeature(es["log"].ww["zipcode"]), CustomMultiOutput ) generic_base = 'The result of applying CUSTOM_MULTIOUTPUT to the "zipcode".' generic_first = 'The 1st output from applying CUSTOM_MULTIOUTPUT to the "zipcode".' generic_second = 'The 2nd output from applying CUSTOM_MULTIOUTPUT to the "zipcode".' assert describe_feature(custom_feat) == generic_base assert describe_feature(custom_feat[0]) == generic_first assert describe_feature(custom_feat[1]) == generic_second CustomMultiOutput.description_template = [ "the multioutput of {}", "the {nth_slice} multioutput part of {}", ] template_base = 'The multioutput of the "zipcode".' template_first_slice = 'The 1st multioutput part of the "zipcode".' template_second_slice = 'The 2nd multioutput part of the "zipcode".' template_third_slice = 'The 3rd multioutput part of the "zipcode".' template_fourth_slice = 'The 4th multioutput part of the "zipcode".' assert describe_feature(custom_feat) == template_base assert describe_feature(custom_feat[0]) == template_first_slice assert describe_feature(custom_feat[1]) == template_second_slice assert describe_feature(custom_feat[2]) == template_third_slice assert describe_feature(custom_feat[3]) == template_fourth_slice CustomMultiOutput.description_template = [ "the multioutput of {}", "the primary multioutput part of {}", "the secondary multioutput part of {}", ] custom_base = 'The multioutput of the "zipcode".' custom_first_slice = 'The primary multioutput part of the "zipcode".' custom_second_slice = 'The secondary multioutput part of the "zipcode".' bad_slice_error = "Slice out of range of template" assert describe_feature(custom_feat) == custom_base assert describe_feature(custom_feat[0]) == custom_first_slice assert describe_feature(custom_feat[1]) == custom_second_slice with pytest.raises(IndexError, match=bad_slice_error): describe_feature(custom_feat[2])
def simple_feat(es): return IdentityFeature(es['log']['id'])
def test_column_description(es): column_description = 'the name of the device used for each session' es['sessions'].ww.columns['device_name'].description = column_description identity_feat = IdentityFeature(es['sessions'].ww['device_name']) assert describe_feature(identity_feat) == column_description[0].upper() + column_description[1:] + '.'
def test_metadata(es, tmpdir): identity_feature_descriptions = { "sessions: device_name": "the name of the device used for each session", "customers: id": "the customer's id", } agg_feat = AggregationFeature( IdentityFeature(es["sessions"].ww["device_name"]), "customers", NumUnique ) agg_description = ( "The number of unique elements in the name of the device used for each " 'session of all instances of "sessions" for each customer\'s id.' ) assert ( describe_feature(agg_feat, feature_descriptions=identity_feature_descriptions) == agg_description ) transform_feat = GroupByTransformFeature( IdentityFeature(es["log"].ww["value"]), CumMean, IdentityFeature(es["log"].ww["session_id"]), ) transform_description = 'The running average of the "value" for each "session_id".' primitive_templates = {"cum_mean": "the running average of {}"} assert ( describe_feature(transform_feat, primitive_templates=primitive_templates) == transform_description ) custom_agg = AggregationFeature( IdentityFeature(es["log"].ww["zipcode"]), "sessions", Mode ) auto_description = 'The most frequently occurring value of the "zipcode" of all instances of "log" for each "id" in "sessions".' custom_agg_description = "the most frequently used zipcode" custom_feature_description = ( custom_agg_description[0].upper() + custom_agg_description[1:] + "." ) feature_description_dict = {"sessions: MODE(log.zipcode)": custom_agg_description} assert describe_feature(custom_agg) == auto_description assert ( describe_feature(custom_agg, feature_descriptions=feature_description_dict) == custom_feature_description ) metadata = { "feature_descriptions": { **identity_feature_descriptions, **feature_description_dict, }, "primitive_templates": primitive_templates, } metadata_path = os.path.join(tmpdir, "description_metadata.json") with open(metadata_path, "w") as f: json.dump(metadata, f) assert describe_feature(agg_feat, metadata_file=metadata_path) == agg_description assert ( describe_feature(transform_feat, metadata_file=metadata_path) == transform_description ) assert ( describe_feature(custom_agg, metadata_file=metadata_path) == custom_feature_description )
def test_identity_description(es): feature = IdentityFeature(es['log'].ww['session_id']) description = 'The "session_id".' assert describe_feature(feature) == description