def test_generic_description(es): class NoName(TransformPrimitive): input_types = [ColumnSchema(semantic_tags={'category'})] output_type = ColumnSchema(semantic_tags={'category'}) def generate_name(self, base_feature_names): return u"%s(%s%s)" % ( 'NO_NAME', u", ".join(base_feature_names), self.get_args_string(), ) class CustomAgg(AggregationPrimitive): name = 'custom_aggregation' input_types = [ColumnSchema(semantic_tags={'category'})] output_type = ColumnSchema(semantic_tags={'category'}) class CustomTrans(TransformPrimitive): name = 'custom_transform' input_types = [ColumnSchema(semantic_tags={'category'})] output_type = ColumnSchema(semantic_tags={'category'}) no_name = TransformFeature(IdentityFeature(es['log'].ww['zipcode']), NoName) no_name_description = 'The result of applying NoName to the "zipcode".' assert describe_feature(no_name) == no_name_description custom_agg = AggregationFeature(IdentityFeature(es['log'].ww['zipcode']), 'customers', CustomAgg) custom_agg_description = 'The result of applying CUSTOM_AGGREGATION to the "zipcode" of all instances of "log" for each "id" in "customers".' assert describe_feature(custom_agg) == custom_agg_description custom_trans = TransformFeature(IdentityFeature(es['log'].ww['zipcode']), CustomTrans) custom_trans_description = 'The result of applying CUSTOM_TRANSFORM to the "zipcode".' assert describe_feature(custom_trans) == custom_trans_description
def test_aggregation_description_where(es): where_feature = TransformFeature(IdentityFeature(es['log'].ww['countrycode']), EqualScalar('US')) feature = AggregationFeature(IdentityFeature(es['log'].ww['value']), 'sessions', Mean, where=where_feature) description = 'The average of the "value" of all instances of "log" where the ' \ '"countrycode" is US for each "id" in "sessions".' assert describe_feature(feature) == description
def test_multioutput_description(es): n_most_common = NMostCommon(2) n_most_common_feature = AggregationFeature(IdentityFeature(es['log'].ww['zipcode']), 'sessions', n_most_common) first_most_common_slice = n_most_common_feature[0] second_most_common_slice = n_most_common_feature[1] n_most_common_base = 'The 2 most common values of the "zipcode" of all instances of "log" for each "id" in "sessions".' n_most_common_first = 'The most common value of the "zipcode" of all instances of "log" ' \ 'for each "id" in "sessions".' n_most_common_second = 'The 2nd most common value of the "zipcode" of all instances of ' \ '"log" for each "id" in "sessions".' assert describe_feature(n_most_common_feature) == n_most_common_base assert describe_feature(first_most_common_slice) == n_most_common_first assert describe_feature(second_most_common_slice) == n_most_common_second class CustomMultiOutput(TransformPrimitive): name = "custom_multioutput" input_types = [ColumnSchema(semantic_tags={'category'})] return_type = ColumnSchema(semantic_tags={'category'}) number_output_features = 4 custom_feat = TransformFeature(IdentityFeature(es['log'].ww['zipcode']), CustomMultiOutput) generic_base = 'The result of applying CUSTOM_MULTIOUTPUT to the "zipcode".' generic_first = 'The 1st output from applying CUSTOM_MULTIOUTPUT to the "zipcode".' generic_second = 'The 2nd output from applying CUSTOM_MULTIOUTPUT to the "zipcode".' assert describe_feature(custom_feat) == generic_base assert describe_feature(custom_feat[0]) == generic_first assert describe_feature(custom_feat[1]) == generic_second CustomMultiOutput.description_template = ['the multioutput of {}', 'the {nth_slice} multioutput part of {}'] template_base = 'The multioutput of the "zipcode".' template_first_slice = 'The 1st multioutput part of the "zipcode".' template_second_slice = 'The 2nd multioutput part of the "zipcode".' template_third_slice = 'The 3rd multioutput part of the "zipcode".' template_fourth_slice = 'The 4th multioutput part of the "zipcode".' assert describe_feature(custom_feat) == template_base assert describe_feature(custom_feat[0]) == template_first_slice assert describe_feature(custom_feat[1]) == template_second_slice assert describe_feature(custom_feat[2]) == template_third_slice assert describe_feature(custom_feat[3]) == template_fourth_slice CustomMultiOutput.description_template = ['the multioutput of {}', 'the primary multioutput part of {}', 'the secondary multioutput part of {}'] custom_base = 'The multioutput of the "zipcode".' custom_first_slice = 'The primary multioutput part of the "zipcode".' custom_second_slice = 'The secondary multioutput part of the "zipcode".' bad_slice_error = 'Slice out of range of template' assert describe_feature(custom_feat) == custom_base assert describe_feature(custom_feat[0]) == custom_first_slice assert describe_feature(custom_feat[1]) == custom_second_slice with pytest.raises(IndexError, match=bad_slice_error): describe_feature(custom_feat[2])
def test_aggregation_description_where(es): where_feature = TransformFeature( IdentityFeature(es["log"].ww["countrycode"]), EqualScalar("US") ) feature = AggregationFeature( IdentityFeature(es["log"].ww["value"]), "sessions", Mean, where=where_feature ) description = ( 'The average of the "value" of all instances of "log" where the ' '"countrycode" is US for each "id" in "sessions".' ) assert describe_feature(feature) == description
def test_stacked(es): trans_feat = TransformFeature(es['customers']['cancel_date'], Year) stacked = AggregationFeature(trans_feat, es['cohorts'], Mode) graph = graph_feature(stacked).source feat_name = stacked.get_name() intermediate_name = trans_feat.get_name() agg_primitive = '0_{}_mode'.format(feat_name) trans_primitive = '1_{}_year'.format(intermediate_name) groupby_node = '{}_groupby_customers--cohort'.format(feat_name) trans_prim_edge = 'customers:cancel_date -> "{}"'.format(trans_primitive) intermediate_edge = '"{}" -> customers:"{}"'.format( trans_primitive, intermediate_name) groupby_edge = 'customers:cohort -> "{}"'.format(groupby_node) groupby_input = 'customers:"{}" -> "{}"'.format(intermediate_name, groupby_node) agg_input = '"{}" -> "{}"'.format(groupby_node, agg_primitive) feat_edge = '"{}" -> cohorts:"{}"'.format(agg_primitive, feat_name) graph_components = [ feat_name, intermediate_name, agg_primitive, trans_primitive, groupby_node, trans_prim_edge, intermediate_edge, groupby_edge, groupby_input, agg_input, feat_edge ] for component in graph_components: assert component in graph agg_primitive = agg_primitive.replace('(', '\\(').replace(')', '\\)') agg_node = re.findall('"{}" \\[label.*'.format(agg_primitive), graph) assert len(agg_node) == 1 assert 'Step 2' in agg_node[0] trans_primitive = trans_primitive.replace('(', '\\(').replace(')', '\\)') trans_node = re.findall('"{}" \\[label.*'.format(trans_primitive), graph) assert len(trans_node) == 1 assert 'Step 1' in trans_node[0]
def test_transform(es): feat = TransformFeature(es['customers']['cancel_date'], Year) graph = graph_feature(feat).source feat_name = feat.get_name() prim_node = '0_{}_year'.format(feat_name) entity_table = '\u2605 customers (target)' prim_edge = 'customers:cancel_date -> "{}"'.format(prim_node) feat_edge = '"{}" -> customers:"{}"'.format(prim_node, feat_name) graph_components = [ feat_name, entity_table, prim_node, prim_edge, feat_edge ] for component in graph_components: assert component in graph matches = re.findall(r"customers \[label=<\n<TABLE.*?</TABLE>>", graph, re.DOTALL) assert len(matches) == 1 rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL) assert len(rows) == 3 to_match = ['customers', 'cancel_date', feat_name] for match, row in zip(to_match, rows): assert match in row
def _build_transform_features(self, all_features, entity, max_depth=0): """Creates trans_features for all the variables in an entity Args: all_features (dict[:class:`.Entity`.id:dict->[str->:class:`BaseFeature`]]): Dict containing a dict for each entity. Each nested dict has features as values with their ids as keys entity (Entity): Entity to calculate features for. """ if max_depth is not None and max_depth < 0: return new_max_depth = None if max_depth is not None: new_max_depth = max_depth - 1 self._add_identity_features(all_features, entity) for trans_prim in self.trans_primitives: # if multiple input_types, only use first one for DFS input_types = trans_prim.input_types if type(input_types[0]) == list: input_types = input_types[0] features = self._features_by_type(all_features=all_features, entity=entity, variable_type=set(input_types), max_depth=new_max_depth) matching_inputs = match(input_types, features, commutative=trans_prim.commutative) for matching_input in matching_inputs: if all(bf.number_output_features == 1 for bf in matching_input): new_f = TransformFeature(matching_input, primitive=trans_prim) self._handle_new_feature(all_features=all_features, new_feature=new_f)
def _build_transform_features(self, all_features, entity, max_depth=0, require_direct_input=False): """Creates trans_features for all the variables in an entity Args: all_features (dict[:class:`.Entity`.id:dict->[str->:class:`BaseFeature`]]): Dict containing a dict for each entity. Each nested dict has features as values with their ids as keys entity (Entity): Entity to calculate features for. """ new_max_depth = None if max_depth is not None: new_max_depth = max_depth - 1 for trans_prim in self.trans_primitives: current_options = self.primitive_options[trans_prim.name] if ignore_entity_for_primitive(current_options, entity): continue # if multiple input_types, only use first one for DFS input_types = trans_prim.input_types if type(input_types[0]) == list: input_types = input_types[0] matching_inputs = self._get_matching_inputs( all_features, entity, new_max_depth, input_types, trans_prim, current_options, require_direct_input=require_direct_input) for matching_input in matching_inputs: if all(bf.number_output_features == 1 for bf in matching_input): new_f = TransformFeature(matching_input, primitive=trans_prim) self._handle_new_feature(all_features=all_features, new_feature=new_f) for groupby_prim in self.groupby_trans_primitives: current_options = self.primitive_options[groupby_prim.name] if ignore_entity_for_primitive(current_options, entity, groupby=True): continue input_types = groupby_prim.input_types[:] # if multiple input_types, only use first one for DFS if type(input_types[0]) == list: input_types = input_types[0] matching_inputs = self._get_matching_inputs( all_features, entity, new_max_depth, input_types, groupby_prim, current_options) # get columns to use as groupbys, use IDs as default unless other groupbys specified if any([ 'include_groupby_variables' in option and entity.id in option['include_groupby_variables'] for option in current_options ]): default_type = variable_types.PandasTypes._all else: default_type = set([Id]) groupby_matches = self._features_by_type( all_features=all_features, entity=entity, max_depth=new_max_depth, variable_type=default_type) groupby_matches = filter_groupby_matches_by_options( groupby_matches, current_options) # If require_direct_input, require a DirectFeature in input or as a # groupby, and don't create features of inputs/groupbys which are # all direct features with the same relationship path for matching_input in matching_inputs: if all(bf.number_output_features == 1 for bf in matching_input): for groupby in groupby_matches: if require_direct_input and ( _all_direct_and_same_path(matching_input + (groupby, )) or not any([ isinstance(feature, DirectFeature) for feature in (matching_input + (groupby, )) ])): continue new_f = GroupByTransformFeature(list(matching_input), groupby=groupby[0], primitive=groupby_prim) self._handle_new_feature(all_features=all_features, new_feature=new_f)
def test_transform_description(es): feature = TransformFeature(IdentityFeature(es['log'].ww['value']), Absolute) description = 'The absolute value of the "value".' assert describe_feature(feature) == description
def _build_transform_features(self, all_features, entity, max_depth=0): """Creates trans_features for all the variables in an entity Args: all_features (dict[:class:`.Entity`.id:dict->[str->:class:`BaseFeature`]]): Dict containing a dict for each entity. Each nested dict has features as values with their ids as keys entity (Entity): Entity to calculate features for. """ if max_depth is not None and max_depth < 0: return new_max_depth = None if max_depth is not None: new_max_depth = max_depth - 1 self._add_identity_features(all_features, entity) for trans_prim in self.trans_primitives: # if multiple input_types, only use first one for DFS input_types = trans_prim.input_types if type(input_types[0]) == list: input_types = input_types[0] features = self._features_by_type(all_features=all_features, entity=entity, max_depth=new_max_depth, variable_type=set(input_types)) matching_inputs = match(input_types, features, commutative=trans_prim.commutative) for matching_input in matching_inputs: if all(bf.number_output_features == 1 for bf in matching_input): new_f = TransformFeature(matching_input, primitive=trans_prim) self._handle_new_feature(all_features=all_features, new_feature=new_f) for groupby_prim in self.groupby_trans_primitives: # Normally input_types is a list of what inputs can be supplied to # the primitive function. Here we temporarily add `Id` as an extra # item in input_types so that the matching function will also look # for feature columns to group by. input_types = groupby_prim.input_types[:] # if multiple input_types, only use first one for DFS if type(input_types[0]) == list: input_types = input_types[0] input_types.append(Id) features = self._features_by_type(all_features=all_features, entity=entity, max_depth=new_max_depth, variable_type=set(input_types)) matching_inputs = match(input_types, features, commutative=groupby_prim.commutative) for matching_input in matching_inputs: if all(bf.number_output_features == 1 for bf in matching_input): new_f = GroupByTransformFeature(list(matching_input[:-1]), groupby=matching_input[-1], primitive=groupby_prim) self._handle_new_feature(all_features=all_features, new_feature=new_f)
def _build_transform_features(self, all_features, dataframe, max_depth=0, require_direct_input=False): """Creates trans_features for all the columns in a dataframe Args: all_features (dict[dataframe name: dict->[str->:class:`BaseFeature`]]): Dict containing a dict for each dataframe. Each nested dict has features as values with their ids as keys dataframe (DataFrame): DataFrame to calculate features for. """ new_max_depth = None if max_depth is not None: new_max_depth = max_depth - 1 # Keep track of features to add until the end to avoid applying # transform primitives to features that were also built by transform primitives features_to_add = [] for trans_prim in self.trans_primitives: current_options = self.primitive_options.get( trans_prim, self.primitive_options.get(trans_prim.name)) if ignore_dataframe_for_primitive(current_options, dataframe): continue input_types = trans_prim.input_types matching_inputs = self._get_matching_inputs( all_features, dataframe, new_max_depth, input_types, trans_prim, current_options, require_direct_input=require_direct_input, ) for matching_input in matching_inputs: if all(bf.number_output_features == 1 for bf in matching_input) and check_transform_stacking( matching_input): new_f = TransformFeature(matching_input, primitive=trans_prim) features_to_add.append(new_f) for groupby_prim in self.groupby_trans_primitives: current_options = self.primitive_options.get( groupby_prim, self.primitive_options.get(groupby_prim.name)) if ignore_dataframe_for_primitive(current_options, dataframe, groupby=True): continue input_types = groupby_prim.input_types[:] matching_inputs = self._get_matching_inputs( all_features, dataframe, new_max_depth, input_types, groupby_prim, current_options, ) # get columns to use as groupbys, use IDs as default unless other groupbys specified if any([ "include_groupby_columns" in option and dataframe.ww.name in option["include_groupby_columns"] for option in current_options ]): column_schemas = "all" else: column_schemas = [ColumnSchema(semantic_tags=["foreign_key"])] groupby_matches = self._features_by_type( all_features=all_features, dataframe=dataframe, max_depth=new_max_depth, column_schemas=column_schemas, ) groupby_matches = filter_groupby_matches_by_options( groupby_matches, current_options) # If require_direct_input, require a DirectFeature in input or as a # groupby, and don't create features of inputs/groupbys which are # all direct features with the same relationship path for matching_input in matching_inputs: if all(bf.number_output_features == 1 for bf in matching_input) and check_transform_stacking( matching_input): for groupby in groupby_matches: if require_direct_input and ( _all_direct_and_same_path(matching_input + (groupby, )) or not any([ isinstance(feature, DirectFeature) for feature in (matching_input + (groupby, )) ])): continue new_f = GroupByTransformFeature( list(matching_input), groupby=groupby[0], primitive=groupby_prim, ) features_to_add.append(new_f) for new_f in features_to_add: self._handle_new_feature(all_features=all_features, new_feature=new_f)
def _build_transform_features(self, all_features, entity, max_depth=0, require_direct_input=False): """Creates trans_features for all the variables in an entity Args: all_features (dict[:class:`.Entity`.id:dict->[str->:class:`BaseFeature`]]): Dict containing a dict for each entity. Each nested dict has features as values with their ids as keys entity (Entity): Entity to calculate features for. """ new_max_depth = None if max_depth is not None: new_max_depth = max_depth - 1 for trans_prim in self.trans_primitives: # if multiple input_types, only use first one for DFS input_types = trans_prim.input_types if type(input_types[0]) == list: input_types = input_types[0] matching_inputs = self._get_matching_inputs( all_features, entity, new_max_depth, input_types, trans_prim, require_direct_input=require_direct_input) for matching_input in matching_inputs: if all(bf.number_output_features == 1 for bf in matching_input): new_f = TransformFeature(matching_input, primitive=trans_prim) self._handle_new_feature(all_features=all_features, new_feature=new_f) for groupby_prim in self.groupby_trans_primitives: input_types = groupby_prim.input_types[:] # if multiple input_types, only use first one for DFS if type(input_types[0]) == list: input_types = input_types[0] matching_inputs = self._get_matching_inputs( all_features, entity, new_max_depth, input_types, groupby_prim, require_direct_input=require_direct_input) # get IDs to use as groupby id_matches = self._features_by_type(all_features=all_features, entity=entity, max_depth=new_max_depth, variable_type=set([Id])) for matching_input in matching_inputs: if all(bf.number_output_features == 1 for bf in matching_input): for id_groupby in id_matches: new_f = GroupByTransformFeature(list(matching_input), groupby=id_groupby, primitive=groupby_prim) self._handle_new_feature(all_features=all_features, new_feature=new_f)
def trans_feat(es): return TransformFeature(IdentityFeature(es['customers'].ww['cancel_date']), Year)
def trans_feat(es): return TransformFeature(es['customers']['cancel_date'], Year)
def trans_feat(es): return TransformFeature(IdentityFeature(es["customers"].ww["cancel_date"]), Year)