Exemplo n.º 1
0
def test_groupby_transform(es):
    feat = GroupByTransformFeature(es['customers']['age'], CumMax,
                                   es['customers']['cohort'])
    graph = graph_feature(feat).source

    feat_name = feat.get_name()
    prim_node = "0_{}_cum_max".format(feat_name)
    groupby_node = '{}_groupby_customers--cohort'.format(feat_name)
    entity_table = '\u2605 customers (target)'

    groupby_edge = 'customers:cohort -> "{}"'.format(groupby_node)
    groupby_input = 'customers:age -> "{}"'.format(groupby_node)
    prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node)
    feat_edge = '"{}" -> customers:"{}"'.format(prim_node, feat_name)

    graph_components = [
        feat_name, prim_node, groupby_node, entity_table, groupby_edge,
        groupby_input, prim_input, feat_edge
    ]
    for component in graph_components:
        assert component in graph

    matches = re.findall(r"customers \[label=<\n<TABLE.*?</TABLE>>", graph,
                         re.DOTALL)
    assert len(matches) == 1
    rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL)
    assert len(rows) == 4
    assert entity_table in rows[0]
    assert feat_name in rows[-1]
    assert ('age' in rows[1] and 'cohort' in rows[2]) or \
           ('age' in rows[2] and 'cohort' in rows[1])
def test_groupby_transform_direct_groupby(es):
    groupby = DirectFeature(IdentityFeature(es['cohorts'].ww['cohort_name']),
                            'customers')
    feat = GroupByTransformFeature(IdentityFeature(es['customers'].ww['age']),
                                   CumMax, groupby)
    graph = graph_feature(feat).source

    groupby_name = groupby.get_name()
    feat_name = feat.get_name()
    join_node = '1_{}_join'.format(groupby_name)
    prim_node = "0_{}_cum_max".format(feat_name)
    groupby_node = '{}_groupby_customers--{}'.format(feat_name, groupby_name)
    customers_table = '\u2605 customers (target)'
    cohorts_table = 'cohorts'

    join_groupby = '"{}" -> customers:cohort'.format(join_node)
    join_input = 'cohorts:cohort_name -> "{}"'.format(join_node)
    join_out_edge = '"{}" -> customers:"{}"'.format(join_node, groupby_name)
    groupby_edge = 'customers:"{}" -> "{}"'.format(groupby_name, groupby_node)
    groupby_input = 'customers:age -> "{}"'.format(groupby_node)
    prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node)
    feat_edge = '"{}" -> customers:"{}"'.format(prim_node, feat_name)

    graph_components = [
        groupby_name, feat_name, join_node, prim_node, groupby_node,
        customers_table, cohorts_table, join_groupby, join_input,
        join_out_edge, groupby_edge, groupby_input, prim_input, feat_edge
    ]
    for component in graph_components:
        assert component in graph

    dataframes = {
        'cohorts': [cohorts_table, 'cohort_name'],
        'customers':
        [customers_table, 'cohort', 'age', groupby_name, feat_name]
    }
    for dataframe in dataframes:
        regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe)
        matches = re.findall(regex, graph, re.DOTALL)
        assert len(matches) == 1

        rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL)
        assert len(rows) == len(dataframes[dataframe])

        for row in rows:
            matched = False
            for i in dataframes[dataframe]:
                if i in row:
                    matched = True
                    dataframes[dataframe].remove(i)
                    break
            assert matched
def test_metadata(es, tmpdir):
    identity_feature_descriptions = {'sessions: device_name': 'the name of the device used for each session',
                                     'customers: id': "the customer's id"}
    agg_feat = AggregationFeature(IdentityFeature(es['sessions'].ww['device_name']), 'customers', NumUnique)
    agg_description = 'The number of unique elements in the name of the device used for each '\
                      'session of all instances of "sessions" for each customer\'s id.'
    assert describe_feature(agg_feat, feature_descriptions=identity_feature_descriptions) == agg_description

    transform_feat = GroupByTransformFeature(IdentityFeature(es['log'].ww['value']), CumMean, IdentityFeature(es['log'].ww['session_id']))
    transform_description = 'The running average of the "value" for each "session_id".'
    primitive_templates = {"cum_mean": "the running average of {}"}
    assert describe_feature(transform_feat, primitive_templates=primitive_templates) == transform_description

    custom_agg = AggregationFeature(IdentityFeature(es['log'].ww['zipcode']), 'sessions', Mode)
    auto_description = 'The most frequently occurring value of the "zipcode" of all instances of "log" for each "id" in "sessions".'
    custom_agg_description = "the most frequently used zipcode"
    custom_feature_description = custom_agg_description[0].upper() + custom_agg_description[1:] + '.'
    feature_description_dict = {'sessions: MODE(log.zipcode)': custom_agg_description}
    assert describe_feature(custom_agg) == auto_description
    assert describe_feature(custom_agg, feature_descriptions=feature_description_dict) == custom_feature_description

    metadata = {
        'feature_descriptions': {**identity_feature_descriptions, **feature_description_dict},
        'primitive_templates': primitive_templates
    }
    metadata_path = os.path.join(tmpdir, 'description_metadata.json')
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f)
    assert describe_feature(agg_feat, metadata_file=metadata_path) == agg_description
    assert describe_feature(transform_feat, metadata_file=metadata_path) == transform_description
    assert describe_feature(custom_agg, metadata_file=metadata_path) == custom_feature_description
Exemplo n.º 4
0
def test_groupby_transform_description(es):
    feature = GroupByTransformFeature(
        IdentityFeature(es["log"].ww["value"]),
        CumMean,
        IdentityFeature(es["log"].ww["session_id"]),
    )
    description = 'The cumulative mean of the "value" for each "session_id".'

    assert describe_feature(feature) == description
Exemplo n.º 5
0
def test_groupby_transform(es):
    feat = GroupByTransformFeature(
        IdentityFeature(es["customers"].ww["age"]),
        CumMax,
        IdentityFeature(es["customers"].ww["cohort"]),
    )
    graph = graph_feature(feat).source

    feat_name = feat.get_name()
    prim_node = "0_{}_cum_max".format(feat_name)
    groupby_node = "{}_groupby_customers--cohort".format(feat_name)
    dataframe_table = "\u2605 customers (target)"

    groupby_edge = 'customers:cohort -> "{}"'.format(groupby_node)
    groupby_input = 'customers:age -> "{}"'.format(groupby_node)
    prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node)
    feat_edge = '"{}" -> customers:"{}"'.format(prim_node, feat_name)

    graph_components = [
        feat_name,
        prim_node,
        groupby_node,
        dataframe_table,
        groupby_edge,
        groupby_input,
        prim_input,
        feat_edge,
    ]
    for component in graph_components:
        assert component in graph

    matches = re.findall(r"customers \[label=<\n<TABLE.*?</TABLE>>", graph, re.DOTALL)
    assert len(matches) == 1
    rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL)
    assert len(rows) == 4
    assert dataframe_table in rows[0]
    assert feat_name in rows[-1]
    assert ("age" in rows[1] and "cohort" in rows[2]) or (
        "age" in rows[2] and "cohort" in rows[1]
    )
    def _build_transform_features(self,
                                  all_features,
                                  entity,
                                  max_depth=0,
                                  require_direct_input=False):
        """Creates trans_features for all the variables in an entity

        Args:
            all_features (dict[:class:`.Entity`.id:dict->[str->:class:`BaseFeature`]]):
                Dict containing a dict for each entity. Each nested dict
                has features as values with their ids as keys

          entity (Entity): Entity to calculate features for.
        """
        new_max_depth = None
        if max_depth is not None:
            new_max_depth = max_depth - 1

        for trans_prim in self.trans_primitives:
            current_options = self.primitive_options[trans_prim.name]
            if ignore_entity_for_primitive(current_options, entity):
                continue
            # if multiple input_types, only use first one for DFS
            input_types = trans_prim.input_types
            if type(input_types[0]) == list:
                input_types = input_types[0]

            matching_inputs = self._get_matching_inputs(
                all_features,
                entity,
                new_max_depth,
                input_types,
                trans_prim,
                current_options,
                require_direct_input=require_direct_input)

            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input):
                    new_f = TransformFeature(matching_input,
                                             primitive=trans_prim)
                    self._handle_new_feature(all_features=all_features,
                                             new_feature=new_f)

        for groupby_prim in self.groupby_trans_primitives:
            current_options = self.primitive_options[groupby_prim.name]
            if ignore_entity_for_primitive(current_options,
                                           entity,
                                           groupby=True):
                continue
            input_types = groupby_prim.input_types[:]
            # if multiple input_types, only use first one for DFS
            if type(input_types[0]) == list:
                input_types = input_types[0]
            matching_inputs = self._get_matching_inputs(
                all_features, entity, new_max_depth, input_types, groupby_prim,
                current_options)

            # get columns to use as groupbys, use IDs as default unless other groupbys specified
            if any([
                    'include_groupby_variables' in option
                    and entity.id in option['include_groupby_variables']
                    for option in current_options
            ]):
                default_type = variable_types.PandasTypes._all
            else:
                default_type = set([Id])
            groupby_matches = self._features_by_type(
                all_features=all_features,
                entity=entity,
                max_depth=new_max_depth,
                variable_type=default_type)
            groupby_matches = filter_groupby_matches_by_options(
                groupby_matches, current_options)

            # If require_direct_input, require a DirectFeature in input or as a
            # groupby, and don't create features of inputs/groupbys which are
            # all direct features with the same relationship path
            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input):
                    for groupby in groupby_matches:
                        if require_direct_input and (
                                _all_direct_and_same_path(matching_input +
                                                          (groupby, ))
                                or not any([
                                    isinstance(feature, DirectFeature)
                                    for feature in (matching_input +
                                                    (groupby, ))
                                ])):
                            continue
                        new_f = GroupByTransformFeature(list(matching_input),
                                                        groupby=groupby[0],
                                                        primitive=groupby_prim)
                        self._handle_new_feature(all_features=all_features,
                                                 new_feature=new_f)
Exemplo n.º 7
0
def test_groupby_transform_description(es):
    feature = GroupByTransformFeature(es['log']['value'], CumMean,
                                      es['log']['session_id'])
    description = 'The cumulative mean of the "value" for each "session_id".'

    assert describe_feature(feature) == description
Exemplo n.º 8
0
def test_metadata(es, tmpdir):
    identity_feature_descriptions = {
        "sessions: device_name": "the name of the device used for each session",
        "customers: id": "the customer's id",
    }
    agg_feat = AggregationFeature(
        IdentityFeature(es["sessions"].ww["device_name"]), "customers", NumUnique
    )
    agg_description = (
        "The number of unique elements in the name of the device used for each "
        'session of all instances of "sessions" for each customer\'s id.'
    )
    assert (
        describe_feature(agg_feat, feature_descriptions=identity_feature_descriptions)
        == agg_description
    )

    transform_feat = GroupByTransformFeature(
        IdentityFeature(es["log"].ww["value"]),
        CumMean,
        IdentityFeature(es["log"].ww["session_id"]),
    )
    transform_description = 'The running average of the "value" for each "session_id".'
    primitive_templates = {"cum_mean": "the running average of {}"}
    assert (
        describe_feature(transform_feat, primitive_templates=primitive_templates)
        == transform_description
    )

    custom_agg = AggregationFeature(
        IdentityFeature(es["log"].ww["zipcode"]), "sessions", Mode
    )
    auto_description = 'The most frequently occurring value of the "zipcode" of all instances of "log" for each "id" in "sessions".'
    custom_agg_description = "the most frequently used zipcode"
    custom_feature_description = (
        custom_agg_description[0].upper() + custom_agg_description[1:] + "."
    )
    feature_description_dict = {"sessions: MODE(log.zipcode)": custom_agg_description}
    assert describe_feature(custom_agg) == auto_description
    assert (
        describe_feature(custom_agg, feature_descriptions=feature_description_dict)
        == custom_feature_description
    )

    metadata = {
        "feature_descriptions": {
            **identity_feature_descriptions,
            **feature_description_dict,
        },
        "primitive_templates": primitive_templates,
    }
    metadata_path = os.path.join(tmpdir, "description_metadata.json")
    with open(metadata_path, "w") as f:
        json.dump(metadata, f)
    assert describe_feature(agg_feat, metadata_file=metadata_path) == agg_description
    assert (
        describe_feature(transform_feat, metadata_file=metadata_path)
        == transform_description
    )
    assert (
        describe_feature(custom_agg, metadata_file=metadata_path)
        == custom_feature_description
    )
Exemplo n.º 9
0
    def _build_transform_features(self, all_features, entity, max_depth=0):
        """Creates trans_features for all the variables in an entity

        Args:
            all_features (dict[:class:`.Entity`.id:dict->[str->:class:`BaseFeature`]]):
                Dict containing a dict for each entity. Each nested dict
                has features as values with their ids as keys

          entity (Entity): Entity to calculate features for.
        """
        if max_depth is not None and max_depth < 0:
            return

        new_max_depth = None
        if max_depth is not None:
            new_max_depth = max_depth - 1

        self._add_identity_features(all_features, entity)

        for trans_prim in self.trans_primitives:
            # if multiple input_types, only use first one for DFS
            input_types = trans_prim.input_types
            if type(input_types[0]) == list:
                input_types = input_types[0]

            features = self._features_by_type(all_features=all_features,
                                              entity=entity,
                                              max_depth=new_max_depth,
                                              variable_type=set(input_types))

            matching_inputs = match(input_types,
                                    features,
                                    commutative=trans_prim.commutative)

            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input):
                    new_f = TransformFeature(matching_input,
                                             primitive=trans_prim)
                    self._handle_new_feature(all_features=all_features,
                                             new_feature=new_f)

        for groupby_prim in self.groupby_trans_primitives:
            # Normally input_types is a list of what inputs can be supplied to
            # the primitive function.  Here we temporarily add `Id` as an extra
            # item in input_types so that the matching function will also look
            # for feature columns to group by.
            input_types = groupby_prim.input_types[:]
            # if multiple input_types, only use first one for DFS
            if type(input_types[0]) == list:
                input_types = input_types[0]
            input_types.append(Id)

            features = self._features_by_type(all_features=all_features,
                                              entity=entity,
                                              max_depth=new_max_depth,
                                              variable_type=set(input_types))
            matching_inputs = match(input_types,
                                    features,
                                    commutative=groupby_prim.commutative)
            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input):
                    new_f = GroupByTransformFeature(list(matching_input[:-1]),
                                                    groupby=matching_input[-1],
                                                    primitive=groupby_prim)
                    self._handle_new_feature(all_features=all_features,
                                             new_feature=new_f)
Exemplo n.º 10
0
    def _build_transform_features(self,
                                  all_features,
                                  dataframe,
                                  max_depth=0,
                                  require_direct_input=False):
        """Creates trans_features for all the columns in a dataframe

        Args:
            all_features (dict[dataframe name: dict->[str->:class:`BaseFeature`]]):
                Dict containing a dict for each dataframe. Each nested dict
                has features as values with their ids as keys

          dataframe (DataFrame): DataFrame to calculate features for.
        """

        new_max_depth = None
        if max_depth is not None:
            new_max_depth = max_depth - 1

        # Keep track of features to add until the end to avoid applying
        # transform primitives to features that were also built by transform primitives
        features_to_add = []

        for trans_prim in self.trans_primitives:
            current_options = self.primitive_options.get(
                trans_prim, self.primitive_options.get(trans_prim.name))
            if ignore_dataframe_for_primitive(current_options, dataframe):
                continue

            input_types = trans_prim.input_types

            matching_inputs = self._get_matching_inputs(
                all_features,
                dataframe,
                new_max_depth,
                input_types,
                trans_prim,
                current_options,
                require_direct_input=require_direct_input,
            )

            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input) and check_transform_stacking(
                           matching_input):
                    new_f = TransformFeature(matching_input,
                                             primitive=trans_prim)
                    features_to_add.append(new_f)

        for groupby_prim in self.groupby_trans_primitives:
            current_options = self.primitive_options.get(
                groupby_prim, self.primitive_options.get(groupby_prim.name))
            if ignore_dataframe_for_primitive(current_options,
                                              dataframe,
                                              groupby=True):
                continue
            input_types = groupby_prim.input_types[:]
            matching_inputs = self._get_matching_inputs(
                all_features,
                dataframe,
                new_max_depth,
                input_types,
                groupby_prim,
                current_options,
            )

            # get columns to use as groupbys, use IDs as default unless other groupbys specified
            if any([
                    "include_groupby_columns" in option
                    and dataframe.ww.name in option["include_groupby_columns"]
                    for option in current_options
            ]):
                column_schemas = "all"
            else:
                column_schemas = [ColumnSchema(semantic_tags=["foreign_key"])]
            groupby_matches = self._features_by_type(
                all_features=all_features,
                dataframe=dataframe,
                max_depth=new_max_depth,
                column_schemas=column_schemas,
            )
            groupby_matches = filter_groupby_matches_by_options(
                groupby_matches, current_options)

            # If require_direct_input, require a DirectFeature in input or as a
            # groupby, and don't create features of inputs/groupbys which are
            # all direct features with the same relationship path
            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input) and check_transform_stacking(
                           matching_input):
                    for groupby in groupby_matches:
                        if require_direct_input and (
                                _all_direct_and_same_path(matching_input +
                                                          (groupby, ))
                                or not any([
                                    isinstance(feature, DirectFeature)
                                    for feature in (matching_input +
                                                    (groupby, ))
                                ])):
                            continue
                        new_f = GroupByTransformFeature(
                            list(matching_input),
                            groupby=groupby[0],
                            primitive=groupby_prim,
                        )
                        features_to_add.append(new_f)
        for new_f in features_to_add:
            self._handle_new_feature(all_features=all_features,
                                     new_feature=new_f)
    def _build_transform_features(self,
                                  all_features,
                                  entity,
                                  max_depth=0,
                                  require_direct_input=False):
        """Creates trans_features for all the variables in an entity

        Args:
            all_features (dict[:class:`.Entity`.id:dict->[str->:class:`BaseFeature`]]):
                Dict containing a dict for each entity. Each nested dict
                has features as values with their ids as keys

          entity (Entity): Entity to calculate features for.
        """
        new_max_depth = None
        if max_depth is not None:
            new_max_depth = max_depth - 1

        for trans_prim in self.trans_primitives:
            # if multiple input_types, only use first one for DFS
            input_types = trans_prim.input_types
            if type(input_types[0]) == list:
                input_types = input_types[0]

            matching_inputs = self._get_matching_inputs(
                all_features,
                entity,
                new_max_depth,
                input_types,
                trans_prim,
                require_direct_input=require_direct_input)

            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input):
                    new_f = TransformFeature(matching_input,
                                             primitive=trans_prim)
                    self._handle_new_feature(all_features=all_features,
                                             new_feature=new_f)

        for groupby_prim in self.groupby_trans_primitives:
            input_types = groupby_prim.input_types[:]
            # if multiple input_types, only use first one for DFS
            if type(input_types[0]) == list:
                input_types = input_types[0]
            matching_inputs = self._get_matching_inputs(
                all_features,
                entity,
                new_max_depth,
                input_types,
                groupby_prim,
                require_direct_input=require_direct_input)
            # get IDs to use as groupby
            id_matches = self._features_by_type(all_features=all_features,
                                                entity=entity,
                                                max_depth=new_max_depth,
                                                variable_type=set([Id]))
            for matching_input in matching_inputs:
                if all(bf.number_output_features == 1
                       for bf in matching_input):
                    for id_groupby in id_matches:
                        new_f = GroupByTransformFeature(list(matching_input),
                                                        groupby=id_groupby,
                                                        primitive=groupby_prim)
                        self._handle_new_feature(all_features=all_features,
                                                 new_feature=new_f)