Пример #1
0
def test_invalid_init_args(diamond_es):
    error_text = 'parent_entity must match first relationship in path'
    with pytest.raises(AssertionError, match=error_text):
        path = backward_path(diamond_es, ['stores', 'transactions'])
        ft.AggregationFeature(diamond_es['transactions']['amount'],
                              diamond_es['customers'],
                              ft.primitives.Mean,
                              relationship_path=path)

    error_text = 'Base feature must be defined on the entity at the end of relationship_path'
    with pytest.raises(AssertionError, match=error_text):
        path = backward_path(diamond_es, ['regions', 'stores'])
        ft.AggregationFeature(diamond_es['transactions']['amount'],
                              diamond_es['regions'],
                              ft.primitives.Mean,
                              relationship_path=path)

    error_text = 'All relationships in path must be backward'
    with pytest.raises(AssertionError, match=error_text):
        backward = backward_path(diamond_es, ['customers', 'transactions'])
        forward = RelationshipPath([(True, r) for _, r in backward])
        path = RelationshipPath(list(forward) + list(backward))
        ft.AggregationFeature(diamond_es['transactions']['amount'],
                              diamond_es['transactions'],
                              ft.primitives.Mean,
                              relationship_path=path)
Пример #2
0
def test_invalid_init_args(diamond_es):
    error_text = "parent_dataframe must match first relationship in path"
    with pytest.raises(AssertionError, match=error_text):
        path = backward_path(diamond_es, ["stores", "transactions"])
        ft.AggregationFeature(
            ft.IdentityFeature(diamond_es["transactions"].ww["amount"]),
            "customers",
            ft.primitives.Mean,
            relationship_path=path,
        )

    error_text = (
        "Base feature must be defined on the dataframe at the end of relationship_path"
    )
    with pytest.raises(AssertionError, match=error_text):
        path = backward_path(diamond_es, ["regions", "stores"])
        ft.AggregationFeature(
            ft.IdentityFeature(diamond_es["transactions"].ww["amount"]),
            "regions",
            ft.primitives.Mean,
            relationship_path=path,
        )

    error_text = "All relationships in path must be backward"
    with pytest.raises(AssertionError, match=error_text):
        backward = backward_path(diamond_es, ["customers", "transactions"])
        forward = RelationshipPath([(True, r) for _, r in backward])
        path = RelationshipPath(list(forward) + list(backward))
        ft.AggregationFeature(
            ft.IdentityFeature(diamond_es["transactions"].ww["amount"]),
            "transactions",
            ft.primitives.Mean,
            relationship_path=path,
        )
Пример #3
0
def test_relationship_path_dataframes(es):
    assert list(RelationshipPath([]).dataframes()) == []

    log_to_sessions = Relationship(es, "sessions", "id", "log", "session_id")
    sessions_to_customers = Relationship(es, "customers", "id", "sessions",
                                         "customer_id")

    forward_path = [(True, log_to_sessions), (True, sessions_to_customers)]
    assert list(RelationshipPath(forward_path).dataframes()) == [
        "log",
        "sessions",
        "customers",
    ]

    backward_path = [(False, sessions_to_customers), (False, log_to_sessions)]
    assert list(RelationshipPath(backward_path).dataframes()) == [
        "customers",
        "sessions",
        "log",
    ]

    mixed_path = [(True, log_to_sessions), (False, log_to_sessions)]
    assert list(RelationshipPath(mixed_path).dataframes()) == [
        "log", "sessions", "log"
    ]
Пример #4
0
def test_two_relationships_to_single_entity(games_es):
    es = games_es
    home_team, away_team = es.relationships
    path = RelationshipPath([(False, home_team)])
    mean_at_home = ft.AggregationFeature(es['games']['home_team_score'],
                                         es['teams'],
                                         relationship_path=path,
                                         primitive=ft.primitives.Mean)
    path = RelationshipPath([(False, away_team)])
    mean_at_away = ft.AggregationFeature(es['games']['away_team_score'],
                                         es['teams'],
                                         relationship_path=path,
                                         primitive=ft.primitives.Mean)
    home_team_mean = ft.DirectFeature(mean_at_home,
                                      es['games'],
                                      relationship=home_team)
    away_team_mean = ft.DirectFeature(mean_at_away,
                                      es['games'],
                                      relationship=away_team)

    feature_set = FeatureSet([home_team_mean, away_team_mean])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 8, 28),
                                      feature_set=feature_set)
    df = calculator.run(np.array(range(3)))
    df = to_pandas(df, index='id', sort_index=True)

    assert (df[home_team_mean.get_name()] == [1.5, 1.5, 2.5]).all()
    assert (df[away_team_mean.get_name()] == [1, 0.5, 2]).all()
Пример #5
0
def test_relationship_path_name(es):
    assert RelationshipPath([]).name == ""

    log_to_sessions = Relationship(es, "sessions", "id", "log", "session_id")
    sessions_to_customers = Relationship(es, "customers", "id", "sessions",
                                         "customer_id")

    forward_path = [(True, log_to_sessions), (True, sessions_to_customers)]
    assert RelationshipPath(forward_path).name == "sessions.customers"

    backward_path = [(False, sessions_to_customers), (False, log_to_sessions)]
    assert RelationshipPath(backward_path).name == "sessions.log"

    mixed_path = [(True, log_to_sessions), (False, log_to_sessions)]
    assert RelationshipPath(mixed_path).name == "sessions.log"
Пример #6
0
def test_relationship_path_name(es):
    assert RelationshipPath([]).name == ''

    log_to_sessions = Relationship(es, 'sessions', 'id', 'log', 'session_id')
    sessions_to_customers = Relationship(es, 'customers', 'id', 'sessions',
                                         'customer_id')

    forward_path = [(True, log_to_sessions), (True, sessions_to_customers)]
    assert RelationshipPath(forward_path).name == 'sessions.customers'

    backward_path = [(False, sessions_to_customers), (False, log_to_sessions)]
    assert RelationshipPath(backward_path).name == 'sessions.log'

    mixed_path = [(True, log_to_sessions), (False, log_to_sessions)]
    assert RelationshipPath(mixed_path).name == 'sessions.log'
Пример #7
0
    def from_dictionary(cls, arguments, entityset, dependencies,
                        primitives_deserializer):
        base_features = [
            dependencies[name] for name in arguments['base_features']
        ]
        relationship_path = [
            Relationship.from_dictionary(r, entityset)
            for r in arguments['relationship_path']
        ]
        parent_entity = relationship_path[0].parent_entity
        relationship_path = RelationshipPath([(False, r)
                                              for r in relationship_path])

        primitive = primitives_deserializer.deserialize_primitive(
            arguments['primitive'])

        use_previous_data = arguments['use_previous']
        use_previous = use_previous_data and Timedelta.from_dictionary(
            use_previous_data)

        where_name = arguments['where']
        where = where_name and dependencies[where_name]

        return cls(base_features=base_features,
                   parent_entity=parent_entity,
                   primitive=primitive,
                   relationship_path=relationship_path,
                   use_previous=use_previous,
                   where=where,
                   name=arguments['name'])
Пример #8
0
    def _handle_relationship_path(self, parent_entity, relationship_path):
        if relationship_path:
            assert all(not is_forward for is_forward, _r in relationship_path), \
                'All relationships in path must be backward'

            _is_forward, first_relationship = relationship_path[0]
            first_parent = first_relationship.parent_entity
            assert parent_entity.id == first_parent.id, \
                'parent_entity must match first relationship in path.'

            _is_forward, last_relationship = relationship_path[-1]
            assert self.child_entity.id == last_relationship.child_entity.id, \
                'Base feature must be defined on the entity at the end of relationship_path'

            path_is_unique = parent_entity.entityset \
                .has_unique_forward_path(self.child_entity.id, parent_entity.id)
        else:
            paths = parent_entity.entityset \
                .find_backward_paths(parent_entity.id, self.child_entity.id)
            first_path = next(paths, None)

            if not first_path:
                raise RuntimeError('No backward path from "%s" to "%s" found.'
                                   % (parent_entity.id, self.child_entity.id))
            # Check for another path.
            elif next(paths, None):
                message = "There are multiple possible paths to the base entity. " \
                          "You must specify a relationship path."
                raise RuntimeError(message)

            relationship_path = RelationshipPath([(False, r) for r in first_path])
            path_is_unique = True

        return relationship_path, path_is_unique
Пример #9
0
    def from_dictionary(cls, arguments, entityset, dependencies, primitive):
        base_features = [dependencies[name] for name in arguments["base_features"]]
        relationship_path = [
            Relationship.from_dictionary(r, entityset)
            for r in arguments["relationship_path"]
        ]
        parent_dataframe_name = relationship_path[0].parent_dataframe.ww.name
        relationship_path = RelationshipPath([(False, r) for r in relationship_path])

        use_previous_data = arguments["use_previous"]
        use_previous = use_previous_data and Timedelta.from_dictionary(
            use_previous_data
        )

        where_name = arguments["where"]
        where = where_name and dependencies[where_name]

        feat = cls(
            base_features=base_features,
            parent_dataframe_name=parent_dataframe_name,
            primitive=primitive,
            relationship_path=relationship_path,
            use_previous=use_previous,
            where=where,
            name=arguments["name"],
        )
        feat._names = arguments.get("feature_names")
        return feat
Пример #10
0
 def __init__(self, variable, name=None):
     entity_id = variable.entity_id
     self.variable = variable.entityset.metadata[entity_id][variable.id]
     self.return_type = type(variable)
     super(IdentityFeature, self).__init__(entity=variable.entity,
                                           base_features=[],
                                           relationship_path=RelationshipPath([]),
                                           primitive=PrimitiveBase,
                                           name=name)
Пример #11
0
    def __init__(self, base_feature, child_entity, relationship=None, name=None):
        base_feature = _check_feature(base_feature)

        self.parent_entity = base_feature.entity

        relationship = self._handle_relationship(child_entity, relationship)

        super(DirectFeature, self).__init__(entity=child_entity,
                                            base_features=[base_feature],
                                            relationship_path=RelationshipPath([(True, relationship)]),
                                            primitive=PrimitiveBase,
                                            name=name)
Пример #12
0
def test_relationship_path_dataframes(es):
    assert list(RelationshipPath([]).dataframes()) == []

    log_to_sessions = Relationship(es, 'sessions', 'id', 'log', 'session_id')
    sessions_to_customers = Relationship(es, 'customers', 'id', 'sessions',
                                         'customer_id')

    forward_path = [(True, log_to_sessions), (True, sessions_to_customers)]
    assert list(RelationshipPath(forward_path).dataframes()) == [
        'log', 'sessions', 'customers'
    ]

    backward_path = [(False, sessions_to_customers), (False, log_to_sessions)]
    assert list(RelationshipPath(backward_path).dataframes()) == [
        'customers', 'sessions', 'log'
    ]

    mixed_path = [(True, log_to_sessions), (False, log_to_sessions)]
    assert list(RelationshipPath(mixed_path).dataframes()) == [
        'log', 'sessions', 'log'
    ]
Пример #13
0
def test_relationship_path(es):
    log_to_sessions = Relationship(es, 'sessions', 'id', 'log', 'session_id')
    sessions_to_customers = Relationship(es, 'customers', 'id', 'sessions',
                                         'customer_id')
    path_list = [(True, log_to_sessions), (True, sessions_to_customers),
                 (False, sessions_to_customers)]
    path = RelationshipPath(path_list)

    for i, edge in enumerate(path_list):
        assert path[i] == edge

    assert [edge for edge in path] == path_list
Пример #14
0
    def __init__(self, column, name=None):
        self.column_name = column.ww.name
        self.return_type = column.ww.schema

        metadata = column.ww.schema._metadata
        es = _ES_REF[metadata['entityset_id']]
        super(IdentityFeature,
              self).__init__(dataframe=es[metadata['dataframe_name']],
                             base_features=[],
                             relationship_path=RelationshipPath([]),
                             primitive=PrimitiveBase,
                             name=name)
Пример #15
0
def test_relationship_path_entities(es):
    assert list(RelationshipPath([]).entities()) == []

    log_to_sessions = Relationship(es['sessions']['id'],
                                   es['log']['session_id'])
    sessions_to_customers = Relationship(es['customers']['id'],
                                         es['sessions']['customer_id'])

    forward_path = [(True, log_to_sessions), (True, sessions_to_customers)]
    assert list(RelationshipPath(forward_path).entities()) == [
        'log', 'sessions', 'customers'
    ]

    backward_path = [(False, sessions_to_customers), (False, log_to_sessions)]
    assert list(RelationshipPath(backward_path).entities()) == [
        'customers', 'sessions', 'log'
    ]

    mixed_path = [(True, log_to_sessions), (False, log_to_sessions)]
    assert list(
        RelationshipPath(mixed_path).entities()) == ['log', 'sessions', 'log']
Пример #16
0
def test_copy(games_es):
    home_games = next(r for r in games_es.relationships
                      if r.child_variable.id == 'home_team_id')
    path = RelationshipPath([(False, home_games)])
    feat = ft.AggregationFeature(games_es['games']['home_team_score'],
                                 games_es['teams'],
                                 relationship_path=path,
                                 primitive=ft.primitives.Mean)
    copied = feat.copy()
    assert copied.entity == feat.entity
    assert copied.base_features == feat.base_features
    assert copied.relationship_path == feat.relationship_path
    assert copied.primitive == feat.primitive
Пример #17
0
    def __init__(self, base_features, primitive, name=None):
        base_features = _validate_base_features(base_features)

        for bf in base_features:
            if bf.number_output_features > 1:
                raise ValueError("Cannot stack on whole multi-output feature.")
        dataframe = base_features[0].entityset[base_features[0].dataframe_name]
        super(TransformFeature, self).__init__(
            dataframe=dataframe,
            base_features=base_features,
            relationship_path=RelationshipPath([]),
            primitive=primitive,
            name=name,
        )
Пример #18
0
def test_copy(games_es):
    home_games = next(r for r in games_es.relationships
                      if r._child_column_name == 'home_team_id')
    path = RelationshipPath([(False, home_games)])
    feat = ft.AggregationFeature(ft.IdentityFeature(
        games_es['games'].ww['home_team_score']),
                                 'teams',
                                 relationship_path=path,
                                 primitive=ft.primitives.Mean)
    copied = feat.copy()
    assert copied.dataframe_name == feat.dataframe_name
    assert copied.base_features == feat.base_features
    assert copied.relationship_path == feat.relationship_path
    assert copied.primitive == feat.primitive
Пример #19
0
def test_relationship_path(es):
    log_to_sessions = Relationship(es, "sessions", "id", "log", "session_id")
    sessions_to_customers = Relationship(es, "customers", "id", "sessions",
                                         "customer_id")
    path_list = [
        (True, log_to_sessions),
        (True, sessions_to_customers),
        (False, sessions_to_customers),
    ]
    path = RelationshipPath(path_list)

    for i, edge in enumerate(path_list):
        assert path[i] == edge

    assert [edge for edge in path] == path_list
Пример #20
0
def backward_path(es, dataframe_ids):
    """
    Create a backward RelationshipPath through the given dataframes. Assumes only
    one such path is possible.
    """
    def _get_relationship(child, parent):
        return next(r for r in es.get_forward_relationships(child)
                    if r._parent_dataframe_name == parent)

    relationships = [
        _get_relationship(child, parent)
        for parent, child in zip(dataframe_ids[:-1], dataframe_ids[1:])
    ]

    return RelationshipPath([(False, r) for r in relationships])
Пример #21
0
    def __init__(self, base_features, primitive, name=None):
        # Any edits made to this method should also be made to the
        # new_class_init method in make_trans_primitive
        base_features = _validate_base_features(base_features)

        for bf in base_features:
            if bf.number_output_features > 1:
                raise ValueError("Cannot stack on whole multi-output feature.")
        dataframe = base_features[0].entityset[base_features[0].dataframe_name]
        super(TransformFeature,
              self).__init__(dataframe=dataframe,
                             base_features=base_features,
                             relationship_path=RelationshipPath([]),
                             primitive=primitive,
                             name=name)
Пример #22
0
def forward_path(es, entity_ids):
    """
    Create a forward RelationshipPath through the given entities. Assumes only
    one such path is possible.
    """
    def _get_relationship(child, parent):
        return next(r for r in es.get_forward_relationships(child)
                    if r.parent_entity.id == parent)

    relationships = [
        _get_relationship(child, parent)
        for child, parent in zip(entity_ids[:-1], entity_ids[1:])
    ]

    return RelationshipPath([(True, r) for r in relationships])
Пример #23
0
    def _handle_relationship_path(
        self, entityset, parent_dataframe_name, relationship_path
    ):
        parent_dataframe = entityset[parent_dataframe_name]
        child_dataframe = entityset[self.child_dataframe_name]

        if relationship_path:
            assert all(
                not is_forward for is_forward, _r in relationship_path
            ), "All relationships in path must be backward"

            _is_forward, first_relationship = relationship_path[0]
            first_parent = first_relationship.parent_dataframe
            assert (
                parent_dataframe.ww.name == first_parent.ww.name
            ), "parent_dataframe must match first relationship in path."

            _is_forward, last_relationship = relationship_path[-1]
            assert (
                child_dataframe.ww.name == last_relationship.child_dataframe.ww.name
            ), "Base feature must be defined on the dataframe at the end of relationship_path"

            path_is_unique = entityset.has_unique_forward_path(
                child_dataframe.ww.name, parent_dataframe.ww.name
            )
        else:
            paths = entityset.find_backward_paths(
                parent_dataframe.ww.name, child_dataframe.ww.name
            )
            first_path = next(paths, None)

            if not first_path:
                raise RuntimeError(
                    'No backward path from "%s" to "%s" found.'
                    % (parent_dataframe.ww.name, child_dataframe.ww.name)
                )
            # Check for another path.
            elif next(paths, None):
                message = (
                    "There are multiple possible paths to the base dataframe. "
                    "You must specify a relationship path."
                )
                raise RuntimeError(message)

            relationship_path = RelationshipPath([(False, r) for r in first_path])
            path_is_unique = True

        return relationship_path, path_is_unique
Пример #24
0
 def __init__(
     self, base_feature, child_dataframe_name, relationship=None, name=None
 ):
     base_feature = _validate_base_features(base_feature)[0]
     self.parent_dataframe_name = base_feature.dataframe_name
     relationship = self._handle_relationship(
         base_feature.entityset, child_dataframe_name, relationship
     )
     child_dataframe = base_feature.entityset[child_dataframe_name]
     super(DirectFeature, self).__init__(
         dataframe=child_dataframe,
         base_features=[base_feature],
         relationship_path=RelationshipPath([(True, relationship)]),
         primitive=PrimitiveBase,
         name=name,
     )
Пример #25
0
    def __init__(self, base_features, primitive, name=None):
        # Any edits made to this method should also be made to the
        # new_class_init method in make_trans_primitive
        if hasattr(base_features, '__iter__'):
            base_features = [_check_feature(bf) for bf in base_features]
            msg = "all base features must share the same entity"
            assert len(set([bf.entity for bf in base_features])) == 1, msg
        else:
            base_features = [_check_feature(base_features)]

        # R TODO handle stacking on sub-features
        assert all(bf.number_output_features == 1 for bf in base_features)

        super(TransformFeature, self).__init__(entity=base_features[0].entity,
                                               base_features=base_features,
                                               relationship_path=RelationshipPath([]),
                                               primitive=primitive,
                                               name=name)
Пример #26
0
    def __init__(self, base_features, primitive, name=None):
        # Any edits made to this method should also be made to the
        # new_class_init method in make_trans_primitive
        if hasattr(base_features, '__iter__'):
            base_features = [_check_feature(bf) for bf in base_features]
            msg = "all base features must share the same entity"
            assert len(set([bf.entity for bf in base_features])) == 1, msg
        else:
            base_features = [_check_feature(base_features)]

        for bf in base_features:
            if bf.number_output_features > 1:
                raise ValueError("Cannot stack on whole multi-output feature.")

        super(TransformFeature, self).__init__(entity=base_features[0].entity,
                                               base_features=base_features,
                                               relationship_path=RelationshipPath([]),
                                               primitive=primitive,
                                               name=name)
Пример #27
0
    def get_backward_entities(self, entity_id, deep=False):
        """
        Get entities that are in a backward relationship with entity

        Args:
            entity_id (str): Id entity of entity to search from.
            deep (bool): if True, recursively find backward entities.

        Yields a tuple of (descendent_id, path from entity_id to descendant).
        """
        for relationship in self.get_backward_relationships(entity_id):
            child_eid = relationship.child_entity.id
            direct_path = RelationshipPath([(False, relationship)])
            yield child_eid, direct_path

            if deep:
                sub_entities = self.get_backward_entities(child_eid, deep=True)
                for sub_eid, path in sub_entities:
                    yield sub_eid, direct_path + path
Пример #28
0
    def build_features(self, return_types=None, verbose=False):
        """Automatically builds feature definitions for target
            dataframe using Deep Feature Synthesis algorithm

        Args:
            return_types (list[woodwork.ColumnSchema] or str, optional):
                List of ColumnSchemas defining the types of
                columns to return. If None, defaults to returning all
                numeric, categorical and boolean types. If given as
                the string 'all', use all available return types.

            verbose (bool, optional): If True, print progress.

        Returns:
            list[BaseFeature]: Returns a list of
                features for target dataframe, sorted by feature depth
                (shallow first).
        """
        all_features = {}

        self.where_clauses = defaultdict(set)

        if return_types is None:
            return_types = [
                ColumnSchema(semantic_tags=["numeric"]),
                ColumnSchema(semantic_tags=["category"]),
                ColumnSchema(logical_type=Boolean),
                ColumnSchema(logical_type=BooleanNullable),
            ]
        elif return_types == "all":
            pass
        else:
            msg = "return_types must be a list, or 'all'"
            assert isinstance(return_types, list), msg

        self._run_dfs(
            self.es[self.target_dataframe_name],
            RelationshipPath([]),
            all_features,
            max_depth=self.max_depth,
        )

        new_features = list(all_features[self.target_dataframe_name].values())

        def filt(f):
            # remove identity features of the ID field of the target dataframe
            if (isinstance(f, IdentityFeature)
                    and f.dataframe_name == self.target_dataframe_name
                    and f.column_name
                    == self.es[self.target_dataframe_name].ww.index):
                return False

            return True

        # filter out features with undesired return types
        if return_types != "all":
            new_features = [
                f for f in new_features if any(
                    is_valid_input(f.column_schema, schema)
                    for schema in return_types)
            ]
        new_features = list(filter(filt, new_features))

        new_features.sort(key=lambda f: f.get_depth())

        new_features = self._filter_features(new_features)

        if self.max_features > 0:
            new_features = new_features[:self.max_features]

        if verbose:
            print("Built {} features".format(len(new_features)))
            verbose = None
        return new_features
    def build_features(self, return_variable_types=None, verbose=False):
        """Automatically builds feature definitions for target
            entity using Deep Feature Synthesis algorithm

        Args:
            return_variable_types (list[Variable] or str, optional): Types of
                variables to return. If None, default to
                Numeric, Discrete, and Boolean. If given as
                the string 'all', use all available variable types.

            verbose (bool, optional): If True, print progress.

        Returns:
            list[BaseFeature]: Returns a list of
                features for target entity, sorted by feature depth
                (shallow first).
        """
        all_features = {}

        self.where_clauses = defaultdict(set)

        if return_variable_types is None:
            return_variable_types = [Numeric, Discrete, Boolean]
        elif return_variable_types == 'all':
            pass
        else:
            msg = "return_variable_types must be a list, or 'all'"
            assert isinstance(return_variable_types, list), msg

        self._run_dfs(self.es[self.target_entity_id],
                      RelationshipPath([]),
                      all_features,
                      max_depth=self.max_depth)

        new_features = list(all_features[self.target_entity_id].values())

        def filt(f):
            # remove identity features of the ID field of the target entity
            if (isinstance(f, IdentityFeature)
                    and f.entity.id == self.target_entity_id
                    and f.variable.id == self.es[self.target_entity_id].index):
                return False

            return True

        # filter out features with undesired return types
        if return_variable_types != 'all':
            new_features = [
                f for f in new_features if any(
                    issubclass(f.variable_type, vt)
                    for vt in return_variable_types)
            ]

        new_features = list(filter(filt, new_features))

        new_features.sort(key=lambda f: f.get_depth())

        new_features = self._filter_features(new_features)

        if self.max_features > 0:
            new_features = new_features[:self.max_features]

        if verbose:
            print("Built {} features".format(len(new_features)))
            verbose = None
        return new_features