def test_copy(games_es):
    home_games = next(r for r in games_es.relationships
                      if r.child_variable.id == 'home_team_id')
    path = RelationshipPath([(False, home_games)])
    feat = ft.AggregationFeature(games_es['games']['home_team_score'],
                                 games_es['teams'],
                                 relationship_path=path,
                                 primitive=ft.primitives.Mean)
    copied = feat.copy()
    assert copied.entity == feat.entity
    assert copied.base_features == feat.base_features
    assert copied.relationship_path == feat.relationship_path
    assert copied.primitive == feat.primitive
def test_diamond_entityset(diamond_es):
    es = diamond_es

    amount = ft.IdentityFeature(es['transactions']['amount'])
    path = backward_path(es, ['regions', 'customers', 'transactions'])
    through_customers = ft.AggregationFeature(amount,
                                              es['regions'],
                                              primitive=ft.primitives.Sum,
                                              relationship_path=path)
    path = backward_path(es, ['regions', 'stores', 'transactions'])
    through_stores = ft.AggregationFeature(amount,
                                           es['regions'],
                                           primitive=ft.primitives.Sum,
                                           relationship_path=path)

    feature_set = FeatureSet([through_customers, through_stores])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 4, 8),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))
    assert (df['SUM(stores.transactions.amount)'] == [94, 261, 128]).all()
    assert (df['SUM(customers.transactions.amount)'] == [72, 411, 0]).all()
def test_feature_use_previous_pd_dateoffset(es):
    value = ft.IdentityFeature(es['log']['id'])
    do = pd.DateOffset(months=3)
    count_feature = ft.AggregationFeature(value, es['customers'], ft.primitives.Count, use_previous=do)
    features = [count_feature, value]
    serializer = FeaturesSerializer(features)

    expected = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [count_feature.unique_name(), value.unique_name()],
        'feature_definitions': {
            count_feature.unique_name(): count_feature.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }

    _compare_feature_dicts(expected, serializer.to_dict())

    value = ft.IdentityFeature(es['log']['id'])
    do = pd.DateOffset(months=3, days=2, minutes=30)
    count_feature = ft.AggregationFeature(value, es['customers'], ft.primitives.Count, use_previous=do)
    features = [count_feature, value]
    serializer = FeaturesSerializer(features)

    expected = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [count_feature.unique_name(), value.unique_name()],
        'feature_definitions': {
            count_feature.unique_name(): count_feature.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }

    _compare_feature_dicts(expected, serializer.to_dict())
示例#4
0
def test_name_with_multiple_possible_paths(diamond_es):
    transaction_relationships = diamond_es.get_forward_relationships(
        'transactions')
    transaction_to_customer = next(r for r in transaction_relationships
                                   if r.parent_entity.id == 'customers')
    customer_to_region = diamond_es.get_forward_relationships('customers')[0]
    # Does not raise if path specified.
    feat = ft.AggregationFeature(
        diamond_es['transactions']['amount'],
        diamond_es['regions'],
        ft.primitives.Mean,
        relationship_path=[customer_to_region, transaction_to_customer])

    assert feat.get_name() == "MEAN(customers.transactions.amount)"
示例#5
0
def test_copy(games_es):
    home_games = next(r for r in games_es.relationships
                      if r._child_column_name == 'home_team_id')
    path = RelationshipPath([(False, home_games)])
    feat = ft.AggregationFeature(ft.IdentityFeature(
        games_es['games'].ww['home_team_score']),
                                 'teams',
                                 relationship_path=path,
                                 primitive=ft.primitives.Mean)
    copied = feat.copy()
    assert copied.dataframe_name == feat.dataframe_name
    assert copied.base_features == feat.base_features
    assert copied.relationship_path == feat.relationship_path
    assert copied.primitive == feat.primitive
示例#6
0
def test_diamond_entityset(diamond_es):
    es = diamond_es

    amount = ft.IdentityFeature(es["transactions"].ww["amount"])
    path = backward_path(es, ["regions", "customers", "transactions"])
    through_customers = ft.AggregationFeature(amount,
                                              "regions",
                                              primitive=ft.primitives.Sum,
                                              relationship_path=path)
    path = backward_path(es, ["regions", "stores", "transactions"])
    through_stores = ft.AggregationFeature(amount,
                                           "regions",
                                           primitive=ft.primitives.Sum,
                                           relationship_path=path)

    feature_set = FeatureSet([through_customers, through_stores])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 4, 8),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))
    df = to_pandas(df, index="id", sort_index=True)

    assert (df["SUM(stores.transactions.amount)"] == [94, 261, 128]).all()
    assert (df["SUM(customers.transactions.amount)"] == [72, 411, 0]).all()
def test_base_features_in_list(es):
    value = ft.IdentityFeature(es['log']['value'])
    max_feat = ft.AggregationFeature(value, es['sessions'], ft.primitives.Max)
    dictionary = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [max_feat.unique_name(), value.unique_name()],
        'feature_definitions': {
            max_feat.unique_name(): max_feat.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }
    deserializer = FeaturesDeserializer(dictionary)

    expected = [max_feat, value]
    assert expected == deserializer.to_list()
def test_serialized_renamed_features(es):
    def serialize_name_unchanged(original):
        new_name = 'MyFeature'
        original_names = original.get_feature_names()
        renamed = original.rename(new_name)
        new_names = [new_name] if len(original_names) == 1 else [new_name + '[{}]'.format(i) for i in range(len(original_names))]
        check_names(renamed, new_name, new_names)

        serializer = FeaturesSerializer([renamed])
        serialized = serializer.to_dict()

        deserializer = FeaturesDeserializer(serialized)
        deserialized = deserializer.to_list()[0]
        check_names(deserialized, new_name, new_names)

    identity_original = ft.IdentityFeature(es['log']['value'])
    assert identity_original.get_name() == 'value'

    value = ft.IdentityFeature(es['log']['value'])

    primitive = ft.primitives.Max()
    agg_original = ft.AggregationFeature(value, es['customers'], primitive)
    assert agg_original.get_name() == 'MAX(log.value)'

    direct_original = ft.DirectFeature(es['customers']['age'], es['sessions'])
    assert direct_original.get_name() == 'customers.age'

    primitive = ft.primitives.MultiplyNumericScalar(value=2)
    transform_original = ft.TransformFeature(value, primitive)
    assert transform_original.get_name() == 'value * 2'

    zipcode = ft.IdentityFeature(es['log']['zipcode'])
    primitive = CumSum()
    groupby_original = ft.feature_base.GroupByTransformFeature(value, primitive, zipcode)
    assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode'

    multioutput_original = ft.Feature(es['log']['product_id'], parent_entity=es['customers'], primitive=NMostCommon(n=2))
    assert multioutput_original.get_name() == 'N_MOST_COMMON(log.product_id, n=2)'

    featureslice_original = ft.feature_base.FeatureOutputSlice(multioutput_original, 0)
    assert featureslice_original.get_name() == 'N_MOST_COMMON(log.product_id, n=2)[0]'

    feature_type_list = [identity_original, agg_original, direct_original, transform_original, groupby_original, multioutput_original, featureslice_original]

    for feature_type in feature_type_list:
        serialize_name_unchanged(feature_type)
def test_base_features_in_list(es):
    value = ft.IdentityFeature(es['log']['value'])
    max_feature = ft.AggregationFeature(value, es['sessions'], ft.primitives.Max)
    features = [max_feature, value]
    serializer = FeaturesSerializer(features)

    expected = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [max_feature.unique_name(), value.unique_name()],
        'feature_definitions': {
            max_feature.unique_name(): max_feature.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }

    _compare_feature_dicts(expected, serializer.to_dict())
def test_feature_use_previous_pd_timedelta(es):
    value = ft.IdentityFeature(es['log']['id'])
    td = pd.Timedelta(3, "M")
    count_feature = ft.AggregationFeature(value, es['customers'], ft.primitives.Count, use_previous=td)
    dictionary = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [count_feature.unique_name(), value.unique_name()],
        'feature_definitions': {
            count_feature.unique_name(): count_feature.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }
    deserializer = FeaturesDeserializer(dictionary)

    expected = [count_feature, value]
    assert expected == deserializer.to_list()
def test_base_features_in_list(es):
    value = ft.IdentityFeature(es["log"].ww["value"])
    max_feat = ft.AggregationFeature(value, "sessions", ft.primitives.Max)
    dictionary = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": [max_feat.unique_name(),
                         value.unique_name()],
        "feature_definitions": {
            max_feat.unique_name(): max_feat.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        },
    }
    deserializer = FeaturesDeserializer(dictionary)

    expected = [max_feat, value]
    assert expected == deserializer.to_list()
def test_base_features_in_list(es):
    value = ft.IdentityFeature(es["log"].ww["value"])
    max_feature = ft.AggregationFeature(value, "sessions", ft.primitives.Max)
    features = [max_feature, value]
    serializer = FeaturesSerializer(features)

    expected = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": [max_feature.unique_name(),
                         value.unique_name()],
        "feature_definitions": {
            max_feature.unique_name(): max_feature.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        },
    }

    _compare_feature_dicts(expected, serializer.to_dict())
示例#13
0
def test_serialized_renamed_features(es):
    def serialize_name_unchanged(original):
        renamed = original.rename('MyFeature')
        assert renamed.get_name() == 'MyFeature'

        serializer = FeaturesSerializer([renamed])
        serialized = serializer.to_dict()

        deserializer = FeaturesDeserializer(serialized)
        deserialized = deserializer.to_list()[0]
        assert deserialized.get_name() == 'MyFeature'

    identity_original = ft.IdentityFeature(es['log']['value'])
    assert identity_original.get_name() == 'value'

    value = ft.IdentityFeature(es['log']['value'])

    primitive = ft.primitives.Max()
    agg_original = ft.AggregationFeature(value, es['customers'], primitive)
    assert agg_original.get_name() == 'MAX(log.value)'

    direct_original = ft.DirectFeature(es['customers']['age'], es['sessions'])
    assert direct_original.get_name() == 'customers.age'

    primitive = ft.primitives.MultiplyNumericScalar(value=2)
    transform_original = ft.TransformFeature(value, primitive)
    assert transform_original.get_name() == 'value * 2'

    zipcode = ft.IdentityFeature(es['log']['zipcode'])
    primitive = CumSum()
    groupby_original = ft.feature_base.GroupByTransformFeature(
        value, primitive, zipcode)
    assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode'

    feature_type_list = [
        identity_original, agg_original, direct_original, transform_original,
        groupby_original
    ]

    for feature_type in feature_type_list:
        serialize_name_unchanged(feature_type)
def test_unknown_primitive_module(es):
    value = ft.IdentityFeature(es['log']['value'])
    max_feat = ft.AggregationFeature(value, es['sessions'], ft.primitives.Max)
    max_dict = max_feat.to_dictionary()
    max_dict['arguments']['primitive']['module'] = 'fake.module'
    dictionary = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [max_feat.unique_name(), value.unique_name()],
        'feature_definitions': {
            max_feat.unique_name(): max_dict,
            value.unique_name(): value.to_dictionary(),
        }
    }
    deserializer = FeaturesDeserializer(dictionary)

    with pytest.raises(RuntimeError) as excinfo:
        deserializer.to_list()

    error_text = 'Primitive "Max" in module "fake.module" not found'
    assert error_text == str(excinfo.value)
示例#15
0
def test_base_features_not_in_list(es):
    value = ft.IdentityFeature(es['log'].ww['value'])
    value_x2 = ft.TransformFeature(
        value, ft.primitives.MultiplyNumericScalar(value=2))
    max_feature = ft.AggregationFeature(value_x2, 'sessions',
                                        ft.primitives.Max)
    features = [max_feature]
    serializer = FeaturesSerializer(features)

    expected = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [max_feature.unique_name()],
        'feature_definitions': {
            max_feature.unique_name(): max_feature.to_dictionary(),
            value_x2.unique_name(): value_x2.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }

    _compare_feature_dicts(expected, serializer.to_dict())
def test_feature_use_previous_pd_timedelta(es):
    value = ft.IdentityFeature(es["log"].ww["id"])
    td = pd.Timedelta(12, "W")
    count_feature = ft.AggregationFeature(value,
                                          "customers",
                                          ft.primitives.Count,
                                          use_previous=td)
    dictionary = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": [count_feature.unique_name(),
                         value.unique_name()],
        "feature_definitions": {
            count_feature.unique_name(): count_feature.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        },
    }
    deserializer = FeaturesDeserializer(dictionary)

    expected = [count_feature, value]
    assert expected == deserializer.to_list()
示例#17
0
def test_where_feature_dependency(es):
    value = ft.IdentityFeature(es['log'].ww['value'])
    is_purchased = ft.IdentityFeature(es['log'].ww['purchased'])
    max_feature = ft.AggregationFeature(value,
                                        'sessions',
                                        ft.primitives.Max,
                                        where=is_purchased)
    features = [max_feature]
    serializer = FeaturesSerializer(features)

    expected = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [max_feature.unique_name()],
        'feature_definitions': {
            max_feature.unique_name(): max_feature.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
            is_purchased.unique_name(): is_purchased.to_dictionary(),
        }
    }

    _compare_feature_dicts(expected, serializer.to_dict())
def test_unknown_primitive_module(es):
    value = ft.IdentityFeature(es["log"].ww["value"])
    max_feat = ft.AggregationFeature(value, "sessions", ft.primitives.Max)
    max_dict = max_feat.to_dictionary()
    max_dict["arguments"]["primitive"]["module"] = "fake.module"
    dictionary = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": [max_feat.unique_name(),
                         value.unique_name()],
        "feature_definitions": {
            max_feat.unique_name(): max_dict,
            value.unique_name(): value.to_dictionary(),
        },
    }
    deserializer = FeaturesDeserializer(dictionary)

    with pytest.raises(RuntimeError) as excinfo:
        deserializer.to_list()

    error_text = 'Primitive "Max" in module "fake.module" not found'
    assert error_text == str(excinfo.value)
示例#19
0
def test_feature_use_previous_pd_timedelta(es):
    value = ft.IdentityFeature(es['log'].ww['id'])
    td = pd.Timedelta(12, "W")
    count_feature = ft.AggregationFeature(value,
                                          'customers',
                                          ft.primitives.Count,
                                          use_previous=td)
    features = [count_feature, value]
    serializer = FeaturesSerializer(features)

    expected = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [count_feature.unique_name(),
                         value.unique_name()],
        'feature_definitions': {
            count_feature.unique_name(): count_feature.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }

    _compare_feature_dicts(expected, serializer.to_dict())
示例#20
0
def test_feature_trie_without_needs_full_dataframe(diamond_es):
    es = diamond_es
    country_name = ft.IdentityFeature(es["countries"].ww["name"])
    direct_name = ft.DirectFeature(country_name, "regions")
    amount = ft.IdentityFeature(es["transactions"].ww["amount"])

    path_through_customers = backward_path(es, ["regions", "customers", "transactions"])
    through_customers = ft.AggregationFeature(
        amount,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=path_through_customers,
    )
    path_through_stores = backward_path(es, ["regions", "stores", "transactions"])
    through_stores = ft.AggregationFeature(
        amount,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=path_through_stores,
    )
    customers_to_transactions = backward_path(es, ["customers", "transactions"])
    customers_mean = ft.AggregationFeature(
        amount,
        "customers",
        primitive=ft.primitives.Mean,
        relationship_path=customers_to_transactions,
    )

    negation = ft.TransformFeature(customers_mean, ft.primitives.Negate)
    regions_to_customers = backward_path(es, ["regions", "customers"])
    mean_of_mean = ft.AggregationFeature(
        negation,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=regions_to_customers,
    )

    features = [direct_name, through_customers, through_stores, mean_of_mean]

    feature_set = FeatureSet(features)
    trie = feature_set.feature_trie

    assert trie.value == (False, set(), {f.unique_name() for f in features})
    assert trie.get_node(direct_name.relationship_path).value == (
        False,
        set(),
        {country_name.unique_name()},
    )
    assert trie.get_node(regions_to_customers).value == (
        False,
        set(),
        {negation.unique_name(), customers_mean.unique_name()},
    )
    regions_to_stores = backward_path(es, ["regions", "stores"])
    assert trie.get_node(regions_to_stores).value == (False, set(), set())
    assert trie.get_node(path_through_customers).value == (
        False,
        set(),
        {amount.unique_name()},
    )
    assert trie.get_node(path_through_stores).value == (
        False,
        set(),
        {amount.unique_name()},
    )
示例#21
0
def test_serialized_renamed_features(es):
    def serialize_name_unchanged(original):
        new_name = "MyFeature"
        original_names = original.get_feature_names()
        renamed = original.rename(new_name)
        new_names = (
            [new_name]
            if len(original_names) == 1
            else [new_name + "[{}]".format(i) for i in range(len(original_names))]
        )
        check_names(renamed, new_name, new_names)

        serializer = FeaturesSerializer([renamed])
        serialized = serializer.to_dict()

        deserializer = FeaturesDeserializer(serialized)
        deserialized = deserializer.to_list()[0]
        check_names(deserialized, new_name, new_names)

    identity_original = ft.IdentityFeature(es["log"].ww["value"])
    assert identity_original.get_name() == "value"

    value = ft.IdentityFeature(es["log"].ww["value"])

    primitive = ft.primitives.Max()
    agg_original = ft.AggregationFeature(value, "customers", primitive)
    assert agg_original.get_name() == "MAX(log.value)"

    direct_original = ft.DirectFeature(
        ft.IdentityFeature(es["customers"].ww["age"]), "sessions"
    )
    assert direct_original.get_name() == "customers.age"

    primitive = ft.primitives.MultiplyNumericScalar(value=2)
    transform_original = ft.TransformFeature(value, primitive)
    assert transform_original.get_name() == "value * 2"

    zipcode = ft.IdentityFeature(es["log"].ww["zipcode"])
    primitive = CumSum()
    groupby_original = ft.feature_base.GroupByTransformFeature(
        value, primitive, zipcode
    )
    assert groupby_original.get_name() == "CUM_SUM(value) by zipcode"

    multioutput_original = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    assert multioutput_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)"

    featureslice_original = ft.feature_base.FeatureOutputSlice(multioutput_original, 0)
    assert featureslice_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)[0]"

    feature_type_list = [
        identity_original,
        agg_original,
        direct_original,
        transform_original,
        groupby_original,
        multioutput_original,
        featureslice_original,
    ]

    for feature_type in feature_type_list:
        serialize_name_unchanged(feature_type)