def test_base_features_in_list(es):
    value = ft.IdentityFeature(es['log']['value'])
    max_feature = ft.AggregationFeature(value, es['sessions'], ft.primitives.Max)
    features = [max_feature, value]
    serializer = FeaturesSerializer(features)

    expected = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [max_feature.unique_name(), value.unique_name()],
        'feature_definitions': {
            max_feature.unique_name(): max_feature.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }

    _compare_feature_dicts(expected, serializer.to_dict())
예제 #2
0
def test_base_features_in_list(es):
    value = ft.IdentityFeature(es['log']['value'])
    max_feat = ft.AggregationFeature(value, es['sessions'], ft.primitives.Max)
    dictionary = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [max_feat.unique_name(),
                         value.unique_name()],
        'feature_definitions': {
            max_feat.unique_name(): max_feat.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }
    deserializer = FeaturesDeserializer(dictionary)

    expected = [max_feat, value]
    assert expected == deserializer.to_list()
def test_feature_use_previous_pd_timedelta(es):
    value = ft.IdentityFeature(es['log']['id'])
    td = pd.Timedelta(3, "M")
    count_feature = ft.AggregationFeature(value, es['customers'], ft.primitives.Count, use_previous=td)
    dictionary = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [count_feature.unique_name(), value.unique_name()],
        'feature_definitions': {
            count_feature.unique_name(): count_feature.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }
    deserializer = FeaturesDeserializer(dictionary)

    expected = [count_feature, value]
    assert expected == deserializer.to_list()
def test_base_features_in_list(es):
    value = ft.IdentityFeature(es["log"].ww["value"])
    max_feat = ft.AggregationFeature(value, "sessions", ft.primitives.Max)
    dictionary = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": [max_feat.unique_name(),
                         value.unique_name()],
        "feature_definitions": {
            max_feat.unique_name(): max_feat.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        },
    }
    deserializer = FeaturesDeserializer(dictionary)

    expected = [max_feat, value]
    assert expected == deserializer.to_list()
def test_base_features_in_list(es):
    value = ft.IdentityFeature(es["log"].ww["value"])
    max_feature = ft.AggregationFeature(value, "sessions", ft.primitives.Max)
    features = [max_feature, value]
    serializer = FeaturesSerializer(features)

    expected = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": [max_feature.unique_name(),
                         value.unique_name()],
        "feature_definitions": {
            max_feature.unique_name(): max_feature.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        },
    }

    _compare_feature_dicts(expected, serializer.to_dict())
예제 #6
0
def test_multi_output_features(es):
    product_id = ft.IdentityFeature(es["log"].ww["product_id"])
    threecommon = NMostCommon()
    num_unique = NumUnique()
    tc = ft.Feature(product_id, parent_dataframe_name="sessions", primitive=threecommon)

    features = [tc, product_id]
    for i in range(3):
        features.append(
            ft.Feature(
                tc[i],
                parent_dataframe_name="customers",
                primitive=num_unique,
            )
        )
        features.append(tc[i])

    serializer = FeaturesSerializer(features)

    flist = [feat.unique_name() for feat in features]
    fd = [feat.to_dictionary() for feat in features]
    fdict = dict(zip(flist, fd))

    expected = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": flist,
        "feature_definitions": fdict,
    }
    expected["primitive_definitions"] = {
        "0": serialize_primitive(threecommon),
        "1": serialize_primitive(num_unique),
    }

    expected["feature_definitions"][flist[0]]["arguments"]["primitive"] = "0"
    expected["feature_definitions"][flist[2]]["arguments"]["primitive"] = "1"
    expected["feature_definitions"][flist[4]]["arguments"]["primitive"] = "1"
    expected["feature_definitions"][flist[6]]["arguments"]["primitive"] = "1"

    actual = serializer.to_dict()
    _compare_feature_dicts(expected, actual)
def test_multioutput_feature(es):
    value = ft.IdentityFeature(es["log"].ww["product_id"])
    threecommon = NMostCommon()
    num_unique = NumUnique()
    tc = ft.Feature(value,
                    parent_dataframe_name="sessions",
                    primitive=threecommon)

    features = [tc, value]
    for i in range(3):
        features.append(
            ft.Feature(
                tc[i],
                parent_dataframe_name="customers",
                primitive=num_unique,
            ))
        features.append(tc[i])

    flist = [feat.unique_name() for feat in features]
    fd = [feat.to_dictionary() for feat in features]
    fdict = dict(zip(flist, fd))

    dictionary = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": flist,
        "feature_definitions": fdict,
    }
    dictionary["primitive_definitions"] = {
        "0": serialize_primitive(threecommon),
        "1": serialize_primitive(num_unique),
    }

    dictionary["feature_definitions"][flist[0]]["arguments"]["primitive"] = "0"
    dictionary["feature_definitions"][flist[2]]["arguments"]["primitive"] = "1"
    dictionary["feature_definitions"][flist[4]]["arguments"]["primitive"] = "1"
    dictionary["feature_definitions"][flist[6]]["arguments"]["primitive"] = "1"
    deserializer = FeaturesDeserializer(dictionary).to_list()

    for i in range(len(features)):
        assert features[i].unique_name() == deserializer[i].unique_name()
def test_diamond_entityset(diamond_es):
    es = diamond_es

    amount = ft.IdentityFeature(es['transactions']['amount'])
    path = backward_path(es, ['regions', 'customers', 'transactions'])
    through_customers = ft.AggregationFeature(amount, es['regions'],
                                              primitive=ft.primitives.Sum,
                                              relationship_path=path)
    path = backward_path(es, ['regions', 'stores', 'transactions'])
    through_stores = ft.AggregationFeature(amount, es['regions'],
                                           primitive=ft.primitives.Sum,
                                           relationship_path=path)

    feature_set = FeatureSet([through_customers, through_stores])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 4, 8),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))
    assert (df['SUM(stores.transactions.amount)'] == [94, 261, 128]).all()
    assert (df['SUM(customers.transactions.amount)'] == [72, 411, 0]).all()
예제 #9
0
def test_to_dictionary_where(es):
    actual = ft.Feature(es['log'].ww['value'], parent_dataframe_name='sessions',
                        where=ft.IdentityFeature(es['log'].ww['value']) == 2, primitive=Sum).to_dictionary()

    expected = {
        'type': 'AggregationFeature',
        'dependencies': ['log: value', 'log: value = 2'],
        'arguments': {'name': None,
                      'base_features': ['log: value'],
                      'relationship_path': [{'parent_dataframe_name': 'sessions',
                                             'child_dataframe_name': 'log',
                                             'parent_column_name': 'id',
                                             'child_column_name': 'session_id'}],
                      'primitive': {'type': 'Sum',
                                    'module': 'featuretools.primitives.standard.aggregation_primitives',
                                    'arguments': {}},
                      'where': 'log: value = 2',
                      'use_previous': None}
    }

    assert expected == actual
예제 #10
0
def test_base_features_not_in_list(es):
    value = ft.IdentityFeature(es['log'].ww['value'])
    value_x2 = ft.TransformFeature(
        value, ft.primitives.MultiplyNumericScalar(value=2))
    max_feature = ft.AggregationFeature(value_x2, 'sessions',
                                        ft.primitives.Max)
    features = [max_feature]
    serializer = FeaturesSerializer(features)

    expected = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [max_feature.unique_name()],
        'feature_definitions': {
            max_feature.unique_name(): max_feature.to_dictionary(),
            value_x2.unique_name(): value_x2.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }

    _compare_feature_dicts(expected, serializer.to_dict())
def test_unknown_primitive_module(es):
    value = ft.IdentityFeature(es['log']['value'])
    max_feat = ft.AggregationFeature(value, es['sessions'], ft.primitives.Max)
    max_dict = max_feat.to_dictionary()
    max_dict['arguments']['primitive']['module'] = 'fake.module'
    dictionary = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [max_feat.unique_name(), value.unique_name()],
        'feature_definitions': {
            max_feat.unique_name(): max_dict,
            value.unique_name(): value.to_dictionary(),
        }
    }
    deserializer = FeaturesDeserializer(dictionary)

    with pytest.raises(RuntimeError) as excinfo:
        deserializer.to_list()

    error_text = 'Primitive "Max" in module "fake.module" not found'
    assert error_text == str(excinfo.value)
def test_feature_use_previous_pd_timedelta(es):
    value = ft.IdentityFeature(es["log"].ww["id"])
    td = pd.Timedelta(12, "W")
    count_feature = ft.AggregationFeature(value,
                                          "customers",
                                          ft.primitives.Count,
                                          use_previous=td)
    dictionary = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": [count_feature.unique_name(),
                         value.unique_name()],
        "feature_definitions": {
            count_feature.unique_name(): count_feature.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        },
    }
    deserializer = FeaturesDeserializer(dictionary)

    expected = [count_feature, value]
    assert expected == deserializer.to_list()
예제 #13
0
def test_feature_use_previous_pd_timedelta(es):
    value = ft.IdentityFeature(es['log'].ww['id'])
    td = pd.Timedelta(12, "W")
    count_feature = ft.AggregationFeature(value,
                                          'customers',
                                          ft.primitives.Count,
                                          use_previous=td)
    features = [count_feature, value]
    serializer = FeaturesSerializer(features)

    expected = {
        'ft_version': ft.__version__,
        'schema_version': SCHEMA_VERSION,
        'entityset': es.to_dictionary(),
        'feature_list': [count_feature.unique_name(),
                         value.unique_name()],
        'feature_definitions': {
            count_feature.unique_name(): count_feature.to_dictionary(),
            value.unique_name(): value.to_dictionary(),
        }
    }

    _compare_feature_dicts(expected, serializer.to_dict())
def test_unknown_primitive_module(es):
    value = ft.IdentityFeature(es["log"].ww["value"])
    max_feat = ft.AggregationFeature(value, "sessions", ft.primitives.Max)
    max_dict = max_feat.to_dictionary()
    max_dict["arguments"]["primitive"]["module"] = "fake.module"
    dictionary = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": [max_feat.unique_name(),
                         value.unique_name()],
        "feature_definitions": {
            max_feat.unique_name(): max_dict,
            value.unique_name(): value.to_dictionary(),
        },
    }
    deserializer = FeaturesDeserializer(dictionary)

    with pytest.raises(RuntimeError) as excinfo:
        deserializer.to_list()

    error_text = 'Primitive "Max" in module "fake.module" not found'
    assert error_text == str(excinfo.value)
예제 #15
0
def test_diamond_entityset(diamond_es):
    es = diamond_es

    amount = ft.IdentityFeature(es["transactions"].ww["amount"])
    path = backward_path(es, ["regions", "customers", "transactions"])
    through_customers = ft.AggregationFeature(amount,
                                              "regions",
                                              primitive=ft.primitives.Sum,
                                              relationship_path=path)
    path = backward_path(es, ["regions", "stores", "transactions"])
    through_stores = ft.AggregationFeature(amount,
                                           "regions",
                                           primitive=ft.primitives.Sum,
                                           relationship_path=path)

    feature_set = FeatureSet([through_customers, through_stores])
    calculator = FeatureSetCalculator(es,
                                      time_last=datetime(2011, 4, 8),
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))
    df = to_pandas(df, index="id", sort_index=True)

    assert (df["SUM(stores.transactions.amount)"] == [94, 261, 128]).all()
    assert (df["SUM(customers.transactions.amount)"] == [72, 411, 0]).all()
예제 #16
0
def test_feature_trie_without_needs_full_dataframe(diamond_es):
    es = diamond_es
    country_name = ft.IdentityFeature(es["countries"].ww["name"])
    direct_name = ft.DirectFeature(country_name, "regions")
    amount = ft.IdentityFeature(es["transactions"].ww["amount"])

    path_through_customers = backward_path(es, ["regions", "customers", "transactions"])
    through_customers = ft.AggregationFeature(
        amount,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=path_through_customers,
    )
    path_through_stores = backward_path(es, ["regions", "stores", "transactions"])
    through_stores = ft.AggregationFeature(
        amount,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=path_through_stores,
    )
    customers_to_transactions = backward_path(es, ["customers", "transactions"])
    customers_mean = ft.AggregationFeature(
        amount,
        "customers",
        primitive=ft.primitives.Mean,
        relationship_path=customers_to_transactions,
    )

    negation = ft.TransformFeature(customers_mean, ft.primitives.Negate)
    regions_to_customers = backward_path(es, ["regions", "customers"])
    mean_of_mean = ft.AggregationFeature(
        negation,
        "regions",
        primitive=ft.primitives.Mean,
        relationship_path=regions_to_customers,
    )

    features = [direct_name, through_customers, through_stores, mean_of_mean]

    feature_set = FeatureSet(features)
    trie = feature_set.feature_trie

    assert trie.value == (False, set(), {f.unique_name() for f in features})
    assert trie.get_node(direct_name.relationship_path).value == (
        False,
        set(),
        {country_name.unique_name()},
    )
    assert trie.get_node(regions_to_customers).value == (
        False,
        set(),
        {negation.unique_name(), customers_mean.unique_name()},
    )
    regions_to_stores = backward_path(es, ["regions", "stores"])
    assert trie.get_node(regions_to_stores).value == (False, set(), set())
    assert trie.get_node(path_through_customers).value == (
        False,
        set(),
        {amount.unique_name()},
    )
    assert trie.get_node(path_through_stores).value == (
        False,
        set(),
        {amount.unique_name()},
    )
def test_relationship_path(es):
    value = ft.IdentityFeature(es['log']['value'])
    assert value.relationship_path == []
def test_relationship_path(es):
    value = ft.IdentityFeature(es["log"].ww["value"])
    assert len(value.relationship_path) == 0
def test_cum_sum_numpy_group_on_nan(pd_es):
    class CumSumNumpy(TransformPrimitive):
        """Returns the cumulative sum after grouping"""

        name = "cum_sum"
        input_types = [ColumnSchema(semantic_tags={"numeric"})]
        return_type = ColumnSchema(semantic_tags={"numeric"})
        uses_full_dataframe = True

        def get_function(self):
            def cum_sum(values):
                return values.cumsum().values

            return cum_sum

    log_value_feat = ft.IdentityFeature(pd_es["log"].ww["value"])
    pd_es["log"]["product_id"] = (
        ["coke zero"] * 3
        + ["car"] * 2
        + ["toothpaste"] * 3
        + ["brown bag"] * 2
        + ["shoes"]
        + [np.nan] * 4
        + ["coke_zero"] * 2
    )
    pd_es["log"]["value"][16] = 10
    cum_sum = ft.Feature(
        log_value_feat,
        groupby=ft.IdentityFeature(pd_es["log"].ww["product_id"]),
        primitive=CumSumNumpy,
    )
    assert cum_sum.get_name() == "CUM_SUM(value) by product_id"
    features = [cum_sum]
    df = ft.calculate_feature_matrix(
        entityset=pd_es, features=features, instance_ids=range(17)
    )
    cvalues = df[cum_sum.get_name()].values
    assert len(cvalues) == 17
    cum_sum_values = [
        0,
        5,
        15,
        15,
        35,
        0,
        1,
        3,
        3,
        3,
        0,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        10,
    ]

    assert len(cvalues) == len(cum_sum_values)
    for i, v in enumerate(cum_sum_values):
        if np.isnan(v):
            assert np.isnan(cvalues[i])
        else:
            assert v == cvalues[i]
예제 #20
0
def test_serialized_renamed_features(es):
    def serialize_name_unchanged(original):
        new_name = "MyFeature"
        original_names = original.get_feature_names()
        renamed = original.rename(new_name)
        new_names = (
            [new_name]
            if len(original_names) == 1
            else [new_name + "[{}]".format(i) for i in range(len(original_names))]
        )
        check_names(renamed, new_name, new_names)

        serializer = FeaturesSerializer([renamed])
        serialized = serializer.to_dict()

        deserializer = FeaturesDeserializer(serialized)
        deserialized = deserializer.to_list()[0]
        check_names(deserialized, new_name, new_names)

    identity_original = ft.IdentityFeature(es["log"].ww["value"])
    assert identity_original.get_name() == "value"

    value = ft.IdentityFeature(es["log"].ww["value"])

    primitive = ft.primitives.Max()
    agg_original = ft.AggregationFeature(value, "customers", primitive)
    assert agg_original.get_name() == "MAX(log.value)"

    direct_original = ft.DirectFeature(
        ft.IdentityFeature(es["customers"].ww["age"]), "sessions"
    )
    assert direct_original.get_name() == "customers.age"

    primitive = ft.primitives.MultiplyNumericScalar(value=2)
    transform_original = ft.TransformFeature(value, primitive)
    assert transform_original.get_name() == "value * 2"

    zipcode = ft.IdentityFeature(es["log"].ww["zipcode"])
    primitive = CumSum()
    groupby_original = ft.feature_base.GroupByTransformFeature(
        value, primitive, zipcode
    )
    assert groupby_original.get_name() == "CUM_SUM(value) by zipcode"

    multioutput_original = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    assert multioutput_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)"

    featureslice_original = ft.feature_base.FeatureOutputSlice(multioutput_original, 0)
    assert featureslice_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)[0]"

    feature_type_list = [
        identity_original,
        agg_original,
        direct_original,
        transform_original,
        groupby_original,
        multioutput_original,
        featureslice_original,
    ]

    for feature_type in feature_type_list:
        serialize_name_unchanged(feature_type)
def test_relationship_path(es):
    value = ft.IdentityFeature(es['log'].ww['value'])
    assert len(value.relationship_path) == 0