Exemplos de NMostCommon em Python, exemplos de featuretools.primitives.NMostCommon em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_encode_features.py Projeto: Anyz01/FeatureTools

def test_encode_features_topn(entityset):
    topn = Feature(entityset['log']['product_id'],
                   parent_entity=entityset['customers'],
                   primitive=NMostCommon(n=3))
    features, feature_defs = dfs(entityset=entityset,
                                 instance_ids=[0, 1, 2],
                                 target_entity="customers",
                                 agg_primitives=[NMostCommon(n=3)])
    features_enc, feature_defs_enc = encode_features(features,
                                                     feature_defs,
                                                     include_unknown=True)
    assert topn.hash() in [feat.hash() for feat in feature_defs_enc]
    for name in topn.get_feature_names():
        assert name in features_enc.columns

Exemplo n.º 2

0

Exibir arquivo

def test_topn(entityset, backend):
    topn = NMostCommon(entityset['log']['product_id'],
                       entityset['customers'],
                       n=2)
    pandas_backend = backend([topn])

    df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2],
                                               time_last=None)

    true_results = [['toothpaste', 'coke zero'],
                    ['coke zero', 'Haribo sugar-free gummy bears'],
                    ['taco clock']]
    assert (topn.get_name() in df.columns)
    for i, values in enumerate(df[topn.get_name()].values):
        assert set(true_results[i]) == set(values)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: test_agg_feats.py Projeto: mikewcasale/featuretools

def test_stacking_multi(pd_es):
    threecommon = NMostCommon(3)
    tc = ft.Feature(pd_es['log']['product_id'],
                    parent_entity=pd_es["sessions"],
                    primitive=threecommon)

    stacked = []
    for i in range(3):
        stacked.append(
            ft.Feature(tc[i],
                       parent_entity=pd_es['customers'],
                       primitive=NumUnique))

    fm = ft.calculate_feature_matrix(stacked,
                                     entityset=pd_es,
                                     instance_ids=[0, 1, 2])

    correct_vals = [[3, 2, 1], [2, 1, 0], [0, 0, 0]]
    correct_vals1 = [[3, 1, 1], [2, 1, 0], [0, 0, 0]]
    # either of the above can be correct, and the outcome depends on the sorting of
    # two values in the initial n most common function, which changes arbitrarily.

    for i in range(3):
        f = 'NUM_UNIQUE(sessions.N_MOST_COMMON(log.product_id)[%d])' % i
        cols = fm.columns
        assert f in cols
        assert fm[cols[i]].tolist() == correct_vals[i] or fm[
            cols[i]].tolist() == correct_vals1[i]

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test_feature_base.py Projeto: john-rice/featuretools

def test_rename_multioutput(es):
    feat = ft.Feature(es['log'].ww['product_id'],
                      parent_dataframe_name='customers',
                      primitive=NMostCommon(n=2))
    new_name = 'session_test'
    new_names = ['session_test[0]', 'session_test[1]']
    check_rename(feat, new_name, new_names)

Exemplo n.º 5

0

Exibir arquivo

def test_stacks_multioutput_features(es):
    class TestTime(TransformPrimitive):
        name = "test_time"
        input_types = [Datetime]
        return_type = Numeric
        number_output_features = 6

        def get_function(self):
            def test_f(x):
                times = pd.Series(x)
                units = ["year", "month", "day", "hour", "minute", "second"]
                return [times.apply(lambda x: getattr(x, unit)) for unit in units]
            return test_f

    feat = ft.dfs(entityset=es,
                  target_entity="customers",
                  agg_primitives=[NumUnique, NMostCommon(n=3)],
                  trans_primitives=[TestTime, Diff],
                  max_depth=4,
                  features_only=True
                  )

    for i in range(3):
        f = 'NUM_UNIQUE(sessions.N_MOST_COMMON(log.countrycode)[%d])' % i
        assert feature_with_name(feat, f)

    for i in range(6):
        f = 'DIFF(TEST_TIME(date_of_birth)[%d])' % i
        assert feature_with_name(feat, f)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_pandas_backend.py Projeto: wuqixiaobai/featuretools

def test_topn(entityset, backend):
    topn = NMostCommon(entityset['log']['product_id'],
                       entityset['customers'], n=2)
    pandas_backend = backend([topn])

    df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2],
                                               time_last=None)

    true_results = [
        ['toothpaste', 'coke zero'],
        ['coke zero', 'Haribo sugar-free gummy bears'],
        ['taco clock']
    ]
    assert (topn.get_name() in df.columns)
    for i, values in enumerate(df[topn.get_name()].values):
        assert set(true_results[i]) == set(values)

Exemplo n.º 7

0

Exibir arquivo

def test_topn(pd_es):
    topn = ft.Feature(pd_es['log']['product_id'],
                      parent_entity=pd_es['customers'],
                      primitive=NMostCommon(n=2))
    feature_set = FeatureSet([topn])

    calculator = FeatureSetCalculator(pd_es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))
    true_results = pd.DataFrame(
        [['toothpaste', 'coke zero'],
         ['coke zero', 'Haribo sugar-free gummy bears'],
         ['taco clock', np.nan]])
    assert ([name in df.columns for name in topn.get_feature_names()])

    for i in range(df.shape[0]):
        true = true_results.loc[i]
        actual = df.loc[i]
        if i == 0:
            # coke zero and toothpase have same number of occurrences
            assert set(true.values) == set(actual.values)
        else:
            for i1, i2 in zip(true, actual):
                assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)

Exemplo n.º 8

0

Exibir arquivo

def test_initialized_agg_prim(es):
    ThreeMost = NMostCommon(n=3)
    dfs_obj = DeepFeatureSynthesis(target_entity_id="sessions",
                                   entityset=es,
                                   agg_primitives=[ThreeMost],
                                   trans_primitives=[])
    features = dfs_obj.build_features()
    assert (feature_with_name(features, "N_MOST_COMMON(log.product_id)"))

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_encode_features.py Projeto: john-rice/featuretools

def test_encode_features_topn(pd_es):
    topn = Feature(Feature(pd_es['log'].ww['product_id']),
                   parent_dataframe_name='customers',
                   primitive=NMostCommon(n=3))
    features, feature_defs = dfs(entityset=pd_es,
                                 instance_ids=[0, 1, 2],
                                 target_dataframe_name="customers",
                                 agg_primitives=[NMostCommon(n=3)])
    features_enc, feature_defs_enc = encode_features(features,
                                                     feature_defs,
                                                     include_unknown=True)
    assert topn.unique_name() in [
        feat.unique_name() for feat in feature_defs_enc
    ]
    for name in topn.get_feature_names():
        assert name in features_enc.columns
        assert features_enc.columns.tolist().count(name) == 1

Exemplo n.º 10

0

Exibir arquivo

Arquivo: test_feature_base.py Projeto: john-rice/featuretools

def test_rename_featureoutputslice(es):
    multi_output_feat = ft.Feature(es['log'].ww['product_id'],
                                   parent_dataframe_name='customers',
                                   primitive=NMostCommon(n=2))
    feat = ft.feature_base.FeatureOutputSlice(multi_output_feat, 0)
    new_name = 'session_test'
    new_names = ['session_test']
    check_rename(feat, new_name, new_names)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_feature_base.py Projeto: yaokaifei/featuretools

def test_multi_output_base_error_agg(es):
    three_common = NMostCommon(3)
    tc = ft.Feature(es['log']['product_id'],
                    parent_entity=es["sessions"],
                    primitive=three_common)
    error_text = "Cannot stack on whole multi-output feature."
    with pytest.raises(ValueError, match=error_text):
        ft.Feature(tc, parent_entity=es['customers'], primitive=NumUnique)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_feature_descriptions.py Projeto: john-rice/featuretools

def test_multioutput_description(es):
    n_most_common = NMostCommon(2)
    n_most_common_feature = AggregationFeature(IdentityFeature(es['log'].ww['zipcode']), 'sessions', n_most_common)
    first_most_common_slice = n_most_common_feature[0]
    second_most_common_slice = n_most_common_feature[1]

    n_most_common_base = 'The 2 most common values of the "zipcode" of all instances of "log" for each "id" in "sessions".'
    n_most_common_first = 'The most common value of the "zipcode" of all instances of "log" ' \
                          'for each "id" in "sessions".'
    n_most_common_second = 'The 2nd most common value of the "zipcode" of all instances of ' \
                           '"log" for each "id" in "sessions".'

    assert describe_feature(n_most_common_feature) == n_most_common_base
    assert describe_feature(first_most_common_slice) == n_most_common_first
    assert describe_feature(second_most_common_slice) == n_most_common_second

    class CustomMultiOutput(TransformPrimitive):
        name = "custom_multioutput"
        input_types = [ColumnSchema(semantic_tags={'category'})]
        return_type = ColumnSchema(semantic_tags={'category'})

        number_output_features = 4

    custom_feat = TransformFeature(IdentityFeature(es['log'].ww['zipcode']), CustomMultiOutput)

    generic_base = 'The result of applying CUSTOM_MULTIOUTPUT to the "zipcode".'
    generic_first = 'The 1st output from applying CUSTOM_MULTIOUTPUT to the "zipcode".'
    generic_second = 'The 2nd output from applying CUSTOM_MULTIOUTPUT to the "zipcode".'

    assert describe_feature(custom_feat) == generic_base
    assert describe_feature(custom_feat[0]) == generic_first
    assert describe_feature(custom_feat[1]) == generic_second

    CustomMultiOutput.description_template = ['the multioutput of {}',
                                              'the {nth_slice} multioutput part of {}']
    template_base = 'The multioutput of the "zipcode".'
    template_first_slice = 'The 1st multioutput part of the "zipcode".'
    template_second_slice = 'The 2nd multioutput part of the "zipcode".'
    template_third_slice = 'The 3rd multioutput part of the "zipcode".'
    template_fourth_slice = 'The 4th multioutput part of the "zipcode".'
    assert describe_feature(custom_feat) == template_base
    assert describe_feature(custom_feat[0]) == template_first_slice
    assert describe_feature(custom_feat[1]) == template_second_slice
    assert describe_feature(custom_feat[2]) == template_third_slice
    assert describe_feature(custom_feat[3]) == template_fourth_slice

    CustomMultiOutput.description_template = ['the multioutput of {}',
                                              'the primary multioutput part of {}',
                                              'the secondary multioutput part of {}']
    custom_base = 'The multioutput of the "zipcode".'
    custom_first_slice = 'The primary multioutput part of the "zipcode".'
    custom_second_slice = 'The secondary multioutput part of the "zipcode".'
    bad_slice_error = 'Slice out of range of template'
    assert describe_feature(custom_feat) == custom_base
    assert describe_feature(custom_feat[0]) == custom_first_slice
    assert describe_feature(custom_feat[1]) == custom_second_slice
    with pytest.raises(IndexError, match=bad_slice_error):
        describe_feature(custom_feat[2])

Exemplo n.º 13

0

Exibir arquivo

def test_serialized_renamed_features(es):
    def serialize_name_unchanged(original):
        new_name = 'MyFeature'
        original_names = original.get_feature_names()
        renamed = original.rename(new_name)
        new_names = [new_name] if len(original_names) == 1 else [
            new_name + '[{}]'.format(i) for i in range(len(original_names))
        ]
        check_names(renamed, new_name, new_names)

        serializer = FeaturesSerializer([renamed])
        serialized = serializer.to_dict()

        deserializer = FeaturesDeserializer(serialized)
        deserialized = deserializer.to_list()[0]
        check_names(deserialized, new_name, new_names)

    identity_original = ft.IdentityFeature(es['log'].ww['value'])
    assert identity_original.get_name() == 'value'

    value = ft.IdentityFeature(es['log'].ww['value'])

    primitive = ft.primitives.Max()
    agg_original = ft.AggregationFeature(value, 'customers', primitive)
    assert agg_original.get_name() == 'MAX(log.value)'

    direct_original = ft.DirectFeature(
        ft.IdentityFeature(es['customers'].ww['age']), 'sessions')
    assert direct_original.get_name() == 'customers.age'

    primitive = ft.primitives.MultiplyNumericScalar(value=2)
    transform_original = ft.TransformFeature(value, primitive)
    assert transform_original.get_name() == 'value * 2'

    zipcode = ft.IdentityFeature(es['log'].ww['zipcode'])
    primitive = CumSum()
    groupby_original = ft.feature_base.GroupByTransformFeature(
        value, primitive, zipcode)
    assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode'

    multioutput_original = ft.Feature(es['log'].ww['product_id'],
                                      parent_dataframe_name='customers',
                                      primitive=NMostCommon(n=2))
    assert multioutput_original.get_name(
    ) == 'N_MOST_COMMON(log.product_id, n=2)'

    featureslice_original = ft.feature_base.FeatureOutputSlice(
        multioutput_original, 0)
    assert featureslice_original.get_name(
    ) == 'N_MOST_COMMON(log.product_id, n=2)[0]'

    feature_type_list = [
        identity_original, agg_original, direct_original, transform_original,
        groupby_original, multioutput_original, featureslice_original
    ]

    for feature_type in feature_type_list:
        serialize_name_unchanged(feature_type)

Exemplo n.º 14

0

Exibir arquivo

def test_rename_multioutput(es):
    feat = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    new_name = "session_test"
    new_names = ["session_test[0]", "session_test[1]"]
    check_rename(feat, new_name, new_names)

Exemplo n.º 15

0

Exibir arquivo

def test_set_feature_names_aggregation_feature(es):
    feat = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    new_names = ["agg_col_1", "second_agg_col"]
    feat.set_feature_names(new_names)
    assert feat.get_feature_names() == new_names

Exemplo n.º 16

0

Exibir arquivo

def test_multi_output_base_error_agg(es):
    three_common = NMostCommon(3)
    tc = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="sessions",
        primitive=three_common,
    )
    error_text = "Cannot stack on whole multi-output feature."
    with pytest.raises(ValueError, match=error_text):
        ft.Feature(tc, parent_dataframe_name="customers", primitive=NumUnique)

Exemplo n.º 17

0

Exibir arquivo

def test_rename_featureoutputslice(es):
    multi_output_feat = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    feat = ft.feature_base.FeatureOutputSlice(multi_output_feat, 0)
    new_name = "session_test"
    new_names = ["session_test"]
    check_rename(feat, new_name, new_names)

Exemplo n.º 18

0

Exibir arquivo

def test_set_feature_names_not_unique(es):
    feat = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    new_names = ["col1", "col1"]
    error_msg = "Provided output feature names must be unique."
    with pytest.raises(ValueError, match=error_msg):
        feat.set_feature_names(new_names)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: test_direct_features.py Projeto: mikewcasale/featuretools

def test_direct_rename_multioutput(es):
    n_common = ft.Feature(es['log']['product_id'],
                          parent_entity=es['customers'],
                          primitive=NMostCommon(n=2))
    feat = DirectFeature(n_common, es['sessions'])
    copy_feat = feat.rename("session_test")
    assert feat.unique_name() != copy_feat.unique_name()
    assert feat.get_name() != copy_feat.get_name()
    assert feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name()
    assert feat.entity == copy_feat.entity

Exemplo n.º 20

0

Exibir arquivo

Arquivo: test_direct_features.py Projeto: john-rice/featuretools

def test_direct_rename_multioutput(es):
    n_common = Feature(es['log'].ww['product_id'],
                       parent_dataframe_name='customers',
                       primitive=NMostCommon(n=2))
    feat = DirectFeature(n_common, 'sessions')
    copy_feat = feat.rename("session_test")
    assert feat.unique_name() != copy_feat.unique_name()
    assert feat.get_name() != copy_feat.get_name()
    assert feat.base_features[0].generate_name(
    ) == copy_feat.base_features[0].generate_name()
    assert feat.dataframe_name == copy_feat.dataframe_name

Exemplo n.º 21

0

Exibir arquivo

def test_set_feature_names_wrong_number_of_names(es):
    feat = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    new_names = ["col1"]
    error_msg = re.escape(
        "Number of names provided must match the number of output features: 1 name(s) provided, 2 expected."
    )
    with pytest.raises(ValueError, match=error_msg):
        feat.set_feature_names(new_names)

Exemplo n.º 22

0

Exibir arquivo

def test_renaming_resets_feature_output_names_to_default(es):
    feat = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    new_names = ["renamed1", "renamed2"]
    feat.set_feature_names(new_names)
    assert feat.get_feature_names() == new_names

    feat = feat.rename("new_feature_name")
    assert feat.get_feature_names() == ["new_feature_name[0]", "new_feature_name[1]"]

Exemplo n.º 23

0

Exibir arquivo

def test_direct_rename_multioutput(es):
    n_common = Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    feat = DirectFeature(n_common, "sessions")
    copy_feat = feat.rename("session_test")
    assert feat.unique_name() != copy_feat.unique_name()
    assert feat.get_name() != copy_feat.get_name()
    assert (feat.base_features[0].generate_name() ==
            copy_feat.base_features[0].generate_name())
    assert feat.dataframe_name == copy_feat.dataframe_name

Exemplo n.º 24

0

Exibir arquivo

Arquivo: test_agg_primitives.py Projeto: RomaKoks/featuretools

def test_nmostcommon_categorical():
    n_most = NMostCommon(3)
    expected = pd.Series([1.0, 2.0, np.nan])

    ints = pd.Series([1, 2, 1, 1]).astype("int64")
    assert pd.Series(n_most(ints)).equals(expected)

    cats = pd.Series([1, 2, 1, 1]).astype("category")
    assert pd.Series(n_most(cats)).equals(expected)

    # Value counts includes data for categories that are not present in data.
    # Make sure these counts are not included in most common outputs
    extra_dtype = CategoricalDtype(categories=[1, 2, 3])
    cats_extra = pd.Series([1, 2, 1, 1]).astype(extra_dtype)
    assert pd.Series(n_most(cats_extra)).equals(expected)

Exemplo n.º 25

0

Exibir arquivo

def test_seed_multi_output_feature_stacking(es):
    threecommon = NMostCommon(3)
    tc = ft.Feature(es['log']['product_id'], parent_entity=es["sessions"], primitive=threecommon)

    fm, feat = ft.dfs(entityset=es,
                      target_entity="customers",
                      seed_features=[tc],
                      agg_primitives=[NumUnique],
                      trans_primitives=[],
                      max_depth=4
                      )

    for i in range(3):
        f = 'NUM_UNIQUE(sessions.N_MOST_COMMON(log.product_id)[%d])' % i
        assert feature_with_name(feat, f)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: test_feature_base.py Projeto: mikewcasale/featuretools

def test_to_dictionary_multi_slice(es):
    slice_feature = ft.Feature(es['log']['product_id'],
                               parent_entity=es['customers'],
                               primitive=NMostCommon(n=2))[0]

    expected = {
        'type': 'FeatureOutputSlice',
        'dependencies': ['customers: N_MOST_COMMON(log.product_id, n=2)'],
        'arguments': {
            'name': None,
            'base_feature': 'customers: N_MOST_COMMON(log.product_id, n=2)',
            'n': 0
        }
    }

    assert expected == slice_feature.to_dictionary()

Exemplo n.º 27

0

Exibir arquivo

def test_custom_feature_names_retained_during_serialization(pd_es, tmpdir):
    class MultiCumulative(TransformPrimitive):
        name = "multi_cum_sum"
        input_types = [ColumnSchema(semantic_tags={"numeric"})]
        return_type = ColumnSchema(semantic_tags={"numeric"})
        number_output_features = 3

    multi_output_trans_feat = ft.Feature(
        pd_es["log"].ww["value"], primitive=MultiCumulative
    )
    groupby_trans_feat = ft.GroupByTransformFeature(
        pd_es["log"].ww["value"],
        primitive=MultiCumulative,
        groupby=pd_es["log"].ww["product_id"],
    )
    multi_output_agg_feat = ft.Feature(
        pd_es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    slice = FeatureOutputSlice(multi_output_trans_feat, 1)
    stacked_feat = ft.Feature(slice, primitive=Negate)

    trans_names = ["cumulative_sum", "cumulative_max", "cumulative_min"]
    multi_output_trans_feat.set_feature_names(trans_names)
    groupby_trans_names = ["grouped_sum", "grouped_max", "grouped_min"]
    groupby_trans_feat.set_feature_names(groupby_trans_names)
    agg_names = ["first_most_common", "second_most_common"]
    multi_output_agg_feat.set_feature_names(agg_names)

    features = [
        multi_output_trans_feat,
        multi_output_agg_feat,
        groupby_trans_feat,
        stacked_feat,
    ]
    file = os.path.join(tmpdir, "features.json")
    ft.save_features(features, file)
    deserialized_features = ft.load_features(file)

    new_trans, new_agg, new_groupby, new_stacked = deserialized_features
    assert new_trans.get_feature_names() == trans_names
    assert new_agg.get_feature_names() == agg_names
    assert new_groupby.get_feature_names() == groupby_trans_names
    assert new_stacked.get_feature_names() == ["-(cumulative_max)"]

Exemplo n.º 28

0

Exibir arquivo

def test_to_dictionary_multi_slice(es):
    slice_feature = ft.Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )[0]

    expected = {
        "type": "FeatureOutputSlice",
        "dependencies": ["customers: N_MOST_COMMON(log.product_id, n=2)"],
        "arguments": {
            "name": "N_MOST_COMMON(log.product_id, n=2)[0]",
            "base_feature": "customers: N_MOST_COMMON(log.product_id, n=2)",
            "n": 0,
        },
    }

    assert expected == slice_feature.to_dictionary()

Exemplo n.º 29

0

Exibir arquivo

def test_multi_output_features(es):
    product_id = ft.IdentityFeature(es["log"].ww["product_id"])
    threecommon = NMostCommon()
    num_unique = NumUnique()
    tc = ft.Feature(product_id, parent_dataframe_name="sessions", primitive=threecommon)

    features = [tc, product_id]
    for i in range(3):
        features.append(
            ft.Feature(
                tc[i],
                parent_dataframe_name="customers",
                primitive=num_unique,
            )
        )
        features.append(tc[i])

    serializer = FeaturesSerializer(features)

    flist = [feat.unique_name() for feat in features]
    fd = [feat.to_dictionary() for feat in features]
    fdict = dict(zip(flist, fd))

    expected = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": flist,
        "feature_definitions": fdict,
    }
    expected["primitive_definitions"] = {
        "0": serialize_primitive(threecommon),
        "1": serialize_primitive(num_unique),
    }

    expected["feature_definitions"][flist[0]]["arguments"]["primitive"] = "0"
    expected["feature_definitions"][flist[2]]["arguments"]["primitive"] = "1"
    expected["feature_definitions"][flist[4]]["arguments"]["primitive"] = "1"
    expected["feature_definitions"][flist[6]]["arguments"]["primitive"] = "1"

    actual = serializer.to_dict()
    _compare_feature_dicts(expected, actual)

Exemplo n.º 30

0

Exibir arquivo

Arquivo: test_features_deserializer.py Projeto: alteryx/featuretools

def test_multioutput_feature(es):
    value = ft.IdentityFeature(es["log"].ww["product_id"])
    threecommon = NMostCommon()
    num_unique = NumUnique()
    tc = ft.Feature(value,
                    parent_dataframe_name="sessions",
                    primitive=threecommon)

    features = [tc, value]
    for i in range(3):
        features.append(
            ft.Feature(
                tc[i],
                parent_dataframe_name="customers",
                primitive=num_unique,
            ))
        features.append(tc[i])

    flist = [feat.unique_name() for feat in features]
    fd = [feat.to_dictionary() for feat in features]
    fdict = dict(zip(flist, fd))

    dictionary = {
        "ft_version": ft.__version__,
        "schema_version": SCHEMA_VERSION,
        "entityset": es.to_dictionary(),
        "feature_list": flist,
        "feature_definitions": fdict,
    }
    dictionary["primitive_definitions"] = {
        "0": serialize_primitive(threecommon),
        "1": serialize_primitive(num_unique),
    }

    dictionary["feature_definitions"][flist[0]]["arguments"]["primitive"] = "0"
    dictionary["feature_definitions"][flist[2]]["arguments"]["primitive"] = "1"
    dictionary["feature_definitions"][flist[4]]["arguments"]["primitive"] = "1"
    dictionary["feature_definitions"][flist[6]]["arguments"]["primitive"] = "1"
    deserializer = FeaturesDeserializer(dictionary).to_list()

    for i in range(len(features)):
        assert features[i].unique_name() == deserializer[i].unique_name()

Exemplo n.º 31

0

Exibir arquivo

def test_topn(es, backend):
    topn = ft.Feature(es['log']['product_id'],
                      parent_entity=es['customers'],
                      primitive=NMostCommon(n=2))
    pandas_backend = backend([topn])

    df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2],
                                               time_last=None)

    true_results = pd.DataFrame(
        [['toothpaste', 'coke zero'],
         ['coke zero', 'Haribo sugar-free gummy bears'],
         ['taco clock', np.nan]])
    assert ([name in df.columns for name in topn.get_feature_names()])
    for i in range(df.shape[0]):
        if i == 0:
            # coke zero and toothpase have same number of occurrences
            assert set(true_results.loc[i].values) == set(df.loc[i].values)
        else:
            for i1, i2 in zip(true_results.loc[i], df.iloc[i]):
                assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)