示例#1
0
def test_direct_of_multi_output_transform_feat(es):
    class TestTime(TransformPrimitive):
        name = "test_time"
        input_types = [Datetime]
        return_type = Numeric
        number_output_features = 6

        def get_function(self):
            def test_f(x):
                times = pd.Series(x)
                units = ["year", "month", "day", "hour", "minute", "second"]
                return [times.apply(lambda x: getattr(x, unit)) for unit in units]
            return test_f

    join_time_split = Feature(es["customers"]["signup_date"],
                              primitive=TestTime)
    alt_features = [Feature(es["customers"]["signup_date"], primitive=Year),
                    Feature(es["customers"]["signup_date"], primitive=Month),
                    Feature(es["customers"]["signup_date"], primitive=Day),
                    Feature(es["customers"]["signup_date"], primitive=Hour),
                    Feature(es["customers"]["signup_date"], primitive=Minute),
                    Feature(es["customers"]["signup_date"], primitive=Second)]
    fm, fl = dfs(
        entityset=es,
        target_entity="sessions",
        trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second])

    # Get column names of for multi feature and normal features
    subnames = DirectFeature(join_time_split, es["sessions"]).get_feature_names()
    altnames = [DirectFeature(f, es["sessions"]).get_name() for f in alt_features]

    # Check values are equal between
    for col1, col2 in zip(subnames, altnames):
        assert (fm[col1] == fm[col2]).all()
示例#2
0
def test_make_dfeat_of_agg_feat_on_self(es):
    """
    The graph looks like this:

        R       R = Regions, a parent of customers
        |
        C       C = Customers, the dataframe we're trying to predict on
        |
       etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on C.
    """
    customer_count_feat = ft.Feature(es["customers"].ww["id"],
                                     parent_dataframe_name="régions",
                                     primitive=Count)

    num_customers_feat = DirectFeature(customer_count_feat,
                                       child_dataframe_name="customers")

    feature_set = FeatureSet([num_customers_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index="id")
    v = df[num_customers_feat.get_name()].values[0]
    assert v == 3
示例#3
0
def test_make_dfeat_of_agg_feat_through_parent(es, backend):
    """
    The graph looks like this:

        R       C = Customers, the entity we're trying to predict on
       / \\     R = Regions, a parent of customers
      S   C     S = Stores, a child of regions
          |
         etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on S.
    """
    store_id_feat = IdentityFeature(es['stores']['id'])

    store_count_feat = ft.Feature(store_id_feat,
                                  parent_entity=es[u'régions'],
                                  primitive=Count)

    num_stores_feat = DirectFeature(store_count_feat,
                                    child_entity=es['customers'])

    pandas_backend = backend([num_stores_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[num_stores_feat.get_name()][0]
    assert (v == 3)
示例#4
0
def test_direct_description(es):
    feature = DirectFeature(
        IdentityFeature(es["customers"].ww["loves_ice_cream"]), "sessions"
    )
    description = (
        'The "loves_ice_cream" for the instance of "customers" associated '
        'with this instance of "sessions".'
    )
    assert describe_feature(feature) == description

    deep_direct = DirectFeature(feature, "log")
    deep_description = (
        'The "loves_ice_cream" for the instance of "customers" '
        'associated with the instance of "sessions" associated with '
        'this instance of "log".'
    )
    assert describe_feature(deep_direct) == deep_description

    agg = AggregationFeature(
        IdentityFeature(es["log"].ww["purchased"]), "sessions", PercentTrue
    )
    complicated_direct = DirectFeature(agg, "log")
    agg_on_direct = AggregationFeature(complicated_direct, "products", Mean)

    complicated_description = (
        "The average of the percentage of true values in "
        'the "purchased" of all instances of "log" for each "id" in "sessions" for '
        'the instance of "sessions" associated with this instance of "log" of all '
        'instances of "log" for each "id" in "products".'
    )
    assert describe_feature(agg_on_direct) == complicated_description
def test_make_dfeat_of_agg_feat_on_self(es):
    """
    The graph looks like this:

        R       R = Regions, a parent of customers
        |
        C       C = Customers, the entity we're trying to predict on
        |
       etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on C.
    """
    customer_count_feat = ft.Feature(es['customers']['id'],
                                     parent_entity=es[u'régions'],
                                     primitive=Count)

    num_customers_feat = DirectFeature(customer_count_feat,
                                       child_entity=es['customers'])

    feature_set = FeatureSet([num_customers_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[num_customers_feat.get_name()][0]
    assert (v == 3)
示例#6
0
def test_make_dfeat_of_agg_feat_through_parent(es):
    """
    The graph looks like this:

        R       C = Customers, the entity we're trying to predict on
       / \\     R = Regions, a parent of customers
      S   C     S = Stores, a child of regions
          |
         etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on S.
    """
    store_id_feat = IdentityFeature(es['stores']['id'])

    store_count_feat = ft.Feature(store_id_feat,
                                  parent_entity=es[u'régions'],
                                  primitive=Count)

    num_stores_feat = DirectFeature(store_count_feat,
                                    child_entity=es['customers'])

    feature_set = FeatureSet([num_stores_feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index='id')
    v = df[num_stores_feat.get_name()].values[0]
    assert (v == 3)
def test_dfs_builds_on_seed_features_more_than_max_depth(es):
    seed_feature_sessions = ft.Feature(es['log']["id"],
                                       parent_entity=es['sessions'],
                                       primitive=Count)
    seed_feature_log = ft.Feature(es['log']['datetime'], primitive=Hour)
    session_agg = ft.Feature(seed_feature_log,
                             parent_entity=es['sessions'],
                             primitive=Last)

    # Depth of this feat is 2 relative to session_agg, the seed feature,
    # which is greater than max_depth so it shouldn't be built
    session_agg_trans = DirectFeature(
        ft.Feature(session_agg, parent_entity=es['customers'], primitive=Mode),
        es['sessions'])
    dfs_obj = DeepFeatureSynthesis(
        target_entity_id='sessions',
        entityset=es,
        agg_primitives=[Last, Count],
        trans_primitives=[],
        max_depth=1,
        seed_features=[seed_feature_sessions, seed_feature_log])
    features = dfs_obj.build_features()
    assert seed_feature_sessions.get_name() in [f.get_name() for f in features]
    assert session_agg.get_name() in [f.get_name() for f in features]
    assert session_agg_trans.get_name() not in [f.get_name() for f in features]
示例#8
0
def test_make_dfeat(es, backend):
    f = DirectFeature(es['customers']['age'], child_entity=es['sessions'])

    pandas_backend = backend([f])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[f.get_name()][0]
    assert (v == 33)
def test_direct_from_identity(es):
    device = es['sessions']['device_type']
    d = DirectFeature(base_feature=device, child_entity=es['log'])

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None)
    df = calculator.run([0, 5])
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
示例#10
0
def test_direct_from_identity(es):
    device = es['sessions']['device_type']
    d = DirectFeature(base_feature=device, child_entity=es['log'])

    pandas_backend = PandasBackend(es, [d])
    df = pandas_backend.calculate_all_features(instance_ids=[0, 5],
                                               time_last=None)
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
示例#11
0
def test_direct_rename(es):
    # should be same behavior as test_direct_from_identity
    feat = DirectFeature(base_feature=es['sessions']['device_type'],
                         child_entity=es['log'])
    copy_feat = feat.rename("session_test")
    assert feat.hash() != copy_feat.hash()
    assert feat.get_name() != copy_feat.get_name()
    assert feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name()
    assert feat.entity == copy_feat.entity
def test_direct_copy(games_es):
    home_team = next(r for r in games_es.relationships
                     if r.child_variable.id == 'home_team_id')
    feat = DirectFeature(games_es['teams']['name'], games_es['games'],
                         relationship=home_team)
    copied = feat.copy()
    assert copied.entity == feat.entity
    assert copied.base_features == feat.base_features
    assert copied.relationship_path == feat.relationship_path
def test_direct_copy(games_es):
    home_team = next(r for r in games_es.relationships
                     if r._child_column_name == 'home_team_id')
    feat = DirectFeature(IdentityFeature(games_es['teams'].ww['name']),
                         'games',
                         relationship=home_team)
    copied = feat.copy()
    assert copied.dataframe_name == feat.dataframe_name
    assert copied.base_features == feat.base_features
    assert copied.relationship_path == feat.relationship_path
def test_direct_with_no_path(diamond_es):
    error_text = 'No relationship from "regions" to "customers" found.'
    with pytest.raises(RuntimeError, match=error_text):
        DirectFeature(IdentityFeature(diamond_es['customers'].ww['name']),
                      'regions')

    error_text = 'No relationship from "customers" to "customers" found.'
    with pytest.raises(RuntimeError, match=error_text):
        DirectFeature(IdentityFeature(diamond_es['customers'].ww['name']),
                      'customers')
示例#15
0
def test_direct_copy(games_es):
    home_team = next(r for r in games_es.relationships
                     if r._child_column_name == "home_team_id")
    feat = DirectFeature(IdentityFeature(games_es["teams"].ww["name"]),
                         "games",
                         relationship=home_team)
    copied = feat.copy()
    assert copied.dataframe_name == feat.dataframe_name
    assert copied.base_features == feat.base_features
    assert copied.relationship_path == feat.relationship_path
def test_direct_rename_multioutput(es):
    n_common = ft.Feature(es['log']['product_id'],
                          parent_entity=es['customers'],
                          primitive=NMostCommon(n=2))
    feat = DirectFeature(n_common, es['sessions'])
    copy_feat = feat.rename("session_test")
    assert feat.unique_name() != copy_feat.unique_name()
    assert feat.get_name() != copy_feat.get_name()
    assert feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name()
    assert feat.entity == copy_feat.entity
def test_make_dfeat(es):
    f = DirectFeature(es['customers']['age'], child_entity=es['sessions'])

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    v = df[f.get_name()][0]
    assert (v == 33)
def test_direct_rename_multioutput(es):
    n_common = Feature(es['log'].ww['product_id'],
                       parent_dataframe_name='customers',
                       primitive=NMostCommon(n=2))
    feat = DirectFeature(n_common, 'sessions')
    copy_feat = feat.rename("session_test")
    assert feat.unique_name() != copy_feat.unique_name()
    assert feat.get_name() != copy_feat.get_name()
    assert feat.base_features[0].generate_name(
    ) == copy_feat.base_features[0].generate_name()
    assert feat.dataframe_name == copy_feat.dataframe_name
def test_direct_rename(es):
    # should be same behavior as test_direct_from_identity
    feat = DirectFeature(base_feature=IdentityFeature(
        es['sessions'].ww['device_type']),
                         child_dataframe_name='log')
    copy_feat = feat.rename("session_test")
    assert feat.unique_name() != copy_feat.unique_name()
    assert feat.get_name() != copy_feat.get_name()
    assert feat.base_features[0].generate_name(
    ) == copy_feat.base_features[0].generate_name()
    assert feat.dataframe_name == copy_feat.dataframe_name
def test_make_dfeat(es):
    f = DirectFeature(ft.Feature(es['customers'].ww['age']),
                      child_dataframe_name='sessions')

    feature_set = FeatureSet([f])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = to_pandas(calculator.run(np.array([0])))

    v = df[f.get_name()][0]
    assert (v == 33)
示例#21
0
def test_direct_from_variable(es):
    # should be same behavior as test_direct_from_identity
    device = es['sessions']['device_type']
    d = DirectFeature(base_feature=device, child_entity=es['log'])

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
def test_groupby_transform_direct_groupby(es):
    groupby = DirectFeature(IdentityFeature(es['cohorts'].ww['cohort_name']),
                            'customers')
    feat = GroupByTransformFeature(IdentityFeature(es['customers'].ww['age']),
                                   CumMax, groupby)
    graph = graph_feature(feat).source

    groupby_name = groupby.get_name()
    feat_name = feat.get_name()
    join_node = '1_{}_join'.format(groupby_name)
    prim_node = "0_{}_cum_max".format(feat_name)
    groupby_node = '{}_groupby_customers--{}'.format(feat_name, groupby_name)
    customers_table = '\u2605 customers (target)'
    cohorts_table = 'cohorts'

    join_groupby = '"{}" -> customers:cohort'.format(join_node)
    join_input = 'cohorts:cohort_name -> "{}"'.format(join_node)
    join_out_edge = '"{}" -> customers:"{}"'.format(join_node, groupby_name)
    groupby_edge = 'customers:"{}" -> "{}"'.format(groupby_name, groupby_node)
    groupby_input = 'customers:age -> "{}"'.format(groupby_node)
    prim_input = '"{}" -> "{}"'.format(groupby_node, prim_node)
    feat_edge = '"{}" -> customers:"{}"'.format(prim_node, feat_name)

    graph_components = [
        groupby_name, feat_name, join_node, prim_node, groupby_node,
        customers_table, cohorts_table, join_groupby, join_input,
        join_out_edge, groupby_edge, groupby_input, prim_input, feat_edge
    ]
    for component in graph_components:
        assert component in graph

    dataframes = {
        'cohorts': [cohorts_table, 'cohort_name'],
        'customers':
        [customers_table, 'cohort', 'age', groupby_name, feat_name]
    }
    for dataframe in dataframes:
        regex = r"{} \[label=<\n<TABLE.*?</TABLE>>".format(dataframe)
        matches = re.findall(regex, graph, re.DOTALL)
        assert len(matches) == 1

        rows = re.findall(r"<TR.*?</TR>", matches[0], re.DOTALL)
        assert len(rows) == len(dataframes[dataframe])

        for row in rows:
            matched = False
            for i in dataframes[dataframe]:
                if i in row:
                    matched = True
                    dataframes[dataframe].remove(i)
                    break
            assert matched
示例#23
0
def test_direct_rename_multioutput(es):
    n_common = Feature(
        es["log"].ww["product_id"],
        parent_dataframe_name="customers",
        primitive=NMostCommon(n=2),
    )
    feat = DirectFeature(n_common, "sessions")
    copy_feat = feat.rename("session_test")
    assert feat.unique_name() != copy_feat.unique_name()
    assert feat.get_name() != copy_feat.get_name()
    assert (feat.base_features[0].generate_name() ==
            copy_feat.base_features[0].generate_name())
    assert feat.dataframe_name == copy_feat.dataframe_name
def test_direct_from_identity(es):
    device = es['sessions']['device_type']
    d = DirectFeature(base_feature=device, child_entity=es['log'])

    feature_set = FeatureSet([d])
    calculator = FeatureSetCalculator(es,
                                      feature_set=feature_set,
                                      time_last=None)
    df = calculator.run(np.array([0, 5]))
    if isinstance(df, dd.DataFrame):
        df = df.compute().set_index('id').sort_index()
    v = df[d.get_name()].tolist()
    assert v == [0, 1]
def test_direct_with_multiple_possible_paths(games_es):
    error_text = "There are multiple relationships to the base dataframe. " \
                 "You must specify a relationship."
    with pytest.raises(RuntimeError, match=error_text):
        DirectFeature(IdentityFeature(games_es['teams'].ww['name']), 'games')

    # Does not raise if path specified.
    relationship = next(r for r in games_es.get_forward_relationships('games')
                        if r._child_column_name == 'home_team_id')
    feat = DirectFeature(IdentityFeature(games_es['teams'].ww['name']),
                         'games',
                         relationship=relationship)
    assert feat.relationship_path_name() == 'teams[home_team_id]'
    assert feat.get_name() == 'teams[home_team_id].name'
示例#26
0
def test_serialization(es):
    value = ft.IdentityFeature(es["products"].ww["rating"])
    direct = DirectFeature(value, "log")

    log_to_products = next(r for r in es.get_forward_relationships("log")
                           if r.parent_dataframe.ww.name == "products")
    dictionary = {
        "name": direct.get_name(),
        "base_feature": value.unique_name(),
        "relationship": log_to_products.to_dictionary(),
    }

    assert dictionary == direct.get_arguments()
    assert direct == DirectFeature.from_dictionary(
        dictionary, es, {value.unique_name(): value}, PrimitivesDeserializer())
示例#27
0
def test_make_deep_agg_feat_of_dfeat_of_agg_feat(es):
    """
    The graph looks like this (higher implies parent):

          C     C = Customers, the entity we're trying to predict on
          |     S = Sessions, a child of Customers
      P   S     L = Log, a child of both Sessions and Log
       \\ /     P = Products, a parent of Log which is not a descendent of customers
        L

    We're trying to calculate a DFeat from L to P on an agg_feat of P on L, and
    then aggregate it with another agg_feat of C on L.
    """
    log_count_feat = ft.Feature(es['log']['id'],
                                parent_entity=es['products'],
                                primitive=Count)

    product_purchases_feat = DirectFeature(log_count_feat,
                                           child_entity=es['log'])

    purchase_popularity = ft.Feature(product_purchases_feat,
                                     parent_entity=es['customers'],
                                     primitive=Mean)

    feature_set = FeatureSet([purchase_popularity])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0]))
    df = to_pandas(df, index='id')
    v = df[purchase_popularity.get_name()].values[0]
    assert (v == 38.0 / 10.0)
示例#28
0
def test_make_compare_feat(es):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    log_count_feat = ft.Feature(es['log']['id'],
                                parent_entity=es['sessions'],
                                primitive=Count)

    mean_agg_feat = ft.Feature(log_count_feat,
                               parent_entity=es['customers'],
                               primitive=Mean)

    mean_feat = DirectFeature(mean_agg_feat, child_entity=es['sessions'])

    feat = log_count_feat > mean_feat

    feature_set = FeatureSet([feat])
    calculator = FeatureSetCalculator(es,
                                      time_last=None,
                                      feature_set=feature_set)
    df = calculator.run(np.array([0, 1, 2]))
    df = to_pandas(df, index='id', sort_index=True)

    name = feat.get_name()
    instances = df[name]
    v0, v1, v2 = instances[0:3]
    assert v0
    assert v1
    assert not v2
    def _build_forward_features(self,
                                all_features,
                                parent_entity,
                                child_entity,
                                relationship,
                                max_depth=0):

        if max_depth is not None and max_depth < 0:
            return

        features = self._features_by_type(
            all_features=all_features,
            entity=parent_entity,
            variable_type=[Numeric, Categorical, Ordinal],
            max_depth=max_depth)

        for f in features:
            if self._feature_in_relationship_path([relationship], f):
                continue

            # limits allowing direct features of agg_feats with where clauses
            if isinstance(f, AggregationFeature):
                deep_base_features = [f] + f.get_dependencies(deep=True)
                for feat in deep_base_features:
                    if isinstance(
                            feat,
                            AggregationFeature) and feat.where is not None:
                        continue

            new_f = DirectFeature(f, child_entity)

            self._handle_new_feature(all_features=all_features,
                                     new_feature=new_f)
示例#30
0
def test_make_deep_agg_feat_of_dfeat_of_agg_feat(entityset, backend):
    """
    The graph looks like this (higher implies parent):

          C     C = Customers, the entity we're trying to predict on
          |     S = Sessions, a child of Customers
      P   S     L = Log, a child of both Sessions and Log
       \\ /     P = Products, a parent of Log which is not a descendent of customers
        L

    We're trying to calculate a DFeat from L to P on an agg_feat of P on L, and
    then aggregate it with another agg_feat of C on L.
    """
    log_count_feat = ft.Feature(entityset['log']['id'], parent_entity=entityset['products'], primitive=Count)

    product_purchases_feat = DirectFeature(log_count_feat,
                                           child_entity=entityset['log'])

    purchase_popularity = ft.Feature(product_purchases_feat, parent_entity=entityset['customers'], primitive=Mean)

    pandas_backend = backend([purchase_popularity])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[purchase_popularity.get_name()][0]
    assert (v == 38.0 / 10.0)