Python DirectFeature示例，featuretools.primitives.DirectFeature Python示例

示例#1

0

显示文件

文件： test_calculate_feature_matrix.py 项目： shannonyu/featuretools

def test_training_window(entityset):
    property_feature = Count(entityset['log']['id'], entityset['customers'])
    top_level_agg = Count(entityset['customers']['id'], entityset[u'régions'])

    # make sure features that have a direct to a higher level agg
    # so we have multiple "filter eids" in get_pandas_data_slice,
    # and we go through the loop to pull data with a training_window param more than once
    dagg = DirectFeature(top_level_agg, entityset['customers'])

    # for now, warns if last_time_index not present
    times = [datetime(2011, 4, 9, 12, 31),
             datetime(2011, 4, 10, 11),
             datetime(2011, 4, 10, 13, 10, 1)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 1, 2]})
    feature_matrix = calculate_feature_matrix([property_feature, dagg],
                                              entityset,
                                              cutoff_time=cutoff_time,
                                              training_window='2 hours')

    entityset.add_last_time_indexes()

    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix([property_feature],
                                                  entityset,
                                                  cutoff_time=cutoff_time,
                                                  training_window=Timedelta(2, 'observations', entity='log'))

    feature_matrix = calculate_feature_matrix([property_feature, dagg],
                                              entityset,
                                              cutoff_time=cutoff_time,
                                              training_window='2 hours')
    prop_values = [5, 5, 1]
    dagg_values = [3, 2, 1]
    assert (feature_matrix[property_feature.get_name()] == prop_values).values.all()
    assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()

示例#2

0

显示文件

文件： test_calculate_feature_matrix.py 项目： AhnanWong/featuretools

def test_approximate_dfeat_of_need_all_values(entityset):
    es = entityset
    p = Percentile(es['log']['value'])
    agg_feat = Sum(p, es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time_in_index=True,
                                              cutoff_time=cutoff_time)
    log_df = es['log'].df
    instances = [0, 2]
    cutoffs = [pd.Timestamp('2011-04-09 10:31:19'), pd.Timestamp('2011-04-09 11:00:00')]
    approxes = [pd.Timestamp('2011-04-09 10:31:10'), pd.Timestamp('2011-04-09 11:00:00')]
    true_vals = []
    true_vals_approx = []
    for instance, cutoff, approx in zip(instances, cutoffs, approxes):
        log_data_cutoff = log_df[log_df['datetime'] < cutoff]
        log_data_cutoff['percentile'] = log_data_cutoff['value'].rank(pct=True)
        true_agg = log_data_cutoff.loc[log_data_cutoff['session_id'] == instance, 'percentile'].fillna(0).sum()
        true_vals.append(round(true_agg, 3))

        log_data_approx = log_df[log_df['datetime'] < approx]
        log_data_approx['percentile'] = log_data_approx['value'].rank(pct=True)
        true_agg_approx = log_data_approx.loc[log_data_approx['session_id'].isin([0, 1, 2]), 'percentile'].fillna(0).sum()
        true_vals_approx.append(round(true_agg_approx, 3))
    lapprox = [round(x, 3) for x in feature_matrix[dfeat.get_name()].tolist()]
    test_list = [round(x, 3) for x in feature_matrix[agg_feat.get_name()].tolist()]
    assert lapprox == true_vals_approx
    assert test_list == true_vals

示例#3

0

显示文件

文件： test_calculate_feature_matrix.py 项目： AhnanWong/featuretools

def test_training_window(entityset):
    property_feature = Count(entityset['log']['id'], entityset['customers'])
    top_level_agg = Count(entityset['customers']['id'], entityset[u'régions'])

    # make sure features that have a direct to a higher level agg
    # so we have multiple "filter eids" in get_pandas_data_slice,
    # and we go through the loop to pull data with a training_window param more than once
    dagg = DirectFeature(top_level_agg, entityset['customers'])

    # for now, warns if last_time_index not present
    times = [datetime(2011, 4, 9, 12, 31),
             datetime(2011, 4, 10, 11),
             datetime(2011, 4, 10, 13, 10, 1)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 1, 2]})
    feature_matrix = calculate_feature_matrix([property_feature, dagg],
                                              entityset,
                                              cutoff_time=cutoff_time,
                                              training_window='2 hours')

    entityset.add_last_time_indexes()

    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix([property_feature],
                                                  entityset,
                                                  cutoff_time=cutoff_time,
                                                  training_window=Timedelta(2, 'observations', entity='log'))

    feature_matrix = calculate_feature_matrix([property_feature, dagg],
                                              entityset,
                                              cutoff_time=cutoff_time,
                                              training_window='2 hours')
    prop_values = [5, 5, 1]
    dagg_values = [3, 2, 1]
    assert (feature_matrix[property_feature.get_name()] == prop_values).values.all()
    assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()

示例#4

0

显示文件

def test_approximate_dfeat_of_need_all_values(entityset):
    es = entityset
    p = Percentile(es['log']['value'])
    agg_feat = Sum(p, es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])

    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              instance_ids=[0, 2],
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time_in_index=True,
                                              cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                                                           datetime(2011, 4, 9, 11, 0, 0)])
    log_df = es['log'].df
    instances = [0, 2]
    cutoffs = [pd.Timestamp('2011-04-09 10:31:19'), pd.Timestamp('2011-04-09 11:00:00')]
    approxes = [pd.Timestamp('2011-04-09 10:31:10'), pd.Timestamp('2011-04-09 11:00:00')]
    true_vals = []
    true_vals_approx = []
    for instance, cutoff, approx in zip(instances, cutoffs, approxes):
        log_data_cutoff = log_df[log_df['datetime'] < cutoff]
        log_data_cutoff['percentile'] = log_data_cutoff['value'].rank(pct=True)
        true_agg = log_data_cutoff.loc[log_data_cutoff['session_id'] == instance, 'percentile'].fillna(0).sum()
        true_vals.append(round(true_agg, 3))

        log_data_approx = log_df[log_df['datetime'] < approx]
        log_data_approx['percentile'] = log_data_approx['value'].rank(pct=True)
        true_agg_approx = log_data_approx.loc[log_data_approx['session_id'].isin([0, 1, 2]), 'percentile'].fillna(0).sum()
        true_vals_approx.append(round(true_agg_approx, 3))
    lapprox = [round(x, 3) for x in feature_matrix[dfeat.get_name()].tolist()]
    test_list = [round(x, 3) for x in feature_matrix[agg_feat.get_name()].tolist()]
    assert lapprox == true_vals_approx
    assert test_list == true_vals

示例#5

0

显示文件

文件： test_pandas_backend.py 项目： wuqixiaobai/featuretools

def test_make_dfeat_of_agg_feat_through_parent(entityset, backend):
    """
    The graph looks like this:

        R       C = Customers, the entity we're trying to predict on
       / \      R = Regions, a parent of customers
      S   C     S = Stores, a child of regions
          |
         etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on S.
    """
    store_id_feat = IdentityFeature(entityset['stores']['id'])

    store_count_feat = Count(store_id_feat,
                             parent_entity=entityset['regions'])

    num_stores_feat = DirectFeature(store_count_feat,
                                    child_entity=entityset['customers'])

    pandas_backend = backend([num_stores_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[num_stores_feat.get_name()][0]
    assert (v == 3)

示例#6

0

显示文件

文件： test_calculate_feature_matrix.py 项目： AhnanWong/featuretools

def test_cfm_no_cutoff_time_index(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat4 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat4, es['sessions'])
    cutoff_time = pd.DataFrame({
        'time': [datetime(2013, 4, 9, 10, 31, 19), datetime(2013, 4, 9, 11, 0, 0)],
        'instance_id': [0, 2]
    })
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              cutoff_time_in_index=False,
                                              approximate=Timedelta(12, 's'),
                                              cutoff_time=cutoff_time)
    assert feature_matrix.index.name == 'id'
    assert feature_matrix.index.values.tolist() == [0, 2]
    assert feature_matrix[dfeat.get_name()].tolist() == [10, 10]
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]

    cutoff_time = pd.DataFrame({
        'time': [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)],
        'instance_id': [0, 2]
    })
    feature_matrix_2 = calculate_feature_matrix([dfeat, agg_feat],
                                                entityset,
                                                cutoff_time_in_index=False,
                                                approximate=Timedelta(10, 's'),
                                                cutoff_time=cutoff_time)
    assert feature_matrix_2.index.name == 'id'
    assert feature_matrix_2.index.tolist() == [0, 2]
    assert feature_matrix_2[dfeat.get_name()].tolist() == [7, 10]
    assert feature_matrix_2[agg_feat.get_name()].tolist() == [5, 1]

示例#7

0

显示文件

文件： test_transform_features.py 项目： xuwenkang/featuretools

def test_arithmetic_of_direct(es):
    rating = es['products']['rating']
    log_rating = DirectFeature(rating,
                               child_entity=es['log'])
    customer_age = es['customers']['age']
    session_age = DirectFeature(customer_age,
                                child_entity=es['sessions'])
    log_age = DirectFeature(session_age,
                            child_entity=es['log'])

    to_test = [(Add, [38, 37, 37.5, 37.5]),
               (Subtract, [28, 29, 28.5, 28.5]),
               (Multiply, [165, 132, 148.5, 148.5]),
               (Divide, [6.6, 8.25, 22. / 3, 22. / 3])]

    features = []
    for test in to_test:
        features.append(test[0](log_age, log_rating))

    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=[0, 3, 5, 7],
                                               time_last=None)

    for i, test in enumerate(to_test):
        v = df[features[i].get_name()].values.tolist()
        assert v == test[1]

示例#8

0

显示文件

def test_make_dfeat_of_agg_feat_through_parent(entityset, backend):
    """
    The graph looks like this:

        R       C = Customers, the entity we're trying to predict on
       / \      R = Regions, a parent of customers
      S   C     S = Stores, a child of regions
          |
         etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on S.
    """
    store_id_feat = IdentityFeature(entityset['stores']['id'])

    store_count_feat = Count(store_id_feat,
                             parent_entity=entityset[u'régions'])

    num_stores_feat = DirectFeature(store_count_feat,
                                    child_entity=entityset['customers'])

    pandas_backend = backend([num_stores_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[num_stores_feat.get_name()][0]
    assert (v == 3)

示例#9

0

显示文件

def test_cfm_no_cutoff_time_index(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat4 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat4, es['sessions'])
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              instance_ids=[0, 2],
                                              cutoff_time_in_index=False,
                                              approximate=Timedelta(12, 's'),
                                              cutoff_time=[datetime(2013, 4, 9, 10, 31, 19),
                                                           datetime(2013, 4, 9, 11, 0, 0)])
    assert feature_matrix.index.name == 'id'
    assert feature_matrix.index.values.tolist() == [0, 2]
    assert feature_matrix[dfeat.get_name()].tolist() == [10, 10]
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
    feature_matrix_2 = calculate_feature_matrix([dfeat, agg_feat],
                                                entityset,
                                                instance_ids=[0, 2],
                                                cutoff_time_in_index=False,
                                                approximate=Timedelta(10, 's'),
                                                cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                                                             datetime(2011, 4, 9, 11, 0, 0)])
    assert feature_matrix_2.index.name == 'id'
    assert feature_matrix_2.index.tolist() == [0, 2]
    assert feature_matrix_2[dfeat.get_name()].tolist() == [7, 10]
    assert feature_matrix_2[agg_feat.get_name()].tolist() == [5, 1]

示例#10

0

显示文件

文件： test_calculate_feature_matrix.py 项目： zergey/featuretools

def test_handles_training_window_correctly(entityset):
    property_feature = Count(entityset['log']['id'], entityset['customers'])
    top_level_agg = Count(entityset['customers']['id'], entityset['regions'])
    # make sure we test feature target entity training window
    feature_matrix = calculate_feature_matrix(
        [property_feature],
        instance_ids=[0, 1, 2],
        cutoff_time=[
            datetime(2011, 4, 9, 12, 31),
            datetime(2011, 4, 10, 11),
            datetime(2011, 4, 10, 13, 10, 1)
        ],
        training_window={'customers': '36 hours'})
    prop_values = [0, 5, 0]
    assert (feature_matrix[property_feature.get_name()] == prop_values
            ).values.all()

    # make sure features that have a direct to a higher level agg
    # so we have multiple "filter eids" in get_pandas_data_slice,
    # and we go through the loop to pull data with a training_window param more than once
    dagg = DirectFeature(top_level_agg, entityset['customers'])
    feature_matrix = calculate_feature_matrix(
        [property_feature, dagg],
        instance_ids=[0, 1, 2],
        cutoff_time=[
            datetime(2011, 4, 9, 12, 31),
            datetime(2011, 4, 10, 11),
            datetime(2011, 4, 10, 13, 10, 1)
        ],
        training_window={'log': '2 hours'})
    prop_values = [5, 5, 1]
    dagg_values = [3, 3, 3]
    assert (feature_matrix[property_feature.get_name()] == prop_values
            ).values.all()
    assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()

    property_feature = Count(entityset['log']['id'], entityset['customers'])
    feature_matrix = calculate_feature_matrix([property_feature],
                                              instance_ids=[0, 1, 2],
                                              cutoff_time=[
                                                  datetime(2011, 4, 9, 12, 31),
                                                  datetime(2011, 4, 10, 11),
                                                  datetime(
                                                      2011, 4, 10, 13, 10, 1)
                                              ],
                                              training_window='2 hours')
    labels = [0, 0, 0]
    assert (feature_matrix == labels).values.all()

    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix(
            [property_feature],
            instance_ids=[0, 1, 2],
            cutoff_time=[
                datetime(2011, 4, 9, 12, 31),
                datetime(2011, 4, 10, 11),
                datetime(2011, 4, 10, 13, 10, 1)
            ],
            training_window=Timedelta(2, 'observations', entity='log'))

示例#11

0

显示文件

文件： test_direct_features.py 项目： AhnanWong/featuretools

def test_direct_rename(es):
    # should be same behavior as test_direct_from_identity
    feat = DirectFeature(base_feature=es['sessions']['device_type'],
                         child_entity=es['log'])
    copy_feat = feat.rename("session_test")
    assert feat.hash() != copy_feat.hash()
    assert feat.get_name() != copy_feat.get_name()
    assert feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name()
    assert feat.entity == copy_feat.entity

示例#12

0

显示文件

文件： test_direct_features.py 项目： AhnanWong/featuretools

def test_direct_from_variable(es):
    # should be same behavior as test_direct_from_identity
    d = DirectFeature(base_feature=es['sessions']['device_type'],
                      child_entity=es['log'])
    pandas_backend = PandasBackend(es, [d])
    df = pandas_backend.calculate_all_features(instance_ids=[0, 5],
                                               time_last=None)
    v = df[d.get_name()].tolist()
    assert v == [0, 1]

示例#13

0

显示文件

def test_make_dfeat(entityset, backend):
    f = DirectFeature(entityset['customers']['age'],
                      child_entity=entityset['sessions'])

    pandas_backend = backend([f])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[f.get_name()][0]
    assert (v == 33)

示例#14

0

显示文件

文件： test_pandas_backend.py 项目： wuqixiaobai/featuretools

def test_make_dfeat(entityset, backend):
    f = DirectFeature(entityset['customers']['age'],
                      child_entity=entityset['sessions'])

    pandas_backend = backend([f])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[f.get_name()][0]
    assert (v == 33)

示例#15

0

显示文件

文件： test_direct_features.py 项目： zergey/featuretools

def test_direct_from_variable(es):
    # should be same behavior as test_direct_from_identity
    d = DirectFeature(base_feature=es['sessions']['device_type'],
                      child_entity=es['log'])
    pandas_backend = PandasBackend(es, [d])
    df = pandas_backend.calculate_all_features(instance_ids=[0, 5],
                                               time_last=None)
    v = df[d.get_name()].tolist()
    assert v == [0, 1]

示例#16

0

显示文件

文件： test_direct_features.py 项目： zergey/featuretools

def test_direct_rename(es):
    # should be same behavior as test_direct_from_identity
    feat = DirectFeature(base_feature=es['sessions']['device_type'],
                         child_entity=es['log'])
    copy_feat = feat.rename("session_test")
    assert feat.hash() != copy_feat.hash()
    assert feat.get_name() != copy_feat.get_name()
    assert feat.base_features[0]._get_name(
    ) == copy_feat.base_features[0]._get_name()
    assert feat.entity == copy_feat.entity

示例#17

0

显示文件

文件： test_calculate_feature_matrix.py 项目： AhnanWong/featuretools

def test_approximate_dfeat_of_dfeat_of_agg_on_target(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['log'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat],
                                              entityset,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=cutoff_time)
    assert feature_matrix[dfeat.get_name()].tolist() == [7, 10]

示例#18

0

显示文件

def test_uses_full_entity_feat_of_approximate(entityset):
    es = entityset
    agg_feat = Sum(es['log']['value'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    agg_feat3 = Min(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    dfeat2 = DirectFeature(agg_feat3, es['sessions'])
    p = Percentile(dfeat)

    # only dfeat2 should be approximated
    # because Percentile needs all values

    feature_matrix_only_dfeat2 = calculate_feature_matrix(
        [dfeat2],
        entityset,
        instance_ids=[0, 2],
        approximate=Timedelta(10, 's'),
        cutoff_time_in_index=True,
        cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                     datetime(2011, 4, 9, 11, 0, 0)])
    assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == [1, 0]

    feature_matrix_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        entityset,
        instance_ids=[0, 2],
        approximate=Timedelta(10, 's'),
        cutoff_time_in_index=True,
        cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                     datetime(2011, 4, 9, 11, 0, 0)])
    assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == feature_matrix_approx[dfeat2.get_name()].tolist()

    feature_matrix_small_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        entityset,
        instance_ids=[0, 2],
        approximate=Timedelta(10, 'ms'),
        cutoff_time_in_index=True,
        cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                     datetime(2011, 4, 9, 11, 0, 0)])

    feature_matrix_no_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        entityset,
        instance_ids=[0, 2],
        cutoff_time_in_index=True,
        cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                     datetime(2011, 4, 9, 11, 0, 0)])
    for f in [p, dfeat, agg_feat]:
        for fm1, fm2 in combinations([feature_matrix_approx,
                                      feature_matrix_small_approx,
                                      feature_matrix_no_approx], 2):
            assert fm1[f.get_name()].tolist() == fm2[f.get_name()].tolist()

示例#19

0

显示文件

文件： test_calculate_feature_matrix.py 项目： tomarraj008/featuretools

def test_approximate_dfeat_of_dfeat_of_agg_on_target(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['log'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat],
                                              entityset,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=cutoff_time)
    assert feature_matrix[dfeat.get_name()].tolist() == [7, 10]

示例#20

0

显示文件

文件： test_calculate_feature_matrix.py 项目： wuqixiaobai/featuretools

def test_approximate_returns_correct_empty_default_values(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['customers'])
    dfeat = DirectFeature(agg_feat, es['sessions'])

    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-08 11:00:00'),
                                       pd.Timestamp('2011-04-09 11:00:00')],
                              'instance_id': [0, 0]})

    fm = calculate_feature_matrix([dfeat],
                                  approximate=Timedelta(10, 's'),
                                  cutoff_time=cutoff_df)
    assert fm[dfeat.get_name()].tolist() == [0, 10]

示例#21

0

显示文件

文件： test_calculate_feature_matrix.py 项目： wuqixiaobai/featuretools

def test_uses_full_entity_feat_of_approximate(entityset):
    es = entityset
    agg_feat = Sum(es['log']['value'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    agg_feat3 = Min(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    dfeat2 = DirectFeature(agg_feat3, es['sessions'])
    p = Percentile(dfeat)

    # only dfeat2 should be approximated
    # because Percentile needs all values

    feature_matrix_only_dfeat2 = calculate_feature_matrix(
        [dfeat2],
        instance_ids=[0, 2],
        approximate=Timedelta(10, 's'),
        cutoff_time_in_index=True,
        cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                     datetime(2011, 4, 9, 11, 0, 0)])
    assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == [1, 0]

    feature_matrix_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        instance_ids=[0, 2],
        approximate=Timedelta(10, 's'),
        cutoff_time_in_index=True,
        cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                     datetime(2011, 4, 9, 11, 0, 0)])
    assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == feature_matrix_approx[dfeat2.get_name()].tolist()

    feature_matrix_small_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        instance_ids=[0, 2],
        approximate=Timedelta(10, 'ms'),
        cutoff_time_in_index=True,
        cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                     datetime(2011, 4, 9, 11, 0, 0)])

    feature_matrix_no_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        instance_ids=[0, 2],
        cutoff_time_in_index=True,
        cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                     datetime(2011, 4, 9, 11, 0, 0)])
    for f in [p, dfeat, agg_feat]:
        for fm1, fm2 in combinations([feature_matrix_approx,
                                      feature_matrix_small_approx,
                                      feature_matrix_no_approx], 2):
            assert fm1[f.get_name()].tolist() == fm2[f.get_name()].tolist()

示例#22

0

显示文件

文件： test_calculate_feature_matrix.py 项目： AhnanWong/featuretools

def test_approximate_multiple_instances_per_cutoff_time(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              approximate=Timedelta(1, 'week'),
                                              cutoff_time=cutoff_time,
                                              chunk_size="cutoff time")
    assert feature_matrix.shape[0] == 2
    assert feature_matrix[dfeat.get_name()].dropna().shape[0] == 0
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]

示例#23

0

显示文件

文件： test_calculate_feature_matrix.py 项目： tomarraj008/featuretools

def test_approximate_multiple_instances_per_cutoff_time(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              approximate=Timedelta(1, 'week'),
                                              cutoff_time=cutoff_time,
                                              chunk_size="cutoff time")
    assert feature_matrix.shape[0] == 2
    assert feature_matrix[dfeat.get_name()].dropna().shape[0] == 0
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]

示例#24

0

显示文件

def test_make_deep_agg_feat_of_dfeat_of_agg_feat(entityset, backend):
    """
    The graph looks like this (higher implies parent):

          C     C = Customers, the entity we're trying to predict on
          |     S = Sessions, a child of Customers
      P   S     L = Log, a child of both Sessions and Log
       \ /      P = Products, a parent of Log which is not a descendent of customers
        L

    We're trying to calculate a DFeat from L to P on an agg_feat of P on L, and
    then aggregate it with another agg_feat of C on L.
    """
    log_count_feat = Count(entityset['log']['id'],
                           parent_entity=entityset['products'])

    product_purchases_feat = DirectFeature(log_count_feat,
                                           child_entity=entityset['log'])

    purchase_popularity = Mean(product_purchases_feat,
                               parent_entity=entityset['customers'])

    pandas_backend = backend([purchase_popularity])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[purchase_popularity.get_name()][0]
    assert (v == 38.0 / 10.0)

示例#25

0

显示文件

文件： test_calculate_feature_matrix.py 项目： xinfushe/python-new

def test_cutoff_time_naming(entityset):
    es = entityset

    agg_feat = Count(es['customers']['id'], es['regions'])
    dfeat = DirectFeature(agg_feat, es['customers'])
    cutoff_df = pd.DataFrame({
        'time': [
            pd.Timestamp('2011-04-08 10:30:00'),
            pd.Timestamp('2011-04-09 10:30:06')
        ],
        'instance_id': [0, 0]
    })
    cutoff_df_index_name = cutoff_df.rename(columns={"instance_id": "id"})
    cutoff_df_time_name = cutoff_df.rename(columns={"time": "cutoff_time"})
    cutoff_df_index_name_time_name = cutoff_df.rename(columns={
        "instance_id": "id",
        "time": "cutoff_time"
    })
    cutoff_df_wrong_index_name = cutoff_df.rename(
        columns={"instance_id": "wrong_id"})

    fm1 = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df)
    for test_cutoff in [
            cutoff_df_index_name, cutoff_df_time_name,
            cutoff_df_index_name_time_name
    ]:
        fm2 = calculate_feature_matrix([dfeat], cutoff_time=test_cutoff)

        assert all((fm1 == fm2.values).values)

    with pytest.raises(AttributeError):
        calculate_feature_matrix([dfeat],
                                 cutoff_time=cutoff_df_wrong_index_name)

示例#26

0

显示文件

def test_approximate_time_split_returns_the_same_result(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])

    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:07:30'),
                                       pd.Timestamp('2011-04-09 10:07:40')],
                              'instance_id': [0, 0]})

    feature_matrix_at_once = calculate_feature_matrix([dfeat, agg_feat],
                                                      entityset,
                                                      approximate=Timedelta(10, 's'),
                                                      cutoff_time=cutoff_df)
    divided_matrices = []
    separate_cutoff = [cutoff_df.iloc[0:1], cutoff_df.iloc[1:]]
    # Make sure indexes are different
    # Not that this step is unecessary and done to showcase the issue here
    separate_cutoff[0].index = [0]
    separate_cutoff[1].index = [1]
    for ct in separate_cutoff:
        fm = calculate_feature_matrix([dfeat, agg_feat],
                                      entityset,
                                      approximate=Timedelta(10, 's'),
                                      cutoff_time=ct)
        divided_matrices.append(fm)
    feature_matrix_from_split = pd.concat(divided_matrices)
    assert feature_matrix_from_split.shape == feature_matrix_at_once.shape
    for i1, i2 in zip(feature_matrix_at_once.index, feature_matrix_from_split.index):
        assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
    for c in feature_matrix_from_split:
        for i1, i2 in zip(feature_matrix_at_once[c], feature_matrix_from_split[c]):
            assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)

示例#27

0

显示文件

def test_cutoff_time_extra_columns(entityset):
    es = entityset

    agg_feat = Count(es['customers']['id'], es[u'régions'])
    dfeat = DirectFeature(agg_feat, es['customers'])

    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'),
                                       pd.Timestamp('2011-04-09 10:30:03'),
                                       pd.Timestamp('2011-04-08 10:30:00')],
                              'instance_id': [0, 1, 0],
                              'label': [True, True, False]},
                             columns=['time', 'instance_id', 'label'])
    fm = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df)
    # check column was added to end of matrix
    assert 'label' == fm.columns[-1]
    # check column was sorted by time labelike the rest of the feature matrix
    true_series = pd.Series([False, True, True], index=[0, 1, 0])
    assert (fm['label'] == true_series).all()

    fm_2 = calculate_feature_matrix([dfeat],
                                    entityset,
                                    cutoff_time=cutoff_df,
                                    approximate="2 days")
    # check column was added to end of matrix
    assert 'label' in fm_2.columns
    # check column was sorted by time like the rest of the feature matrix
    true_series = pd.Series([False, True, True], index=[0, 1, 0])
    assert (fm_2['label'] == true_series).all()

示例#28

0

显示文件

def test_make_compare_feat(entityset, backend):
    """
    Feature we're creating is:
    Number of sessions for each customer where the
    number of logs in the session is less than 3
    """
    Count.max_stack_depth = 2
    log_count_feat = Count(entityset['log']['id'],
                           parent_entity=entityset['sessions'])

    mean_agg_feat = Mean(log_count_feat, parent_entity=entityset['customers'])

    mean_feat = DirectFeature(mean_agg_feat,
                              child_entity=entityset['sessions'])

    feat = log_count_feat > mean_feat

    pandas_backend = backend([feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2],
                                               time_last=None)
    name = feat.get_name()
    instances = df[name]
    v0, v1, v2 = instances[0:3]
    assert v0
    assert v1
    assert not v2

示例#29

0

显示文件

文件： test_transform_features.py 项目： wangbin321/featuretools

def test_diff(es):
    value = IdentityFeature(es['log']['value'])
    customer_id_feat = \
        DirectFeature(es['sessions']['customer_id'],
                      child_entity=es['log'])
    diff1 = Diff(value, es['log']['session_id'])
    diff2 = Diff(value, customer_id_feat)

    pandas_backend = PandasBackend(es, [diff1, diff2])
    df = pandas_backend.calculate_all_features(instance_ids=range(15),
                                               time_last=None)

    val1 = df[diff1.get_name()].values.tolist()
    val2 = df[diff2.get_name()].values.tolist()
    correct_vals1 = [
        np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7
    ]
    correct_vals2 = [np.nan, 5, 5, 5, 5, -20, 1, 1, 1, -3, np.nan, 5, -5, 7, 7]
    for i, v in enumerate(val1):
        v1 = val1[i]
        if np.isnan(v1):
            assert (np.isnan(correct_vals1[i]))
        else:
            assert v1 == correct_vals1[i]
        v2 = val2[i]
        if np.isnan(v2):
            assert (np.isnan(correct_vals2[i]))
        else:
            assert v2 == correct_vals2[i]

示例#30

0

显示文件

文件： test_calculate_feature_matrix.py 项目： AhnanWong/featuretools

def test_empty_path_approximate_partial(entityset):
    es = copy.deepcopy(entityset)
    es['sessions'].df['customer_id'] = [0, 0, np.nan, 1, 1, 2]
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              es,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=cutoff_time)
    vals1 = feature_matrix[dfeat.get_name()].tolist()
    assert vals1[0] == 7
    assert np.isnan(vals1[1])
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]

示例#31

0

显示文件

文件： test_calculate_feature_matrix.py 项目： tomarraj008/featuretools

def test_empty_path_approximate_partial(entityset):
    es = copy.deepcopy(entityset)
    es['sessions'].df['customer_id'] = [0, 0, np.nan, 1, 1, 2]
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              es,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=cutoff_time)
    vals1 = feature_matrix[dfeat.get_name()].tolist()
    assert vals1[0] == 7
    assert np.isnan(vals1[1])
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]

示例#32

0

显示文件

文件： test_calculate_feature_matrix.py 项目： wuqixiaobai/featuretools

def test_approximate_child_aggs_handled_correctly(entityset):
    es = entityset
    agg_feat = Count(es['customers']['id'], es['regions'])
    dfeat = DirectFeature(agg_feat, es['customers'])
    agg_feat_2 = Count(es['log']['value'], es['customers'])
    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-08 10:30:00'),
                                       pd.Timestamp('2011-04-09 10:30:06')],
                              'instance_id': [0, 0]})

    fm = calculate_feature_matrix([dfeat],
                                  approximate=Timedelta(10, 's'),
                                  cutoff_time=cutoff_df)
    fm_2 = calculate_feature_matrix([dfeat, agg_feat_2],
                                    approximate=Timedelta(10, 's'),
                                    cutoff_time=cutoff_df)
    assert fm[dfeat.get_name()].tolist() == [2, 3]
    assert fm_2[agg_feat_2.get_name()].tolist() == [0, 2]

示例#33

0

显示文件

文件： test_calculate_feature_matrix.py 项目： tomarraj008/featuretools

def test_approx_base_feature_is_also_first_class_feature(entityset):
    es = entityset
    log_to_products = DirectFeature(es['products']['rating'], es['log'])
    # This should still be computed properly
    agg_feat = Min(log_to_products, es['sessions'])
    customer_agg_feat = Sum(agg_feat, es['customers'])
    # This is to be approximated
    sess_to_cust = DirectFeature(customer_agg_feat, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([sess_to_cust, agg_feat],
                                              entityset,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=cutoff_time)
    vals1 = feature_matrix[sess_to_cust.get_name()].tolist()
    assert vals1 == [8.5, 7]
    vals2 = feature_matrix[agg_feat.get_name()].tolist()
    assert vals2 == [4, 1.5]

示例#34

0

显示文件

文件： test_calculate_feature_matrix.py 项目： AhnanWong/featuretools

def test_approx_base_feature_is_also_first_class_feature(entityset):
    es = entityset
    log_to_products = DirectFeature(es['products']['rating'], es['log'])
    # This should still be computed properly
    agg_feat = Min(log_to_products, es['sessions'])
    customer_agg_feat = Sum(agg_feat, es['customers'])
    # This is to be approximated
    sess_to_cust = DirectFeature(customer_agg_feat, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([sess_to_cust, agg_feat],
                                              entityset,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=cutoff_time)
    vals1 = feature_matrix[sess_to_cust.get_name()].tolist()
    assert vals1 == [8.5, 7]
    vals2 = feature_matrix[agg_feat.get_name()].tolist()
    assert vals2 == [4, 1.5]

示例#35

0

显示文件

文件： test_calculate_feature_matrix.py 项目： tomarraj008/featuretools

def test_training_window_recent_time_index(entityset):
    # customer with no sessions
    row = {
        'id': [3],
        'age': [73],
        u'région_id': ['United States'],
        'cohort': [1],
        'cohort_name': ["Late Adopters"],
        'loves_ice_cream': [True],
        'favorite_quote': ["Who is John Galt?"],
        'signup_date': [datetime(2011, 4, 10)],
        'upgrade_date': [datetime(2011, 4, 12)],
        'cancel_date': [datetime(2011, 5, 13)],
        'date_of_birth': [datetime(1938, 2, 1)],
        'engagement_level': [2],
    }
    df = pd.DataFrame(row)
    df.index = range(3, 4)
    df = entityset['customers'].df.append(df, sort=False)
    entityset['customers'].update_data(df=df,
                                       recalculate_last_time_indexes=False)
    entityset.add_last_time_indexes()

    property_feature = Count(entityset['log']['id'], entityset['customers'])
    top_level_agg = Count(entityset['customers']['id'], entityset[u'régions'])
    dagg = DirectFeature(top_level_agg, entityset['customers'])
    instance_ids = [0, 1, 2, 3]
    times = [
        datetime(2011, 4, 9, 12, 31),
        datetime(2011, 4, 10, 11),
        datetime(2011, 4, 10, 13, 10, 1),
        datetime(2011, 4, 10, 1, 59, 59)
    ]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': instance_ids})
    feature_matrix = calculate_feature_matrix([property_feature, dagg],
                                              entityset,
                                              cutoff_time=cutoff_time,
                                              training_window='2 hours')
    prop_values = [5, 5, 1, 0]
    dagg_values = [3, 2, 1, 3]
    feature_matrix.sort_index(inplace=True)
    assert (feature_matrix[property_feature.get_name()] == prop_values
            ).values.all()
    assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()

示例#36

0

显示文件

def test_dfs_builds_on_seed_features_more_than_max_depth(es):
    seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2
    seed_feature_log = Hour(es['log']['datetime'])
    session_agg = Last(seed_feature_log, es['sessions'])

    # Depth of this feat is 2 relative to session_agg, the seed feature,
    # which is greater than max_depth so it shouldn't be built
    session_agg_trans = DirectFeature(Count(session_agg, es['customers']),
                                      es['sessions'])
    dfs_obj = DeepFeatureSynthesis(
        target_entity_id='sessions',
        entityset=es,
        agg_primitives=[Last, Count],
        trans_primitives=[],
        max_depth=1,
        seed_features=[seed_feature_sessions, seed_feature_log])
    features = dfs_obj.build_features()
    assert seed_feature_sessions.get_name() in [f.get_name() for f in features]
    assert session_agg.get_name() in [f.get_name() for f in features]
    assert session_agg_trans.get_name() not in [f.get_name() for f in features]

示例#37

0

显示文件

文件： test_calculate_feature_matrix.py 项目： AhnanWong/featuretools

def test_training_window_recent_time_index(entityset):
    # customer with no sessions
    row = {
        'id': [3],
        'age': [73],
        u'région_id': ['United States'],
        'cohort': [1],
        'cancel_reason': ["I am finally awake!!"],
        'loves_ice_cream': [True],
        'favorite_quote': ["Who is John Galt?"],
        'signup_date': [datetime(2011, 4, 10)],
        'upgrade_date': [datetime(2011, 4, 12)],
        'cancel_date': [datetime(2011, 5, 13)],
        'date_of_birth': [datetime(1938, 2, 1)],
        'engagement_level': [2],
    }
    df = pd.DataFrame(row)
    df.index = range(3, 4)
    df = entityset['customers'].df.append(df, sort=False)
    entityset['customers'].update_data(df=df,
                                       recalculate_last_time_indexes=False)
    entityset.add_last_time_indexes()

    property_feature = Count(entityset['log']['id'], entityset['customers'])
    top_level_agg = Count(entityset['customers']['id'], entityset[u'régions'])
    dagg = DirectFeature(top_level_agg, entityset['customers'])
    instance_ids = [0, 1, 2, 3]
    times = [datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11),
             datetime(2011, 4, 10, 13, 10, 1), datetime(2011, 4, 10, 1, 59, 59)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': instance_ids})
    feature_matrix = calculate_feature_matrix(
        [property_feature, dagg],
        entityset,
        cutoff_time=cutoff_time,
        training_window='2 hours'
    )
    prop_values = [5, 5, 1, 0]
    dagg_values = [3, 2, 1, 3]
    feature_matrix.sort_index(inplace=True)
    assert (feature_matrix[property_feature.get_name()] == prop_values).values.all()
    assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()

示例#38

0

显示文件

def test_cfm_returns_original_time_indexes(entityset):
    es = entityset

    agg_feat = Count(es['customers']['id'], es[u'régions'])
    dfeat = DirectFeature(agg_feat, es['customers'])
    agg_feat_2 = Count(es['sessions']['id'], es['customers'])
    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'),
                                       pd.Timestamp('2011-04-09 10:30:03'),
                                       pd.Timestamp('2011-04-08 10:30:00')],
                              'instance_id': [0, 1, 0]})
    sorted_df = cutoff_df.sort_values(['time', 'instance_id'], kind='mergesort')

    # no approximate
    fm = calculate_feature_matrix([dfeat],
                                  entityset, cutoff_time=cutoff_df,
                                  cutoff_time_in_index=True)
    instance_level_vals = fm.index.get_level_values(0).values
    time_level_vals = fm.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in different windows, no unapproximated aggs
    fm2 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True, approximate="1 m")
    instance_level_vals = fm2.index.get_level_values(0).values
    time_level_vals = fm2.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in different windows, unapproximated aggs
    fm2 = calculate_feature_matrix([dfeat, agg_feat_2], entityset, cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True, approximate="1 m")
    instance_level_vals = fm2.index.get_level_values(0).values
    time_level_vals = fm2.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in same window, no unapproximated aggs
    fm3 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True, approximate="2 d")
    instance_level_vals = fm3.index.get_level_values(0).values
    time_level_vals = fm3.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in same window, unapproximated aggs
    fm3 = calculate_feature_matrix([dfeat, agg_feat_2], entityset, cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True, approximate="2 d")
    instance_level_vals = fm3.index.get_level_values(0).values
    time_level_vals = fm3.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

示例#39

0

显示文件

文件： test_deep_feature_synthesis.py 项目： AhnanWong/featuretools

def test_dfs_builds_on_seed_features_more_than_max_depth(es):
    seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2
    seed_feature_log = Hour(es['log']['datetime'])
    session_agg = Last(seed_feature_log, es['sessions'])

    # Depth of this feat is 2 relative to session_agg, the seed feature,
    # which is greater than max_depth so it shouldn't be built
    session_agg_trans = DirectFeature(Count(session_agg, es['customers']),
                                      es['sessions'])
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Count],
                                   trans_primitives=[],
                                   max_depth=1,
                                   seed_features=[seed_feature_sessions,
                                                  seed_feature_log])
    features = dfs_obj.build_features()
    assert seed_feature_sessions.get_name() in [f.get_name()
                                                for f in features]
    assert session_agg.get_name() in [f.get_name() for f in features]
    assert session_agg_trans.get_name() not in [f.get_name()
                                                for f in features]

示例#40

0

显示文件

def test_make_dfeat_of_agg_feat_on_self(entityset, backend):
    """
    The graph looks like this:

        R       R = Regions, a parent of customers
        |
        C       C = Customers, the entity we're trying to predict on
        |
       etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on C.
    """
    customer_count_feat = Count(entityset['customers']['id'],
                                parent_entity=entityset[u'régions'])

    num_customers_feat = DirectFeature(customer_count_feat,
                                       child_entity=entityset['customers'])

    pandas_backend = backend([num_customers_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[num_customers_feat.get_name()][0]
    assert (v == 3)

示例#41

0

显示文件

文件： test_pandas_backend.py 项目： wuqixiaobai/featuretools

def test_make_dfeat_of_agg_feat_on_self(entityset, backend):
    """
    The graph looks like this:

        R       R = Regions, a parent of customers
        |
        C       C = Customers, the entity we're trying to predict on
        |
       etc.

    We're trying to calculate a DFeat from C to R on an agg_feat of R on C.
    """
    customer_count_feat = Count(entityset['customers']['id'],
                                parent_entity=entityset['regions'])

    num_customers_feat = DirectFeature(customer_count_feat,
                                       child_entity=entityset['customers'])

    pandas_backend = backend([num_customers_feat])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=None)
    v = df[num_customers_feat.get_name()][0]
    assert (v == 3)

示例#42

0

显示文件

文件： test_transform_features.py 项目： wangbin321/featuretools

def test_compare_of_direct(es):
    log_rating = DirectFeature(es['products']['rating'],
                               child_entity=es['log'])
    to_test = [(Equals, [False, False, False, False]),
               (NotEquals, [True, True, True, True]),
               (LessThan, [False, False, False, True]),
               (LessThanEqualTo, [False, False, False, True]),
               (GreaterThan, [True, True, True, False]),
               (GreaterThanEqualTo, [True, True, True, False])]

    features = []
    for test in to_test:
        features.append(test[0](log_rating, 4.5))

    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2, 3],
                                               time_last=None)

    for i, test in enumerate(to_test):
        v = df[features[i].get_name()].values.tolist()
        assert v == test[1]