def test_training_window(entityset):
    property_feature = Count(entityset['log']['id'], entityset['customers'])
    top_level_agg = Count(entityset['customers']['id'], entityset[u'régions'])

    # make sure features that have a direct to a higher level agg
    # so we have multiple "filter eids" in get_pandas_data_slice,
    # and we go through the loop to pull data with a training_window param more than once
    dagg = DirectFeature(top_level_agg, entityset['customers'])

    # for now, warns if last_time_index not present
    times = [datetime(2011, 4, 9, 12, 31),
             datetime(2011, 4, 10, 11),
             datetime(2011, 4, 10, 13, 10, 1)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 1, 2]})
    feature_matrix = calculate_feature_matrix([property_feature, dagg],
                                              entityset,
                                              cutoff_time=cutoff_time,
                                              training_window='2 hours')

    entityset.add_last_time_indexes()

    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix([property_feature],
                                                  entityset,
                                                  cutoff_time=cutoff_time,
                                                  training_window=Timedelta(2, 'observations', entity='log'))

    feature_matrix = calculate_feature_matrix([property_feature, dagg],
                                              entityset,
                                              cutoff_time=cutoff_time,
                                              training_window='2 hours')
    prop_values = [5, 5, 1]
    dagg_values = [3, 2, 1]
    assert (feature_matrix[property_feature.get_name()] == prop_values).values.all()
    assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()
def test_saveprogress(entityset):
    times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] +
                 [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] +
                 [datetime(2011, 4, 9, 10, 40, 0)] +
                 [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] +
                 [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] +
                 [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)])
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)})
    property_feature = IdentityFeature(entityset['log']['value']) > 10
    save_progress = tempfile.mkdtemp()
    fm_save = calculate_feature_matrix([property_feature],
                                       entityset,
                                       cutoff_time=cutoff_time,
                                       save_progress=save_progress)
    _, _, files = next(os.walk(save_progress))
    files = [os.path.join(save_progress, file) for file in files]
    # there is 17 datetime files created above
    assert len(files) == 17
    list_df = []
    for file_ in files:
        df = pd.read_csv(file_, index_col="id", header=0)
        list_df.append(df)
    merged_df = pd.concat(list_df)
    merged_df.set_index(pd.DatetimeIndex(times), inplace=True, append=True)
    fm_no_save = calculate_feature_matrix([property_feature],
                                          entityset,
                                          cutoff_time=cutoff_time)
    assert np.all((merged_df.sort_index().values) == (fm_save.sort_index().values))
    assert np.all((fm_no_save.sort_index().values) == (fm_save.sort_index().values))
    assert np.all((fm_no_save.sort_index().values) == (merged_df.sort_index().values))
    shutil.rmtree(save_progress)
def test_cutoff_time_extra_columns(entityset):
    es = entityset

    agg_feat = Count(es['customers']['id'], es[u'régions'])
    dfeat = DirectFeature(agg_feat, es['customers'])

    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'),
                                       pd.Timestamp('2011-04-09 10:30:03'),
                                       pd.Timestamp('2011-04-08 10:30:00')],
                              'instance_id': [0, 1, 0],
                              'label': [True, True, False]},
                             columns=['time', 'instance_id', 'label'])
    fm = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df)
    # check column was added to end of matrix
    assert 'label' == fm.columns[-1]
    # check column was sorted by time labelike the rest of the feature matrix
    true_series = pd.Series([False, True, True], index=[0, 1, 0])
    assert (fm['label'] == true_series).all()

    fm_2 = calculate_feature_matrix([dfeat],
                                    entityset,
                                    cutoff_time=cutoff_df,
                                    approximate="2 days")
    # check column was added to end of matrix
    assert 'label' in fm_2.columns
    # check column was sorted by time like the rest of the feature matrix
    true_series = pd.Series([False, True, True], index=[0, 1, 0])
    assert (fm_2['label'] == true_series).all()
def test_string_time_values_in_cutoff_time(entityset):
    times = ['2011-04-09 10:31:27', '2011-04-09 10:30:18']
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 0]})
    agg_feature = Sum(entityset['log']['value'], entityset['customers'])

    with pytest.raises(TypeError):
        calculate_feature_matrix([agg_feature], entityset, cutoff_time=cutoff_time)
def test_cfm_no_cutoff_time_index(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat4 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat4, es['sessions'])
    cutoff_time = pd.DataFrame({
        'time': [datetime(2013, 4, 9, 10, 31, 19), datetime(2013, 4, 9, 11, 0, 0)],
        'instance_id': [0, 2]
    })
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              cutoff_time_in_index=False,
                                              approximate=Timedelta(12, 's'),
                                              cutoff_time=cutoff_time)
    assert feature_matrix.index.name == 'id'
    assert feature_matrix.index.values.tolist() == [0, 2]
    assert feature_matrix[dfeat.get_name()].tolist() == [10, 10]
    assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]

    cutoff_time = pd.DataFrame({
        'time': [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)],
        'instance_id': [0, 2]
    })
    feature_matrix_2 = calculate_feature_matrix([dfeat, agg_feat],
                                                entityset,
                                                cutoff_time_in_index=False,
                                                approximate=Timedelta(10, 's'),
                                                cutoff_time=cutoff_time)
    assert feature_matrix_2.index.name == 'id'
    assert feature_matrix_2.index.tolist() == [0, 2]
    assert feature_matrix_2[dfeat.get_name()].tolist() == [7, 10]
    assert feature_matrix_2[agg_feat.get_name()].tolist() == [5, 1]
def test_dask_persisted_entityset(entityset, capsys):
    times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] +
                 [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] +
                 [datetime(2011, 4, 9, 10, 40, 0)] +
                 [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] +
                 [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] +
                 [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)])
    labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)})
    property_feature = IdentityFeature(entityset['log']['value']) > 10

    with cluster() as (scheduler, [a, b]):
        dkwargs = {'cluster': scheduler['address']}
        feature_matrix = calculate_feature_matrix([property_feature],
                                                  entityset=entityset,
                                                  cutoff_time=cutoff_time,
                                                  verbose=True,
                                                  chunk_size=.13,
                                                  dask_kwargs=dkwargs,
                                                  approximate='1 hour')
        assert (feature_matrix == labels).values.all()
        feature_matrix = calculate_feature_matrix([property_feature],
                                                  entityset=entityset,
                                                  cutoff_time=cutoff_time,
                                                  verbose=True,
                                                  chunk_size=.13,
                                                  dask_kwargs=dkwargs,
                                                  approximate='1 hour')
        captured = capsys.readouterr()
        assert "Using EntitySet persisted on the cluster as dataset " in captured[0]
        assert (feature_matrix == labels).values.all()
def test_empty_child_dataframe():
    parent_df = pd.DataFrame({"id": [1]})
    child_df = pd.DataFrame({"id": [1, 2, 3],
                             "parent_id": [1, 1, 1],
                             "time_index": pd.date_range(start='1/1/2018', periods=3),
                             "value": [10, 5, 2]})

    es = ft.EntitySet(id="blah")
    es.entity_from_dataframe(entity_id="parent", dataframe=parent_df, index="id")
    es.entity_from_dataframe(entity_id="child", dataframe=child_df, index="id", time_index="time_index")
    es.add_relationship(ft.Relationship(es["parent"]["id"], es["child"]["parent_id"]))

    # create regular agg
    count = Count(es["child"]['id'], es["parent"])

    # create agg feature that requires multiple arguments
    trend = Trend([es["child"]['value'], es["child"]['time_index']], es["parent"])

    # create aggs with where
    where = ft.Feature(es["child"]["value"]) == 1
    count_where = Count(es["child"]['id'], es["parent"], where=where)
    trend_where = Trend([es["child"]['value'], es["child"]['time_index']], es["parent"], where=where)

    # cutoff time before all rows
    fm = ft.calculate_feature_matrix(entityset=es, features=[count, count_where, trend, trend_where], cutoff_time=pd.Timestamp("12/31/2017"))
    names = [count.get_name(), count_where.get_name(), trend.get_name(), trend_where.get_name()]
    assert_array_equal(fm[names], [[0, 0, np.nan, np.nan]])

    # cutoff time after all rows, but where clause filters all rows
    fm2 = ft.calculate_feature_matrix(entityset=es, features=[count_where, trend_where], cutoff_time=pd.Timestamp("1/4/2018"))
    names = [count_where.get_name(), trend_where.get_name()]
    assert_array_equal(fm2[names], [[0, np.nan]])
def test_approximate_time_split_returns_the_same_result(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])

    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:07:30'),
                                       pd.Timestamp('2011-04-09 10:07:40')],
                              'instance_id': [0, 0]})

    feature_matrix_at_once = calculate_feature_matrix([dfeat, agg_feat],
                                                      entityset,
                                                      approximate=Timedelta(10, 's'),
                                                      cutoff_time=cutoff_df)
    divided_matrices = []
    separate_cutoff = [cutoff_df.iloc[0:1], cutoff_df.iloc[1:]]
    # Make sure indexes are different
    # Not that this step is unecessary and done to showcase the issue here
    separate_cutoff[0].index = [0]
    separate_cutoff[1].index = [1]
    for ct in separate_cutoff:
        fm = calculate_feature_matrix([dfeat, agg_feat],
                                      entityset,
                                      approximate=Timedelta(10, 's'),
                                      cutoff_time=ct)
        divided_matrices.append(fm)
    feature_matrix_from_split = pd.concat(divided_matrices)
    assert feature_matrix_from_split.shape == feature_matrix_at_once.shape
    for i1, i2 in zip(feature_matrix_at_once.index, feature_matrix_from_split.index):
        assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
    for c in feature_matrix_from_split:
        for i1, i2 in zip(feature_matrix_at_once[c], feature_matrix_from_split[c]):
            assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
def test_integer_time_index_datetime_cutoffs(int_es):
    times = [datetime.now()] * 17
    cutoff_df = pd.DataFrame({'time': times, 'instance_id': range(17)})
    property_feature = IdentityFeature(int_es['log']['value']) > 10

    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature],
                                 cutoff_time=cutoff_df,
                                 cutoff_time_in_index=True)
def test_cfm_returns_original_time_indexes(entityset):
    es = entityset

    agg_feat = Count(es['customers']['id'], es[u'régions'])
    dfeat = DirectFeature(agg_feat, es['customers'])
    agg_feat_2 = Count(es['sessions']['id'], es['customers'])
    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'),
                                       pd.Timestamp('2011-04-09 10:30:03'),
                                       pd.Timestamp('2011-04-08 10:30:00')],
                              'instance_id': [0, 1, 0]})
    sorted_df = cutoff_df.sort_values(['time', 'instance_id'], kind='mergesort')

    # no approximate
    fm = calculate_feature_matrix([dfeat],
                                  entityset, cutoff_time=cutoff_df,
                                  cutoff_time_in_index=True)
    instance_level_vals = fm.index.get_level_values(0).values
    time_level_vals = fm.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in different windows, no unapproximated aggs
    fm2 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True, approximate="1 m")
    instance_level_vals = fm2.index.get_level_values(0).values
    time_level_vals = fm2.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in different windows, unapproximated aggs
    fm2 = calculate_feature_matrix([dfeat, agg_feat_2], entityset, cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True, approximate="1 m")
    instance_level_vals = fm2.index.get_level_values(0).values
    time_level_vals = fm2.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in same window, no unapproximated aggs
    fm3 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True, approximate="2 d")
    instance_level_vals = fm3.index.get_level_values(0).values
    time_level_vals = fm3.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in same window, unapproximated aggs
    fm3 = calculate_feature_matrix([dfeat, agg_feat_2], entityset, cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True, approximate="2 d")
    instance_level_vals = fm3.index.get_level_values(0).values
    time_level_vals = fm3.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()
def test_datetime_index_mixed_cutoff(entityset):
    times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] +
                 [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] +
                 [17] +
                 [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] +
                 [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] +
                 [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)])
    labels = [False] * 3 + [True] * 2 + [False] * 9 + [False] * 2 + [True]
    instances = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 15, 14]
    cutoff_df = pd.DataFrame({'time': times,
                              'instance_id': instances,
                              'labels': labels})
    cutoff_df = cutoff_df[['time', 'instance_id', 'labels']]
    property_feature = IdentityFeature(entityset['log']['value']) > 10

    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature],
                                 cutoff_time=cutoff_df)

    times[9] = "foobar"
    cutoff_df['time'] = times
    with pytest.raises(ValueError):
        calculate_feature_matrix([property_feature],
                                 cutoff_time=cutoff_df)

    cutoff_df['time'].iloc[9] = '2018-04-02 18:50:45.453216'
    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature],
                                 cutoff_time=cutoff_df)

    times[9] = '17'
    cutoff_df['time'] = times
    with pytest.raises(ValueError):
        calculate_feature_matrix([property_feature],
                                 cutoff_time=cutoff_df)
def test_integer_time_index_mixed_cutoff(int_es):
    times_dt = list(range(8, 17)) + [datetime(2011, 1, 1), 19, 20, 21, 22, 25, 24, 23]
    labels = [False] * 3 + [True] * 2 + [False] * 9 + [False] * 2 + [True]
    instances = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 15, 14]
    cutoff_df = pd.DataFrame({'time': times_dt,
                              'instance_id': instances,
                              'labels': labels})
    cutoff_df = cutoff_df[['time', 'instance_id', 'labels']]
    property_feature = IdentityFeature(int_es['log']['value']) > 10

    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature],
                                 cutoff_time=cutoff_df)

    times_str = list(range(8, 17)) + ["foobar", 19, 20, 21, 22, 25, 24, 23]
    cutoff_df['time'] = times_str
    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature],
                                 cutoff_time=cutoff_df)

    times_date_str = list(range(8, 17)) + ['2018-04-02', 19, 20, 21, 22, 25, 24, 23]
    cutoff_df['time'] = times_date_str
    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature],
                                 cutoff_time=cutoff_df)

    [19, 20, 21, 22]
    times_int_str = [0, 1, 2, 3, 4, 5, '6', 7, 8, 9, 9, 10, 11, 12, 15, 14, 13]
    times_int_str = list(range(8, 17)) + ['17', 19, 20, 21, 22, 25, 24, 23]
    cutoff_df['time'] = times_int_str
    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature],
                                 cutoff_time=cutoff_df)
def test_uses_full_entity_feat_of_approximate(entityset):
    es = entityset
    agg_feat = Sum(es['log']['value'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    agg_feat3 = Min(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    dfeat2 = DirectFeature(agg_feat3, es['sessions'])
    p = Percentile(dfeat)

    # only dfeat2 should be approximated
    # because Percentile needs all values

    feature_matrix_only_dfeat2 = calculate_feature_matrix(
        [dfeat2],
        instance_ids=[0, 2],
        approximate=Timedelta(10, 's'),
        cutoff_time_in_index=True,
        cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                     datetime(2011, 4, 9, 11, 0, 0)])
    assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == [1, 0]

    feature_matrix_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        instance_ids=[0, 2],
        approximate=Timedelta(10, 's'),
        cutoff_time_in_index=True,
        cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                     datetime(2011, 4, 9, 11, 0, 0)])
    assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == feature_matrix_approx[dfeat2.get_name()].tolist()

    feature_matrix_small_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        instance_ids=[0, 2],
        approximate=Timedelta(10, 'ms'),
        cutoff_time_in_index=True,
        cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                     datetime(2011, 4, 9, 11, 0, 0)])

    feature_matrix_no_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        instance_ids=[0, 2],
        cutoff_time_in_index=True,
        cutoff_time=[datetime(2011, 4, 9, 10, 31, 19),
                     datetime(2011, 4, 9, 11, 0, 0)])
    for f in [p, dfeat, agg_feat]:
        for fm1, fm2 in combinations([feature_matrix_approx,
                                      feature_matrix_small_approx,
                                      feature_matrix_no_approx], 2):
            assert fm1[f.get_name()].tolist() == fm2[f.get_name()].tolist()
def test_approximate_dfeat_of_need_all_values(entityset):
    es = entityset
    p = Percentile(es['log']['value'])
    agg_feat = Sum(p, es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat, agg_feat],
                                              entityset,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time_in_index=True,
                                              cutoff_time=cutoff_time)
    log_df = es['log'].df
    instances = [0, 2]
    cutoffs = [pd.Timestamp('2011-04-09 10:31:19'), pd.Timestamp('2011-04-09 11:00:00')]
    approxes = [pd.Timestamp('2011-04-09 10:31:10'), pd.Timestamp('2011-04-09 11:00:00')]
    true_vals = []
    true_vals_approx = []
    for instance, cutoff, approx in zip(instances, cutoffs, approxes):
        log_data_cutoff = log_df[log_df['datetime'] < cutoff]
        log_data_cutoff['percentile'] = log_data_cutoff['value'].rank(pct=True)
        true_agg = log_data_cutoff.loc[log_data_cutoff['session_id'] == instance, 'percentile'].fillna(0).sum()
        true_vals.append(round(true_agg, 3))

        log_data_approx = log_df[log_df['datetime'] < approx]
        log_data_approx['percentile'] = log_data_approx['value'].rank(pct=True)
        true_agg_approx = log_data_approx.loc[log_data_approx['session_id'].isin([0, 1, 2]), 'percentile'].fillna(0).sum()
        true_vals_approx.append(round(true_agg_approx, 3))
    lapprox = [round(x, 3) for x in feature_matrix[dfeat.get_name()].tolist()]
    test_list = [round(x, 3) for x in feature_matrix[agg_feat.get_name()].tolist()]
    assert lapprox == true_vals_approx
    assert test_list == true_vals
示例#15
0
def test_custom_primitive_time_as_arg(es):
    def time_since_last(values, time):
        time_since = time - values.iloc[0]
        return time_since.total_seconds()

    TimeSinceLast = make_agg_primitive(time_since_last,
                                       [DatetimeTimeIndex],
                                       Numeric,
                                       uses_calc_time=True)
    assert TimeSinceLast.name == "time_since_last"
    f = TimeSinceLast(es["log"]["datetime"], es["customers"])
    fm = ft.calculate_feature_matrix([f],
                                     entityset=es,
                                     instance_ids=[0, 1, 2],
                                     cutoff_time=datetime(2015, 6, 8))

    correct = [131376600, 131289600, 131287800]
    # note: must round to nearest second
    assert all(fm[f.get_name()].round().values == correct)

    error_text = "'time' is a restricted keyword.  Please use a different keyword."
    with pytest.raises(ValueError, match=error_text):
        make_agg_primitive(time_since_last,
                           [DatetimeTimeIndex],
                           Numeric,
                           uses_calc_time=False)
def test_cutoff_time_correctly(entityset):
    property_feature = Count(entityset['log']['id'], entityset['customers'])
    feature_matrix = calculate_feature_matrix([property_feature], instance_ids=[0, 1, 2],
                                              cutoff_time=[datetime(2011, 4, 10), datetime(2011, 4, 11),
                                                           datetime(2011, 4, 7)])
    labels = [0, 10, 5]
    assert (feature_matrix == labels).values.all()
示例#17
0
def test_time_since_last_custom(es):
    def time_since_last(values, time=None):
        time_since = time - values.iloc[0]
        return time_since.total_seconds()

    TimeSinceLast = make_agg_primitive(time_since_last,
                                       [DatetimeTimeIndex],
                                       Numeric,
                                       name="time_since_last",
                                       uses_calc_time=True)
    f = TimeSinceLast(es["log"]["datetime"], es["customers"])
    fm = calculate_feature_matrix([f],
                                  entityset=es,
                                  instance_ids=[0, 1, 2],
                                  cutoff_time=datetime(2015, 6, 8))

    correct = [131376600, 131289600, 131287800]
    # note: must round to nearest second
    assert all(fm[f.get_name()].round().values == correct)

    with pytest.raises(ValueError):
        TimeSinceLast = make_agg_primitive(time_since_last,
                                           [DatetimeTimeIndex],
                                           Numeric,
                                           uses_calc_time=False)
def test_encode_features_handles_pass_columns(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])
    f2 = IdentityFeature(entityset["log"]["value"])

    features = [f1, f2]
    cutoff_time = pd.DataFrame({'instance_id': range(6),
                                'time': entityset['log'].df['datetime'][0:6],
                                'label': [i % 2 for i in range(6)]},
                               columns=["instance_id", "time", "label"])
    feature_matrix = calculate_feature_matrix(features, entityset, cutoff_time)

    assert 'label' in feature_matrix.columns

    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    feature_matrix_encoded_shape = feature_matrix_encoded.shape

    # to_encode should keep product_id as a string, and not create 3 additional columns
    to_encode = []
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    to_encode = ['value']
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, to_encode=to_encode)
    assert feature_matrix_encoded_shape != feature_matrix_encoded.shape

    assert 'label' in feature_matrix_encoded.columns
示例#19
0
def test_median(es):
    f = Median(es["log"]["value_many_nans"], es["customers"])
    fm = calculate_feature_matrix([f],
                                  instance_ids=[0, 1, 2],
                                  cutoff_time=datetime(2015, 6, 8))

    correct = [1, 3, np.nan]
    np.testing.assert_equal(fm[f.get_name()].values, correct)
def test_approximate_child_aggs_handled_correctly(entityset):
    es = entityset
    agg_feat = Count(es['customers']['id'], es['regions'])
    dfeat = DirectFeature(agg_feat, es['customers'])
    agg_feat_2 = Count(es['log']['value'], es['customers'])
    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-08 10:30:00'),
                                       pd.Timestamp('2011-04-09 10:30:06')],
                              'instance_id': [0, 0]})

    fm = calculate_feature_matrix([dfeat],
                                  approximate=Timedelta(10, 's'),
                                  cutoff_time=cutoff_df)
    fm_2 = calculate_feature_matrix([dfeat, agg_feat_2],
                                    approximate=Timedelta(10, 's'),
                                    cutoff_time=cutoff_df)
    assert fm[dfeat.get_name()].tolist() == [2, 3]
    assert fm_2[agg_feat_2.get_name()].tolist() == [0, 2]
def test_cfm_approximate_correct_ordering():
    trips = {
        'trip_id': [i for i in range(1000)],
        'flight_time': [datetime(1998, 4, 2) for i in range(350)] + [datetime(1997, 4, 3) for i in range(650)],
        'flight_id': [randint(1, 25) for i in range(1000)],
        'trip_duration': [randint(1, 999) for i in range(1000)]
    }
    df = pd.DataFrame.from_dict(trips)
    es = EntitySet('flights')
    es.entity_from_dataframe("trips",
                             dataframe=df,
                             index="trip_id",
                             time_index='flight_time')
    es.normalize_entity(base_entity_id="trips",
                        new_entity_id="flights",
                        index="flight_id",
                        make_time_index=True)
    features = dfs(entityset=es, target_entity='trips', features_only=True)
    flight_features = [feature for feature in features
                       if isinstance(feature, DirectFeature) and
                       isinstance(feature.base_features[0],
                                  AggregationPrimitive)]
    property_feature = IdentityFeature(es['trips']['trip_id'])
    # direct_agg_feat = DirectFeature(Sum(es['trips']['trip_duration'],
    #                                     es['flights']),
    #                                 es['trips'])
    cutoff_time = pd.DataFrame.from_dict({'instance_id': df['trip_id'],
                                          'time': df['flight_time']})
    time_feature = IdentityFeature(es['trips']['flight_time'])
    feature_matrix = calculate_feature_matrix(flight_features + [property_feature, time_feature],
                                              cutoff_time_in_index=True,
                                              cutoff_time=cutoff_time)
    feature_matrix.index.names = ['instance', 'time']
    assert(np.all(feature_matrix.reset_index('time').reset_index()[['instance', 'time']].values == feature_matrix[['trip_id', 'flight_time']].values))
    feature_matrix_2 = calculate_feature_matrix(flight_features + [property_feature, time_feature],
                                                cutoff_time=cutoff_time,
                                                cutoff_time_in_index=True,
                                                approximate=Timedelta(2, 'd'))
    feature_matrix_2.index.names = ['instance', 'time']
    assert(np.all(feature_matrix_2.reset_index('time').reset_index()[['instance', 'time']].values == feature_matrix_2[['trip_id', 'flight_time']].values))
    for column in feature_matrix:
        for x, y in zip(feature_matrix[column], feature_matrix_2[column]):
            if not ((pd.isnull(x) and pd.isnull(y)) or (x == y)):
                import pdb
                pdb.set_trace()
            assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
示例#22
0
def test_time_since_last(es):
    f = TimeSinceLast(es["log"]["datetime"], es["customers"])
    fm = calculate_feature_matrix([f],
                                  instance_ids=[0, 1, 2],
                                  cutoff_time=datetime(2015, 6, 8))

    correct = [131376600, 131289600, 131287800]
    # note: must round to nearest second
    assert all(fm[f.get_name()].round().values == correct)
def test_parallel_failure_raises_correct_error(entityset):
    times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] +
                 [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] +
                 [datetime(2011, 4, 9, 10, 40, 0)] +
                 [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] +
                 [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] +
                 [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)])
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)})
    property_feature = IdentityFeature(entityset['log']['value']) > 10

    with pytest.raises(AssertionError):
        calculate_feature_matrix([property_feature],
                                 entityset=entityset,
                                 cutoff_time=cutoff_time,
                                 verbose=True,
                                 chunk_size=.13,
                                 n_jobs=0,
                                 approximate='1 hour')
def test_cutoff_time_correctly(entityset):
    property_feature = Count(entityset['log']['id'], entityset['customers'])
    times = [datetime(2011, 4, 10), datetime(2011, 4, 11), datetime(2011, 4, 7)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 1, 2]})
    feature_matrix = calculate_feature_matrix([property_feature],
                                              entityset,
                                              cutoff_time=cutoff_time)
    labels = [0, 10, 5]
    assert (feature_matrix == labels).values.all()
示例#25
0
    def head(self, n=10, cutoff_time=None):
        """See values for feature

        Args:
            n (int) : number of instances to return

        Returns:
            :class:`pd.DataFrame` : Pandas DataFrame
        """
        from featuretools import calculate_feature_matrix
        cfm = calculate_feature_matrix([self], cutoff_time=cutoff_time).head(n)
        return cfm
def test_cutoff_time_naming(entityset):
    es = entityset

    agg_feat = Count(es['customers']['id'], es[u'régions'])
    dfeat = DirectFeature(agg_feat, es['customers'])
    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-08 10:30:00'),
                                       pd.Timestamp('2011-04-09 10:30:06')],
                              'instance_id': [0, 0]})
    cutoff_df_index_name = cutoff_df.rename(columns={"instance_id": "id"})
    cutoff_df_time_name = cutoff_df.rename(columns={"time": "cutoff_time"})
    cutoff_df_index_name_time_name = cutoff_df.rename(columns={"instance_id": "id", "time": "cutoff_time"})
    cutoff_df_wrong_index_name = cutoff_df.rename(columns={"instance_id": "wrong_id"})

    fm1 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df)
    for test_cutoff in [cutoff_df_index_name, cutoff_df_time_name, cutoff_df_index_name_time_name]:
        fm2 = calculate_feature_matrix([dfeat], entityset, cutoff_time=test_cutoff)

        assert all((fm1 == fm2.values).values)

    with pytest.raises(AttributeError):
        calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df_wrong_index_name)
def test_approximate_dfeat_of_dfeat_of_agg_on_target(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['log'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([dfeat],
                                              entityset,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=cutoff_time)
    assert feature_matrix[dfeat.get_name()].tolist() == [7, 10]
def test_calc_feature_matrix(entityset):
    times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] +
                 [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] +
                 [datetime(2011, 4, 9, 10, 40, 0)] +
                 [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] +
                 [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] +
                 [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)])
    labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2

    property_feature = IdentityFeature(entityset['log']['value']) > 10

    feature_matrix = calculate_feature_matrix([property_feature],
                                              instance_ids=range(17),
                                              cutoff_time=times,
                                              verbose=True)

    assert (feature_matrix == labels).values.all()

    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix('features', instance_ids=range(17),
                                                  cutoff_time=times)
    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix([], instance_ids=range(17),
                                                  cutoff_time=times)
    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix([1, 2, 3], instance_ids=range(17),
                                                  cutoff_time=times)
    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature],
                                 instance_ids=range(17),
                                 cutoff_time=17)
def test_approximate_returns_correct_empty_default_values(entityset):
    es = entityset
    agg_feat = Count(es['log']['id'], es['customers'])
    dfeat = DirectFeature(agg_feat, es['sessions'])

    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-08 11:00:00'),
                                       pd.Timestamp('2011-04-09 11:00:00')],
                              'instance_id': [0, 0]})

    fm = calculate_feature_matrix([dfeat],
                                  approximate=Timedelta(10, 's'),
                                  cutoff_time=cutoff_df)
    assert fm[dfeat.get_name()].tolist() == [0, 10]
def test_inplace_encodes_features(entityset):
    f1 = IdentityFeature(entityset["log"]["product_id"])

    features = [f1]
    feature_matrix = calculate_feature_matrix(features, entityset, instance_ids=[0, 1, 2, 3, 4, 5])

    feature_matrix_shape = feature_matrix.shape
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features)
    assert feature_matrix_encoded.shape != feature_matrix_shape
    assert feature_matrix.shape == feature_matrix_shape

    # inplace they should be the same
    feature_matrix_encoded, features_encoded = encode_features(feature_matrix, features, inplace=True)
    assert feature_matrix_encoded.shape == feature_matrix.shape
示例#31
0
    def calculate_feature_matrix(self, X, target_entity=None, entityset=None,
                                 entities=None, relationships=None):

        if entityset is None:
            entityset = self._get_entityset(X, target_entity, entities, relationships)

        if self.training_window is not None:
            entityset.add_last_time_indexes()

        cutoff_time = None
        if self.time_index:
            cutoff_time = X[[self.index, self.time_index]]
            cutoff_time = cutoff_time.rename(columns={self.time_index: 'time'})

        X = ft.calculate_feature_matrix(
            self.features,
            entityset=entityset,
            cutoff_time=cutoff_time,
            training_window=self.training_window,
            n_jobs=self.n_jobs,
            verbose=self.verbose,
        )

        return X
示例#32
0
def test_count_null_and_make_agg_primitive(es):
    def count_func(values, count_null=False):
        if len(values) == 0:
            return 0

        if count_null:
            values = values.fillna(0)

        return values.count()

    def count_generate_name(self):
        where_str = self._where_str()
        use_prev_str = self._use_prev_str()
        return u"COUNT(%s%s%s)" % (self.child_entity.name,
                                   where_str,
                                   use_prev_str)

    Count = make_agg_primitive(count_func, [[Index], [Variable]], Numeric,
                               name="count", stack_on_self=False,
                               cls_attributes={"generate_name": count_generate_name})
    count_null = Count(es['log']['value'], es['sessions'], count_null=True)
    feature_matrix = calculate_feature_matrix([count_null], entityset=es)
    values = [5, 4, 1, 2, 3, 2]
    assert (values == feature_matrix[count_null.get_name()]).all()
示例#33
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:

        if self._features is None:
            raise ValueError('Must call fit() before calling produce()')

        if not isinstance(inputs, Dataset):
            raise ValueError('Inputs to produce() must be a Dataset')

        features = self._features

        parsed = self._parse_inputs(
            inputs,
            entities_to_normalize=self._entities_normalized,
            # original_entityset=self._entityset,
            parse_target=False)

        entityset = parsed['entityset']
        target = self._target
        instance_ids = parsed['instance_ids']

        feature_matrix = ft.calculate_feature_matrix(
            features,
            entityset=entityset,
            instance_ids=instance_ids,
            cutoff_time_in_index=False)

        fm_with_metadata = self._format_fm_after_cfm(feature_matrix,
                                                     instance_ids, features,
                                                     target, entityset,
                                                     inputs.metadata)

        return CallResult(fm_with_metadata)
示例#34
0
def test_time_since_last_custom(es):
    def time_since_last(values, time=None):
        time_since = time - values.iloc[0]
        return time_since.total_seconds()

    TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex],
                                       Numeric,
                                       name="time_since_last",
                                       uses_calc_time=True)
    f = TimeSinceLast(es["log"]["datetime"], es["customers"])
    fm = calculate_feature_matrix([f],
                                  entityset=es,
                                  instance_ids=[0, 1, 2],
                                  cutoff_time=datetime(2015, 6, 8))

    correct = [131376600, 131289600, 131287800]
    # note: must round to nearest second
    assert all(fm[f.get_name()].round().values == correct)

    with pytest.raises(ValueError):
        TimeSinceLast = make_agg_primitive(time_since_last,
                                           [DatetimeTimeIndex],
                                           Numeric,
                                           uses_calc_time=False)
def test_arithmetic_of_identity(es):
    logs = es['log']

    to_test = [(AddNumeric, [0., 7., 14., 21.]),
               (SubtractNumeric, [0, 3, 6, 9]),
               (MultiplyNumeric, [0, 10, 40, 90]),
               (DivideNumeric, [np.nan, 2.5, 2.5, 2.5])]

    features = []
    for test in to_test:
        features.append(
            ft.Feature([logs['value'], logs['value_2']], primitive=test[0]))

    df = ft.calculate_feature_matrix(entityset=es,
                                     features=features,
                                     instance_ids=[0, 1, 2, 3])

    for i, test in enumerate(to_test[:-1]):
        v = df[features[i].get_name()].values.tolist()
        assert v == test[1]
    i, test = 3, to_test[-1]
    v = df[features[i].get_name()].values.tolist()
    assert (np.isnan(v[0]))
    assert v[1:] == test[1][1:]
示例#36
0
def test_custom_primitive_time_as_arg(es):
    def time_since_last(values, time):
        time_since = time - values.iloc[0]
        return time_since.total_seconds()

    TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex],
                                       Numeric,
                                       uses_calc_time=True)
    assert TimeSinceLast.name == "time_since_last"
    f = TimeSinceLast(es["log"]["datetime"], es["customers"])
    fm = ft.calculate_feature_matrix([f],
                                     entityset=es,
                                     instance_ids=[0, 1, 2],
                                     cutoff_time=datetime(2015, 6, 8))

    correct = [131376600, 131289600, 131287800]
    # note: must round to nearest second
    assert all(fm[f.get_name()].round().values == correct)

    error_text = "'time' is a restricted keyword.  Please use a different keyword."
    with pytest.raises(ValueError, match=error_text):
        make_agg_primitive(time_since_last, [DatetimeTimeIndex],
                           Numeric,
                           uses_calc_time=False)
示例#37
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        es = self._make_entityset(inputs.copy())

        fm = ft.calculate_feature_matrix(
            entityset=es,
            features=self.features,
            chunk_size=self.chunk_size
        )

        # make sure the feature matrix is ordered the same as the input
        fm = fm.reindex(es[self._target_resource_id].df.index)
        fm = fm.reset_index(drop=True)  # d3m wants index to increment by 1

        # treat inf as null like fit step
        fm = fm.replace([np.inf, -np.inf], np.nan)

        # todo add this metadata handle
        fm = add_metadata(fm, self.features)
        fm = self._add_labels(fm, inputs)

        return CallResult(fm)
示例#38
0
def test_compare_of_agg(es):
    count_logs = ft.Feature(es['log']['id'],
                            parent_entity=es['sessions'],
                            primitive=Count)

    to_test = [(EqualScalar, [False, False, False, True]),
               (NotEqualScalar, [True, True, True, False]),
               (LessThanScalar, [False, False, True, False]),
               (LessThanEqualToScalar, [False, False, True, True]),
               (GreaterThanScalar, [True, True, False, False]),
               (GreaterThanEqualToScalar, [True, True, False, True])]

    features = []
    for test in to_test:
        features.append(ft.Feature(count_logs, primitive=test[0](2)))

    df = ft.calculate_feature_matrix(entityset=es,
                                     features=features,
                                     instance_ids=[0, 1, 2, 3])
    df = to_pandas(df, index='id', sort_index=True)

    for i, test in enumerate(to_test):
        v = df[features[i].get_name()].values.tolist()
        assert v == test[1]
def test_arithmetic_of_val(es):
    to_test = [(AddNumericScalar, [2.0, 7.0, 12.0, 17.0]),
               (SubtractNumericScalar, [-2.0, 3.0, 8.0, 13.0]),
               (ScalarSubtractNumericFeature, [2.0, -3.0, -8.0, -13.0]),
               (MultiplyNumericScalar, [0, 10, 20, 30]),
               (DivideNumericScalar, [0, 2.5, 5, 7.5]),
               (DivideByFeature, [np.inf, 0.4, 0.2, 2 / 15.0])]

    features = []
    for test in to_test:
        features.append(ft.Feature(es['log']['value'], primitive=test[0](2)))

    features.append(ft.Feature(es['log']['value']) / 0)

    df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 1, 2, 3])

    for f, test in zip(features, to_test):
        v = df[f.get_name()].values.tolist()
        assert v == test[1]

    test = [np.nan, np.inf, np.inf, np.inf]
    v = df[features[-1].get_name()].values.tolist()
    assert (np.isnan(v[0]))
    assert v[1:] == test[1:]
def test_get_filepath(es):
    class Mod4(TransformPrimitive):
        '''Return base feature modulo 4'''
        name = "mod4"
        input_types = [Numeric]
        return_type = Numeric

        def get_function(self):
            filepath = self.get_filepath("featuretools_unit_test_example.csv")
            reference = pd.read_csv(filepath, header=None, squeeze=True)

            def map_to_word(x):
                def _map(x):
                    if pd.isnull(x):
                        return x
                    return reference[int(x) % 4]
                return pd.Series(x).apply(_map)
            return map_to_word

    feat = ft.Feature(es['log']['value'], primitive=Mod4)
    df = ft.calculate_feature_matrix(features=[feat],
                                     entityset=es,
                                     instance_ids=range(17))

    assert pd.isnull(df["MOD4(value)"][15])
    assert df["MOD4(value)"][0] == 0
    assert df["MOD4(value)"][14] == 2

    fm, fl = ft.dfs(entityset=es,
                    target_entity="log",
                    agg_primitives=[],
                    trans_primitives=[Mod4])

    assert fm["MOD4(value)"][0] == 0
    assert fm["MOD4(value)"][14] == 2
    assert pd.isnull(fm["MOD4(value)"][15])
def test_boolean_multiply(boolean_mult_es):
    es = boolean_mult_es
    to_test = [('numeric', 'numeric'), ('numeric', 'bool'),
               ('bool', 'numeric'), ('bool', 'bool')]
    features = []
    for row in to_test:
        features.append(
            ft.Feature(es["test"][row[0]]) * ft.Feature(es["test"][row[1]]))

    fm = ft.calculate_feature_matrix(entityset=es, features=features)

    if isinstance(fm, dd.DataFrame):
        fm = fm.compute()

    df = es['test'].df
    if isinstance(df, dd.DataFrame):
        df = df.compute()

    for row in to_test:
        col_name = '{} * {}'.format(row[0], row[1])
        if row[0] == 'bool' and row[1] == 'bool':
            assert fm[col_name].equals(df[row[0]] & df[row[1]])
        else:
            assert fm[col_name].equals(df[row[0]] * df[row[1]])
def test_binary_encoding():
    feature_matrix, features, f1, f2, f3, f4, es, ids = create_feature_matrix()

    enc = Encoder(method='binary')
    fm_encoded = enc.fit_transform(feature_matrix, features)

    encoder = BinaryEnc(fitted_encoder=enc, category='product_id')
    encoded = encoder(['car', 'toothpaste', 'coke zero', 'coke zero'])
    encoded_results = [[0, 0, 0, 0],
                       [1, 1, 0, 0],
                       [0, 1, 1, 1]]
    assert (encoded == encoded_results).all()

    product_feature = ft.Feature([f1], primitive=BinaryEnc(enc, 0))
    cc_feature = ft.Feature([f4], primitive=BinaryEnc(enc, 1))
    features = [product_feature, f2, f3, cc_feature]
    assert len(features) == len(enc.get_features())
    # __eq__ does not support multioutput columns yet
    for i in range(len(enc.get_features())):
        assert features[i].unique_name() == enc.get_features()[i].unique_name()

    features = enc.get_features()
    feature_matrix = ft.calculate_feature_matrix(features, es, instance_ids=ids)
    assert (fm_encoded == feature_matrix).all().all()
示例#43
0
def test_text_primitives(es):
    words = ft.Feature(es['log']['comments'], primitive=NumWords)
    chars = ft.Feature(es['log']['comments'], primitive=NumCharacters)

    features = [words, chars]

    df = ft.calculate_feature_matrix(entityset=es,
                                     features=features,
                                     instance_ids=range(15))

    word_counts = [
        514, 3, 3, 644, 1268, 1269, 177, 172, 79, 240, 1239, 3, 3, 3, 3
    ]
    char_counts = [
        3392, 10, 10, 4116, 7961, 7580, 992, 957, 437, 1325, 6322, 10, 10, 10,
        10
    ]
    word_values = df[words.get_name()].values
    char_values = df[chars.get_name()].values
    assert len(word_values) == 15
    for i, v in enumerate(word_values):
        assert v == word_counts[i]
    for i, v in enumerate(char_values):
        assert v == char_counts[i]
def test_calc_feature_matrix(entityset):
    times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] +
                 [datetime(2011, 4, 9, 10, 31, i * 9)
                  for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] +
                 [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] +
                 [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] +
                 [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)])
    labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2

    property_feature = IdentityFeature(entityset['log']['value']) > 10

    feature_matrix = calculate_feature_matrix([property_feature],
                                              entityset,
                                              instance_ids=range(17),
                                              cutoff_time=times,
                                              verbose=True)

    assert (feature_matrix == labels).values.all()

    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix('features',
                                                  entityset,
                                                  instance_ids=range(17),
                                                  cutoff_time=times)
    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix([],
                                                  entityset,
                                                  instance_ids=range(17),
                                                  cutoff_time=times)
    with pytest.raises(AssertionError):
        feature_matrix = calculate_feature_matrix([1, 2, 3],
                                                  entityset,
                                                  instance_ids=range(17),
                                                  cutoff_time=times)
    with pytest.raises(TypeError):
        calculate_feature_matrix([property_feature],
                                 entityset,
                                 instance_ids=range(17),
                                 cutoff_time=17)
示例#45
0
def build_transaction_data():
    """ Builds a data set from raw card and transaction data
        using the featuretools package.

        The resulting data set will be strictly concerned
        with transactions shown in the historical transactions CSV,
        and linking them to the proper card.

        :return:    training, testing feature matrices
    """

    logger = logging.getLogger(__name__)
    logger.info("Reading in card data")
    customer_df = pd.read_csv("data/raw/train.csv")
    customer_df['first_active_month'] = pd.to_datetime(
        customer_df['first_active_month'] + "-01")

    customer_df.drop(columns='target', inplace=True)

    logger.info("Reading in transactions")
    transactions_df = pd.read_csv("data/raw/historical_transactions.csv",
                                  dtype=TRANSACTION_LOAD_DTYPES)
    transactions_df['authorized_flag'] = np.where(
        transactions_df['authorized_flag'] == 'Y', 1, 0)
    transactions_df.reset_index(inplace=True)

    logger.info("Creating training entity set")
    es_train = ft.EntitySet()
    es_train = es_train.entity_from_dataframe(entity_id='customer',
                                              dataframe=customer_df,
                                              index='card_id',
                                              time_index='first_active_month',
                                              variable_types=CARD_TYPES)

    es_train = es_train.entity_from_dataframe(entity_id='transactions',
                                              dataframe=transactions_df,
                                              index='index',
                                              variable_types=TRANSACTION_TYPES)

    del customer_df
    gc.collect()

    logger.info("Defining relationships")
    relationship = ft.Relationship(es_train['customer']['card_id'],
                                   es_train['transactions']['card_id'])

    es_train = es_train.add_relationship(relationship)

    feature_matrix, feature_defs = ft.dfs(entityset=es_train,
                                          target_entity='customer')

    train_feature_matrix_enc, features_enc = ft.encode_features(
        feature_matrix, feature_defs)

    ft.save_features(features_enc, "feature_definitions")
    saved_features = ft.load_features('feature_definitions')

    logger.info("Loading test data")
    customer_df = pd.read_csv("data/raw/test.csv")
    customer_df['first_active_month'] = pd.to_datetime(
        customer_df['first_active_month'] + "-01")

    logger.info("Creating testing entity set")
    es_test = ft.EntitySet()
    es_test = es_test.entity_from_dataframe(entity_id='customer',
                                            dataframe=customer_df,
                                            index='card_id',
                                            time_index='first_active_month',
                                            variable_types=CARD_TYPES)

    es_test = es_test.entity_from_dataframe(entity_id='transactions',
                                            dataframe=transactions_df,
                                            index='index',
                                            variable_types=TRANSACTION_TYPES)

    es_test = es_test.add_relationship(relationship)

    test_feature_matrix_enc = ft.calculate_feature_matrix(
        saved_features, es_test)

    for col in train_feature_matrix_enc.columns:
        logger.debug(f"Normalizing feature [{col}]")
        old_min, old_max = train_feature_matrix_enc[col].agg(['min', 'max'])

        if (old_min == old_max):
            logger.debug(f"Droping feature [{col}] due to lack of variation")
            train_feature_matrix_enc.drop(columns=col, inplace=True)
            test_feature_matrix_enc.drop(columns=col, inplace=True)

            continue

        train_feature_matrix_enc[col] = normalize_series(
            series=train_feature_matrix_enc[col], min_max=(old_min, old_max))

        assert col in test_feature_matrix_enc.columns

        test_feature_matrix_enc[col] = normalize_series(
            series=test_feature_matrix_enc[col], min_max=(old_min, old_max))

    logger.info("Dropping SKEW features.")
    # TODO: Determine why these have lower counts than other features
    drop_cols = [c for c in train_feature_matrix_enc.columns if "SKEW" in c]
    train_feature_matrix_enc.drop(columns=drop_cols, inplace=True)
    test_feature_matrix_enc.drop(columns=drop_cols, inplace=True)

    return train_feature_matrix_enc, test_feature_matrix_enc
示例#46
0
def test_empty_child_dataframe(parent_child):
    parent_df, child_df = parent_child
    if not isinstance(parent_df, pd.DataFrame):
        parent_vtypes = {'id': variable_types.Index}
        child_vtypes = {
            'id': variable_types.Index,
            'parent_id': variable_types.Numeric,
            'time_index': variable_types.Datetime,
            'value': variable_types.Numeric,
            'cat': variable_types.Categorical
        }
    else:
        parent_vtypes = None
        child_vtypes = None
    es = ft.EntitySet(id="blah")
    es.entity_from_dataframe(entity_id="parent",
                             dataframe=parent_df,
                             index="id",
                             variable_types=parent_vtypes)
    es.entity_from_dataframe(entity_id="child",
                             dataframe=child_df,
                             index="id",
                             time_index="time_index",
                             variable_types=child_vtypes)
    es.add_relationship(
        ft.Relationship(es["parent"]["id"], es["child"]["parent_id"]))

    # create regular agg
    count = ft.Feature(es["child"]['id'],
                       parent_entity=es["parent"],
                       primitive=Count)

    # create agg feature that requires multiple arguments
    trend = ft.Feature([es["child"]['value'], es["child"]['time_index']],
                       parent_entity=es["parent"],
                       primitive=Trend)

    # create multi-output agg feature
    n_most_common = ft.Feature(es["child"]['cat'],
                               parent_entity=es["parent"],
                               primitive=NMostCommon)

    # create aggs with where
    where = ft.Feature(es["child"]["value"]) == 1
    count_where = ft.Feature(es["child"]['id'],
                             parent_entity=es["parent"],
                             where=where,
                             primitive=Count)
    trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']],
                             parent_entity=es["parent"],
                             where=where,
                             primitive=Trend)
    n_most_common_where = ft.Feature(es["child"]['cat'],
                                     parent_entity=es["parent"],
                                     where=where,
                                     primitive=NMostCommon)

    if isinstance(parent_df, pd.DataFrame):
        features = [
            count, count_where, trend, trend_where, n_most_common,
            n_most_common_where
        ]
        names = [
            count.get_name(),
            count_where.get_name(),
            trend.get_name(),
            trend_where.get_name(), *n_most_common.get_feature_names(),
            *n_most_common_where.get_feature_names()
        ]
        values = [
            0, 0, np.nan, np.nan,
            *np.full(n_most_common.number_output_features, np.nan),
            *np.full(n_most_common_where.number_output_features, np.nan)
        ]
    else:
        features = [count, count_where]
        names = [count.get_name(), count_where.get_name()]
        values = [0, 0]

    # cutoff time before all rows
    fm = ft.calculate_feature_matrix(entityset=es,
                                     features=features,
                                     cutoff_time=pd.Timestamp("12/31/2017"))
    fm = to_pandas(fm)

    assert_array_equal(fm[names], [values])

    # cutoff time after all rows, but where clause filters all rows
    if isinstance(parent_df, pd.DataFrame):
        features = [count_where, trend_where, n_most_common_where]
        names = [
            count_where.get_name(),
            trend_where.get_name(), *n_most_common_where.get_feature_names()
        ]
        values = [
            0, np.nan,
            *np.full(n_most_common_where.number_output_features, np.nan)
        ]
    else:
        features = [count_where]
        names = [count_where.get_name()]
        values = [0]

    fm2 = ft.calculate_feature_matrix(entityset=es,
                                      features=features,
                                      cutoff_time=pd.Timestamp("1/4/2018"))
    fm2 = to_pandas(fm2)

    assert_array_equal(fm2[names], [values])
示例#47
0
def test_make_agg_feat_where_different_identity_feat(es):
    feats = []
    where_cmps = [
        LessThanScalar,
        GreaterThanScalar,
        LessThanEqualToScalar,
        GreaterThanEqualToScalar,
        EqualScalar,
        NotEqualScalar,
    ]
    for where_cmp in where_cmps:
        feats.append(
            ft.Feature(
                es["log"].ww["id"],
                parent_dataframe_name="sessions",
                where=ft.Feature(
                    es["log"].ww["datetime"],
                    primitive=where_cmp(datetime(2011, 4, 10, 10, 40, 1)),
                ),
                primitive=Count,
            ))

    df = ft.calculate_feature_matrix(entityset=es,
                                     features=feats,
                                     instance_ids=[0, 1, 2, 3])
    df = to_pandas(df, index="id", sort_index=True)

    for i, where_cmp in enumerate(where_cmps):
        name = feats[i].get_name()
        instances = df[name]
        v0, v1, v2, v3 = instances[0:4]
        if where_cmp == LessThanScalar:
            assert v0 == 5
            assert v1 == 4
            assert v2 == 1
            assert v3 == 1
        elif where_cmp == GreaterThanScalar:
            assert v0 == 0
            assert v1 == 0
            assert v2 == 0
            assert v3 == 0
        elif where_cmp == LessThanEqualToScalar:
            assert v0 == 5
            assert v1 == 4
            assert v2 == 1
            assert v3 == 2
        elif where_cmp == GreaterThanEqualToScalar:
            assert v0 == 0
            assert v1 == 0
            assert v2 == 0
            assert v3 == 1
        elif where_cmp == EqualScalar:
            assert v0 == 0
            assert v1 == 0
            assert v2 == 0
            assert v3 == 1
        elif where_cmp == NotEqualScalar:
            assert v0 == 5
            assert v1 == 4
            assert v2 == 1
            assert v3 == 1
示例#48
0
def test_handles_primitive_function_name_uniqueness(entityset):
    class SumTimesN(AggregationPrimitive):
        name = "sum_times_n"
        input_types = [Numeric]
        return_type = Numeric

        def __init__(self, n):
            self.n = n

        def get_function(self):
            def my_function(values):
                return values.sum() * self.n

            return my_function

        def generate_name(self, base_feature_names, child_entity_id,
                          parent_entity_id, where_str, use_prev_str):
            base_features_str = ", ".join(base_feature_names)
            return u"%s(%s.%s%s%s, n=%s)" % (self.name.upper(),
                                             child_entity_id,
                                             base_features_str,
                                             where_str, use_prev_str, self.n)

    # works as expected
    f1 = ft.Feature(entityset["log"]["value"],
                    parent_entity=entityset["customers"],
                    primitive=SumTimesN(n=1))
    fm = ft.calculate_feature_matrix(features=[f1], entityset=entityset)
    value_sum = pd.Series([56, 26, 0])
    assert all(fm[f1.get_name()].sort_index() == value_sum)

    # works as expected
    f2 = ft.Feature(entityset["log"]["value"],
                    parent_entity=entityset["customers"],
                    primitive=SumTimesN(n=2))
    fm = ft.calculate_feature_matrix(features=[f2], entityset=entityset)
    double_value_sum = pd.Series([112, 52, 0])
    assert all(fm[f2.get_name()].sort_index() == double_value_sum)

    # same primitive, same variable, different args
    fm = ft.calculate_feature_matrix(features=[f1, f2], entityset=entityset)
    assert all(fm[f1.get_name()].sort_index() == value_sum)
    assert all(fm[f2.get_name()].sort_index() == double_value_sum)

    # different primtives, same function returned by get_function,
    # different base features
    f3 = ft.Feature(entityset["log"]["value"],
                    parent_entity=entityset["customers"],
                    primitive=Sum)
    f4 = ft.Feature(entityset["log"]["purchased"],
                    parent_entity=entityset["customers"],
                    primitive=NumTrue)
    fm = ft.calculate_feature_matrix(features=[f3, f4], entityset=entityset)
    purchased_sum = pd.Series([10, 1, 1])
    assert all(fm[f3.get_name()].sort_index() == value_sum)
    assert all(fm[f4.get_name()].sort_index() == purchased_sum)\


    # different primtives, same function returned by get_function,
    # same base feature
    class Sum1(AggregationPrimitive):
        """Sums elements of a numeric or boolean feature."""
        name = "sum1"
        input_types = [Numeric]
        return_type = Numeric
        stack_on_self = False
        stack_on_exclude = [Count]
        default_value = 0

        def get_function(self):
            return np.sum

    class Sum2(AggregationPrimitive):
        """Sums elements of a numeric or boolean feature."""
        name = "sum2"
        input_types = [Numeric]
        return_type = Numeric
        stack_on_self = False
        stack_on_exclude = [Count]
        default_value = 0

        def get_function(self):
            return np.sum

    class Sum3(AggregationPrimitive):
        """Sums elements of a numeric or boolean feature."""
        name = "sum3"
        input_types = [Numeric]
        return_type = Numeric
        stack_on_self = False
        stack_on_exclude = [Count]
        default_value = 0

        def get_function(self):
            return np.sum

    f5 = ft.Feature(entityset["log"]["value"],
                    parent_entity=entityset["customers"],
                    primitive=Sum1)
    f6 = ft.Feature(entityset["log"]["value"],
                    parent_entity=entityset["customers"],
                    primitive=Sum2)
    f7 = ft.Feature(entityset["log"]["value"],
                    parent_entity=entityset["customers"],
                    primitive=Sum3)
    fm = ft.calculate_feature_matrix(features=[f5, f6, f7], entityset=entityset)
    assert all(fm[f5.get_name()].sort_index() == value_sum)
    assert all(fm[f6.get_name()].sort_index() == value_sum)
    assert all(fm[f7.get_name()].sort_index() == value_sum)
def test_cum_sum_numpy_group_on_nan(pd_es):
    class CumSumNumpy(TransformPrimitive):
        """Returns the cumulative sum after grouping"""

        name = "cum_sum"
        input_types = [ColumnSchema(semantic_tags={"numeric"})]
        return_type = ColumnSchema(semantic_tags={"numeric"})
        uses_full_dataframe = True

        def get_function(self):
            def cum_sum(values):
                return values.cumsum().values

            return cum_sum

    log_value_feat = ft.IdentityFeature(pd_es["log"].ww["value"])
    pd_es["log"]["product_id"] = (
        ["coke zero"] * 3
        + ["car"] * 2
        + ["toothpaste"] * 3
        + ["brown bag"] * 2
        + ["shoes"]
        + [np.nan] * 4
        + ["coke_zero"] * 2
    )
    pd_es["log"]["value"][16] = 10
    cum_sum = ft.Feature(
        log_value_feat,
        groupby=ft.IdentityFeature(pd_es["log"].ww["product_id"]),
        primitive=CumSumNumpy,
    )
    assert cum_sum.get_name() == "CUM_SUM(value) by product_id"
    features = [cum_sum]
    df = ft.calculate_feature_matrix(
        entityset=pd_es, features=features, instance_ids=range(17)
    )
    cvalues = df[cum_sum.get_name()].values
    assert len(cvalues) == 17
    cum_sum_values = [
        0,
        5,
        15,
        15,
        35,
        0,
        1,
        3,
        3,
        3,
        0,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        10,
    ]

    assert len(cvalues) == len(cum_sum_values)
    for i, v in enumerate(cum_sum_values):
        if np.isnan(v):
            assert np.isnan(cvalues[i])
        else:
            assert v == cvalues[i]
def test_uses_full_entity_feat_of_approximate(entityset):
    es = entityset
    agg_feat = Sum(es['log']['value'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    agg_feat3 = Min(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    dfeat2 = DirectFeature(agg_feat3, es['sessions'])
    p = Percentile(dfeat)

    # only dfeat2 should be approximated
    # because Percentile needs all values

    feature_matrix_only_dfeat2 = calculate_feature_matrix(
        [dfeat2],
        entityset,
        instance_ids=[0, 2],
        approximate=Timedelta(10, 's'),
        cutoff_time_in_index=True,
        cutoff_time=[
            datetime(2011, 4, 9, 10, 31, 19),
            datetime(2011, 4, 9, 11, 0, 0)
        ])
    assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == [1, 0]

    feature_matrix_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        entityset,
        instance_ids=[0, 2],
        approximate=Timedelta(10, 's'),
        cutoff_time_in_index=True,
        cutoff_time=[
            datetime(2011, 4, 9, 10, 31, 19),
            datetime(2011, 4, 9, 11, 0, 0)
        ])
    assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist(
    ) == feature_matrix_approx[dfeat2.get_name()].tolist()

    feature_matrix_small_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        entityset,
        instance_ids=[0, 2],
        approximate=Timedelta(10, 'ms'),
        cutoff_time_in_index=True,
        cutoff_time=[
            datetime(2011, 4, 9, 10, 31, 19),
            datetime(2011, 4, 9, 11, 0, 0)
        ])

    feature_matrix_no_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        entityset,
        instance_ids=[0, 2],
        cutoff_time_in_index=True,
        cutoff_time=[
            datetime(2011, 4, 9, 10, 31, 19),
            datetime(2011, 4, 9, 11, 0, 0)
        ])
    for f in [p, dfeat, agg_feat]:
        for fm1, fm2 in combinations([
                feature_matrix_approx, feature_matrix_small_approx,
                feature_matrix_no_approx
        ], 2):
            assert fm1[f.get_name()].tolist() == fm2[f.get_name()].tolist()
示例#51
0
def test_agg_same_method_name(es):
    """
    Pandas relies on the function name when calculating aggregations. This means if a two
    primitives with the same function name are applied to the same column, pandas
    can't differentiate them. We have a work around to this based on the name property
    that we test here.
    """
    # TODO: Update to work with Dask and Spark
    if es.dataframe_type != Library.PANDAS.value:
        pytest.xfail("Need to update to work with Dask and Spark EntitySets")

    # test with normally defined functions
    class Sum(AggregationPrimitive):
        name = "sum"
        input_types = [ColumnSchema(semantic_tags={"numeric"})]
        return_type = ColumnSchema(semantic_tags={"numeric"})

        def get_function(self):
            def custom_primitive(x):
                return x.sum()

            return custom_primitive

    class Max(AggregationPrimitive):
        name = "max"
        input_types = [ColumnSchema(semantic_tags={"numeric"})]
        return_type = ColumnSchema(semantic_tags={"numeric"})

        def get_function(self):
            def custom_primitive(x):
                return x.max()

            return custom_primitive

    f_sum = ft.Feature(
        es["log"].ww["value"], parent_dataframe_name="customers", primitive=Sum
    )
    f_max = ft.Feature(
        es["log"].ww["value"], parent_dataframe_name="customers", primitive=Max
    )

    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]

    # test with lambdas
    class Sum(AggregationPrimitive):
        name = "sum"
        input_types = [ColumnSchema(semantic_tags={"numeric"})]
        return_type = ColumnSchema(semantic_tags={"numeric"})

        def get_function(self):
            return lambda x: x.sum()

    class Max(AggregationPrimitive):
        name = "max"
        input_types = [ColumnSchema(semantic_tags={"numeric"})]
        return_type = ColumnSchema(semantic_tags={"numeric"})

        def get_function(self):
            return lambda x: x.max()

    f_sum = ft.Feature(
        es["log"].ww["value"], parent_dataframe_name="customers", primitive=Sum
    )
    f_max = ft.Feature(
        es["log"].ww["value"], parent_dataframe_name="customers", primitive=Max
    )
    fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es)
    assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]
示例#52
0
文件: utils.py 项目: rkelly07/DSx
def compute_features(features, cutoff_time):
    feature_matrix = ft.calculate_feature_matrix(features,
                                                 cutoff_time=cutoff_time,
                                                 approximate='36d')
    return feature_matrix
def test_cfm_returns_original_time_indexes(entityset):
    es = entityset

    agg_feat = Count(es['customers']['id'], es[u'régions'])
    dfeat = DirectFeature(agg_feat, es['customers'])
    agg_feat_2 = Count(es['sessions']['id'], es['customers'])
    cutoff_df = pd.DataFrame({
        'time': [
            pd.Timestamp('2011-04-09 10:30:06'),
            pd.Timestamp('2011-04-09 10:30:03'),
            pd.Timestamp('2011-04-08 10:30:00')
        ],
        'instance_id': [0, 1, 0]
    })
    sorted_df = cutoff_df.sort_values(['time', 'instance_id'],
                                      kind='mergesort')

    # no approximate
    fm = calculate_feature_matrix([dfeat],
                                  entityset,
                                  cutoff_time=cutoff_df,
                                  cutoff_time_in_index=True)
    instance_level_vals = fm.index.get_level_values(0).values
    time_level_vals = fm.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in different windows, no unapproximated aggs
    fm2 = calculate_feature_matrix([dfeat],
                                   entityset,
                                   cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True,
                                   approximate="1 m")
    instance_level_vals = fm2.index.get_level_values(0).values
    time_level_vals = fm2.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in different windows, unapproximated aggs
    fm2 = calculate_feature_matrix([dfeat, agg_feat_2],
                                   entityset,
                                   cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True,
                                   approximate="1 m")
    instance_level_vals = fm2.index.get_level_values(0).values
    time_level_vals = fm2.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in same window, no unapproximated aggs
    fm3 = calculate_feature_matrix([dfeat],
                                   entityset,
                                   cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True,
                                   approximate="2 d")
    instance_level_vals = fm3.index.get_level_values(0).values
    time_level_vals = fm3.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()

    # approximate, in same window, unapproximated aggs
    fm3 = calculate_feature_matrix([dfeat, agg_feat_2],
                                   entityset,
                                   cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True,
                                   approximate="2 d")
    instance_level_vals = fm3.index.get_level_values(0).values
    time_level_vals = fm3.index.get_level_values(1).values
    assert (instance_level_vals == sorted_df['instance_id'].values).all()
    assert (time_level_vals == sorted_df['time'].values).all()
示例#54
0
    def run_featuretools(self,
                         read_in_data_if_needed=True,
                         export_to_csv=False):

        # TODO: This should eventually be dynamic.
        dataset_filenames = ['POS_CASH_balance.csv', 'application_test.csv', 'application_train.csv', 'bureau.csv',\
        'bureau_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'previous_application.csv']

        if self.datasets == []:
            self.read_all_data(dataset_filenames=dataset_filenames)
        for data in self.datasets:
            if data.name == 'POS_CASH_balance':
                pos = data.data
            elif data.name == 'application_test':
                test = data.data
            elif data.name == 'application_train':
                train_full = data.data
            elif data.name == 'bureau':
                bureau = data.data
            elif data.name == 'bureau_balance':
                bureau_balance = data.data
            elif data.name == 'credit_card_balance':
                cc_bal = data.data
            elif data.name == 'installments_payments':
                inst = data.data
            elif data.name == 'previous_application':
                prev_app = data.data

        train = train_full.drop('TARGET', axis=1)
        train_y = train_full['TARGET']

        print('Creating entity set.')

        # Create new entityset
        es = ft.EntitySet(id='train')
        print('Creating train entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='train',
                                      dataframe=train,
                                      index='SK_ID_CURR')
        print('Creating bureau entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='bureau',
                                      dataframe=bureau,
                                      index='SK_ID_BUREAU')
        print('Creating bureau_bal entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='bureau_bal',
                                      dataframe=bureau_balance,
                                      make_index=True,
                                      index='bureau_bal_id')
        print('Creating pos entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='pos',
                                      dataframe=pos,
                                      make_index=True,
                                      index='pos_id')
        print('Creating cc_bal entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='cc_bal',
                                      dataframe=cc_bal,
                                      make_index=True,
                                      index='cc_bal_id')
        print('Creating inst entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='inst',
                                      dataframe=inst,
                                      make_index=True,
                                      index='inst_id')
        print('Creating prev_app entity.')
        print(str(pd.Timestamp.now()))
        es = es.entity_from_dataframe(entity_id='prev_app',
                                      dataframe=prev_app,
                                      index='SK_ID_PREV')

        print('Creating relationships.')
        print(str(pd.Timestamp.now()))

        # Create relationships
        print('Creating r_train_bureau.')
        print(str(pd.Timestamp.now()))
        r_train_bureau = ft.Relationship(es['train']['SK_ID_CURR'],
                                         es['bureau']['SK_ID_CURR'])
        es = es.add_relationship(r_train_bureau)

        print('Creating r_bureau_bureau_bal.')
        print(str(pd.Timestamp.now()))
        r_bureau_bureau_bal = ft.Relationship(es['bureau']['SK_ID_BUREAU'],
                                              es['bureau_bal']['SK_ID_BUREAU'])
        es = es.add_relationship(r_bureau_bureau_bal)

        print('Creating r_train_pos.')
        print(str(pd.Timestamp.now()))
        r_train_pos = ft.Relationship(es['train']['SK_ID_CURR'],
                                      es['pos']['SK_ID_CURR'])
        es = es.add_relationship(r_train_pos)

        print('Creating r_train_cc_bal.')
        print(str(pd.Timestamp.now()))
        r_train_cc_bal = ft.Relationship(es['train']['SK_ID_CURR'],
                                         es['cc_bal']['SK_ID_CURR'])
        es = es.add_relationship(r_train_cc_bal)

        print('Creating r_train_inst.')
        print(str(pd.Timestamp.now()))
        r_train_inst = ft.Relationship(es['train']['SK_ID_CURR'],
                                       es['inst']['SK_ID_CURR'])
        es = es.add_relationship(r_train_inst)

        print('Creating r_train_prev_app.')
        print(str(pd.Timestamp.now()))
        r_train_prev_app = ft.Relationship(es['train']['SK_ID_CURR'],
                                           es['prev_app']['SK_ID_CURR'])
        es = es.add_relationship(r_train_prev_app)

        print('Creating r_prev_app_pos.')
        print(str(pd.Timestamp.now()))
        r_prev_app_pos = ft.Relationship(es['prev_app']['SK_ID_PREV'],
                                         es['pos']['SK_ID_PREV'])
        es = es.add_relationship(r_prev_app_pos)

        print('Creating r_prev_app_inst.')
        print(str(pd.Timestamp.now()))
        r_prev_app_inst = ft.Relationship(es['prev_app']['SK_ID_PREV'],
                                          es['inst']['SK_ID_PREV'])
        es = es.add_relationship(r_prev_app_inst)

        print('Creating r_prev_app_cc_bal.')
        print(str(pd.Timestamp.now()))
        r_prev_app_cc_bal = ft.Relationship(es['prev_app']['SK_ID_PREV'],
                                            es['cc_bal']['SK_ID_PREV'])
        es = es.add_relationship(r_prev_app_cc_bal)

        # Create new features using specified primitives
        # Documentation: https://docs.featuretools.com/generated/featuretools.dfs.html

        print('Creating actual features.')
        print(str(pd.Timestamp.now()))
        feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'train', \
        agg_primitives = ['mean', 'max', 'last']
        # trans_primitives = ['years', 'month', 'subtract', 'divide']
        )

        self.featuretools_feature_set = feature_matrix
        self.featuretools_feature_names = feature_defs

        # One hot encode categorical features
        feature_matrix_enc, feature_defs_enc = ft.encode_features(
            feature_matrix, feature_defs)

        # Create entity set for test
        print('Creating test entity')
        ts = ft.EntitySet(id='test')
        print('Creating test entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='test',
                                      dataframe=test,
                                      index='SK_ID_CURR')
        print('Creating bureau entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='bureau',
                                      dataframe=bureau,
                                      index='SK_ID_BUREAU')
        print('Creating bureau_bal entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='bureau_bal',
                                      dataframe=bureau_balance,
                                      make_index=True,
                                      index='bureau_bal_id')
        print('Creating pos entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='pos',
                                      dataframe=pos,
                                      make_index=True,
                                      index='pos_id')
        print('Creating cc_bal entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='cc_bal',
                                      dataframe=cc_bal,
                                      make_index=True,
                                      index='cc_bal_id')
        print('Creating inst entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='inst',
                                      dataframe=inst,
                                      make_index=True,
                                      index='inst_id')
        print('Creating prev_app entity.')
        print(str(pd.Timestamp.now()))
        ts = ts.entity_from_dataframe(entity_id='prev_app',
                                      dataframe=prev_app,
                                      index='SK_ID_PREV')

        print('Creating relationships.')
        print(str(pd.Timestamp.now()))

        # Create relationships
        print('Creating r_test_bureau.')
        print(str(pd.Timestamp.now()))
        r_test_bureau = ft.Relationship(ts['test']['SK_ID_CURR'],
                                        ts['bureau']['SK_ID_CURR'])
        ts = ts.add_relationship(r_test_bureau)

        print('Creating r_bureau_bureau_bal.')
        print(str(pd.Timestamp.now()))
        r_bureau_bureau_bal = ft.Relationship(ts['bureau']['SK_ID_BUREAU'],
                                              ts['bureau_bal']['SK_ID_BUREAU'])
        ts = ts.add_relationship(r_bureau_bureau_bal)

        print('Creating r_test_pos.')
        print(str(pd.Timestamp.now()))
        r_test_pos = ft.Relationship(ts['test']['SK_ID_CURR'],
                                     ts['pos']['SK_ID_CURR'])
        ts = ts.add_relationship(r_test_pos)

        print('Creating r_test_cc_bal.')
        print(str(pd.Timestamp.now()))
        r_test_cc_bal = ft.Relationship(ts['test']['SK_ID_CURR'],
                                        ts['cc_bal']['SK_ID_CURR'])
        ts = ts.add_relationship(r_test_cc_bal)

        print('Creating r_test_inst.')
        print(str(pd.Timestamp.now()))
        r_test_inst = ft.Relationship(ts['test']['SK_ID_CURR'],
                                      ts['inst']['SK_ID_CURR'])
        ts = ts.add_relationship(r_test_inst)

        print('Creating r_test_prev_app.')
        print(str(pd.Timestamp.now()))
        r_test_prev_app = ft.Relationship(ts['test']['SK_ID_CURR'],
                                          ts['prev_app']['SK_ID_CURR'])
        ts = ts.add_relationship(r_test_prev_app)

        print('Creating r_prev_app_pos.')
        print(str(pd.Timestamp.now()))
        r_prev_app_pos = ft.Relationship(ts['prev_app']['SK_ID_PREV'],
                                         ts['pos']['SK_ID_PREV'])
        ts = ts.add_relationship(r_prev_app_pos)

        print('Creating r_prev_app_inst.')
        print(str(pd.Timestamp.now()))
        r_prev_app_inst = ft.Relationship(ts['prev_app']['SK_ID_PREV'],
                                          ts['inst']['SK_ID_PREV'])
        ts = ts.add_relationship(r_prev_app_inst)

        print('Creating r_prev_app_cc_bal.')
        print(str(pd.Timestamp.now()))
        r_prev_app_cc_bal = ft.Relationship(ts['prev_app']['SK_ID_PREV'],
                                            ts['cc_bal']['SK_ID_PREV'])
        ts = ts.add_relationship(r_prev_app_cc_bal)

        # Create new features using specified primitives
        # Documentation: https://docs.featuretools.com/generated/featuretools.dfs.html

        print('Creating actual features.')
        print(str(pd.Timestamp.now()))
        feature_matrix_test = ft.calculate_feature_matrix(
            features=feature_matrix_enc, entityset='test')

        # One hot encode categorical features
        feature_matrix_test_enc, feature_defs_test_enc = ft.encode_features(
            feature_matrix_test, feature_defs)

        print('Done running featuretools!')

        print('Exporting features to CSV.')

        if export_to_csv:
            pd.DataFrame(feature_matrix_enc).to_csv('featuretools_feature.csv')
            train_y.to_csv('train_y.csv')
            pd.DataFrame(feature_matrix_test_enc).to_csv(
                'featuretools_features_test.csv')
def test_empty_child_dataframe():
    parent_df = pd.DataFrame({"id": [1]})
    child_df = pd.DataFrame({
        "id": [1, 2, 3],
        "parent_id": [1, 1, 1],
        "time_index":
        pd.date_range(start='1/1/2018', periods=3),
        "value": [10, 5, 2]
    })

    es = ft.EntitySet(id="blah")
    es.entity_from_dataframe(entity_id="parent",
                             dataframe=parent_df,
                             index="id")
    es.entity_from_dataframe(entity_id="child",
                             dataframe=child_df,
                             index="id",
                             time_index="time_index")
    es.add_relationship(
        ft.Relationship(es["parent"]["id"], es["child"]["parent_id"]))

    # create regular agg
    count = ft.Feature(es["child"]['id'],
                       parent_entity=es["parent"],
                       primitive=Count)

    # create agg feature that requires multiple arguments
    trend = ft.Feature([es["child"]['value'], es["child"]['time_index']],
                       parent_entity=es["parent"],
                       primitive=Trend)

    # create aggs with where
    where = ft.Feature(es["child"]["value"]) == 1
    count_where = ft.Feature(es["child"]['id'],
                             parent_entity=es["parent"],
                             where=where,
                             primitive=Count)
    trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']],
                             parent_entity=es["parent"],
                             where=where,
                             primitive=Trend)

    # cutoff time before all rows
    fm = ft.calculate_feature_matrix(
        entityset=es,
        features=[count, count_where, trend, trend_where],
        cutoff_time=pd.Timestamp("12/31/2017"))
    names = [
        count.get_name(),
        count_where.get_name(),
        trend.get_name(),
        trend_where.get_name()
    ]
    assert_array_equal(fm[names], [[0, 0, np.nan, np.nan]])

    # cutoff time after all rows, but where clause filters all rows
    fm2 = ft.calculate_feature_matrix(entityset=es,
                                      features=[count_where, trend_where],
                                      cutoff_time=pd.Timestamp("1/4/2018"))
    names = [count_where.get_name(), trend_where.get_name()]
    assert_array_equal(fm2[names], [[0, np.nan]])
示例#56
0
def gen_feature_matrix(entityset,
                       features_only=False,
                       feature_matrix_encode=False,
                       saved_features=None):
    '''A function compute and return (feature_matrix, feature_defs) from an featuretools EntitySet

    entityset: the EntitySet to compute features from
    features_only: only return feature_defs, do not actually compute the feature_matrix
    feature_matrix_encode: whether return encoded feature_matrix (Categorical variable one-hot)
    saved_features: load a pre defined feature file and compute feature_matrix based on it
    '''

    if 'goldstandard' in entityset.entity_dict.keys():
        goldstandard_exist = True
        goldstandard_id = 'goldstandard'
    else:
        goldstandard_exist = False
        goldstandard_id = None
    ##FIX manual partition by person_id does NOT improve Dask computing performance
    # ignore 'partition' columns in every entity when building features
    # ignore_variables = dict()
    # for entity in entityset.entities:
    #     if 'partition' in [v.name for v in entity.variables]:
    #         ignore_variables[entity.id] = ['partition']

    ##CAUTION when the entityset is backed by Dask dataframes, only limited set of primitives are supported
    # agg_primitives_all=['avg_time_between', 'count', 'all', 'entropy', 'last', 'num_unique', 'n_most_common',
    #             'min', 'std', 'median', 'mean', 'percent_true', 'trend', 'sum', 'time_since_last', 'any',
    #             'num_true', 'time_since_first', 'first', 'max', 'mode', 'skew']
    # agg_primitives_dask=['count', 'all', 'num_unique', #'n_most_common',
    #               'min', 'std', 'mean', 'percent_true', 'sum', 'any',
    #               'num_true', 'max']

    ## define features per entity(table)
    agg_primitives = [
        'mean', 'max', 'min', 'std', 'last', 'skew', 'time_since_last'
    ]  # 'trend' # trend takes extremely long time to compute
    include_variables = {
        'measurement':
        ['measurement_datetime', 'value_as_number', 'measurement_concept_id'],
        'observation':
        ['observation_concept_id', 'observation_datetime', 'value_as_number']
    }
    agg_primitives_device_exposure = [
        'count', 'avg_time_between', 'time_since_first'
    ]
    include_entities_device_exposure = ['device_exposure']

    trans_primitives = ['age']
    groupby_trans_primitives = []
    include_entities = ['person']
    primitive_options = {
        tuple(trans_primitives): {
            'include_entities': include_entities
        },
        tuple(agg_primitives): {
            'include_variables': include_variables
        },
        tuple(agg_primitives_device_exposure): {
            'include_entities': include_entities_device_exposure
        },
    }
    ignore_entities = [
        goldstandard_id, 'condition_occurrence', 'drug_exposure',
        'observation_period', 'procedure_occurrence', 'visit_occurrence'
    ]
    ignore_variables = {}
    where_primitives = agg_primitives
    entityset['measurement'][
        'measurement_concept_id'].interesting_values = entityset[
            'measurement'].df['measurement_concept_id'].unique()
    entityset['observation'][
        'observation_concept_id'].interesting_values = entityset[
            'observation'].df['observation_concept_id'].unique()
    # if isinstance(entityset.entities[0].df, pandas.DataFrame):
    #     agg_primitives = agg_primitives_all
    # else:
    #     agg_primitives = agg_primitives_dask

    # build features
    if saved_features is None:
        with yaspin(color="yellow") as spinner:
            spinner.write(
                "No features definition file specified, calculating feature matrix from ground zero ... "
            )
            feature_defs = ft.dfs(
                entityset=entityset,
                target_entity="person",
                features_only=True,
                agg_primitives=agg_primitives + agg_primitives_device_exposure,
                trans_primitives=trans_primitives,
                groupby_trans_primitives=groupby_trans_primitives,
                primitive_options=primitive_options,
                ignore_entities=ignore_entities,
                ignore_variables=ignore_variables,
                where_primitives=where_primitives,
                max_depth=2)
            spinner.write("> generated {} features".format(len(feature_defs)))
            if features_only:
                return feature_defs

            tic = time.perf_counter()
            feature_matrix = ft.calculate_feature_matrix(
                feature_defs, entityset)
            if isinstance(entityset.entities[0].df, dd.DataFrame):
                feature_matrix = feature_matrix.compute()
            toc = time.perf_counter()
            spinner.write(
                f"> feature matrix calculate completed in {toc - tic:0.4f} seconds"
            )
            if feature_matrix_encode:
                feature_matrix_enc, features_enc = ft.encode_features(
                    feature_matrix, feature_defs)
                spinner.write(
                    "> generated {} encoded features and the feature matrix".
                    format(len(features_enc)))
            spinner.ok("Done")
    else:
        with yaspin(color="yellow") as spinner:
            spinner.write(
                "Using saved features from {} ... ".format(saved_features))
            feature_defs = ft.load_features(saved_features)
            spinner.write("> {} features loaded from {}".format(
                len(feature_defs), saved_features))

            tic = time.perf_counter()
            feature_matrix = ft.calculate_feature_matrix(
                feature_defs, entityset)
            if isinstance(entityset.entities[0].df, dd.DataFrame):
                feature_matrix = feature_matrix.compute()
            toc = time.perf_counter()
            spinner.write(
                f"> feature matrix calculate complete in {toc - tic:0.4f} seconds"
            )
            spinner.ok("Done")

    if goldstandard_exist:
        if isinstance(entityset.entities[0].df, dd.DataFrame):
            goldstandard = entityset['goldstandard'].df.compute()
        else:
            goldstandard = entityset['goldstandard'].df
    if feature_matrix_encode:
        feature_matrix = feature_matrix_enc
    if goldstandard_exist:
        feature_matrix = feature_matrix.merge(goldstandard,
                                              on='person_id',
                                              how='right')

    return feature_matrix, feature_defs
    def dfsWindow(self,
                  target_entity,
                  time_scope=None,
                  training_window=None,
                  cutoff_times=None,
                  max_depth=1,
                  chunk_size=None,
                  n_jobs=1):
        '''Runs dfs on the target_entity and outputs a feature matrix with 
        features based on the training_window and time_scope relative to cutoff 
        times. If no training_window, time_scope, or cutoff_times are specified,
        regular dfs will run without using cutoff times.
           
        target_entity: str. Name of target_entity in entity set to run dfs on. 
        The index of the target_entity must match the instance_id column in the 
        cutoff_times table.
           
        time_scope: 'daily', 'weekly' or 'monthly'. Assumes 7 days in a week, 
        and 31 days in a month.
           
       training_window: list of integers that refer to the number of months or 
       weeks depending on the time_scope. Ex. [1, 2] for time_scope='monthly' 
       returns features based on the last month and last 2 months from the 
       cutoff date.
       
       cutoff_times: Pandas dataframe with instance_id, cutoff_dates, and 
       label (label is optional). Any columns after instance_id and cutoff_dates 
       will not be used for feature synthesis. The instance_id column must match 
       the index of the target entity. 
       
       max_depth: integer, defines how many levels of dfs to run. For example if 
       max_depth = 2 on a transactions table, features returned include avg. 
       transactions and avg. of avg. transactions.
       
       chunk_size: integer, float, None, or "cutoff time". Number of rows of 
       output feature matrix to calculate at time. If passed an integer greater 
       than 0, it will use that many rows per chunk. If passed a float value 
       between 0 and 1, sets the chunk size to that percentage of all instances. 
       If passed “cutoff time”, rows are split per cutoff time.
       
       n_jobs: integer. The number of parallel processes to use when creating
       the feature matrix.
        '''
        orig_window = training_window
        if (time_scope is None) or (training_window is None) or (cutoff_times
                                                                 is None):
            self.df, feature_defs = ft.dfs(
                entityset=self.es,
                target_entity=target_entity,
                agg_primitives=self.agg_primitives,
                trans_primitives=self.trans_primitives,
                where_primitives=self.where_primitives,
                max_depth=max_depth,
                features_only=False,
                verbose=1,
                chunk_size=chunk_size,
                n_jobs=n_jobs)

        else:
            self.df, feature_defs = ft.dfs(
                entityset=self.es,
                target_entity=target_entity,
                cutoff_time=cutoff_times,
                agg_primitives=self.agg_primitives,
                trans_primitives=self.trans_primitives,
                where_primitives=self.where_primitives,
                max_depth=max_depth,
                features_only=False,
                verbose=1,
                chunk_size=chunk_size,
                n_jobs=n_jobs,
                cutoff_time_in_index=True)
            if time_scope == 'daily':
                training_window = [int(x) for x in orig_window]
                for i in range(len(training_window)):
                    feature_matrix = ft.calculate_feature_matrix(
                        entityset=self.es,
                        features=feature_defs,
                        cutoff_time=cutoff_times,
                        chunk_size=chunk_size,
                        cutoff_time_in_index=True,
                        n_jobs=n_jobs,
                        training_window=ft.Timedelta(training_window[i], "d"))

                    suffix = '_' + str(orig_window[i]) + 'day'
                    feature_matrix = feature_matrix.add_suffix(suffix)
                    self.df = pd.concat([self.df, feature_matrix],
                                        axis=1,
                                        join='inner')

            elif time_scope == 'monthly':
                training_window = [x * 30 for x in orig_window]
                for i in range(len(training_window)):
                    feature_matrix = ft.calculate_feature_matrix(
                        entityset=self.es,
                        features=feature_defs,
                        cutoff_time=cutoff_times,
                        chunk_size=chunk_size,
                        cutoff_time_in_index=True,
                        n_jobs=n_jobs,
                        training_window=ft.Timedelta(training_window[i], "d"))

                    suffix = '_' + str(orig_window[i]) + 'mos'
                    feature_matrix = feature_matrix.add_suffix(suffix)
                    self.df = pd.concat([self.df, feature_matrix],
                                        axis=1,
                                        join='inner')

            elif time_scope == 'weekly':
                training_window = [x * 7 for x in orig_window]
                for i in range(len(training_window)):
                    feature_matrix, feature_defs = ft.dfs(
                        entityset=self.es,
                        target_entity=target_entity,
                        cutoff_time=cutoff_times,
                        agg_primitives=self.agg_primitives,
                        trans_primitives=self.trans_primitives,
                        where_primitives=self.where_primitives,
                        max_depth=max_depth,
                        features_only=False,
                        verbose=1,
                        chunk_size=chunk_size,
                        cutoff_time_in_index=True,
                        n_jobs=n_jobs,
                        training_window=ft.Timedelta(training_window[i], "d"))

                    suffix = '_' + str(orig_window[i]) + 'wks'
                    feature_matrix = feature_matrix.add_suffix(suffix)
                    self.df = pd.concat([self.df, feature_matrix],
                                        axis=1,
                                        join='inner')

            else:
                print("ERROR: time_scope entered is not one of the options.")

        drop_duplicates = DropDuplicate()
        self.df = drop_duplicates.fit_transform(self.df)

        for i in self.df.columns:
            self.feature_defs.append(i)

        return self.df
def test_empty_child_dataframe(parent_child):
    parent_df, child_df = parent_child
    child_ltypes = {
        'parent_id': Integer,
        'time_index': Datetime,
        'value': Double,
        'cat': Categorical
    }

    es = ft.EntitySet(id="blah")
    es.add_dataframe(dataframe_name="parent", dataframe=parent_df, index="id")
    es.add_dataframe(dataframe_name="child",
                     dataframe=child_df,
                     index="id",
                     time_index="time_index",
                     logical_types=child_ltypes)
    es.add_relationship("parent", "id", "child", "parent_id")

    # create regular agg
    count = ft.Feature(es["child"].ww["id"],
                       parent_dataframe_name="parent",
                       primitive=Count)

    # create agg feature that requires multiple arguments
    trend = ft.Feature([
        ft.Feature(es["child"].ww["value"]),
        ft.Feature(es["child"].ww['time_index'])
    ],
                       parent_dataframe_name="parent",
                       primitive=Trend)

    # create multi-output agg feature
    n_most_common = ft.Feature(es["child"].ww["cat"],
                               parent_dataframe_name="parent",
                               primitive=NMostCommon)

    # create aggs with where
    where = ft.Feature(es["child"].ww["value"]) == 1
    count_where = ft.Feature(es["child"].ww["id"],
                             parent_dataframe_name="parent",
                             where=where,
                             primitive=Count)
    trend_where = ft.Feature([
        ft.Feature(es["child"].ww["value"]),
        ft.Feature(es["child"].ww["time_index"])
    ],
                             parent_dataframe_name="parent",
                             where=where,
                             primitive=Trend)
    n_most_common_where = ft.Feature(es["child"].ww["cat"],
                                     parent_dataframe_name="parent",
                                     where=where,
                                     primitive=NMostCommon)

    if isinstance(parent_df, pd.DataFrame):
        features = [
            count, count_where, trend, trend_where, n_most_common,
            n_most_common_where
        ]
        data = {
            count.get_name(): pd.Series([0], dtype="Int64"),
            count_where.get_name(): pd.Series([0], dtype="Int64"),
            trend.get_name(): pd.Series([np.nan], dtype="float"),
            trend_where.get_name(): pd.Series([np.nan], dtype="float")
        }
        for name in n_most_common.get_feature_names():
            data[name] = pd.Series([np.nan], dtype="category")
        for name in n_most_common_where.get_feature_names():
            data[name] = pd.Series([np.nan], dtype="category")
    else:
        features = [count, count_where]
        data = {
            count.get_name(): pd.Series([0], dtype="Int64"),
            count_where.get_name(): pd.Series([0], dtype="Int64")
        }

    answer = pd.DataFrame(data)

    # cutoff time before all rows
    fm = ft.calculate_feature_matrix(entityset=es,
                                     features=features,
                                     cutoff_time=pd.Timestamp("12/31/2017"))
    fm = to_pandas(fm)

    for column in data.keys():
        pd.testing.assert_series_equal(fm[column],
                                       answer[column],
                                       check_names=False,
                                       check_index=False)

    # cutoff time after all rows, but where clause filters all rows
    if isinstance(parent_df, pd.DataFrame):
        features = [count_where, trend_where, n_most_common_where]
        data = {
            count_where.get_name(): pd.Series([0], dtype="Int64"),
            trend_where.get_name(): pd.Series([np.nan], dtype="float")
        }
        for name in n_most_common_where.get_feature_names():
            data[name] = pd.Series([np.nan], dtype="category")
    else:
        features = [count_where]
        data = {count_where.get_name(): pd.Series([0], dtype="Int64")}
    answer = pd.DataFrame(data)

    fm2 = ft.calculate_feature_matrix(entityset=es,
                                      features=features,
                                      cutoff_time=pd.Timestamp("1/4/2018"))
    fm2 = to_pandas(fm2)

    for column in data.keys():
        pd.testing.assert_series_equal(fm[column],
                                       answer[column],
                                       check_names=False,
                                       check_index=False)
示例#59
0
def test_handles_primitive_function_name_uniqueness(es):
    if not all(isinstance(entity.df, pd.DataFrame) for entity in es.entities):
        pytest.xfail(
            "Fails with Dask and Koalas due conflicting aggregation primitive names"
        )

    class SumTimesN(AggregationPrimitive):
        name = "sum_times_n"
        input_types = [Numeric]
        return_type = Numeric

        def __init__(self, n):
            self.n = n

        def get_function(self, agg_type='pandas'):
            def my_function(values):
                return values.sum() * self.n

            return my_function

    # works as expected
    f1 = ft.Feature(es["log"]["value"],
                    parent_entity=es["customers"],
                    primitive=SumTimesN(n=1))
    fm = ft.calculate_feature_matrix(features=[f1], entityset=es)

    value_sum = pd.Series([56, 26, 0])
    assert all(fm[f1.get_name()].sort_index() == value_sum)

    # works as expected
    f2 = ft.Feature(es["log"]["value"],
                    parent_entity=es["customers"],
                    primitive=SumTimesN(n=2))
    fm = ft.calculate_feature_matrix(features=[f2], entityset=es)

    double_value_sum = pd.Series([112, 52, 0])
    assert all(fm[f2.get_name()].sort_index() == double_value_sum)

    # same primitive, same variable, different args
    fm = ft.calculate_feature_matrix(features=[f1, f2], entityset=es)

    assert all(fm[f1.get_name()].sort_index() == value_sum)
    assert all(fm[f2.get_name()].sort_index() == double_value_sum)

    # different primtives, same function returned by get_function,
    # different base features
    f3 = ft.Feature(es["log"]["value"],
                    parent_entity=es["customers"],
                    primitive=Sum)
    f4 = ft.Feature(es["log"]["purchased"],
                    parent_entity=es["customers"],
                    primitive=NumTrue)
    fm = ft.calculate_feature_matrix(features=[f3, f4], entityset=es)

    purchased_sum = pd.Series([10, 1, 1])
    assert all(fm[f3.get_name()].sort_index() == value_sum)
    assert all(fm[f4.get_name()].sort_index() == purchased_sum)\

    # different primtives, same function returned by get_function,
    # same base feature
    class Sum1(AggregationPrimitive):
        """Sums elements of a numeric or boolean feature."""
        name = "sum1"
        input_types = [Numeric]
        return_type = Numeric
        stack_on_self = False
        stack_on_exclude = [Count]
        default_value = 0

        def get_function(self, agg_type='pandas'):
            return np.sum

    class Sum2(AggregationPrimitive):
        """Sums elements of a numeric or boolean feature."""
        name = "sum2"
        input_types = [Numeric]
        return_type = Numeric
        stack_on_self = False
        stack_on_exclude = [Count]
        default_value = 0

        def get_function(self, agg_type='pandas'):
            return np.sum

    class Sum3(AggregationPrimitive):
        """Sums elements of a numeric or boolean feature."""
        name = "sum3"
        input_types = [Numeric]
        return_type = Numeric
        stack_on_self = False
        stack_on_exclude = [Count]
        default_value = 0

        def get_function(self, agg_type='pandas'):
            return np.sum

    f5 = ft.Feature(es["log"]["value"],
                    parent_entity=es["customers"],
                    primitive=Sum1)
    f6 = ft.Feature(es["log"]["value"],
                    parent_entity=es["customers"],
                    primitive=Sum2)
    f7 = ft.Feature(es["log"]["value"],
                    parent_entity=es["customers"],
                    primitive=Sum3)
    fm = ft.calculate_feature_matrix(features=[f5, f6, f7], entityset=es)
    assert all(fm[f5.get_name()].sort_index() == value_sum)
    assert all(fm[f6.get_name()].sort_index() == value_sum)
    assert all(fm[f7.get_name()].sort_index() == value_sum)
示例#60
0
 def sample(self, n=10, cutoff_time=None):
     from featuretools import calculate_feature_matrix
     cfm = calculate_feature_matrix([self], cutoff_time=cutoff_time)
     return cfm.sample(n)