Пример #1
0
def test_make_agg_feat_using_prev_n_events(entityset, backend):
    agg_feat_1 = Min(entityset['log']['value'],
                     parent_entity=entityset['sessions'],
                     use_previous=Timedelta(1,
                                            'observations',
                                            entity=entityset['log']))

    agg_feat_2 = Min(entityset['log']['value'],
                     parent_entity=entityset['sessions'],
                     use_previous=Timedelta(3,
                                            'observations',
                                            entity=entityset['log']))

    assert agg_feat_1.get_name() != agg_feat_2.get_name(), \
        'Features should have different names based on use_previous'

    pandas_backend = backend([agg_feat_1, agg_feat_2])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=datetime(
                                                   2011, 4, 9, 10, 30, 6))

    # time_last is included by default
    v1 = df[agg_feat_1.get_name()][0]
    v2 = df[agg_feat_2.get_name()][0]
    assert v1 == 5
    assert v2 == 0

    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=datetime(
                                                   2011, 4, 9, 10, 30, 30))

    v1 = df[agg_feat_1.get_name()][0]
    v2 = df[agg_feat_2.get_name()][0]
    assert v1 == 20
    assert v2 == 10
def test_make_agg_feat_using_prev_n_events(entityset, backend):
    agg_feat_1 = Min(entityset['log']['value'],
                     parent_entity=entityset['sessions'],
                     use_previous=Timedelta(1, 'observations',
                                            entity=entityset['log']))

    agg_feat_2 = Min(entityset['log']['value'],
                     parent_entity=entityset['sessions'],
                     use_previous=Timedelta(3, 'observations',
                                            entity=entityset['log']))

    assert agg_feat_1.get_name() != agg_feat_2.get_name(), \
        'Features should have different names based on use_previous'

    pandas_backend = backend([agg_feat_1, agg_feat_2])
    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=datetime(2011, 4, 9, 10, 30, 6))

    # time_last is included by default
    v1 = df[agg_feat_1.get_name()][0]
    v2 = df[agg_feat_2.get_name()][0]
    assert v1 == 5
    assert v2 == 0

    df = pandas_backend.calculate_all_features(instance_ids=[0],
                                               time_last=datetime(2011, 4, 9, 10, 30, 30))

    v1 = df[agg_feat_1.get_name()][0]
    v2 = df[agg_feat_2.get_name()][0]
    assert v1 == 20
    assert v2 == 10
def test_uses_full_entity_feat_of_approximate(entityset):
    es = entityset
    agg_feat = Sum(es['log']['value'], es['sessions'])
    agg_feat2 = Sum(agg_feat, es['customers'])
    agg_feat3 = Min(agg_feat, es['customers'])
    dfeat = DirectFeature(agg_feat2, es['sessions'])
    dfeat2 = DirectFeature(agg_feat3, es['sessions'])
    p = Percentile(dfeat)

    # only dfeat2 should be approximated
    # because Percentile needs all values

    feature_matrix_only_dfeat2 = calculate_feature_matrix(
        [dfeat2],
        instance_ids=[0, 2],
        approximate=Timedelta(10, 's'),
        cutoff_time_in_index=True,
        cutoff_time=[
            datetime(2011, 4, 9, 10, 31, 19),
            datetime(2011, 4, 9, 11, 0, 0)
        ])
    assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == [1, 0]

    feature_matrix_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        instance_ids=[0, 2],
        approximate=Timedelta(10, 's'),
        cutoff_time_in_index=True,
        cutoff_time=[
            datetime(2011, 4, 9, 10, 31, 19),
            datetime(2011, 4, 9, 11, 0, 0)
        ])
    assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist(
    ) == feature_matrix_approx[dfeat2.get_name()].tolist()

    feature_matrix_small_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        instance_ids=[0, 2],
        approximate=Timedelta(10, 'ms'),
        cutoff_time_in_index=True,
        cutoff_time=[
            datetime(2011, 4, 9, 10, 31, 19),
            datetime(2011, 4, 9, 11, 0, 0)
        ])

    feature_matrix_no_approx = calculate_feature_matrix(
        [p, dfeat, dfeat2, agg_feat],
        instance_ids=[0, 2],
        cutoff_time_in_index=True,
        cutoff_time=[
            datetime(2011, 4, 9, 10, 31, 19),
            datetime(2011, 4, 9, 11, 0, 0)
        ])
    for f in [p, dfeat, agg_feat]:
        for fm1, fm2 in combinations([
                feature_matrix_approx, feature_matrix_small_approx,
                feature_matrix_no_approx
        ], 2):
            assert fm1[f.get_name()].tolist() == fm2[f.get_name()].tolist()
def test_approx_base_feature_is_also_first_class_feature(entityset):
    es = entityset
    log_to_products = DirectFeature(es['products']['rating'], es['log'])
    # This should still be computed properly
    agg_feat = Min(log_to_products, es['sessions'])
    customer_agg_feat = Sum(agg_feat, es['customers'])
    # This is to be approximated
    sess_to_cust = DirectFeature(customer_agg_feat, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([sess_to_cust, agg_feat],
                                              entityset,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=cutoff_time)
    vals1 = feature_matrix[sess_to_cust.get_name()].tolist()
    assert vals1 == [8.5, 7]
    vals2 = feature_matrix[agg_feat.get_name()].tolist()
    assert vals2 == [4, 1.5]
def test_approx_base_feature_is_also_first_class_feature(entityset):
    es = entityset
    log_to_products = DirectFeature(es['products']['rating'], es['log'])
    # This should still be computed properly
    agg_feat = Min(log_to_products, es['sessions'])
    customer_agg_feat = Sum(agg_feat, es['customers'])
    # This is to be approximated
    sess_to_cust = DirectFeature(customer_agg_feat, es['sessions'])
    times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]
    cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]})
    feature_matrix = calculate_feature_matrix([sess_to_cust, agg_feat],
                                              entityset,
                                              approximate=Timedelta(10, 's'),
                                              cutoff_time=cutoff_time)
    vals1 = feature_matrix[sess_to_cust.get_name()].tolist()
    assert vals1 == [8.5, 7]
    vals2 = feature_matrix[agg_feat.get_name()].tolist()
    assert vals2 == [4, 1.5]