예제 #1
0
def test_copy_features_does_not_copy_entityset(es):
    agg = Sum(es['log']['value'], es['sessions'])
    agg_where = Sum(es['log']['value'],
                    es['sessions'],
                    where=IdentityFeature(es['log']['value']) == 2)
    agg_use_previous = Sum(es['log']['value'],
                           es['sessions'],
                           use_previous='4 days')
    agg_use_previous_where = Sum(es['log']['value'],
                                 es['sessions'],
                                 where=IdentityFeature(
                                     es['log']['value']) == 2,
                                 use_previous='4 days')
    features = [agg, agg_where, agg_use_previous, agg_use_previous_where]
    in_memory_size = getsize(locals())
    copied = [f.copy() for f in features]
    new_in_memory_size = getsize(locals())
    assert new_in_memory_size < 2 * in_memory_size

    for f, c in zip(features, copied):
        assert f.entityset
        assert c.entityset
        assert id(f.entityset) == id(c.entityset)
        if f.where:
            assert c.where
            assert id(f.where.entityset) == id(c.where.entityset)
        for bf, bf_c in zip(f.base_features, c.base_features):
            assert id(bf.entityset) == id(bf_c.entityset)
            if bf.where:
                assert bf_c.where
                assert id(bf.where.entityset) == id(bf_c.where.entityset)
예제 #2
0
def test_copy_features_does_not_copy_entityset(es):
    agg = Sum(es['log']['value'], es['sessions'])
    agg_where = Sum(es['log']['value'], es['sessions'],
                    where=IdentityFeature(es['log']['value']) == 2)
    agg_use_previous = Sum(es['log']['value'], es['sessions'],
                           use_previous='4 days')
    agg_use_previous_where = Sum(es['log']['value'], es['sessions'],
                                 where=IdentityFeature(es['log']['value']) == 2,
                                 use_previous='4 days')
    features = [agg, agg_where, agg_use_previous, agg_use_previous_where]
    in_memory_size = getsize(locals())
    copied = [f.copy() for f in features]
    new_in_memory_size = getsize(locals())
    assert new_in_memory_size < 2 * in_memory_size

    for f, c in zip(features, copied):
        assert f.entityset
        assert c.entityset
        assert id(f.entityset) == id(c.entityset)
        if f.where:
            assert c.where
            assert id(f.where.entityset) == id(c.where.entityset)
        for bf, bf_c in zip(f.base_features, c.base_features):
            assert id(bf.entityset) == id(bf_c.entityset)
            if bf.where:
                assert bf_c.where
                assert id(bf.where.entityset) == id(bf_c.where.entityset)
def test_pickle_features(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   filters=[],
                                   agg_primitives=[Last, Mean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()

    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(features_no_pickle[0].entityset, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath, es)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < getsize(feat_1.entityset)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
예제 #4
0
def test_pickle_features_with_custom_primitive(es):
    NewMean = make_agg_primitive(
        np.nanmean,
        name="NewMean",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean, NewMean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()
    assert any([isinstance(feat, NewMean) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < getsize(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
def test_pickle_features_with_custom_primitive(es):
    NewMean = make_agg_primitive(
        np.nanmean,
        name="NewMean",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean, NewMean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()
    assert any([isinstance(feat, NewMean) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < getsize(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
def test_pickle_features(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()

    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < getsize(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)