示例#1
0
def test_robust_transformer_desugar():
    """Should be able to "desugar" multiple things into a valid transformer pipeline"""  # noqa
    transformer = [
        None,
        IdentityTransformer(),
        lambda x: x,
        Feature('A', IdentityTransformer()),
        ('A', IdentityTransformer()),
        ('A', [None, IdentityTransformer()]),
    ]
    robust_transformer = make_robust_transformer(transformer)
    assert isinstance(robust_transformer, TransformerPipeline)
示例#2
0
    def test_init(self):
        feature_1 = Feature(input='A_0',
                            transformer=IdentityTransformer(),
                            source='1st Feature')
        feature_2 = Feature(input='Z_0',
                            transformer=IdentityTransformer(),
                            source='2nd Feature')

        features = [feature_1]
        candidate_feature = feature_2

        accepter = GFSSFAccepter(self.X, self.y, features, candidate_feature)

        self.assertIsNotNone(accepter)
示例#3
0
 def test_producing_missing_values_fails(self):
     assert has_nans(self.X)
     feature = Feature(input='size', transformer=IdentityTransformer())
     valid, failures = check_from_class(FeatureApiCheck, feature, self.X,
                                        self.y)
     self.assertFalse(valid)
     self.assertIn(NoMissingValuesCheck.__name__, failures)
示例#4
0
def test_can_deepcopy():
    # see GH 90
    feature = Feature('size', IdentityTransformer())
    pipeline = FeatureEngineeringPipeline(feature)
    assert hasattr(pipeline, '_ballet_features')
    pipeline2 = deepcopy(pipeline)
    assert hasattr(pipeline2, '_ballet_features')
示例#5
0
def get_target_encoder():
    """Get encoder for the prediction target

    Returns:
        transformer-like
    """
    return IdentityTransformer()
示例#6
0
def test_gfssf_pruner_keep_relevant(sample_data):
    X_df, y_df, y = sample_data

    feature_1 = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='1st Feature')
    feature_2 = Feature(
        input='Z_0',
        transformer=IdentityTransformer(),
        source='2nd Feature')
    gfssf_pruner = GFSSFPruner(
        X_df, y_df, X_df, y, [feature_1], feature_2)

    redunant_features = gfssf_pruner.prune()
    assert feature_1 not in redunant_features, \
        'Still relevant features should be pruned'
示例#7
0
def test_gfssf_pruner_prune_exact_replicas(sample_data):
    X_df, y_df, y = sample_data

    feature_1 = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='1st Feature')
    feature_2 = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='2nd Feature')
    gfssf_pruner = GFSSFPruner(
        X_df, y_df, X_df, y, [feature_1], feature_2)

    redunant_features = gfssf_pruner.prune()
    assert feature_1 in redunant_features, \
        'Exact replica features should be pruned'
示例#8
0
def test_mutual_information_accepter_nans(handle_nan_targets, expected):
    X_df = pd.DataFrame({'A': [1, 2, 3]})
    y = np.array([np.nan, 2, 3]).reshape(-1, 1)
    feature = Feature(
        input='A',
        transformer=IdentityTransformer())
    accepter = MutualInformationAccepter(
        X_df, y, X_df, y, [], feature, handle_nan_targets=handle_nan_targets)
    actual = accepter.judge()
    assert expected == actual
示例#9
0
def test_gfssf_accepter_init(sample_data):
    X_df, y_df, y = sample_data

    feature_1 = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='1st Feature')
    feature_2 = Feature(
        input='Z_0',
        transformer=IdentityTransformer(),
        source='2nd Feature')

    features = [feature_1]
    candidate_feature = feature_2

    accepter = GFSSFAccepter(
        X_df, y_df, X_df, y, features, candidate_feature)

    assert accepter is not None
示例#10
0
def test_producing_missing_values_fails(sample_data):
    assert has_nans(sample_data.X)
    feature = Feature(
        input='size',
        transformer=IdentityTransformer()
    )
    valid, failures, advice = check_from_class(
        FeatureApiCheck, feature, sample_data.X, sample_data.y)
    assert not valid
    assert NoMissingValuesCheck.__name__ in failures
示例#11
0
文件: util.py 项目: ballet/ballet
    def __init__(self, nsteps, bad_input_checks, errors, shuffle=True, seed=1):
        steps = [(f'IdentityTransformer{i:02d}', IdentityTransformer())
                 for i in range(nsteps - 1)]
        fragile_transformer = FragileTransformer(bad_input_checks, errors)
        steps.append((repr(fragile_transformer), fragile_transformer))
        if shuffle:
            rand = random.Random()
            rand.seed(seed)
            rand.shuffle(steps)

        super().__init__(steps)
示例#12
0
def test_variance_threshold_accepter(mock_var, sample_data):
    expected = False
    X_df, y_df, y = sample_data
    feature = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='1st Feature')
    accepter = VarianceThresholdAccepter(
        X_df, y_df, X_df, y, [], feature)
    actual = accepter.judge()

    assert expected == actual
示例#13
0
def test_mutual_information_accepter(_, sample_data):
    expected = True
    X_df, y_df, y = sample_data
    feature = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='1st Feature')
    accepter = MutualInformationAccepter(
        X_df, y_df, X_df, y, [], feature)
    actual = accepter.judge()

    assert expected == actual
示例#14
0
def test_variance_threshold_accepter_feature_group():
    expected = True
    # variance is 0.25 per column, > 0.05 threshold
    X = pd.DataFrame(np.eye(2))
    y = None
    feature = Feature(
        input=[0, 1],
        transformer=IdentityTransformer(),
        source='1st Feature')
    accepter = VarianceThresholdAccepter(
        X, y, X, y, [], feature)
    actual = accepter.judge()

    assert expected == actual
示例#15
0
def test_gfssf_pruner_prune_weak_replicas(sample_data):
    X_df, y_df, y = sample_data

    def add_noise(X):
        X = asarray2d(X)
        return X + np.random.normal(0, 0.5, X.shape)

    noise_transformer = SimpleFunctionTransformer(add_noise)
    feature_weak = Feature(
        input='A_0',
        transformer=noise_transformer,
        source='1st Feature')
    feature_strong = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='2nd Feature')
    gfssf_pruner = GFSSFPruner(
        X_df, y_df, X_df, y, [feature_weak], feature_strong)

    redunant_features = gfssf_pruner.prune()
    assert feature_weak in redunant_features, \
        'Noisy features should be pruned'
示例#16
0
def test_compound_accepter(sample_data):
    expected = False
    X_df, y_df, y = sample_data
    agg = 'all'
    specs = [
        'ballet.validation.feature_acceptance.validator.AlwaysAccepter',
        {
            'name': 'ballet.validation.feature_acceptance.validator.RandomAccepter',  # noqa
            'params': {
                'p': 0.00,
            }
        }
    ]
    feature = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='1st Feature')
    accepter = CompoundAccepter(
        X_df, y_df, X_df, y, [], feature, agg=agg, specs=specs
    )
    actual = accepter.judge()

    assert expected == actual
示例#17
0
def test_validation_end_to_end(quickstart):
    project = quickstart.project
    slug = quickstart.package_slug
    base = project.path
    repo = quickstart.repo

    pkg = project.package
    assert isinstance(pkg, ModuleType)

    api = project.api
    assert isinstance(api, FeatureEngineeringProject)

    # no features at first
    features = api.features
    assert len(features) == 0

    # first providing a mock feature, call build
    mock_features = [Feature(input='A_1', transformer=IdentityTransformer())]
    with patch.object(api, 'collect', return_value=mock_features):
        X_df = pd.util.testing.makeCustomDataframe(5, 2)
        X_df.columns = ['A_0', 'A_1']
        result = api.engineer_features(X_df=X_df, y_df=[])
        assert np.shape(result.X) == (5, 1)
        assert isinstance(result.pipeline, FeatureEngineeringPipeline)

    # splice in a new version of foo.load_data.load_data
    # 1. 'src' needs to be hardcoded
    # 2. really bad - set load_data = load_regression_data which does not
    #    have the same args
    new_load_data_str = get_source(load_regression_data)
    p = base.joinpath('src', slug, 'load_data.py')
    with p.open('w') as f:
        f.write(new_load_data_str)
        f.write('\n')
        f.write('load_data=load_regression_data\n')

    # commit changes
    repo.index.add([str(p)])
    repo.index.commit('Load mock regression dataset')

    # call different validation routines
    def call_validate_all(ref=None):
        """Validate branch as if we were running on CI"""
        envvars = {
            'TRAVIS_BUILD_DIR': repo.working_tree_dir,
        }
        if ref is None:
            envvars['TRAVIS_PULL_REQUEST'] = 'false'
            envvars['TRAVIS_COMMIT_RANGE'] = make_commit_range(
                repo.commit('HEAD@{-1}').hexsha,
                repo.commit('HEAD').hexsha)
            envvars['TRAVIS_PULL_REQUEST_BRANCH'] = ''
            envvars['TRAVIS_BRANCH'] = repo.heads.master.name
        else:
            # TODO is this okay for testing?
            envvars['TRAVIS_PULL_REQUEST'] = str(1)
            envvars['TRAVIS_COMMIT_RANGE'] = make_commit_range(
                repo.heads.master.name,
                repo.commit(ref).hexsha)

        with patch.dict(os.environ, envvars):
            check_call(shlex.split('ballet validate -A'),
                       cwd=base,
                       env=os.environ)

    call_validate_all()

    # branch and write a new feature
    contrib_dir = base.joinpath('src', slug, 'features', 'contrib')
    ref = 'bob/feature-a'
    logger.info(f'Switching to branch {ref}, User Bob, Feature A')
    switch_to_new_branch(repo, ref)
    new_feature_str = make_feature_str('A_0')
    username = '******'
    featurename = 'A_0'
    submit_feature(repo, contrib_dir, username, featurename, new_feature_str)

    # call different validation routines
    logger.info('Validating User Bob, Feature A')
    call_validate_all(ref=ref)

    # merge branch with master
    logger.info('Merging into master')
    repo.git.checkout('master')
    repo.git.merge(ref, no_ff=True)

    # call different validation routines
    logger.info('Validating after merge')
    call_validate_all()

    # write another new feature
    ref = 'charlie/feature-z1'
    logger.info('Switching to branch ref, User Charlie, Feature Z_1')
    switch_to_new_branch(repo, ref)
    new_feature_str = make_feature_str('Z_1')
    username = '******'
    featurename = 'Z_1'
    submit_feature(repo, contrib_dir, username, featurename, new_feature_str)

    # TODO we expect this feature to fail but it passes
    cm = pytest.raises(CalledProcessError) if False else nullcontext()
    with cm:
        logger.info('Validating User Charlie, Feature Z_1')
        call_validate_all(ref=ref)

    # write another new feature - redundancy
    ref = 'charlie/feature-a0'
    repo.git.checkout('master')
    switch_to_new_branch(repo, ref)
    new_feature_str = make_feature_str('A_0')
    username = '******'
    featurename = 'A_0'
    submit_feature(repo, contrib_dir, username, featurename, new_feature_str)

    with pytest.raises(CalledProcessError):
        call_validate_all(ref=ref)
示例#18
0
with_input = pytest.mark.parametrize(
    'input',
    [
        ['foo', 'bar'],
        lambda df: ['foo', 'bar'],
    ],
    ids=[
        'list of string',
        'callable to list of string'
    ]
)
with_transformer = pytest.mark.parametrize(
    'transformer',
    [
        IdentityTransformer(),
        [IdentityTransformer()],
        [None, IdentityTransformer(), lambda x: x],
        Feature(['foo', 'bar'], IdentityTransformer()),
        [None, IdentityTransformer(), Feature(
            ['foo', 'bar'], IdentityTransformer())],
    ],
    ids=[
        'scalar',
        'list of transformer',
        'list of mixed',
        'nested feature',
        'list of mixed and nested features',
    ]
)
示例#19
0
def inputs(request):
    input = request.param
    transformer = IdentityTransformer()
    return input, transformer
示例#20
0
@pytest.mark.parametrize('robust_maker', [
    DelegatingRobustTransformer,
    lambda x: make_robust_transformer([x]),
])
def test_robust_str_repr(robust_maker):
    robust_transformer = robust_maker(IdentityTransformer())
    for func in [str, repr]:
        s = func(robust_transformer)
        assert len(s) > 0


@pytest.mark.parametrize(
    'transformer,expected',
    [(
        IdentityTransformer(),
        ['IdentityTransformer'],
    ),
     (
         [IdentityTransformer(), IdentityTransformer()],
         ['IdentityTransformer', 'IdentityTransformer'],
     )])
def test_get_transformer_primitives(transformer, expected):
    robust_transformer = make_robust_transformer(transformer)
    primitives = get_transformer_primitives(robust_transformer)
    assert primitives == expected


def test_robust_transformer_desugar():
    """Should be able to "desugar" multiple things into a valid transformer pipeline"""  # noqa
    transformer = [
示例#21
0
def test_robust_str_repr(robust_maker):
    robust_transformer = robust_maker(IdentityTransformer())
    for func in [str, repr]:
        s = func(robust_transformer)
        assert len(s) > 0
def test_validation_end_to_end(quickstart):
    project = quickstart.project
    modname = 'foo'
    base = project.path
    repo = project.repo

    def _import(modname):
        relpath = modname_to_relpath(modname,
                                     project_root=base,
                                     add_init=False)
        abspath = base.joinpath(relpath)
        return import_module_at_path(modname, abspath)

    foo = _import('foo')
    assert isinstance(foo, ModuleType)

    foo_features = _import('foo.features')
    assert isinstance(foo_features, ModuleType)

    collect_contrib_features = foo_features.collect_contrib_features
    features = collect_contrib_features()
    assert len(features) == 0

    # first providing a mock feature, call build
    with patch.object(
        foo_features, 'collect_contrib_features',
        return_value=[Feature(input='A_1', transformer=IdentityTransformer())]
    ):
        X_df = pd.util.testing.makeCustomDataframe(5, 2)
        X_df.columns = ['A_0', 'A_1']
        out = foo_features.build(X_df=X_df, y_df=[])
        assert np.shape(out.X) == (5, 1)
        assert isinstance(out.mapper_X, FeatureEngineeringPipeline)

    # write a new version of foo.load_data.load_data
    new_load_data_str = get_source(load_regression_data)

    p = base.joinpath(modname, 'load_data.py')
    with p.open('w') as f:
        f.write(new_load_data_str)

    # commit changes
    repo.index.add([str(p)])
    repo.index.commit('Load mock regression dataset')

    # call different validation routines
    def call_validate_all(pr=None):
        envvars = {
            'TRAVIS_BUILD_DIR': repo.working_tree_dir,
        }
        if pr is None:
            envvars['TRAVIS_PULL_REQUEST'] = 'false'
            envvars['TRAVIS_COMMIT_RANGE'] = make_commit_range(
                repo.commit('HEAD@{-1}').hexsha, repo.commit('HEAD').hexsha)
            envvars['TRAVIS_PULL_REQUEST_BRANCH'] = ''
            envvars['TRAVIS_BRANCH'] = repo.heads.master.name
        else:
            envvars['TRAVIS_PULL_REQUEST'] = str(pr)
            envvars['TRAVIS_COMMIT_RANGE'] = make_commit_range(
                repo.heads.master.name,
                repo.commit('pull/{pr}'.format(pr=pr)).hexsha)

        with patch.dict(os.environ, envvars):
            cmd = 'ballet validate -A'
            check_call(cmd, cwd=safepath(base), env=os.environ)

    call_validate_all()

    # branch to a fake PR and write a new feature
    contrib_dir = base.joinpath(modname, 'features', 'contrib')
    logger.info('Switching to pull request 1, User Bob, Feature A')
    switch_to_new_branch(repo, 'pull/1')
    new_feature_str = make_feature_str('A_0')
    username = '******'
    featurename = 'A_0'
    submit_feature(repo, contrib_dir, username, featurename, new_feature_str)

    # call different validation routines
    logger.info('Validating pull request 1, User Bob, Feature A')
    call_validate_all(pr=1)

    # merge PR with master
    logger.info('Merging into master')
    repo.git.checkout('master')
    repo.git.merge('pull/1', no_ff=True)

    # call different validation routines
    logger.info('Validating after merge')
    call_validate_all()

    # write another new feature
    logger.info('Switching to pull request 2, User Charlie, Feature Z_1')
    switch_to_new_branch(repo, 'pull/2')
    new_feature_str = make_feature_str('Z_1')
    username = '******'
    featurename = 'Z_1'
    submit_feature(repo, contrib_dir, username, featurename, new_feature_str)

    # if we expect this feature to fail
    with pytest.raises(CalledProcessError):
        logger.info('Validating pull request 2, User Charlie, Feature Z_1')
        call_validate_all(pr=2)

    # write another new feature - redudancy
    repo.git.checkout('master')
    switch_to_new_branch(repo, 'pull/3')
    new_feature_str = make_feature_str('A_0')
    username = '******'
    featurename = 'A_0'
    submit_feature(repo, contrib_dir, username, featurename, new_feature_str)

    with pytest.raises(CalledProcessError):
        call_validate_all(pr=3)
示例#23
0
 def setUp(self):
     self.input = 'foo'
     self.transformer = IdentityTransformer()