示例#1
0
def test_feature_init_invalid_transformer_api(inputs):
    input, transformer = inputs
    with pytest.raises(ValueError):
        Feature(input, object())

    with pytest.raises(ValueError):
        Feature(input, IdentityTransformer)
示例#2
0
def test_discover(sample_data, expensive_stats):
    features = [
        Feature('size',
                NullFiller(0),
                source='foo.features.contrib.user_a.feature_1'),
        Feature('strength',
                NullFiller(100),
                source='foo.features.contrib.user_b.feature_1')
    ]
    X_df, y_df = sample_data.X, sample_data.y
    y = np.asfarray(y_df)

    df = discover(features, X_df, y_df, y, expensive_stats=expensive_stats)

    expected_cols = {
        'name',
        'description',
        'input',
        'transformer',
        'primitives',
        'output',
        'author',
        'source',
        'mutual_information',
        'conditional_mutual_information',
        'ninputs',
        'nvalues',
        'ncontinuous',
        'ndiscrete',
        'mean',
        'std',
        'variance',
        'min',
        'median',
        'max',
        'nunique',
    }
    actual_cols = df.columns
    assert not expected_cols.symmetric_difference(actual_cols)

    assert df.shape[0] == len(features)

    # test filter
    input = 'size'
    discovery_df = discover(features, X_df, y_df, y, input=input)
    assert discovery_df.shape[0] == len([
        feature for feature in features
        if feature.input == input or input in feature.input
    ])

    # test no data available
    # have to clear cache, as values on data already known
    ballet.discovery._summarize_feature.memory.clear()
    discovery_df = discover(features, None, None, None)
    assert discovery_df.shape[0] == len(features)
    actual_cols = discovery_df.columns
    assert not expected_cols.symmetric_difference(actual_cols)
    assert np.isnan(discovery_df['mean'].at[0])
示例#3
0
    def test_init(self):
        feature_1 = Feature(input='A_0',
                            transformer=IdentityTransformer(),
                            source='1st Feature')
        feature_2 = Feature(input='Z_0',
                            transformer=IdentityTransformer(),
                            source='2nd Feature')

        features = [feature_1]
        candidate_feature = feature_2

        accepter = GFSSFAccepter(self.X, self.y, features, candidate_feature)

        self.assertIsNotNone(accepter)
示例#4
0
def test_can_deepcopy():
    # see GH 90
    feature = Feature('size', IdentityTransformer())
    pipeline = FeatureEngineeringPipeline(feature)
    assert hasattr(pipeline, '_ballet_features')
    pipeline2 = deepcopy(pipeline)
    assert hasattr(pipeline2, '_ballet_features')
示例#5
0
 def test_producing_missing_values_fails(self):
     assert has_nans(self.X)
     feature = Feature(input='size', transformer=IdentityTransformer())
     valid, failures = check_from_class(FeatureApiCheck, feature, self.X,
                                        self.y)
     self.assertFalse(valid)
     self.assertIn(NoMissingValuesCheck.__name__, failures)
示例#6
0
def test_feature_pipeline(inputs):
    input, transformer = inputs
    feature = Feature(input, transformer)
    pipeline = feature.pipeline
    assert isinstance(pipeline, FeatureEngineeringPipeline)
    pipeline2 = feature.pipeline
    assert pipeline is pipeline2
示例#7
0
def test_transform(input, transformer):
    feature = Feature(input, transformer)
    mapper = FeatureEngineeringPipeline(feature)
    df = pd.util.testing.makeCustomDataframe(5, 2)
    df.columns = ['foo', 'bar']
    mapper.fit(df)
    X = mapper.transform(df)
    assert np.shape(X) == (5, 2)
示例#8
0
 def test_transform(self):
     feature = Feature(self.input, self.transformer)
     mapper = FeatureEngineeringPipeline(feature)
     df = pd.util.testing.makeCustomDataframe(5, 2)
     df.columns = ['foo', 'bar']
     mapper.fit(df)
     X = mapper.transform(df)
     self.assertEqual(np.shape(X), (5, 1))
示例#9
0
def test_gfssf_pruner_keep_relevant(sample_data):
    X_df, y_df, y = sample_data

    feature_1 = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='1st Feature')
    feature_2 = Feature(
        input='Z_0',
        transformer=IdentityTransformer(),
        source='2nd Feature')
    gfssf_pruner = GFSSFPruner(
        X_df, y_df, X_df, y, [feature_1], feature_2)

    redunant_features = gfssf_pruner.prune()
    assert feature_1 not in redunant_features, \
        'Still relevant features should be pruned'
示例#10
0
def test_gfssf_pruner_prune_exact_replicas(sample_data):
    X_df, y_df, y = sample_data

    feature_1 = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='1st Feature')
    feature_2 = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='2nd Feature')
    gfssf_pruner = GFSSFPruner(
        X_df, y_df, X_df, y, [feature_1], feature_2)

    redunant_features = gfssf_pruner.prune()
    assert feature_1 in redunant_features, \
        'Exact replica features should be pruned'
示例#11
0
 def test_bad_feature_transform_errors(self):
     # transformer throws errors
     feature = Feature(input='size',
                       transformer=FragileTransformer((lambda x: True, ),
                                                      (RuntimeError, )))
     valid, failures = check_from_class(FeatureApiCheck, feature, self.X,
                                        self.y)
     self.assertFalse(valid)
     self.assertIn(CanTransformCheck.__name__, failures)
示例#12
0
def test_mutual_information_accepter_nans(handle_nan_targets, expected):
    X_df = pd.DataFrame({'A': [1, 2, 3]})
    y = np.array([np.nan, 2, 3]).reshape(-1, 1)
    feature = Feature(
        input='A',
        transformer=IdentityTransformer())
    accepter = MutualInformationAccepter(
        X_df, y, X_df, y, [], feature, handle_nan_targets=handle_nan_targets)
    actual = accepter.judge()
    assert expected == actual
示例#13
0
def test_gfssf_accepter_init(sample_data):
    X_df, y_df, y = sample_data

    feature_1 = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='1st Feature')
    feature_2 = Feature(
        input='Z_0',
        transformer=IdentityTransformer(),
        source='2nd Feature')

    features = [feature_1]
    candidate_feature = feature_2

    accepter = GFSSFAccepter(
        X_df, y_df, X_df, y, features, candidate_feature)

    assert accepter is not None
示例#14
0
def test_bad_feature_input(sample_data):
    # bad input
    feature = Feature(
        input=3,
        transformer=SimpleImputer(),
    )
    valid, failures, advice = check_from_class(
        FeatureApiCheck, feature, sample_data.X, sample_data.y)
    assert not valid
    assert HasCorrectInputTypeCheck.__name__ in failures
示例#15
0
def test_good_feature(sample_data):
    feature = Feature(
        input='size',
        transformer=SimpleImputer(),
    )

    valid, failures, advice = check_from_class(
        FeatureApiCheck, feature, sample_data.X, sample_data.y)
    assert valid
    assert len(failures) == 0
示例#16
0
    def test_good_feature(self):
        feature = Feature(
            input='size',
            transformer=SimpleImputer(),
        )

        valid, failures = check_from_class(
            FeatureApiCheck, feature, self.X, self.y)
        self.assertTrue(valid)
        self.assertEqual(len(failures), 0)
示例#17
0
def test_producing_missing_values_fails(sample_data):
    assert has_nans(sample_data.X)
    feature = Feature(
        input='size',
        transformer=IdentityTransformer()
    )
    valid, failures, advice = check_from_class(
        FeatureApiCheck, feature, sample_data.X, sample_data.y)
    assert not valid
    assert NoMissingValuesCheck.__name__ in failures
示例#18
0
 def test_bad_feature_input(self):
     # bad input
     feature = Feature(
         input=3,
         transformer=SimpleImputer(),
     )
     valid, failures = check_from_class(
         FeatureApiCheck, feature, self.X, self.y)
     self.assertFalse(valid)
     self.assertIn(HasCorrectInputTypeCheck.__name__, failures)
示例#19
0
def test_bad_feature_transform_errors(sample_data):
    # transformer throws errors
    feature = Feature(
        input='size',
        transformer=FragileTransformer(
            (lambda x: True, ), (RuntimeError, ))
    )
    valid, failures, advice = check_from_class(
        FeatureApiCheck, feature, sample_data.X, sample_data.y)
    assert not valid
    assert CanTransformCheck.__name__ in failures
示例#20
0
def test_discover_feature_error(sample_data):
    features = [
        Feature('size', FragileTransformer()),
    ]
    X_df, y_df = sample_data.X, sample_data.y
    y = np.asfarray(y_df)

    discovery_df = discover(features, X_df, y_df, y)

    assert discovery_df.shape[0] == len(features)
    assert np.isnan(discovery_df['mean'].at[0])
示例#21
0
def test_robust_transformer_desugar():
    """Should be able to "desugar" multiple things into a valid transformer pipeline"""  # noqa
    transformer = [
        None,
        IdentityTransformer(),
        lambda x: x,
        Feature('A', IdentityTransformer()),
        ('A', IdentityTransformer()),
        ('A', [None, IdentityTransformer()]),
    ]
    robust_transformer = make_robust_transformer(transformer)
    assert isinstance(robust_transformer, TransformerPipeline)
示例#22
0
def test_df_colnames(input, transformer, output):
    feature = Feature(input, transformer, output=output)
    mapper = FeatureEngineeringPipeline(feature)
    entities_df = pd.util.testing.makeCustomDataframe(5, 2)
    entities_df.columns = ['foo', 'bar']
    feature_matrix = mapper.fit_transform(entities_df)
    feature_frame = pd.DataFrame(
        feature_matrix,
        columns=mapper.transformed_names_,
        index=entities_df.index,
    )
    assert fy.all(fy.isa(str), feature_frame.columns)
示例#23
0
def test_variance_threshold_accepter(mock_var, sample_data):
    expected = False
    X_df, y_df, y = sample_data
    feature = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='1st Feature')
    accepter = VarianceThresholdAccepter(
        X_df, y_df, X_df, y, [], feature)
    actual = accepter.judge()

    assert expected == actual
示例#24
0
 def test_bad_feature_deepcopy_fails(self):
     class _CopyFailsTransformer(IdentityTransformer):
         def __deepcopy__(self, memo):
             raise RuntimeError
     feature = Feature(
         input='size',
         transformer=_CopyFailsTransformer(),
     )
     valid, failures = check_from_class(
         FeatureApiCheck, feature, self.X, self.y)
     self.assertFalse(valid)
     self.assertIn(CanDeepcopyCheck.__name__, failures)
示例#25
0
def test_bad_feature_deepcopy_fails(sample_data):
    class _CopyFailsTransformer(IdentityTransformer):
        def __deepcopy__(self, memo):
            raise RuntimeError
    feature = Feature(
        input='size',
        transformer=_CopyFailsTransformer(),
    )
    valid, failures, advice = check_from_class(
        FeatureApiCheck, feature, sample_data.X, sample_data.y)
    assert not valid
    assert CanDeepcopyCheck.__name__ in failures
示例#26
0
def test_mutual_information_accepter(_, sample_data):
    expected = True
    X_df, y_df, y = sample_data
    feature = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='1st Feature')
    accepter = MutualInformationAccepter(
        X_df, y_df, X_df, y, [], feature)
    actual = accepter.judge()

    assert expected == actual
示例#27
0
def test_gfssf_pruner_prune_weak_replicas(sample_data):
    X_df, y_df, y = sample_data

    def add_noise(X):
        X = asarray2d(X)
        return X + np.random.normal(0, 0.5, X.shape)

    noise_transformer = SimpleFunctionTransformer(add_noise)
    feature_weak = Feature(
        input='A_0',
        transformer=noise_transformer,
        source='1st Feature')
    feature_strong = Feature(
        input='A_0',
        transformer=IdentityTransformer(),
        source='2nd Feature')
    gfssf_pruner = GFSSFPruner(
        X_df, y_df, X_df, y, [feature_weak], feature_strong)

    redunant_features = gfssf_pruner.prune()
    assert feature_weak in redunant_features, \
        'Noisy features should be pruned'
示例#28
0
def test_discover_target_nans(sample_data):
    features = [
        Feature('size', NullFiller(0)),
    ]
    X_df, y_df = sample_data.X, sample_data.y
    y = np.asfarray(y_df)

    # introduce nan to target
    y[0] = np.nan

    discovery_df = discover(features, X_df, y_df, y)

    # stats with target should still be computed
    assert not np.isnan(discovery_df['mutual_information']).any()
示例#29
0
def test_variance_threshold_accepter_feature_group():
    expected = True
    # variance is 0.25 per column, > 0.05 threshold
    X = pd.DataFrame(np.eye(2))
    y = None
    feature = Feature(
        input=[0, 1],
        transformer=IdentityTransformer(),
        source='1st Feature')
    accepter = VarianceThresholdAccepter(
        X, y, X, y, [], feature)
    actual = accepter.judge()

    assert expected == actual
示例#30
0
def test_bad_feature_wrong_transform_length(sample_data):
    class _WrongLengthTransformer(BaseTransformer):
        def transform(self, X, **transform_kwargs):
            new_shape = list(X.shape)
            new_shape[0] += 1
            output = np.arange(np.prod(new_shape)).reshape(new_shape)
            return output

    # doesn't return correct length
    feature = Feature(
        input='size',
        transformer=_WrongLengthTransformer(),
    )
    valid, failures, advice = check_from_class(
        FeatureApiCheck, feature, sample_data.X, sample_data.y)
    assert not valid
    assert HasCorrectOutputDimensionsCheck.__name__ in failures