def test_preprocessing():
    """Tests feature preprocessing"""
    base_prob = mock_problem()
    base_prob.features.append('discrete_feat')

    # Derive a problem with a single discrete feature perfectly correlated with the label
    df = pd.DataFrame(base_prob.dataframe, copy=True)
    df['discrete_feat'] = 'negative'
    df['discrete_feat'].values[base_prob.y == 1] = 'positive'

    # Verify that a manual upfront vectorize is equivalent to passing a vectorizer as the preprocess step
    # to SelectAndClassify
    prob = base_prob.set_data(df)
    vectorized_prob = ProblemVectorizer().fit_apply(prob)

    baseline_classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), preprocess=None)
    preprocess_classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(),
                                              preprocess=ProblemVectorizer())

    # First make sure that the baseline classifier cannot be fit on the unvectorized data
    nose.tools.assert_raises(ValueError, lambda: baseline_classifier.fit_apply(prob))

    baseline_scores = baseline_classifier.fit_apply(vectorized_prob)
    preprocess_scores = preprocess_classifier.fit_apply(prob)

    np.testing.assert_allclose(baseline_scores, preprocess_scores)
def test_no_column_overwrite():
    """Validates that we don't overwrite input values if the input contains NaNs in discrete columns"""
    df = pd.DataFrame({
        'A': ['a', 'aa', float('nan')],
        'B': ['b', 'bb', 'bbb'],
        'y': [0, 1, 1]
    })
    prob = Problem(df, ['A', 'B'], 'y', 1)
    vec = ProblemVectorizer()

    vec_prob = vec.fit_apply(prob, keep_discrete_columns=True)
    vec_df = vec_prob.dataframe

    nose.tools.assert_list_equal(sorted(vec_prob.features),
                                 ['A=a', 'A=aa', 'B=b', 'B=bb', 'B=bbb'])

    nose.tools.assert_list_equal(list(vec_df['A=a']), [1, 0, 0])
    nose.tools.assert_list_equal(list(vec_df['A=aa']), [0, 1, 0])

    nose.tools.assert_list_equal(list(vec_df['B=b']), [1, 0, 0])
    nose.tools.assert_list_equal(list(vec_df['B=bb']), [0, 1, 0])
    nose.tools.assert_list_equal(list(vec_df['B=bbb']), [0, 0, 1])

    # Original input columns shouldn't have changed.
    #
    # In the initial implementation, this test failed for column 'A'. This happened
    # because scikit's vectorizer creates an all-zero column with the exact same name if the input is
    # discrete and contains NaNs, which causes the original values to be overwritten.
    nose.tools.assert_list_equal(list(vec_df['A']), list(df['A']))
    nose.tools.assert_list_equal(list(vec_df['B']), list(df['B']))

    nose.tools.assert_list_equal(
        sorted(vec_df.columns),
        sorted(['A', 'A=a', 'A=aa', 'B', 'B=b', 'B=bb', 'B=bbb', 'y']))
def test_vectorize():
    """\
    Tests ProblemVectorizer in problem.py

    """
    def assert_equal_with_nans(lst_a, lst_b):
        """\
        Checks that lists a and b are equal with nose.tools.eq_
        Lists must be of equal length.
        NaNs are handled specially with np.isNaN

        :param lst_a: List 1
        :param lst_b: List 2
        """
        nose.tools.eq_(len(lst_a), len(lst_b))
        for x, y in zip(lst_a, lst_b):
            # Special handling for NaNs
            if isnan2(x) and isnan2(y):
                continue
            if isinstance(x, float) and isinstance(y, float):
                nose.tools.assert_almost_equal(x, y)
            else:
                nose.tools.eq_(x, y)

    vec = ProblemVectorizer(['gene1'], ['gene2'])

    assert_equal_with_nans(
        vec.preprocess_numeric([1.0, 100, 2.3, 'missing', -8, 'nul']),
        [1.0, 100, 2.3, float('nan'), -8,
         float('nan')])
    assert_equal_with_nans(
        vec.preprocess_discrete(
            ['gene1', 'disease1', 100, -2.1, 'missing', 'vector', 'attribute'],
            'unknown'), [
                'gene1', 'disease1', 'unknown=100', 'unknown=-2.1', 'missing',
                'vector', 'attribute'
            ])

    # test vectorize end-to-end
    prob = mock_badvector_problem()
    vec_prob = vec.fit_apply(prob)

    # the vectorizer pipeline converts NaN to column average: 1.4 / 3.0 in our case
    assert_equal_with_nans(list(vec_prob.data.dataframe['gene1'].values),
                           [0.0, 1.4 / 3.0, 1.4 / 3.0, 0.6, 0.8])
    assert_equal_with_nans(list(vec_prob.data.dataframe['gene2=a'].values),
                           [1, 0, 0, 0, 0])
    assert_equal_with_nans(list(vec_prob.data.dataframe['gene2=b'].values),
                           [0, 1, 0, 0, 0])
    assert_equal_with_nans(list(vec_prob.data.dataframe['gene2=c'].values),
                           [0, 0, 1, 0, 0])
    assert_equal_with_nans(list(vec_prob.data.dataframe['gene2=d'].values),
                           [0, 0, 0, 0, 1])
    assert_equal_with_nans(
        list(vec_prob.data.dataframe['gene2=discrete=0.5'].values),
        [0, 0, 0, 1, 0])
 def checkme(permissive_or_not, fail_or_pass, expected_numeric,
             expected_discrete, df_columns):
     """Utility"""
     assert permissive_or_not in {'permissive', 'strict'}
     assert fail_or_pass in {'fail', 'pass'}
     df = pd.DataFrame({col: list(range(10)) for col in df_columns})
     df['y'] = [0, 1] * 5
     prob = Problem(df, df_columns, 'y', 1)
     vec = ProblemVectorizer(expected_numeric=expected_numeric,
                             expected_discrete=expected_discrete,
                             permissive=(permissive_or_not == 'permissive'))
     if fail_or_pass == 'pass':
         vec.fit_apply(prob)
     else:
         nose.tools.assert_raises(ValueError, lambda: vec.fit_apply(prob))
示例#5
0
 def make_learning_approaches(self):
     """Creates LearningApproaches from the learner options"""
     for k in self.n_features:
         for cls in self.classifiers:
             yield SelectAndClassify(SelectKBest(k=k),
                                     cls,
                                     name='SelectKBest(k={}) -> {}'.format(
                                         k, cls.__class__.__name__),
                                     preprocess=ProblemVectorizer())
def mock_model():
    """Creates a simple mock model for testing"""
    prob = mock_problem()
    logit = SelectAndClassify(selector=None,
                              classifier=LogisticRegression(),
                              preprocess=ProblemVectorizer(),
                              name="test model").fit(prob)

    return ClassificationModel(logit, prob)
示例#7
0
    def checkme(working_dir, n_samples, n_features, k, make_classifier,
                test_vectorize):
        """Utility"""
        assert n_samples % 4 == 0
        model_path = os.path.join(working_dir, 'model.txt')
        prob = mock_problem(n_samples=n_samples, n_features=n_features)
        if test_vectorize:
            df = prob.dataframe
            df['discrete_1'] = ['foo', 'bar'] * int(n_samples / 2)
            df['discrete_2'] = ['foo', 'bar', 'baz',
                                float('nan')] * int(n_samples / 4)
            df['continuous_with_missing'] = [0, 1, 2, float('nan')] * int(
                n_samples / 4)
            prob = Problem(
                df, prob.features +
                ['discrete_1', 'discrete_2', 'continuous_with_missing'],
                prob.outcome_column, prob.positive_outcome)
            preprocess = ProblemVectorizer()
        else:
            preprocess = None

        approach = SelectAndClassify(SelectKBest(k=k),
                                     make_classifier(),
                                     preprocess=preprocess).fit(prob)
        model = ClassificationModel(approach, prob)

        model.write(model_path)
        reconstituted_model = ClassificationModel.read(model_path)

        model.validate()
        reconstituted_model.validate()

        np.testing.assert_array_equal(model.approach.apply(prob),
                                      reconstituted_model.approach.apply(prob))

        if preprocess is not None:
            approach_pipeline = ApproachPipeline([('preprocess', preprocess)])
            approach_with_pipeline = SelectAndClassify(
                SelectKBest(k=k),
                make_classifier(),
                preprocess=approach_pipeline).fit(prob)
            # test approach serialization with Pipeline from learners.py
            model_with_pipeline = ClassificationModel(approach_with_pipeline,
                                                      prob)
            model_path2 = os.path.join(working_dir, 'model2.txt')
            model_with_pipeline.write(model_path2)
            reconstituted_model2 = ClassificationModel.read(model_path2)
            reconstituted_model2.validate()
            np.testing.assert_array_almost_equal(
                model.approach.apply(prob),
                reconstituted_model2.approach.apply(prob), 14)