def test_preprocessing(): """Tests feature preprocessing""" base_prob = mock_problem() base_prob.features.append('discrete_feat') # Derive a problem with a single discrete feature perfectly correlated with the label df = pd.DataFrame(base_prob.dataframe, copy=True) df['discrete_feat'] = 'negative' df['discrete_feat'].values[base_prob.y == 1] = 'positive' # Verify that a manual upfront vectorize is equivalent to passing a vectorizer as the preprocess step # to SelectAndClassify prob = base_prob.set_data(df) vectorized_prob = ProblemVectorizer().fit_apply(prob) baseline_classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), preprocess=None) preprocess_classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), preprocess=ProblemVectorizer()) # First make sure that the baseline classifier cannot be fit on the unvectorized data nose.tools.assert_raises(ValueError, lambda: baseline_classifier.fit_apply(prob)) baseline_scores = baseline_classifier.fit_apply(vectorized_prob) preprocess_scores = preprocess_classifier.fit_apply(prob) np.testing.assert_allclose(baseline_scores, preprocess_scores)
def test_no_column_overwrite(): """Validates that we don't overwrite input values if the input contains NaNs in discrete columns""" df = pd.DataFrame({ 'A': ['a', 'aa', float('nan')], 'B': ['b', 'bb', 'bbb'], 'y': [0, 1, 1] }) prob = Problem(df, ['A', 'B'], 'y', 1) vec = ProblemVectorizer() vec_prob = vec.fit_apply(prob, keep_discrete_columns=True) vec_df = vec_prob.dataframe nose.tools.assert_list_equal(sorted(vec_prob.features), ['A=a', 'A=aa', 'B=b', 'B=bb', 'B=bbb']) nose.tools.assert_list_equal(list(vec_df['A=a']), [1, 0, 0]) nose.tools.assert_list_equal(list(vec_df['A=aa']), [0, 1, 0]) nose.tools.assert_list_equal(list(vec_df['B=b']), [1, 0, 0]) nose.tools.assert_list_equal(list(vec_df['B=bb']), [0, 1, 0]) nose.tools.assert_list_equal(list(vec_df['B=bbb']), [0, 0, 1]) # Original input columns shouldn't have changed. # # In the initial implementation, this test failed for column 'A'. This happened # because scikit's vectorizer creates an all-zero column with the exact same name if the input is # discrete and contains NaNs, which causes the original values to be overwritten. nose.tools.assert_list_equal(list(vec_df['A']), list(df['A'])) nose.tools.assert_list_equal(list(vec_df['B']), list(df['B'])) nose.tools.assert_list_equal( sorted(vec_df.columns), sorted(['A', 'A=a', 'A=aa', 'B', 'B=b', 'B=bb', 'B=bbb', 'y']))
def test_vectorize(): """\ Tests ProblemVectorizer in problem.py """ def assert_equal_with_nans(lst_a, lst_b): """\ Checks that lists a and b are equal with nose.tools.eq_ Lists must be of equal length. NaNs are handled specially with np.isNaN :param lst_a: List 1 :param lst_b: List 2 """ nose.tools.eq_(len(lst_a), len(lst_b)) for x, y in zip(lst_a, lst_b): # Special handling for NaNs if isnan2(x) and isnan2(y): continue if isinstance(x, float) and isinstance(y, float): nose.tools.assert_almost_equal(x, y) else: nose.tools.eq_(x, y) vec = ProblemVectorizer(['gene1'], ['gene2']) assert_equal_with_nans( vec.preprocess_numeric([1.0, 100, 2.3, 'missing', -8, 'nul']), [1.0, 100, 2.3, float('nan'), -8, float('nan')]) assert_equal_with_nans( vec.preprocess_discrete( ['gene1', 'disease1', 100, -2.1, 'missing', 'vector', 'attribute'], 'unknown'), [ 'gene1', 'disease1', 'unknown=100', 'unknown=-2.1', 'missing', 'vector', 'attribute' ]) # test vectorize end-to-end prob = mock_badvector_problem() vec_prob = vec.fit_apply(prob) # the vectorizer pipeline converts NaN to column average: 1.4 / 3.0 in our case assert_equal_with_nans(list(vec_prob.data.dataframe['gene1'].values), [0.0, 1.4 / 3.0, 1.4 / 3.0, 0.6, 0.8]) assert_equal_with_nans(list(vec_prob.data.dataframe['gene2=a'].values), [1, 0, 0, 0, 0]) assert_equal_with_nans(list(vec_prob.data.dataframe['gene2=b'].values), [0, 1, 0, 0, 0]) assert_equal_with_nans(list(vec_prob.data.dataframe['gene2=c'].values), [0, 0, 1, 0, 0]) assert_equal_with_nans(list(vec_prob.data.dataframe['gene2=d'].values), [0, 0, 0, 0, 1]) assert_equal_with_nans( list(vec_prob.data.dataframe['gene2=discrete=0.5'].values), [0, 0, 0, 1, 0])
def checkme(permissive_or_not, fail_or_pass, expected_numeric, expected_discrete, df_columns): """Utility""" assert permissive_or_not in {'permissive', 'strict'} assert fail_or_pass in {'fail', 'pass'} df = pd.DataFrame({col: list(range(10)) for col in df_columns}) df['y'] = [0, 1] * 5 prob = Problem(df, df_columns, 'y', 1) vec = ProblemVectorizer(expected_numeric=expected_numeric, expected_discrete=expected_discrete, permissive=(permissive_or_not == 'permissive')) if fail_or_pass == 'pass': vec.fit_apply(prob) else: nose.tools.assert_raises(ValueError, lambda: vec.fit_apply(prob))
def make_learning_approaches(self): """Creates LearningApproaches from the learner options""" for k in self.n_features: for cls in self.classifiers: yield SelectAndClassify(SelectKBest(k=k), cls, name='SelectKBest(k={}) -> {}'.format( k, cls.__class__.__name__), preprocess=ProblemVectorizer())
def mock_model(): """Creates a simple mock model for testing""" prob = mock_problem() logit = SelectAndClassify(selector=None, classifier=LogisticRegression(), preprocess=ProblemVectorizer(), name="test model").fit(prob) return ClassificationModel(logit, prob)
def checkme(working_dir, n_samples, n_features, k, make_classifier, test_vectorize): """Utility""" assert n_samples % 4 == 0 model_path = os.path.join(working_dir, 'model.txt') prob = mock_problem(n_samples=n_samples, n_features=n_features) if test_vectorize: df = prob.dataframe df['discrete_1'] = ['foo', 'bar'] * int(n_samples / 2) df['discrete_2'] = ['foo', 'bar', 'baz', float('nan')] * int(n_samples / 4) df['continuous_with_missing'] = [0, 1, 2, float('nan')] * int( n_samples / 4) prob = Problem( df, prob.features + ['discrete_1', 'discrete_2', 'continuous_with_missing'], prob.outcome_column, prob.positive_outcome) preprocess = ProblemVectorizer() else: preprocess = None approach = SelectAndClassify(SelectKBest(k=k), make_classifier(), preprocess=preprocess).fit(prob) model = ClassificationModel(approach, prob) model.write(model_path) reconstituted_model = ClassificationModel.read(model_path) model.validate() reconstituted_model.validate() np.testing.assert_array_equal(model.approach.apply(prob), reconstituted_model.approach.apply(prob)) if preprocess is not None: approach_pipeline = ApproachPipeline([('preprocess', preprocess)]) approach_with_pipeline = SelectAndClassify( SelectKBest(k=k), make_classifier(), preprocess=approach_pipeline).fit(prob) # test approach serialization with Pipeline from learners.py model_with_pipeline = ClassificationModel(approach_with_pipeline, prob) model_path2 = os.path.join(working_dir, 'model2.txt') model_with_pipeline.write(model_path2) reconstituted_model2 = ClassificationModel.read(model_path2) reconstituted_model2.validate() np.testing.assert_array_almost_equal( model.approach.apply(prob), reconstituted_model2.approach.apply(prob), 14)