def test_null_feature_selector():
    """Validates that SelectAndClassify works with a null feature selector"""
    def make_fixed_rs():
        """Utility: makes a fixed random state for use in this test"""
        return np.random.RandomState(0xC0FFEE)

    prob = mock_problem()

    # selector=None and SelectKBest(k='all') should produce identical predictions
    no_select_approach = SelectAndClassify(None, LogisticRegression(random_state=make_fixed_rs()),
                                           classifier_grid={'C': [0.5, 1.0]},
                                           random_state=make_fixed_rs()).fit(prob)
    select_all_approach = SelectAndClassify(SelectKBest(k='all'),
                                            LogisticRegression(random_state=make_fixed_rs()),
                                            classifier_grid={'C': [0.5, 1.0]},
                                            random_state=make_fixed_rs()).fit(prob)

    # There should be no selection step in the underlying model
    nose.tools.eq_(len(no_select_approach.model.steps), len(select_all_approach.model.steps) - 1)

    # We should still be logging the right features
    nose.tools.assert_list_equal(no_select_approach.selected_features, prob.features)

    # Scores should be identical as k='all'
    np.testing.assert_allclose(no_select_approach.apply(prob), select_all_approach.apply(prob))
Exemplo n.º 2
0
def test_predict_tool(working_dir):
    """Tests that the predict.py command line tool works as expected"""
    out_dir = os.path.join(working_dir, 'learn_output')
    model_path = os.path.join(out_dir, 'model.txt')
    predictions_path = os.path.join(out_dir, 'predictions.txt')

    # Mock up some input data
    prob_path, prob = mock_input(working_dir)
    os.mkdir(out_dir)

    # Train a model and save it to a file
    classifier = SelectAndClassify(SelectKBest(k=5),
                                   GaussianNB(),
                                   name='test model').fit(prob)
    model = ClassificationModel(classifier, prob)
    model.write(model_path)

    # Run the predict tool with the model using the training data loaded from a file, and validate that
    # the returned predictions match
    predict.main(
        [model_path, prob_path, predictions_path, '--index_col', 'sample_id'])

    expected_predictions = pd.DataFrame({
        'sample_id': prob.sample_ids,
        'score': classifier.apply(prob)
    })
    actual_predictions = pd.read_csv(predictions_path, sep='\t')

    np.testing.assert_allclose(actual_predictions['score'].values,
                               expected_predictions['score'].values)
def test_binary_report_with_score_vector():
    " Test that in binary case score as vector contains same data as with positive outcome only"
    data = []
    class_values = ['A', 'B']
    for index_class in range(4):
        data = mock_coords_data(data,
                                index_class,
                                class_values[index_class % 2],
                                data2=None,
                                append_missed=False)[0]

    df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data)
    prob = Problem(df, ['coord0', 'coord1'], 'class', 'B')

    classifier = SelectAndClassify(
        SelectKBest(k='all'),
        LogisticRegression(),
        name='test binary with score vector').fit(prob)
    y_score_positive = classifier.apply(prob)
    y_score_all = classifier.apply(prob, False)
    nose.tools.ok_(np.allclose(y_score_positive, y_score_all[:, 1]))
def test_feature_engineering():
    """Tests feature engineering"""
    prob = mock_problem()

    classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), feature_engineering=PCA(n_components=2))
    model = classifier.fit(prob).model
    steps = dict(model.steps)

    nose.tools.ok_('PCA' in str(classifier))
    nose.tools.ok_('feature_engineering' in steps)
    nose.tools.assert_is_not_none(steps['feature_engineering'].components_)

    # Check that classifier.apply() works
    nose.tools.eq_(len(classifier.apply(prob)), prob.n_samples)

    # Test that SelectAndClassify still works without feature engineering
    classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression())
    model = classifier.fit(prob).model
    steps = dict(model.steps)
    nose.tools.ok_('PCA' not in str(classifier))
    nose.tools.ok_('feature_engineering' not in steps)