Exemplo n.º 1
0
def test_predict_tool(working_dir):
    """Tests that the predict.py command line tool works as expected"""
    out_dir = os.path.join(working_dir, 'learn_output')
    model_path = os.path.join(out_dir, 'model.txt')
    predictions_path = os.path.join(out_dir, 'predictions.txt')

    # Mock up some input data
    prob_path, prob = mock_input(working_dir)
    os.mkdir(out_dir)

    # Train a model and save it to a file
    classifier = SelectAndClassify(SelectKBest(k=5),
                                   GaussianNB(),
                                   name='test model').fit(prob)
    model = ClassificationModel(classifier, prob)
    model.write(model_path)

    # Run the predict tool with the model using the training data loaded from a file, and validate that
    # the returned predictions match
    predict.main(
        [model_path, prob_path, predictions_path, '--index_col', 'sample_id'])

    expected_predictions = pd.DataFrame({
        'sample_id': prob.sample_ids,
        'score': classifier.apply(prob)
    })
    actual_predictions = pd.read_csv(predictions_path, sep='\t')

    np.testing.assert_allclose(actual_predictions['score'].values,
                               expected_predictions['score'].values)
Exemplo n.º 2
0
def run_learner(options):
    """Runs the learner tool with the given options"""
    prob = options.make_problem()
    results = {}
    for approach in options.make_learning_approaches():
        cv_gen = CVSplitGenerator(prob,
                                  n_folds=options.cv_k,
                                  n_repartitions=options.cv_repartitions)
        results[approach] = CrossValidatedAnalysis(prob,
                                                   approach,
                                                   cv_generator=cv_gen).run()
        report = ClassificationReport(renderer=ReportRenderer(
            os.path.join(options.output_dir, str(approach))))
        report.generate(results[approach])

        if options.save_models:
            # Retrain the approach on the full dataset
            trained_approach = approach.fit(prob)
            model = ClassificationModel(trained_approach, prob)
            model.write(
                os.path.join(options.output_dir, str(approach), 'model.txt'))

    comparative_report = ComparativeClassificationReport(
        renderer=ReportRenderer(options.output_dir))
    comparative_report.generate(results)
    return results
def test_multiclass(working_dir):
    """ Tests machine learning classification workfloor with multiclass for iris dataset
        see http://scikit-learn.org/stable/modules/multiclass.html """

    out_dir = os.path.join(working_dir, 'learn_output')
    model_path = os.path.join(out_dir, 'model.txt')

    iris = datasets.load_iris()

    df = iris_to_df(iris)

    features = [feat for feat in df.columns if feat not in ['Target']]

    prob = Problem(df, features, "Target", positive_outcome=None)
    rnd = np.random.RandomState(2016)
    approach = SelectAndClassify(SelectKBest(score_func=f_pearson, k=3),
                                 RandomForestClassifier(random_state=rnd))

    learn_params = LearningParameters(metrics={
        'auc':
        roc_auc_score,
        'accuracy':
        accuracy_from_confusion_matrix
    },
                                      treat_as_binary=False)
    cvg = CVSplitGenerator(prob,
                           n_folds=10,
                           n_repartitions=10,
                           random_state=rnd)

    cv = CrossValidatedAnalysis(prob,
                                approach,
                                cv_generator=cvg,
                                runner=SerialRunner(),
                                params=learn_params)

    results = cv.run()
    renderer = ReportRenderer(out_dir)
    ClassificationReport(renderer, False, prob.label_list).generate(results)
    nose.tools.ok_(
        os.path.exists(os.path.join(out_dir, 'sample_confusion_matrix.txt')))
    average_accuracy = compute_average_accuracy(results)
    nose.tools.assert_almost_equal(0.95, average_accuracy, delta=0.01)

    classifier = SelectAndClassify(SelectKBest(score_func=f_pearson, k=3),
                                   RandomForestClassifier(random_state=2016),
                                   name='test multiclass model').fit(prob)
    model = ClassificationModel(classifier, prob)
    model.write(model_path)

    read_model = ClassificationModel.read(model_path)

    auc_average = read_model.training_auc
    nose.tools.assert_almost_equal(1.0, auc_average, delta=1e-6)
def mock_model():
    """Creates a simple mock model for testing"""
    prob = mock_problem()
    logit = SelectAndClassify(selector=None,
                              classifier=LogisticRegression(),
                              preprocess=ProblemVectorizer(),
                              name="test model").fit(prob)

    return ClassificationModel(logit, prob)
Exemplo n.º 5
0
def test_model_validation(working_dir):
    """Validates that we fail if a model has been corrupted or otherwise produces bad output"""
    model_path = os.path.join(working_dir, 'model.txt')
    prob = mock_problem()
    approach = SelectAndClassify(SelectKBest(k=7),
                                 LogisticRegression()).fit(prob)
    model = ClassificationModel(approach, prob)
    model.write(model_path)

    # Change an expected score for a sample -- this should cause model loading to fail because actual
    # classifier output will no longer match the expected output
    with open(model_path, 'r') as f:
        model_string = '\n'.join(f.readlines())
        nose.tools.ok_(str(model.expected_scores[17]) in model_string)
        bad_model_string = model_string.replace(
            str(model.expected_scores[17]),
            str(model.expected_scores[17] + 0.5))

    with open(model_path, 'w') as f:
        f.write(bad_model_string)

    nose.tools.assert_raises(ValueError,
                             lambda: ClassificationModel.read(model_path))
def test_multiclass_auc():
    """ Tests auc value for multiclass problem"""
    data = []
    class_values = ['A', 'B', 'C', 'D']
    for index_class in range(4):
        data, _ = mock_coords_data(data, index_class,
                                   class_values[index_class], None, True)

    df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data)
    prob = Problem(df, ['coord0', 'coord1'], 'class', None)
    classifier = SelectAndClassify(SelectKBest(k='all'),
                                   LogisticRegression(),
                                   name='test multiclass model').fit(prob)
    model = ClassificationModel(classifier, prob)
    auc_average = model.training_auc
    nose.tools.assert_almost_equal(0.853333333, auc_average, delta=1e-6)

    prob_binary = Problem(df, ['coord0', 'coord1'], 'class', 'A')
    classifier_binary = SelectAndClassify(SelectKBest(k='all'),
                                          LogisticRegression(),
                                          name='binary model').fit(prob_binary)
    model_binary = ClassificationModel(classifier_binary, prob_binary)
    auc_binary = model_binary.training_auc
    nose.tools.assert_almost_equal(auc_binary, auc_average, delta=1e-6)
Exemplo n.º 7
0
def main(args=None):
    """The main method"""
    global model  # pylint: disable=global-statement

    parser = ArgumentParser()
    parser.add_argument('model', help="Model to use")
    parser.add_argument('--port',
                        type=int,
                        default=5100,
                        help="Port on which to run the service")
    parser.add_argument('--debug',
                        action='store_true',
                        help="Whether to run in debug mode")
    parsed_args = parser.parse_args(args)

    model = ClassificationModel.read(parsed_args.model)
    app.run(port=parsed_args.port, debug=parsed_args.debug)
Exemplo n.º 8
0
    def checkme(working_dir, n_samples, n_features, k, make_classifier,
                test_vectorize):
        """Utility"""
        assert n_samples % 4 == 0
        model_path = os.path.join(working_dir, 'model.txt')
        prob = mock_problem(n_samples=n_samples, n_features=n_features)
        if test_vectorize:
            df = prob.dataframe
            df['discrete_1'] = ['foo', 'bar'] * int(n_samples / 2)
            df['discrete_2'] = ['foo', 'bar', 'baz',
                                float('nan')] * int(n_samples / 4)
            df['continuous_with_missing'] = [0, 1, 2, float('nan')] * int(
                n_samples / 4)
            prob = Problem(
                df, prob.features +
                ['discrete_1', 'discrete_2', 'continuous_with_missing'],
                prob.outcome_column, prob.positive_outcome)
            preprocess = ProblemVectorizer()
        else:
            preprocess = None

        approach = SelectAndClassify(SelectKBest(k=k),
                                     make_classifier(),
                                     preprocess=preprocess).fit(prob)
        model = ClassificationModel(approach, prob)

        model.write(model_path)
        reconstituted_model = ClassificationModel.read(model_path)

        model.validate()
        reconstituted_model.validate()

        np.testing.assert_array_equal(model.approach.apply(prob),
                                      reconstituted_model.approach.apply(prob))

        if preprocess is not None:
            approach_pipeline = ApproachPipeline([('preprocess', preprocess)])
            approach_with_pipeline = SelectAndClassify(
                SelectKBest(k=k),
                make_classifier(),
                preprocess=approach_pipeline).fit(prob)
            # test approach serialization with Pipeline from learners.py
            model_with_pipeline = ClassificationModel(approach_with_pipeline,
                                                      prob)
            model_path2 = os.path.join(working_dir, 'model2.txt')
            model_with_pipeline.write(model_path2)
            reconstituted_model2 = ClassificationModel.read(model_path2)
            reconstituted_model2.validate()
            np.testing.assert_array_almost_equal(
                model.approach.apply(prob),
                reconstituted_model2.approach.apply(prob), 14)