示例#1
0
    def checkme(make_classifier):
        """Utility"""
        prob = mock_problem()
        logr = make_classifier().fit(prob.X, prob.y)

        decoded_logr = serializer_roundtrip(Serializer(), logr)
        np.testing.assert_array_equal(logr.predict_proba(prob.X),
                                      decoded_logr.predict_proba(prob.X))
示例#2
0
    def checkme(n_features, k, make_classifier):
        """Utility"""
        prob = mock_problem(n_features=n_features)

        pipe = Pipeline([('select', SelectKBest(k=k)),
                         ('classify', make_classifier())]).fit(prob.X, prob.y)
        decoded_piple = serializer_roundtrip(Serializer(), pipe)
        np.testing.assert_array_equal(pipe.predict_proba(prob.X),
                                      decoded_piple.predict_proba(prob.X))
示例#3
0
    def checkme(params):
        """
        Test helper function.

        :param params: StandardScaler constructor params as a dictionary
        :return: None

        """
        prob = mock_problem(n_samples=100, n_features=4)
        scaler = StandardScaler(**params).fit(prob.X)
        reconstituted_scaler = serializer_roundtrip(Serializer(), scaler)
        np.testing.assert_allclose(reconstituted_scaler.transform(prob.X),
                                   scaler.transform(prob.X))
示例#4
0
    def checkme(params):
        """
        Test helper function.

        :param params: PCA constructor params as a dictionary
        :return: None

        """
        prob = mock_problem(n_samples=100, n_features=4)
        pca = PCA(**params).fit(prob.X)
        reconstituted_pca = serializer_roundtrip(Serializer(), pca)
        np.testing.assert_allclose(reconstituted_pca.transform(prob.X),
                                   pca.transform(prob.X))
示例#5
0
def mock_input(working_dir):
    """\
    Mocks an input problem in the given directory, returning a pair of (1) path to the problem file and
    (2) the Problem instance

    """
    prob_path = os.path.join(working_dir, 'problem.txt')
    prob = mock_problem(n_samples=100, n_features=10)
    prob.dataframe.to_csv(prob_path,
                          sep='\t',
                          index=True,
                          index_label='sample_id')
    return prob_path, prob
示例#6
0
    def checkme(working_dir, n_samples, n_features, k, make_classifier,
                test_vectorize):
        """Utility"""
        assert n_samples % 4 == 0
        model_path = os.path.join(working_dir, 'model.txt')
        prob = mock_problem(n_samples=n_samples, n_features=n_features)
        if test_vectorize:
            df = prob.dataframe
            df['discrete_1'] = ['foo', 'bar'] * int(n_samples / 2)
            df['discrete_2'] = ['foo', 'bar', 'baz',
                                float('nan')] * int(n_samples / 4)
            df['continuous_with_missing'] = [0, 1, 2, float('nan')] * int(
                n_samples / 4)
            prob = Problem(
                df, prob.features +
                ['discrete_1', 'discrete_2', 'continuous_with_missing'],
                prob.outcome_column, prob.positive_outcome)
            preprocess = ProblemVectorizer()
        else:
            preprocess = None

        approach = SelectAndClassify(SelectKBest(k=k),
                                     make_classifier(),
                                     preprocess=preprocess).fit(prob)
        model = ClassificationModel(approach, prob)

        model.write(model_path)
        reconstituted_model = ClassificationModel.read(model_path)

        model.validate()
        reconstituted_model.validate()

        np.testing.assert_array_equal(model.approach.apply(prob),
                                      reconstituted_model.approach.apply(prob))

        if preprocess is not None:
            approach_pipeline = ApproachPipeline([('preprocess', preprocess)])
            approach_with_pipeline = SelectAndClassify(
                SelectKBest(k=k),
                make_classifier(),
                preprocess=approach_pipeline).fit(prob)
            # test approach serialization with Pipeline from learners.py
            model_with_pipeline = ClassificationModel(approach_with_pipeline,
                                                      prob)
            model_path2 = os.path.join(working_dir, 'model2.txt')
            model_with_pipeline.write(model_path2)
            reconstituted_model2 = ClassificationModel.read(model_path2)
            reconstituted_model2.validate()
            np.testing.assert_array_almost_equal(
                model.approach.apply(prob),
                reconstituted_model2.approach.apply(prob), 14)
示例#7
0
    def checkme(params):
        """
        Test helper function.

        :param params: DecisionTreeClassifier constructor params as a dictioanry
        :return: None

        """
        prob = mock_problem(n_samples=100, n_features=18)
        tree = DecisionTreeClassifier(**params).fit(prob.X, prob.y)
        reconstituted_dtc = serializer_roundtrip(
            Serializer(debug_deserialize=False), tree)
        np.testing.assert_array_equal(reconstituted_dtc.predict_proba(prob.X),
                                      tree.predict_proba(prob.X))
示例#8
0
    def checkme(params):
        """
        Test helper function.

        :param params: PCA constructor params as a dictioanry
        :return: None

        """
        prob = mock_problem(n_samples=100, n_features=4)
        rfc = RandomForestClassifier(**params)
        rfc.fit(prob.X, prob.y)
        reconstituted_rfc = serializer_roundtrip(Serializer(), rfc)

        np.testing.assert_array_equal(reconstituted_rfc.predict(prob.X),
                                      rfc.predict(prob.X))
        np.testing.assert_array_equal(reconstituted_rfc.predict_proba(prob.X),
                                      rfc.predict_proba(prob.X))
示例#9
0
    def checkme(params, probability):
        """
        Test helper function.

        :param params: SVC constructor params as a dictioanry
        :param probability: whether to test the output of predict_proba
        :return: None

        """
        prob = mock_problem(n_samples=100, n_features=3)
        svc = SVC(**params).fit(prob.X, prob.y)
        reconstituted_svc = serializer_roundtrip(Serializer(), svc)
        np.testing.assert_array_equal(
            reconstituted_svc.decision_function(prob.X),
            svc.decision_function(prob.X))

        if probability:
            np.testing.assert_array_equal(
                reconstituted_svc.predict_proba(prob.X),
                svc.predict_proba(prob.X))
示例#10
0
def test_model_validation(working_dir):
    """Validates that we fail if a model has been corrupted or otherwise produces bad output"""
    model_path = os.path.join(working_dir, 'model.txt')
    prob = mock_problem()
    approach = SelectAndClassify(SelectKBest(k=7),
                                 LogisticRegression()).fit(prob)
    model = ClassificationModel(approach, prob)
    model.write(model_path)

    # Change an expected score for a sample -- this should cause model loading to fail because actual
    # classifier output will no longer match the expected output
    with open(model_path, 'r') as f:
        model_string = '\n'.join(f.readlines())
        nose.tools.ok_(str(model.expected_scores[17]) in model_string)
        bad_model_string = model_string.replace(
            str(model.expected_scores[17]),
            str(model.expected_scores[17] + 0.5))

    with open(model_path, 'w') as f:
        f.write(bad_model_string)

    nose.tools.assert_raises(ValueError,
                             lambda: ClassificationModel.read(model_path))