Exemplo n.º 1
0
def test2():
    parameters = {'kernel': ['cat', ['rbf', 'poly']],
                  'd': ['int', [1, 3]],
                  'C': ['float', [1, 10]]}

    def scoring_function(x):
        return 0.5

    search = GPSearchCV(parameters, estimator=scoring_function, n_iter=20)
    search.fit(X=data.data, y=data.target)
Exemplo n.º 2
0
def test_n_iter_smaller_n_iter():
    # failing test that happens when n_iter < n_init.
    iris = load_iris()
    X, y = iris.data, iris.target

    parameters = {"max_depth": ['int', [3, 3]],
                  "max_features": ['int', [1, 4]],
                  "min_samples_split": ['int', [1, 11]],
                  "min_samples_leaf": ['int', [1, 11]],
                  "bootstrap": ['cat', [True, False]],
                  "criterion": ['cat', ["gini", "entropy"]]}
    clf = RandomForestClassifier()
    grid_search = GPSearchCV(clf, parameters, n_iter=5,
        n_init=20)
    grid_search.fit(X, y)
Exemplo n.º 3
0
def test1():
    iris = load_digits()
    X, y = iris.data, iris.target
    clf = RandomForestClassifier(n_estimators=20)

    # specify parameters and distributions to sample from
    parameters = {"max_depth": ['int', [3, 3]],
                  "max_features": ['int', [1, 11]],
                  "min_samples_split": ['int', [1, 11]],
                  "min_samples_leaf": ['int', [1, 11]],
                  "bootstrap": ['cat', [True, False]],
                  "criterion": ['cat', ["gini", "entropy"]]}

    search = GPSearchCV(clf, parameters, n_iter=20)
    search.fit(X, y)
    print(search)
    print(search.best_parameter_)
    print(search.best_estimator_)
Exemplo n.º 4
0
def test3():
    # Display progress logs on stdout
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    # Load some categories from the training set
    categories = [
        'alt.atheism',
        'talk.religion.misc',
    ]
    # Uncomment the following to do the analysis on all the categories
    # categories = None

    print("Loading 20 newsgroups dataset for categories:")
    print(categories)

    data = fetch_20newsgroups(subset='train', categories=categories)
    print("%d documents" % len(data.filenames))
    print("%d categories" % len(data.target_names))
    print()

    # define a pipeline combining a text feature extractor with a simple
    # classifier
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
    ])

    # uncommenting more parameters will give better exploring power but will
    # increase processing time in a combinatorial way
    parameters = {
        'vect__max_df': ['float', [0.5, 1.]],
        # 'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ['cat', [(1, 1), (1, 2)]],  # unigrams or bigrams
        # 'tfidf__use_idf': (True, False),
        # 'tfidf__norm': ('l1', 'l2'),
        'clf__alpha': ['float', [0.000001, 0.00001]],
        'clf__penalty': ['cat', ['l2', 'elasticnet']]
        # 'clf__n_iter': (10, 50, 80),
    }

    search = GPSearchCV(pipeline, parameters, n_iter=20)
    search.fit(X=data.data, y=data.target)
Exemplo n.º 5
0
def test_grid_search_no_score():
    # Test grid-search on classifier that has no score function.
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    clf_no_score = LinearSVCNoScore(random_state=0)
    gp_search = GPSearchCV(clf, {'C': Cs}, scoring='accuracy')
    gp_search.fit(X, y)

    grid_search_no_score = GPSearchCV(clf_no_score, {'C': Cs},
                                      scoring='accuracy')
    # smoketest grid search
    grid_search_no_score.fit(X, y)

    # check that best params are equal
    assert_equal(grid_search_no_score.best_params_, gp_search.best_params_)
    # check that we can call score and that it gives the correct result
    assert_equal(gp_search.score(X, y), grid_search_no_score.score(X, y))

    # giving no scoring function raises an error
    grid_search_no_score = GPSearchCV(clf_no_score, {'C': Cs})
    assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit,
                         [[1]])
Exemplo n.º 6
0
def test_gp_search():
    clf = MockClassifier()
    gp_search = GPSearchCV(clf, {'foo_param': ['int', [1, 3]]}, verbose=3)
    # make sure it selects the smallest parameter in case of ties
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    gp_search.fit(X, y)
    sys.stdout = old_stdout
    assert_equal(gp_search.best_estimator_.foo_param, 2)

    for i, foo_i in enumerate([1, 2, 3]):
        assert_true(gp_search.scores_[i][0]
                    == {'foo_param': foo_i})
    # Smoke test the score etc:
    gp_search.score(X, y)
    gp_search.predict_proba(X)
    gp_search.decision_function(X)
    gp_search.transform(X)

    # Test exception handling on scoring
    gp_search.scoring = 'sklearn'
    assert_raises(ValueError, gp_search.fit, X, y)
Exemplo n.º 7
0
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = {'gamma': ['float', [1e-3, 1e-4]],
                    'C': ['float', [1, 1000]]}

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GPSearchCV(SVC(C=1), tuned_parameters, cv=5, verbose=True)
    clf.fit(X, y)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
Exemplo n.º 8
0
def test_gp_search():
    # Test that the best estimator contains the right value for foo_param
    clf = MockDiscreteClassifier()
    gp_search = GPSearchCV(clf, {'foo_param': ['int', [1, 3]]})
    # make sure it selects the smallest parameter in case of ties
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    gp_search.fit(X, y)
    sys.stdout = old_stdout
    assert_equal(gp_search.best_estimator_.foo_param, 2)

    clf = MockContinuousClassifier()
    gp_search = GPSearchCV(clf, {'foo_param': ['float', [-3, 3]]}, verbose=3)
    # make sure it selects the smallest parameter in case of ties
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    gp_search.fit(X, y)
    sys.stdout = old_stdout
    assert_almost_equal(gp_search.best_estimator_.foo_param, 0, decimal=1)

    # Smoke test the score etc:
    gp_search.score(X, y)
    gp_search.predict_proba(X)
    gp_search.decision_function(X)
    gp_search.transform(X)

    # Test exception handling on scoring
    gp_search.scoring = 'sklearn'
    assert_raises(ValueError, gp_search.fit, X, y)
Exemplo n.º 9
0
def test_no_refit():
    # Test that grid search can be used for model selection only
    clf = MockDiscreteClassifier()
    grid_search = GPSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=False)
    grid_search.fit(X, y)
    assert_true(hasattr(grid_search, "best_params_"))
Exemplo n.º 10
0
def gp_vs_random_search(test_name, n_tests, search_length, save_data=False):
    """
    Compare GP-based search vs a simple random one
    Choose test_name in {'iris', 'text'}
    """

    n_iter_search = search_length

    if(test_name == 'iris'):
        iris = load_digits()
        X, y = iris.data, iris.target
        pipeline = RandomForestClassifier()

        # specify parameters and distributions to sample from
        parameters = {"max_depth": ['int', [3, 3]],
                      "max_features": ['int', [1, 11]],
                      "min_samples_split": ['int', [1, 11]],
                      "min_samples_leaf": ['int', [1, 11]],
                      "bootstrap": ['cat', [True, False]],
                      "criterion": ['cat', ["gini", "entropy"]]}

    elif(test_name == 'text'):
        # Display progress logs on stdout
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s %(levelname)s %(message)s')

        # Load some categories from the training set
        categories = [
            'alt.atheism',
            'talk.religion.misc',
        ]
        # Uncomment the following to do the analysis on all the categories
        # categories = None
        print("Loading 20 newsgroups dataset for categories:")
        print(categories)

        data = fetch_20newsgroups(subset='train', categories=categories)
        print("%d documents" % len(data.filenames))
        print("%d categories" % len(data.target_names))

        X = data.data
        y = data.target

        # define a pipeline combining a text feature extractor with a simple
        # classifier
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', SGDClassifier()),
        ])

        # uncommenting more parameters will give better exploring power but will
        # increase processing time in a combinatorial way
        parameters = {
            'vect__max_df': ['float', [0.5, 1.]],
            # 'vect__max_features': (None, 5000, 10000, 50000),
            'vect__ngram_range': ['cat', [(1, 1), (1, 2)]],  # unigrams or bigrams
            # 'tfidf__use_idf': (True, False),
            # 'tfidf__norm': ('l1', 'l2'),
            'clf__alpha': ['float', [0.000001, 0.00001]],
            'clf__penalty': ['cat', ['l2', 'elasticnet']]
            # 'clf__n_iter': (10, 50, 80),
        }

    else:
        print('Dataset not available for test')

    # GP UCB search
    all_gp_ucb_results = []
    print('GP_ucb search')
    for i in range(n_tests):
        ucb_search = GPSearchCV(pipeline, parameters,
                                acquisition_function='UCB',
                                n_iter=n_iter_search, n_init=20, verbose=False)
        _, scores = ucb_search.fit(X=data.data, y=data.target)

        max_scores = [scores[0]]
        print('Test', i, '-', len(scores), 'parameters tested')

        for j in range(1, len(scores)):
            max_scores.append(max(max_scores[j-1], scores[j]))
        all_gp_ucb_results.append(extend_result(n_iter_search, max_scores))
    all_gp_ucb_results = np.asarray(all_gp_ucb_results)
    print(all_gp_ucb_results.shape)
    if(save_data):
        np.savetxt('gp_ucb_scores.csv', all_gp_ucb_results, delimiter=',')

    # # GP EI search
    # all_gp_ei_results = []
    # print('GP_ei search')
    # for i in range(n_tests):
    #   ei_search = GPSearchCV(parameters,estimator=pipeline,
    #                       acquisition_function='EI',
    #                       n_iter=n_iter_search, n_init=20, verbose=False)
    #   _,scores = ei_search.fit(X=data.data, y=data.target)

    #   max_scores = [scores[0]]
    #   print('Test',i,'-',len(scores),'parameters tested')

    #   for j in range(1,len(scores)):
    #       max_scores.append(max(max_scores[j-1],scores[j]))
    #   all_gp_ei_results.append(extend_result(n_iter_search,max_scores))
    # all_gp_ei_results = np.asarray(all_gp_ei_results)
    # print(all_gp_ei_results.shape)
    # if(save_data):
    #   np.savetxt('gp_ei_scores.csv',all_gp_ei_results,delimiter=',')

    # Randomized search
    print('Random search')
    all_random_results = []
    for i in range(n_tests):
        random_search = GPSearchCV(parameters, estimator=pipeline,
                                   n_iter=n_iter_search, n_init=n_iter_search, verbose=False)
        _, scores = random_search.fit(X=data.data, y=data.target)

        max_scores = [scores[0]]
        print('Test', i, '-', len(scores), 'parameters tested')

        for j in range(1, len(scores)):
            max_scores.append(max(max_scores[j-1], scores[j]))
        all_random_results.append(extend_result(n_iter_search, max_scores))
    all_random_results = np.asarray(all_random_results)
    if(save_data):
        np.savetxt('rand_scores.csv', all_random_results, delimiter=',')

    plt.figure()
    # plt.plot(range(n_iter_search),np.mean(all_gp_ei_results,axis=0),'r',label='GP-EI')
    plt.plot(range(n_iter_search), np.mean(all_gp_ucb_results, axis=0), 'b', label='GP-UCB')
    plt.plot(range(n_iter_search), np.mean(all_random_results, axis=0), 'g', label='Random')
    plt.legend(loc=4)
    plt.title('Test GP vs Random on ' + test_name + ' dataset - Average on ' + str(n_tests) + ' trials')
    plt.xlabel('Iterations')
    plt.ylabel('Max CV performance')
    plt.show()