Exemplo n.º 1
0
def test_dense_vectorizer_pipeline_grid_selection():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
    # simulate iterables
    train_data = iter(data[1:-1])
    test_data = iter([data[0], data[-1]])

    # label junk food as -1, the others as +1
    y = np.ones(len(data))
    y[:6] = -1
    y_train = y[1:-1]
    y_test = np.array([y[0], y[-1]])

    pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())])

    parameters = {'vect__analyzer__max_n': (1, 2), 'svc__loss': ('l1', 'l2')}

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1)

    # cross-validation doesn't work if the length of the data is not known,
    # hence use lists instead of iterators
    pred = grid_search.fit(list(train_data), y_train).predict(list(test_data))
    assert_array_equal(pred, y_test)

    # on this toy dataset bigram representation which is used in the last of
    # the grid_search is considered the best estimator since they all converge
    # to 100% accurracy models
    assert_equal(grid_search.best_score, 1.0)
    best_vectorizer = grid_search.best_estimator.named_steps['vect']
    assert_equal(best_vectorizer.analyzer.max_n, 1)
Exemplo n.º 2
0
def test_grid_search():
    """Test that the best estimator contains the right value for foo_param"""
    clf = MockClassifier()
    cross_validation = GridSearchCV(clf, {'foo_param': [1, 2, 3]})
    # make sure it selects the smallest parameter in case of ties
    assert_equal(cross_validation.fit(X, y).best_estimator.foo_param, 2)

    for i, foo_i in enumerate([1, 2, 3]):
        assert cross_validation.grid_scores_[i][0] == {'foo_param': foo_i}
Exemplo n.º 3
0
def test_grid_search():
    """Test that the best estimator contains the right value for foo_param"""
    clf = MockClassifier()
    cross_validation = GridSearchCV(clf, {'foo_param': [1, 2, 3]})
    # make sure it selects the smallest parameter in case of ties
    assert_equal(cross_validation.fit(X, y).best_estimator.foo_param, 2)

    for i, foo_i in enumerate([1, 2, 3]):
        assert cross_validation.grid_scores_[i][0] == {'foo_param' : foo_i}
def do_grid_search(X, Y, gs_params):
    """ Given data (X,Y) will perform a grid search on g_params
        for a LogisticRegression called logreg
        """
    lrpipe = Pipeline([('logreg', LogisticRegression())])
    gs = GridSearchCV(lrpipe, gs_params, n_jobs=-1)
    #print gs
    gs = gs.fit(X, Y)

    best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1])
    logger.info("best_parameters: " + str(best_parameters))
    logger.info("expected score: " + str(score))

    return best_parameters
def do_grid_search(X,Y, gs_params):
    """ Given data (X,Y) will perform a grid search on g_params
        for a LogisticRegression called logreg
        """
    lrpipe = Pipeline([
        ('logreg',  LogisticRegression()  )
        ])
    gs = GridSearchCV( lrpipe, gs_params , n_jobs=-1)
    #print gs
    gs = gs.fit(X,Y)

    best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1])
    logger.info("best_parameters: " +str( best_parameters ) )
    logger.info("expected score: "+str( score ) )

    return best_parameters
Exemplo n.º 6
0
def test_grid_search_error():
    """Test that grid search will capture errors on data with different
    length"""
    X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    assert_raises(ValueError, cv.fit, X_[:180], y_)
Exemplo n.º 7
0
def train_svm_crossvalidated(X, y, tuned_parameters={'kernel': ['rbf'], 'gamma': 2.0**np.arange(-15,3), 'C': 2.0**np.arange(-5, 15)}):
    """
    Performs grid search with stratified K-fold cross validation on observations X with 
    true labels y and returns an optimal SVM, clf
    """

    k_fold = _size_dependent_k_split(np.size(X,0))

    clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=recall_score)
    clf.fit(X, y, cv=StratifiedKFold(y, k_fold))

    y_true, y_pred = y, clf.predict(X)

    #print "Classification report for the best estimator: "
    #print clf.best_estimator
    print "Tuned with optimal value: %0.3f" % recall_score(y_true, y_pred)
    
    return clf
    def ParameterGridSearch(self, callback=None, nValidation=5):
        '''
        Grid search for the best C and gamma parameters for the RBF Kernel.
        The efficiency of the parameters is evaluated using nValidation-fold
        cross-validation of the training data.
    
        As this process is time consuming and parallelizable, a number of
        threads equal to the number of cores in the computer is used for the
        calculations
        '''
        from scikits.learn.grid_search import GridSearchCV
        from scikits.learn.metrics import precision_score
        from scikits.learn.cross_val import StratifiedKFold
        #
        # XXX: program crashes with >1 worker when running cpa.py
        #      No crash when running from classifier.py. Why?
        #
        n_workers = 1
        #try:
        #from multiprocessing import cpu_count
        #n_workers = cpu_count()
        #except:
        #n_workers = 1

        # Define the parameter ranges for C and gamma and perform a grid search for the optimal setting
        parameters = {
            'C': 2**np.arange(-5, 11, 2, dtype=float),
            'gamma': 2**np.arange(3, -11, -2, dtype=float)
        }
        clf = GridSearchCV(SVC(kernel='rbf'),
                           parameters,
                           n_jobs=n_workers,
                           score_func=precision_score)
        clf.fit(self.svm_train_values,
                self.svm_train_labels,
                cv=StratifiedKFold(self.svm_train_labels, nValidation))

        # Pick the best parameters as the ones with the maximum cross-validation rate
        bestParameters = max(clf.grid_scores_, key=lambda a: a[1])
        bestC = bestParameters[0]['C']
        bestGamma = bestParameters[0]['gamma']
        logging.info('Optimal values: C=%s g=%s rate=%s' %
                     (bestC, bestGamma, bestParameters[1]))
        return bestC, bestGamma
Exemplo n.º 9
0
def test_grid_search_sparse():
    """Test that grid search works with both dense and sparse matrices"""
    X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator.C

    X_ = sp.csr_matrix(X_)
    clf = SparseLinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator.C

    assert np.mean(y_pred == y_pred2) >= .9
    assert_equal(C, C2)
def do_grid_search(X, Y, gs_params=None):
    """ Given data (X,Y) will perform a grid search on g_params
        for a LogisticRegression called logreg
        """
    svpipe = Pipeline([('rbfsvm', SVC())])
    if not gs_params:
        gs_params = {
            'rbfsvm__C': (1.5, 2, 5, 10, 20),
            'rbfsvm__gamma': (0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 5),
        }
    gs = GridSearchCV(svpipe, gs_params, n_jobs=-1)
    #print gs
    gs = gs.fit(X, Y)

    best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1])
    logger.info("best_parameters: " + str(best_parameters))
    logger.info("expected score: " + str(score))

    return best_parameters
Exemplo n.º 11
0
def test_grid_search_sparse_score_func():
    X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    # XXX: set refit to False due to a random bug when True (default)
    cv.fit(X_[:180], y_[:180], refit=False)
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator.C

    X_ = sp.csr_matrix(X_)
    clf = SparseLinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    # XXX: set refit to False due to a random bug when True (default)
    cv.fit(X_[:180], y_[:180], refit=False)
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator.C

    assert_array_equal(y_pred, y_pred2)
    assert_equal(C, C2)
def do_grid_search(X,Y, gs_params=None):
    """ Given data (X,Y) will perform a grid search on g_params
        for a LogisticRegression called logreg
        """
    svpipe = Pipeline([
        ('rbfsvm',  SVC()  )
        ])
    if not gs_params: 
        gs_params = {
                'rbfsvm__C': (1.5, 2, 5, 10, 20),
                'rbfsvm__gamma': (0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 5 ) ,
                }
    gs = GridSearchCV( svpipe, gs_params , n_jobs=-1)
    #print gs
    gs = gs.fit(X,Y)

    best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1])
    logger.info("best_parameters: " +str( best_parameters ) )
    logger.info("expected score: "+str( score ) )

    return best_parameters
    def ParameterGridSearch(self, callback = None, nValidation = 5):
        '''
        Grid search for the best C and gamma parameters for the RBF Kernel.
        The efficiency of the parameters is evaluated using nValidation-fold
        cross-validation of the training data.
    
        As this process is time consuming and parallelizable, a number of
        threads equal to the number of cores in the computer is used for the
        calculations
        '''
        from scikits.learn.grid_search import GridSearchCV
        from scikits.learn.metrics import precision_score
        from scikits.learn.cross_val import StratifiedKFold
        # 
        # XXX: program crashes with >1 worker when running cpa.py
        #      No crash when running from classifier.py. Why?
        #
        n_workers = 1
        #try:
            #from multiprocessing import cpu_count
            #n_workers = cpu_count()
        #except:
            #n_workers = 1

        # Define the parameter ranges for C and gamma and perform a grid search for the optimal setting
        parameters = {'C': 2**np.arange(-5,11,2, dtype=float),
                      'gamma': 2**np.arange(3,-11,-2, dtype=float)}                
        clf = GridSearchCV(SVC(kernel='rbf'), parameters, n_jobs=n_workers, score_func=precision_score)
        clf.fit(self.svm_train_values, self.svm_train_labels, 
                cv=StratifiedKFold(self.svm_train_labels, nValidation))

        # Pick the best parameters as the ones with the maximum cross-validation rate
        bestParameters = max(clf.grid_scores_, key=lambda a: a[1])
        bestC = bestParameters[0]['C']
        bestGamma = bestParameters[0]['gamma']
        logging.info('Optimal values: C=%s g=%s rate=%s'%
                     (bestC, bestGamma, bestParameters[1]))
        return bestC, bestGamma
Exemplo n.º 14
0
def test_dense_vectorizer_pipeline_grid_selection():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
    # simulate iterables
    train_data = iter(data[1:-1])
    test_data = iter([data[0], data[-1]])

    # label junk food as -1, the others as +1
    y = np.ones(len(data))
    y[:6] = -1
    y_train = y[1:-1]
    y_test = np.array([y[0],y[-1]])

    pipeline = Pipeline([('vect', CountVectorizer()),
                         ('svc', DenseLinearSVC())])

    parameters = {
        'vect__analyzer': (WordNGramAnalyzer(min_n=1, max_n=1),
                           WordNGramAnalyzer(min_n=1, max_n=2)),
        'svc__loss'  : ('l1', 'l2')
    }


    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1)

    # cross-validation doesn't work if the length of the data is not known,
    # hence use lists instead of iterators
    pred = grid_search.fit(list(train_data), y_train).predict(list(test_data))
    assert_array_equal(pred, y_test)

    # on this toy dataset bigram representation which is used in the last of the
    # grid_search is considered the best estimator since they all converge to
    # 100% accurracy models
    assert_equal(grid_search.best_score, 1.0)
    best_vectorizer = grid_search.best_estimator.named_steps['vect']
    assert_equal(best_vectorizer.analyzer.max_n, 2)
Exemplo n.º 15
0
def test_grid_search_sparse_score_func():
    X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator.C

    X_ = sp.csr_matrix(X_)
    clf = SparseLinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    cv.fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator.C

    assert_array_equal(y_pred, y_pred2)
    assert_equal(C, C2)
Exemplo n.º 16
0
def test_grid_search_sparse():
    """Test that grid search works with both dense and sparse matrices"""
    X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C':[0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator.C

    X_ = sp.csr_matrix(X_)
    clf = SparseLinearSVC()
    cv = GridSearchCV(clf, {'C':[0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator.C

    assert np.mean(y_pred == y_pred2) >= .9
    assert_equal(C, C2)
Exemplo n.º 17
0
def test_grid_search_sparse_score_func():
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    # XXX: set refit to False due to a random bug when True (default)
    cv.set_params(refit=False).fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator.C

    X_ = sp.csr_matrix(X_)
    clf = SparseLinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    # XXX: set refit to False due to a random bug when True (default)
    cv.set_params(refit=False).fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator.C

    assert_array_equal(y_pred, y_pred2)
    assert_equal(C, C2)
moto_vq_train, plane_vq_train = [np.load(file)
                                 for file 
                                 in ['moto_vq_train.npy','plane_vq_train.npy']]
labels = [0]* moto_vq_train.shape[0] + [1]* plane_vq_train.shape[0]


###############################################################################
# Train SVM

param_grid = {
    'C': [1, 5, 10, 50, 100],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }

clf = GridSearchCV(SVC(kernel='rbf'), param_grid,
                   fit_params={'class_weight': 'auto'})

#clf = SVC(kernel='rbf')
#clf = SVC(kernel='linear')

clf.fit(np.vstack([moto_vq_train,plane_vq_train]),
        np.array(labels))

print "Best estimator found by grid search:"
#print clf.best_estimator

###############################################################################
# Evaluation 

moto_vq_eval, plane_vq_eval  = [np.load(file) 
                                for file 
Exemplo n.º 19
0
# Data attributes
targets = [0, 1, 2]
target_names = ["covered", "no alternance", "uncovered"]
target_colors = "rgb"
    
# Classification settings
pipeline = Pipeline([
    ('extr', InfinitivesExtractor()),
    ('svc', LinearSVC(multi_class=True))
])
parameters = {
    'extr__count': (True,False),
    'extr__n': (3, 4, 5, 6),
    'svc__C': (1e-1, 1e-2, 1e9)
}
grid_search = GridSearchCV(pipeline, parameters)

print "Loading data..."
X, y = load_data()
print "Searching for the best model..."
t0 = time()
grid_search.fit(X, y)
print "Done in %0.3f" % (time() - t0)
print "Best score: %0.3f" % grid_search.best_score
clf = grid_search.best_estimator
print clf
yp = clf.predict(X)
print classification_report(y, yp, targets, target_names)

#pl.figure()
#pl.title("Classification rate for 3-fold stratified CV")
Exemplo n.º 20
0
parameters = {
    # uncommenting more parameters will give better exploring power but will
    # increase processing time in a combinatorial way
    'vect__max_df': (0.5, 0.75, 1.0),
    #    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__analyzer__max_n': (1, 2),  # words or bigrams
    #    'tfidf__use_idf': (True, False),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #    'clf__n_iter': (10, 50, 80),
}

# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)

# cross-validation doesn't work if the length of the data is not known,
# hence use lists instead of iterators
text_docs = [file(f).read() for f in data.filenames]

print "Performing grid search..."
print "pipeline:", [name for name, _ in pipeline.steps]
print "parameters:"
pprint(parameters)
t0 = time()
grid_search.fit(text_docs, data.target)
print "done in %0.3fs" % (time() - t0)
print

print "Best score: %0.3f" % grid_search.best_score
Exemplo n.º 21
0
# split the dataset in two equal part respecting label proportions
train, test = iter(StratifiedKFold(y, 2)).next()

################################################################################
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = [
    ('precision', precision_score),
    ('recall', recall_score),
]

for score_name, score_func in scores:
    clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func)
    clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5))
    y_true, y_pred = y[test], clf.predict(X[test])

    print "Classification report for the best estimator: "
    print clf.best_estimator
    print "Tuned for '%s' with optimal value: %0.3f" % (
        score_name, score_func(y_true, y_pred))
    print classification_report(y_true, y_pred)
    print "Grid scores:"
    pprint(clf.grid_scores_)
    print

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality
Exemplo n.º 22
0
def test_GridSearch():
    clf = MockClassifier()
    cross_validation = GridSearchCV(clf, {'foo_param':[1, 2, 3]})
    assert_equal(cross_validation.fit(X, y).best_estimator.foo_param, 2)
print "Extracting the top %d eigenfaces" % n_components
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

eigenfaces = pca.components_.T.reshape((n_components, 64, 64))

# project the input data on the eigenfaces orthonormal basis
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)


# Train a SVM classification model

print "Fitting the classifier to the training set"
param_grid = {"C": [1, 5, 10, 100], "gamma": [0.0001, 0.001, 0.01, 0.1]}
clf = GridSearchCV(SVC(kernel="rbf"), param_grid, fit_params={"class_weight": "auto"}, n_jobs=-1)
clf = clf.fit(X_train_pca, y_train)
print "Best estimator found by grid search:"
print clf.best_estimator


# Quantitative evaluation of the model quality on the test set

y_pred = clf.predict(X_test_pca)
print classification_report(y_test, y_pred, labels=selected_target, target_names=target_names[selected_target])

print confusion_matrix(y_test, y_pred, labels=selected_target)


# Qualitative evaluation of the predictions using matplotlib
Exemplo n.º 24
0
from preprocess import InfinitivesExtractor, load_data

# Data attributes
targets = [0, 1, 2]
target_names = ["covered", "no alternance", "uncovered"]
target_colors = "rgb"

# Classification settings
pipeline = Pipeline([('extr', InfinitivesExtractor()),
                     ('svc', LinearSVC(multi_class=True))])
parameters = {
    'extr__count': (True, False),
    'extr__n': (3, 4, 5, 6),
    'svc__C': (1e-1, 1e-2, 1e9)
}
grid_search = GridSearchCV(pipeline, parameters)

print "Loading data..."
X, y = load_data()
print "Searching for the best model..."
t0 = time()
grid_search.fit(X, y)
print "Done in %0.3f" % (time() - t0)
print "Best score: %0.3f" % grid_search.best_score
clf = grid_search.best_estimator
print clf
yp = clf.predict(X)
print classification_report(y, yp, targets, target_names)

#pl.figure()
#pl.title("Classification rate for 3-fold stratified CV")
###############################################################################
# Compute the coefs of a Bayesian Ridge with GridSearch
cv = KFold(len(y), 2)  # cross-validation generator for model selection
ridge = BayesianRidge()
mem = Memory(cachedir='.', verbose=1)

# Ward agglomeration followed by BayesianRidge
A = grid_to_graph(n_x=size, n_y=size)
ward = WardAgglomeration(n_clusters=10,
                         connectivity=A,
                         memory=mem,
                         n_components=1)
clf = Pipeline([('ward', ward), ('ridge', ridge)])
parameters = {'ward__n_clusters': [10, 20, 30]}
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, parameters, n_jobs=1)
clf.fit(X, y, cv=cv)  # set the best parameters
coef_ = clf.best_estimator.steps[-1][1].coef_
coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_)
coef_agglomeration_ = coef_.reshape(size, size)

# Anova univariate feature selection followed by BayesianRidge
f_regression = mem.cache(feature_selection.f_regression)  # caching function
anova = feature_selection.SelectPercentile(f_regression)
clf = Pipeline([('anova', anova), ('ridge', ridge)])
parameters = {'anova__percentile': [5, 10, 20]}
# Select the optimal percentage of features with grid search
clf = GridSearchCV(clf, parameters)
clf.fit(X, y, cv=cv)  # set the best parameters
coef_ = clf.best_estimator.steps[-1][1].coef_
coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_)
digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target

################################################################################
# Plot the PCA spectrum
pca.fit(X_digits)

pl.figure(1, figsize=(4, 3))
pl.clf()
pl.axes([.2, .2, .7, .7])
pl.plot(pca.explained_variance_, linewidth=2)
pl.axis('tight')
pl.xlabel('n_components')
pl.ylabel('explained_variance_')

################################################################################
# Prediction
scores = cross_val.cross_val_score(pipe, X_digits, y_digits, n_jobs=-1)

from scikits.learn.grid_search import GridSearchCV

n_components = [10, 15, 20, 30, 40, 50, 64]
Cs = np.logspace(-4, 4, 16)
estimator = GridSearchCV(pipe,
                         dict(pca__n_components=n_components,
                              logistic__C=Cs),
                         n_jobs=-1)
estimator.fit(X_digits, y_digits)
Exemplo n.º 27
0
from scikits.learn import datasets
from scikits.learn.metrics import zero_one

################################################################################
# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

################################################################################
# Set the parameters by cross-validation
tuned_parameters = [{"kernel": ("rbf",), "gamma": [1e-3, 1e-4]}, {"kernel": ("linear",)}]

clf = GridSearchCV(SVC(C=1), tuned_parameters, n_jobs=2)

y_pred = []
y_true = []
for train, test in StratifiedKFold(y, 2):
    cv = StratifiedKFold(y[train], 5)
    y_pred.append(clf.fit(X[train], y[train], cv=cv).predict(X[test]))
    y_true.append(y[test])

y_pred = np.concatenate(y_pred)
y_true = np.concatenate(y_true)
classif_rate = np.mean(y_pred == y_true) * 100
print "Classification rate : %f" % classif_rate
Exemplo n.º 28
0
def test_grid_search():
    """Test that the best estimator contains the right value for foo_param"""
    clf = MockClassifier()
    cross_validation = GridSearchCV(clf, {'foo_param':[1, 2, 3]})
    assert_equal(cross_validation.fit(X, y).best_estimator.foo_param, 2)
Exemplo n.º 29
0
from scikits.learn.grid_search import GridSearchCV
from scikits.learn import datasets
from scikits.learn.metrics import zero_one

################################################################################
# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

################################################################################
# Set the parameters by cross-validation
tuned_parameters = [{'kernel':('rbf', ), 'gamma':[1e-3, 1e-4]},
                    {'kernel':('linear', )}]

clf = GridSearchCV(SVC(C=1), tuned_parameters, n_jobs=2)

y_pred = []
y_true = []
for train, test in StratifiedKFold(y, 2):
    clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5))
    y_pred = np.append(y_pred, clf.predict(X[test]))
    y_true = np.append(y_true, y[test])

classif_rate = np.mean(y_pred == y_true) * 100
print "Classification rate : %f" % classif_rate
Exemplo n.º 30
0
y_test = dataset.target[split:]

# Build a vectorizer / classifier pipeline using the previous analyzer
pipeline = Pipeline([
    ('vect', CountVectorizer(max_features=100000)),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(C=1000)),
])

parameters = {
    'vect__analyzer__max_n': (1, 2),
    'vect__max_df': (.95, ),
}

# Fit the pipeline on the training set using grid search for the parameters
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
grid_search.fit(docs_train[:200], y_train[:200])

# Refit the best parameter set on the complete training set
clf = grid_search.best_estimator.fit(docs_train, y_train)

# Predict the outcome on the testing set
y_predicted = clf.predict(docs_test)

# Print the classification report
print metrics.classification_report(y_test,
                                    y_predicted,
                                    class_names=dataset.target_names)

# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
y_test = dataset.target[split:]

# Build a vectorizer / classifier pipeline using the previous analyzer
pipeline = Pipeline([
    ('vect', CountVectorizer(max_features=100000)),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(C=1000)),
])

parameters = {
    'vect__analyzer__max_n': (1, 2),
    'vect__max_df': (.95,),
}

# Fit the pipeline on the training set using grid search for the parameters
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
grid_search.fit(docs_train[:200], y_train[:200])

# Refit the best parameter set on the complete training set
clf = grid_search.best_estimator.fit(docs_train, y_train)

# Predict the outcome on the testing set
y_predicted = clf.predict(docs_test)

# Print the classification report
print metrics.classification_report(y_test, y_predicted,
                                    class_names=dataset.target_names)

# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print cm
Exemplo n.º 32
0
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print "done in %0.3fs" % (time() - t0)

################################################################################
# Train a SVM classification model

print "Fitting the classifier to the training set"
t0 = time()
param_grid = {
    'C': [1, 5, 10, 50, 100],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
clf = GridSearchCV(SVC(kernel='rbf'),
                   param_grid,
                   fit_params={'class_weight': 'auto'})
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator

################################################################################
# Quantitative evaluation of the model quality on the test set

print "Predicting the people names on the testing set"
t0 = time()
y_pred = clf.predict(X_test_pca)
print "done in %0.3fs" % (time() - t0)

print classification_report(y_test, y_pred, target_names=target_names)
n_components = 150
print "Extracting the top %d eigenfaces" % n_components
pca_sl = RandomizedPCA(n_components=n_components, whiten=True)
pca_sl.fit(X_train)
#components, mean = pca.pca(X_train, n_components)

#print "PCA components shape", pca.components_.T.shape 
#eigenfaces = pca.components_.T.reshape((-1, 64, 64))

# project the input data on the eigenfaces orthonormal basis
X_train_pca = pca_sl.transform(X_train)
#X_train_pca = pca.transform(X_train, mean, components)

################################################################################
# Train a SVM classification model

print "Fitting the classifier to the training set"
param_grid = {
    'C': [1, 5, 10, 50, 100],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
clf = GridSearchCV(SVC(kernel='rbf'), param_grid,
                   fit_params={'class_weight': 'auto'})

clf = clf.fit(X_train_pca, y_train)
print "Best estimator found by grid search:"
print clf.best_estimator



Exemplo n.º 34
0
tuned_parameters = [{
    'kernel': ['rbf'],
    'gamma': [1e-3, 1e-4],
    'C': [1, 10, 100, 1000]
}, {
    'kernel': ['linear'],
    'C': [1, 10, 100, 1000]
}]

scores = [
    ('precision', precision_score),
    ('recall', recall_score),
]

for score_name, score_func in scores:
    clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func)
    clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5))
    y_true, y_pred = y[test], clf.predict(X[test])

    print "Classification report for the best estimator: "
    print clf.best_estimator
    print "Tuned for '%s' with optimal value: %0.3f" % (
        score_name, score_func(y_true, y_pred))
    print classification_report(y_true, y_pred)
    print "Grid scores:"
    pprint(clf.grid_scores_)
    print

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality
Exemplo n.º 35
0
moto_vq_train, plane_vq_train = [
    np.load(file) for file in ['moto_vq_train.npy', 'plane_vq_train.npy']
]
labels = [0] * moto_vq_train.shape[0] + [1] * plane_vq_train.shape[0]

###############################################################################
# Train SVM

param_grid = {
    'C': [1, 5, 10, 50, 100],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}

clf = GridSearchCV(SVC(kernel='rbf'),
                   param_grid,
                   fit_params={'class_weight': 'auto'})

#clf = SVC(kernel='rbf')
#clf = SVC(kernel='linear')

clf.fit(np.vstack([moto_vq_train, plane_vq_train]), np.array(labels))

print "Best estimator found by grid search:"
#print clf.best_estimator

###############################################################################
# Evaluation

moto_vq_eval, plane_vq_eval = [
    np.load(file) for file in ['moto_vq_eval.npy', 'plane_vq_eval.npy']
parameters = {
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
    'vect__max_df': (0.5, 0.75, 1.0),
#    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__analyzer__max_n': (1, 2), # words or bigrams
#    'tfidf__use_idf': (True, False),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
#    'clf__n_iter': (10, 50, 80),
}

# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)

# cross-validation doesn't work if the length of the data is not known,
# hence use lists instead of iterators
text_docs = [file(f).read() for f in data.filenames]

print "Performing grid search..."
print "pipeline:", [name for name, _ in pipeline.steps]
print "parameters:"
pprint(parameters)
t0 = time()
grid_search.fit(text_docs, data.target)
print "done in %0.3fs" % (time() - t0)
print

print "Best score: %0.3f" % grid_search.best_score
Exemplo n.º 37
0
def test_grid_search():
    """Test that the best estimator contains the right value for foo_param"""
    clf = MockClassifier()
    cross_validation = GridSearchCV(clf, {'foo_param': [1, 2, 3]})
    assert_equal(cross_validation.fit(X, y).best_estimator.foo_param, 2)