def Train(self, colNames, nValidation, labels, values, fout=None, callback=None): ''' Train a SVM model using optimized C and Gamma parameters and a training set. ''' # First make sure the supplied problem is in SVM format self.TranslateTrainingSet(labels, values) # Perform a grid-search to obtain the C and gamma parameters for C-SVM # classification if nValidation > 1: C, gamma = self.ParameterGridSearch(callback, nValidation) else: C, gamma = self.ParameterGridSearch(callback) # Train the model using the obtained C and gamma parameters to obtain the final classifier self.model = Pipeline([ ('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, tol=0.1)) ]) self.model.fit(self.svm_train_values, self.svm_train_labels)
def train_lrpipe(trainX, trainY, params): """ trains LogisiticRegression model with params logreg_C specified by params """ lrpipe = Pipeline([('logreg', LogisticRegression(penalty="l1", C=1))]) lrpipe = lrpipe.fit(trainX, trainY, **params) return lrpipe
def train_svpipe(trainX, trainY, params): """ trains LogisiticRegression model with params logreg_C specified by params """ svpipe = Pipeline([('rbfsvm', SVC())]) svpipe = svpipe.fit(trainX, trainY, **params) return svpipe
def test_dense_vectorizer_pipeline_grid_selection(): # raw documents data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS # simulate iterables train_data = iter(data[1:-1]) test_data = iter([data[0], data[-1]]) # label junk food as -1, the others as +1 y = np.ones(len(data)) y[:6] = -1 y_train = y[1:-1] y_test = np.array([y[0], y[-1]]) pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())]) parameters = {'vect__analyzer__max_n': (1, 2), 'svc__loss': ('l1', 'l2')} # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=1) # cross-validation doesn't work if the length of the data is not known, # hence use lists instead of iterators pred = grid_search.fit(list(train_data), y_train).predict(list(test_data)) assert_array_equal(pred, y_test) # on this toy dataset bigram representation which is used in the last of # the grid_search is considered the best estimator since they all converge # to 100% accurracy models assert_equal(grid_search.best_score, 1.0) best_vectorizer = grid_search.best_estimator.named_steps['vect'] assert_equal(best_vectorizer.analyzer.max_n, 1)
def train(cls, labeled_featuresets): train, target_labels = zip(*labeled_featuresets) target_names = sorted(set(target_labels)) targets = [target_names.index(l) for l in target_labels] pipeline = Pipeline([ ('bow', BagOfWordsVectorizer()), ('clf', LinearSVC(C=1000)), ]) pipeline.fit(train, targets) return cls(pipeline, target_names)
def get_clf(n=3, binarize=True): steps = [('vectorizer', CountVectorizer( CharNGramAnalyzer(min_n=1, max_n=n, preprocessor=SimplePreprocessor())))] if binarize: steps.append(('binarizer', Binarizer(copy=False))) steps.append(('clf', naive_bayes.BernoulliNB())) else: steps.append(('clf', naive_bayes.MultinomialNB())) return Pipeline(steps)
def do_grid_search(X, Y, gs_params): """ Given data (X,Y) will perform a grid search on g_params for a LogisticRegression called logreg """ lrpipe = Pipeline([('logreg', LogisticRegression())]) gs = GridSearchCV(lrpipe, gs_params, n_jobs=-1) #print gs gs = gs.fit(X, Y) best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1]) logger.info("best_parameters: " + str(best_parameters)) logger.info("expected score: " + str(score)) return best_parameters
def do_grid_search(X, Y, gs_params=None): """ Given data (X,Y) will perform a grid search on g_params for a LogisticRegression called logreg """ svpipe = Pipeline([('rbfsvm', SVC())]) if not gs_params: gs_params = { 'rbfsvm__C': (1.5, 2, 5, 10, 20), 'rbfsvm__gamma': (0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 5), } gs = GridSearchCV(svpipe, gs_params, n_jobs=-1) #print gs gs = gs.fit(X, Y) best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1]) logger.info("best_parameters: " + str(best_parameters)) logger.info("expected score: " + str(score)) return best_parameters
import numpy as np import matplotlib.pyplot as pl from scikits.learn.decomposition import RandomizedPCA from scikits.learn.svm import LinearSVC from scikits.learn.pipeline import Pipeline from scikits.learn.grid_search import GridSearchCV from scikits.learn.metrics import classification_report from preprocess import InfinitivesExtractor, load_data # Data attributes targets = [0, 1, 2] target_names = ["covered", "no alternance", "uncovered"] target_colors = "rgb" # Classification settings pipeline = Pipeline([('extr', InfinitivesExtractor()), ('svc', LinearSVC(multi_class=True))]) parameters = { 'extr__count': (True, False), 'extr__n': (3, 4, 5, 6), 'svc__C': (1e-1, 1e-2, 1e9) } grid_search = GridSearchCV(pipeline, parameters) print "Loading data..." X, y = load_data() print "Searching for the best model..." t0 = time() grid_search.fit(X, y) print "Done in %0.3f" % (time() - t0) print "Best score: %0.3f" % grid_search.best_score clf = grid_search.best_estimator
# split the dataset in training and test set: n_samples_total = dataset.filenames.shape[0] split = (n_samples_total * 3) / 4 docs_train = [open(f).read() for f in dataset.filenames[:split]] docs_test = [open(f).read() for f in dataset.filenames[split:]] y_train = dataset.target[:split] y_test = dataset.target[split:] # Build a vectorizer / classifier pipeline using the previous analyzer pipeline = Pipeline([ ('vect', CountVectorizer(max_features=100000)), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(C=1000)), ]) parameters = { 'vect__analyzer__max_n': (1, 2), 'vect__max_df': (.95, ), } # Fit the pipeline on the training set using grid search for the parameters grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) grid_search.fit(docs_train[:200], y_train[:200]) # Refit the best parameter set on the complete training set clf = grid_search.best_estimator.fit(docs_train, y_train)
def XValidate(self, nPermutations): # Make sure all data is available in the training set if not self.classifier.UpdateTrainingSet(): return # Initialize process dialog def cb(frac): cont, skip = dlg.Update(int(frac * 100.), '%d%% Complete' % (frac * 100.)) if not cont: # Cancel was pressed dlg.Destroy() raise StopCalculating() dlg = wx.ProgressDialog( 'Performing grid search for optimal parameters...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) # Define cross validation parameters totalGroups = 5 trainingGroups = 4 # Convert the training set into SVM format and search for optimal parameters # C and gamma using 5-fold cross-validation logging.info( 'Performing grid search for parameters C and gamma on entire training set...' ) self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix, self.classifier.trainingSet.values) C, gamma = self.ParameterGridSearch(callback=cb) dlg.Destroy() logging.info( 'Grid search completed. Found optimal C=%d and gamma=%f.' % (C, gamma)) # Create the classifier and initialize misclassification storage classifier = Pipeline([ ('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1)) ]) nObjects = self.classifier.trainingSet.label_matrix.shape[0] subsetSize = np.ceil(nObjects / float(totalGroups)) indices = np.arange(nObjects) misclassifications = [[] for i in range(nObjects)] # Create group combinations and arrays of all labels and values dt = ','.join('i' * trainingGroups) trainingTotalGroups = list( np.fromiter(combinations(range(totalGroups), trainingGroups), dtype=dt, count=-1)) #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups)) allLabels = np.array(self.svm_train_labels) allValues = np.array(self.svm_train_values) # For all permutations of the subsets train the classifier on 4 totalGroups and # classify the remaining group for a number of random subsets logging.info('Calculating average classification accuracy %d times over a ' \ '%0.1f%%/%0.1f%% cross-validation process' % \ (nPermutations, trainingGroups/float(totalGroups)*100, \ (1-trainingGroups/float(totalGroups))*100)) dlg = wx.ProgressDialog( 'Calculating average cross-validation accuracy...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) nTrainingTotalGroups = len(trainingTotalGroups) nOperations = float(nPermutations * nTrainingTotalGroups) for per in range(nPermutations): # Split the training set into subsets np.random.shuffle(indices) lastGroupStart = (totalGroups - 1) * subsetSize subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups - 1)) subsets.append(indices[lastGroupStart:], ) for index, group in enumerate(trainingTotalGroups): # Retrieve indices of all objects in the training set trainingSet = np.hstack( [subsets[i] for i in range(totalGroups) if i in group]) # Train a classifier on the subset classifier.fit(allValues[trainingSet], allLabels[trainingSet]) # Predict the test set using the trained classifier testSet = np.hstack( [subsets[i] for i in range(totalGroups) if i not in group]) testLabels = classifier.predict(allValues[testSet]) # Store all misclassifications [misclassifications[testSet[i]].append(testLabels[i]) \ for i in range(len(testLabels)) \ if testLabels[i] != allLabels[testSet][i]] # Update progress dialog cb((nTrainingTotalGroups * per + index) / nOperations) # Calculate average classification accuracy dlg.Destroy() logging.info('Average Classification Accuracy: %f%%' % \ ((1-len([item for sublist in misclassifications for item in sublist]) /\ float(nObjects * nPermutations))*100)) return misclassifications
from scikits.learn.pipeline import Pipeline ################################################################################ # Import some data to play with digits = datasets.load_digits() y = digits.target n_samples = len(y) X = digits.data.reshape((n_samples, -1)) ################################################################################ # Create a feature-selection transform and an instance of SVM that we # combine together to have an full-blown estimator transform = feature_selection.SelectPercentile(feature_selection.f_classif) clf = Pipeline([transform], svm.SVC()) ################################################################################ # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (10, 20, 30, 40, 50, 60, 70, 80, 90, 100) for percentile in percentiles: transform._set_params(percentile=percentile) this_scores = cross_val.cross_val_score(clf, X, y) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) pl.errorbar(percentiles, score_means, np.array(score_stds))
#categories = None print "Loading 20 newsgroups dataset for categories:" print categories data = load_20newsgroups(subset='train', categories=categories) print "%d documents" % len(data.filenames) print "%d categories" % len(data.target_names) print ################################################################################ # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) parameters = { # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way 'vect__max_df': (0.5, 0.75, 1.0), # 'vect__max_features': (None, 5000, 10000, 50000), 'vect__analyzer__max_n': (1, 2), # words or bigrams # 'tfidf__use_idf': (True, False), 'clf__alpha': (0.00001, 0.000001), 'clf__penalty': ('l2', 'elasticnet'), # 'clf__n_iter': (10, 50, 80), }
y_train = dataset.target[:n_samples_total / 2] y_test = dataset.target[n_samples_total / 2:] # Build a an analyzer that split strings into sequence of 1 to 3 characters # after using the previous preprocessor analyzer = CharNGramAnalyzer( min_n=1, max_n=3, preprocessor=LowerCasePreprocessor(), ) # Build a vectorizer / classifier pipeline using the previous analyzer clf = Pipeline([ ('vec', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)), ]) # Fit the pipeline on the training set clf.fit(docs_train, y_train) # Predict the outcome on the testing set y_predicted = clf.predict(docs_test) # Print the classification report print metrics.classification_report(y_test, y_predicted, class_names=dataset.target_names) # Plot the confusion matrix
""" ================== Pipeline Anova SVM ================== Simple usage of Pipeline that runs successively a univariate feature selection with anova and then a C-SVM of the selected features. """ print __doc__ from scikits.learn import svm from scikits.learn.datasets import samples_generator from scikits.learn.feature_selection import SelectKBest, f_regression from scikits.learn.pipeline import Pipeline # import some data to play with X, y = samples_generator.test_dataset_classif(k=5) # ANOVA SVM-C # 1) anova filter, take 5 best ranked features anova_filter = SelectKBest(f_regression, k=5) # 2) svm clf = svm.SVC(kernel='linear') anova_svm = Pipeline([('anova', anova_filter), ('svm', clf)]) anova_svm.fit(X, y) anova_svm.predict(X)
import numpy as np import pylab as pl from scikits.learn import linear_model, decomposition, datasets, cross_val logistic = linear_model.LogisticRegression() pca = decomposition.PCA() from scikits.learn.pipeline import Pipeline pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target ################################################################################ # Plot the PCA spectrum pca.fit(X_digits) pl.figure(1, figsize=(4, 3)) pl.clf() pl.axes([.2, .2, .7, .7]) pl.plot(pca.explained_variance_, linewidth=2) pl.axis('tight') pl.xlabel('n_components') pl.ylabel('explained_variance_') ################################################################################ # Prediction scores = cross_val.cross_val_score(pipe, X_digits, y_digits, n_jobs=-1)
y = digits.target # Throw away data, to be in the curse of dimension settings y = y[:200] X = digits.data[:200] n_samples = len(y) X = X.reshape((n_samples, -1)) # add 200 non-informative features X = np.hstack((X, 2 * np.random.random((n_samples, 200)))) ################################################################################ # Create a feature-selection transform and an instance of SVM that we # combine together to have an full-blown estimator transform = feature_selection.SelectPercentile(feature_selection.f_classif) clf = Pipeline([('anova', transform), ('svc', svm.SVC())]) ################################################################################ # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100) for percentile in percentiles: clf._set_params(anova__percentile=percentile) # Compute cross-validation score using all CPUs this_scores = cross_val.cross_val_score(clf, X, y, n_jobs=1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) pl.errorbar(percentiles, score_means, np.array(score_stds))
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise ############################################################################### # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(len(y), 2) # cross-validation generator for model selection ridge = BayesianRidge() mem = Memory(cachedir='.', verbose=1) # Ward agglomeration followed by BayesianRidge A = grid_to_graph(n_x=size, n_y=size) ward = WardAgglomeration(n_clusters=10, connectivity=A, memory=mem, n_components=1) clf = Pipeline([('ward', ward), ('ridge', ridge)]) parameters = {'ward__n_clusters': [10, 20, 30]} # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, parameters, n_jobs=1) clf.fit(X, y, cv=cv) # set the best parameters coef_ = clf.best_estimator.steps[-1][1].coef_ coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression) clf = Pipeline([('anova', anova), ('ridge', ridge)]) parameters = {'anova__percentile': [5, 10, 20]} # Select the optimal percentage of features with grid search clf = GridSearchCV(clf, parameters)