def Train(self,
              colNames,
              nValidation,
              labels,
              values,
              fout=None,
              callback=None):
        '''
    	Train a SVM model using optimized C and Gamma parameters and a training set.
    	'''
        # First make sure the supplied problem is in SVM format
        self.TranslateTrainingSet(labels, values)

        # Perform a grid-search to obtain the C and gamma parameters for C-SVM
        # classification
        if nValidation > 1:
            C, gamma = self.ParameterGridSearch(callback, nValidation)
        else:
            C, gamma = self.ParameterGridSearch(callback)

        # Train the model using the obtained C and gamma parameters to obtain the final classifier
        self.model = Pipeline([
            ('anova',
             feature_selection.SelectPercentile(feature_selection.f_classif,
                                                percentile=self.percentile)),
            ('svc', SVC(kernel='rbf', C=C, gamma=gamma, tol=0.1))
        ])
        self.model.fit(self.svm_train_values, self.svm_train_labels)
def train_lrpipe(trainX, trainY, params):
    """ trains LogisiticRegression model with params
        logreg_C specified by params 
        """
    lrpipe = Pipeline([('logreg', LogisticRegression(penalty="l1", C=1))])
    lrpipe = lrpipe.fit(trainX, trainY, **params)
    return lrpipe
def train_svpipe(trainX, trainY, params):
    """ trains LogisiticRegression model with params
        logreg_C specified by params 
        """
    svpipe = Pipeline([('rbfsvm', SVC())])
    svpipe = svpipe.fit(trainX, trainY, **params)
    return svpipe
示例#4
0
def test_dense_vectorizer_pipeline_grid_selection():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
    # simulate iterables
    train_data = iter(data[1:-1])
    test_data = iter([data[0], data[-1]])

    # label junk food as -1, the others as +1
    y = np.ones(len(data))
    y[:6] = -1
    y_train = y[1:-1]
    y_test = np.array([y[0], y[-1]])

    pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())])

    parameters = {'vect__analyzer__max_n': (1, 2), 'svc__loss': ('l1', 'l2')}

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1)

    # cross-validation doesn't work if the length of the data is not known,
    # hence use lists instead of iterators
    pred = grid_search.fit(list(train_data), y_train).predict(list(test_data))
    assert_array_equal(pred, y_test)

    # on this toy dataset bigram representation which is used in the last of
    # the grid_search is considered the best estimator since they all converge
    # to 100% accurracy models
    assert_equal(grid_search.best_score, 1.0)
    best_vectorizer = grid_search.best_estimator.named_steps['vect']
    assert_equal(best_vectorizer.analyzer.max_n, 1)
示例#5
0
	def train(cls, labeled_featuresets):
		train, target_labels = zip(*labeled_featuresets)
		target_names = sorted(set(target_labels))
		targets = [target_names.index(l) for l in target_labels]
		
		pipeline = Pipeline([
			('bow', BagOfWordsVectorizer()),
			('clf', LinearSVC(C=1000)),
		])
		
		pipeline.fit(train, targets)
		return cls(pipeline, target_names)
示例#6
0
def get_clf(n=3, binarize=True):
    steps = [('vectorizer',
              CountVectorizer(
                  CharNGramAnalyzer(min_n=1,
                                    max_n=n,
                                    preprocessor=SimplePreprocessor())))]
    if binarize:
        steps.append(('binarizer', Binarizer(copy=False)))
        steps.append(('clf', naive_bayes.BernoulliNB()))
    else:
        steps.append(('clf', naive_bayes.MultinomialNB()))

    return Pipeline(steps)
def do_grid_search(X, Y, gs_params):
    """ Given data (X,Y) will perform a grid search on g_params
        for a LogisticRegression called logreg
        """
    lrpipe = Pipeline([('logreg', LogisticRegression())])
    gs = GridSearchCV(lrpipe, gs_params, n_jobs=-1)
    #print gs
    gs = gs.fit(X, Y)

    best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1])
    logger.info("best_parameters: " + str(best_parameters))
    logger.info("expected score: " + str(score))

    return best_parameters
def do_grid_search(X, Y, gs_params=None):
    """ Given data (X,Y) will perform a grid search on g_params
        for a LogisticRegression called logreg
        """
    svpipe = Pipeline([('rbfsvm', SVC())])
    if not gs_params:
        gs_params = {
            'rbfsvm__C': (1.5, 2, 5, 10, 20),
            'rbfsvm__gamma': (0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 5),
        }
    gs = GridSearchCV(svpipe, gs_params, n_jobs=-1)
    #print gs
    gs = gs.fit(X, Y)

    best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1])
    logger.info("best_parameters: " + str(best_parameters))
    logger.info("expected score: " + str(score))

    return best_parameters
示例#9
0
import numpy as np
import matplotlib.pyplot as pl
from scikits.learn.decomposition import RandomizedPCA
from scikits.learn.svm import LinearSVC
from scikits.learn.pipeline import Pipeline
from scikits.learn.grid_search import GridSearchCV
from scikits.learn.metrics import classification_report
from preprocess import InfinitivesExtractor, load_data

# Data attributes
targets = [0, 1, 2]
target_names = ["covered", "no alternance", "uncovered"]
target_colors = "rgb"

# Classification settings
pipeline = Pipeline([('extr', InfinitivesExtractor()),
                     ('svc', LinearSVC(multi_class=True))])
parameters = {
    'extr__count': (True, False),
    'extr__n': (3, 4, 5, 6),
    'svc__C': (1e-1, 1e-2, 1e9)
}
grid_search = GridSearchCV(pipeline, parameters)

print "Loading data..."
X, y = load_data()
print "Searching for the best model..."
t0 = time()
grid_search.fit(X, y)
print "Done in %0.3f" % (time() - t0)
print "Best score: %0.3f" % grid_search.best_score
clf = grid_search.best_estimator
示例#10
0
# split the dataset in training and test set:
n_samples_total = dataset.filenames.shape[0]

split = (n_samples_total * 3) / 4

docs_train = [open(f).read() for f in dataset.filenames[:split]]
docs_test = [open(f).read() for f in dataset.filenames[split:]]

y_train = dataset.target[:split]
y_test = dataset.target[split:]

# Build a vectorizer / classifier pipeline using the previous analyzer
pipeline = Pipeline([
    ('vect', CountVectorizer(max_features=100000)),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(C=1000)),
])

parameters = {
    'vect__analyzer__max_n': (1, 2),
    'vect__max_df': (.95, ),
}

# Fit the pipeline on the training set using grid search for the parameters
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
grid_search.fit(docs_train[:200], y_train[:200])

# Refit the best parameter set on the complete training set
clf = grid_search.best_estimator.fit(docs_train, y_train)
    def XValidate(self, nPermutations):
        # Make sure all data is available in the training set
        if not self.classifier.UpdateTrainingSet():
            return

        # Initialize process dialog
        def cb(frac):
            cont, skip = dlg.Update(int(frac * 100.),
                                    '%d%% Complete' % (frac * 100.))
            if not cont:  # Cancel was pressed
                dlg.Destroy()
                raise StopCalculating()

        dlg = wx.ProgressDialog(
            'Performing grid search for optimal parameters...', '0% Complete',
            100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME
            | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)

        # Define cross validation parameters
        totalGroups = 5
        trainingGroups = 4

        # Convert the training set into SVM format and search for optimal parameters
        # C and gamma using 5-fold cross-validation
        logging.info(
            'Performing grid search for parameters C and gamma on entire training set...'
        )
        self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix,
                                  self.classifier.trainingSet.values)
        C, gamma = self.ParameterGridSearch(callback=cb)
        dlg.Destroy()
        logging.info(
            'Grid search completed. Found optimal C=%d and gamma=%f.' %
            (C, gamma))

        # Create the classifier and initialize misclassification storage
        classifier = Pipeline([
            ('anova',
             feature_selection.SelectPercentile(feature_selection.f_classif,
                                                percentile=self.percentile)),
            ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1))
        ])
        nObjects = self.classifier.trainingSet.label_matrix.shape[0]
        subsetSize = np.ceil(nObjects / float(totalGroups))
        indices = np.arange(nObjects)
        misclassifications = [[] for i in range(nObjects)]

        # Create group combinations and arrays of all labels and values
        dt = ','.join('i' * trainingGroups)
        trainingTotalGroups = list(
            np.fromiter(combinations(range(totalGroups), trainingGroups),
                        dtype=dt,
                        count=-1))
        #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups))
        allLabels = np.array(self.svm_train_labels)
        allValues = np.array(self.svm_train_values)

        # For all permutations of the subsets train the classifier on 4 totalGroups and
        # classify the remaining group for a number of random subsets
        logging.info('Calculating average classification accuracy %d times over a ' \
                     '%0.1f%%/%0.1f%% cross-validation process' % \
                     (nPermutations, trainingGroups/float(totalGroups)*100, \
                     (1-trainingGroups/float(totalGroups))*100))
        dlg = wx.ProgressDialog(
            'Calculating average cross-validation accuracy...', '0% Complete',
            100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME
            | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)
        nTrainingTotalGroups = len(trainingTotalGroups)
        nOperations = float(nPermutations * nTrainingTotalGroups)
        for per in range(nPermutations):
            # Split the training set into subsets
            np.random.shuffle(indices)
            lastGroupStart = (totalGroups - 1) * subsetSize
            subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups - 1))
            subsets.append(indices[lastGroupStart:], )

            for index, group in enumerate(trainingTotalGroups):
                # Retrieve indices of all objects in the training set
                trainingSet = np.hstack(
                    [subsets[i] for i in range(totalGroups) if i in group])

                # Train a classifier on the subset
                classifier.fit(allValues[trainingSet], allLabels[trainingSet])

                # Predict the test set using the trained classifier
                testSet = np.hstack(
                    [subsets[i] for i in range(totalGroups) if i not in group])
                testLabels = classifier.predict(allValues[testSet])

                # Store all misclassifications
                [misclassifications[testSet[i]].append(testLabels[i]) \
                    for i in range(len(testLabels)) \
                    if testLabels[i] != allLabels[testSet][i]]

                # Update progress dialog
                cb((nTrainingTotalGroups * per + index) / nOperations)

        # Calculate average classification accuracy
        dlg.Destroy()
        logging.info('Average Classification Accuracy: %f%%' % \
                     ((1-len([item for sublist in misclassifications for item in sublist]) /\
                     float(nObjects * nPermutations))*100))

        return misclassifications
示例#12
0
from scikits.learn.pipeline import Pipeline

################################################################################
# Import some data to play with
digits = datasets.load_digits()
y = digits.target
n_samples = len(y)
X = digits.data.reshape((n_samples, -1))

################################################################################
# Create a feature-selection transform and an instance of SVM that we
# combine together to have an full-blown estimator

transform = feature_selection.SelectPercentile(feature_selection.f_classif)

clf = Pipeline([transform], svm.SVC())

################################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds = list()
percentiles = (10, 20, 30, 40, 50, 60, 70, 80, 90, 100)

for percentile in percentiles:
    transform._set_params(percentile=percentile)
    this_scores = cross_val.cross_val_score(clf, X, y)
    score_means.append(this_scores.mean())
    score_stds.append(this_scores.std())

pl.errorbar(percentiles, score_means, np.array(score_stds))
示例#13
0
#categories = None

print "Loading 20 newsgroups dataset for categories:"
print categories

data = load_20newsgroups(subset='train', categories=categories)
print "%d documents" % len(data.filenames)
print "%d categories" % len(data.target_names)
print

################################################################################
# define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

parameters = {
    # uncommenting more parameters will give better exploring power but will
    # increase processing time in a combinatorial way
    'vect__max_df': (0.5, 0.75, 1.0),
    #    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__analyzer__max_n': (1, 2),  # words or bigrams
    #    'tfidf__use_idf': (True, False),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #    'clf__n_iter': (10, 50, 80),
}
示例#14
0
y_train = dataset.target[:n_samples_total / 2]
y_test = dataset.target[n_samples_total / 2:]

# Build a an analyzer that split strings into sequence of 1 to 3 characters
# after using the previous preprocessor
analyzer = CharNGramAnalyzer(
    min_n=1,
    max_n=3,
    preprocessor=LowerCasePreprocessor(),
)

# Build a vectorizer / classifier pipeline using the previous analyzer
clf = Pipeline([
    ('vec', CountVectorizer(analyzer=analyzer)),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
])

# Fit the pipeline on the training set
clf.fit(docs_train, y_train)

# Predict the outcome on the testing set
y_predicted = clf.predict(docs_test)

# Print the classification report
print metrics.classification_report(y_test,
                                    y_predicted,
                                    class_names=dataset.target_names)

# Plot the confusion matrix
示例#15
0
"""
==================
Pipeline Anova SVM
==================

Simple usage of Pipeline that runs successively a univariate
feature selection with anova and then a C-SVM of the selected features.
"""
print __doc__

from scikits.learn import svm
from scikits.learn.datasets import samples_generator
from scikits.learn.feature_selection import SelectKBest, f_regression
from scikits.learn.pipeline import Pipeline

# import some data to play with
X, y = samples_generator.test_dataset_classif(k=5)

# ANOVA SVM-C
# 1) anova filter, take 5 best ranked features
anova_filter = SelectKBest(f_regression, k=5)
# 2) svm
clf = svm.SVC(kernel='linear')

anova_svm = Pipeline([('anova', anova_filter), ('svm', clf)])
anova_svm.fit(X, y)
anova_svm.predict(X)

import numpy as np
import pylab as pl

from scikits.learn import linear_model, decomposition, datasets, cross_val

logistic = linear_model.LogisticRegression()

pca = decomposition.PCA()
from scikits.learn.pipeline import Pipeline
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target

################################################################################
# Plot the PCA spectrum
pca.fit(X_digits)

pl.figure(1, figsize=(4, 3))
pl.clf()
pl.axes([.2, .2, .7, .7])
pl.plot(pca.explained_variance_, linewidth=2)
pl.axis('tight')
pl.xlabel('n_components')
pl.ylabel('explained_variance_')

################################################################################
# Prediction
scores = cross_val.cross_val_score(pipe, X_digits, y_digits, n_jobs=-1)
示例#17
0
y = digits.target
# Throw away data, to be in the curse of dimension settings
y = y[:200]
X = digits.data[:200]
n_samples = len(y)
X = X.reshape((n_samples, -1))
# add 200 non-informative features
X = np.hstack((X, 2 * np.random.random((n_samples, 200))))

################################################################################
# Create a feature-selection transform and an instance of SVM that we
# combine together to have an full-blown estimator

transform = feature_selection.SelectPercentile(feature_selection.f_classif)

clf = Pipeline([('anova', transform), ('svc', svm.SVC())])

################################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds = list()
percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)

for percentile in percentiles:
    clf._set_params(anova__percentile=percentile)
    # Compute cross-validation score using all CPUs
    this_scores = cross_val.cross_val_score(clf, X, y, n_jobs=1)
    score_means.append(this_scores.mean())
    score_stds.append(this_scores.std())

pl.errorbar(percentiles, score_means, np.array(score_stds))
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
y += noise_coef * noise  # add noise

###############################################################################
# Compute the coefs of a Bayesian Ridge with GridSearch
cv = KFold(len(y), 2)  # cross-validation generator for model selection
ridge = BayesianRidge()
mem = Memory(cachedir='.', verbose=1)

# Ward agglomeration followed by BayesianRidge
A = grid_to_graph(n_x=size, n_y=size)
ward = WardAgglomeration(n_clusters=10,
                         connectivity=A,
                         memory=mem,
                         n_components=1)
clf = Pipeline([('ward', ward), ('ridge', ridge)])
parameters = {'ward__n_clusters': [10, 20, 30]}
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, parameters, n_jobs=1)
clf.fit(X, y, cv=cv)  # set the best parameters
coef_ = clf.best_estimator.steps[-1][1].coef_
coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_)
coef_agglomeration_ = coef_.reshape(size, size)

# Anova univariate feature selection followed by BayesianRidge
f_regression = mem.cache(feature_selection.f_regression)  # caching function
anova = feature_selection.SelectPercentile(f_regression)
clf = Pipeline([('anova', anova), ('ridge', ridge)])
parameters = {'anova__percentile': [5, 10, 20]}
# Select the optimal percentage of features with grid search
clf = GridSearchCV(clf, parameters)