예제 #1
0
 def fit(self, X, y):
     # The smaller C, the stronger the regularization.
     # The more regularization, the more sparsity.
     self.transformer_ = LinearSVC(C=1000, penalty="l1",
                                   dual=False, tol=1e-3)
     X = self.transformer_.fit_transform(X, y)
     return LinearSVC.fit(self, X, y)
예제 #2
0
def test_dense_vectorizer_pipeline_grid_selection():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
    # simulate iterables
    train_data = iter(data[1:-1])
    test_data = iter([data[0], data[-1]])

    # label junk food as -1, the others as +1
    y = np.ones(len(data))
    y[:6] = -1
    y_train = y[1:-1]
    y_test = np.array([y[0], y[-1]])

    pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())])

    parameters = {'vect__analyzer__max_n': (1, 2), 'svc__loss': ('l1', 'l2')}

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1)

    # cross-validation doesn't work if the length of the data is not known,
    # hence use lists instead of iterators
    pred = grid_search.fit(list(train_data), y_train).predict(list(test_data))
    assert_array_equal(pred, y_test)

    # on this toy dataset bigram representation which is used in the last of
    # the grid_search is considered the best estimator since they all converge
    # to 100% accuracy models
    assert_equal(grid_search.best_score, 1.0)
    best_vectorizer = grid_search.best_estimator.named_steps['vect']
    assert_equal(best_vectorizer.analyzer.max_n, 1)
    def train(labeled_featuresets, C=1e5):
        """
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        """
        feat = [featureset for featureset, label in labeled_featuresets]
        feature_vectorizer = MVectorizer.DictsVectorizer()
        X = feature_vectorizer.fit_transform(feat)
        X = Normalizer().fit_transform(X)
        label_set = set( [label for featureset, label in labeled_featuresets] )
        label_vectorizer = dict( [(label,num) for num,label in enumerate(label_set)] )
        y = numpy.array([label_vectorizer[label] for featureset, label in labeled_featuresets])
        print "Training on %d examples with %d features..."%(X.shape[0],X.shape[1]),
        classifier = OneVsRestClassifier(LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-5, C=C, scale_C=True))
        classifier.fit(X,y)
        print "done"

        return scikit_classifier(feature_vectorizer,label_vectorizer,classifier)
예제 #4
0
    print
    return score, train_time, test_time


for clf, name in ((RidgeClassifier(tol=1e-1), "Ridge Classifier"),
                  (KNeighborsClassifier(n_neighbors=10), "kNN")):
    print 80 * '='
    print name
    results = benchmark(clf)

for penalty in ["l2", "l1"]:
    print 80 * '='
    print "%s penalty" % penalty.upper()
    # Train Liblinear model
    liblinear_results = benchmark(
        LinearSVC(loss='l2', penalty=penalty, C=1000, dual=False, tol=1e-3))

    # Train SGD model
    sgd_results = benchmark(
        SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty))

# Train SGD with Elastic Net penalty
print 80 * '='
print "Elastic-Net penalty"
sgd_results = benchmark(
    SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))

# Train sparse Naive Bayes classifiers
print 80 * '='
print "Naive Bayes"
mnnb_results = benchmark(MultinomialNB(alpha=.01))
예제 #5
0
파일: classify.py 프로젝트: vene/misc-nlp
from preprocess import get_clf, load_data, preprocess_data
from sklearn.metrics import classification_report
from sklearn.cross_validation import KFold, LeaveOneOut
from sklearn.grid_search import GridSearchCV

if __name__ == '__main__':
    filename = 'inf-all-labeled.txt'

    X, y = load_data(filename)
    n = len(X)
    scores = np.empty((5, 2, 2), dtype=np.float)
    best_C = np.empty((5, 2, 2), dtype=np.float)
    for i, ngrams in enumerate((2, 3, 4, 5, 6)):
        for j, suffix in enumerate(('', '$')):
            for k, binarize in enumerate((True, False)):
                print "ngrams=%d, suffix=%s, binarize=%s" % (ngrams, suffix,
                                                             binarize)
                X_new = preprocess_data(X,
                                        n=ngrams,
                                        suffix=suffix,
                                        binarize=binarize)
                grid = GridSearchCV(
                    estimator=LinearSVC(),
                    n_jobs=4,
                    verbose=False,
                    param_grid={'C': (0.01, 0.03, 0.1, 0.3, 1, 1.3)},
                    cv=LeaveOneOut(n, indices=True))
                grid.fit(X_new, y)
                scores[i, j, k] = grid.best_score
                best_C[i, j, k] = grid.best_estimator.C
예제 #6
0
print

# # Feature selection for the L1 dataset
# select_chi2 = 1000
# print ("Extracting %d best features by a chi-squared test" % select_chi2)
# t0 = time()
# ch2 = SelectKBest(chi2, k = select_chi2)
# X_L1 = ch2.fit_transform(X_L1, y_L1)
# print "Done in %fs" % (time() - t0)
# print "L1:      n_samples: %d, n_features: %d" % X_L1.shape
# print

# Train L1 classifier
print "Training L1 Classifier..."
t0 = time()
clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
print clf
clf.fit(X_L1, y_L1)
train_time = time() - t0
print "Train time: %0.3fs" % train_time
print

# Train L2 classifiers
print "Training L2 Classifiers..."
t0 = time()

# comment out all linearSVC
# clf_ca = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
# clf_collect = LinearSVC(loss='l2', penalty='l2', C=256, dual=False, tol=1e-2)
# clf_cookies = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
# clf_share = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
예제 #7
0
        print metrics.confusion_matrix(y_test, pred)

    print
    return score, train_time, test_time

for clf, name in ((RidgeClassifier(tol=1e-1), "Ridge Classifier"),
                  (KNeighborsClassifier(n_neighbors=10), "kNN")):
    print 80 * '='
    print name
    results = benchmark(clf)

for penalty in ["l2", "l1"]:
    print 80 * '='
    print "%s penalty" % penalty.upper()
    # Train Liblinear model
    liblinear_results = benchmark(LinearSVC(loss='l2', penalty=penalty, C=1000,
                                            dual=False, tol=1e-3))

    # Train SGD model
    sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                          penalty=penalty))

# Train SGD with Elastic Net penalty
print 80 * '='
print "Elastic-Net penalty"
sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                      penalty="elasticnet"))

# Train sparse Naive Bayes classifiers
print 80 * '='
print "Naive Bayes"
mnnb_results = benchmark(MultinomialNB(alpha=.01))
예제 #8
0
categories = ['HUM', 'LOC', 'NUM', 'ENTY', 'DESC', 'ABBR']

train = load_files('coarse/',
                   categories=categories,
                   shuffle=True,
                   random_state=42)
# save train pickle
filehandler = open('pickle_training_coarse.pkl', 'wb')
pickle.dump(train, filehandler)
filehandler.close()

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

_ = text_clf.fit(train.data, train.target)

# save text_clf pickle
filehandler = open('pickle_clf_coarse.pkl', 'wb')
pickle.dump(text_clf, filehandler)
filehandler.close()

#new = ['Where is the Amazon river located?',
#       'Where can I get a good sandwhich',
#       'In what state was Columbus born?',
#       'What is the best cheese?']

text = """
예제 #9
0
def find_best_lsvc(**params):
    parameters = {'C': [0.01, 0.1, 1, 10, 100, 1000]}
    return GridSearchCV(LinearSVC(**params), parameters)
예제 #10
0
    for i, n in enumerate((2, 3, 4, 5, 6)):
        for j, suffix in enumerate(('', '$')):
            for k, binarize in enumerate((True, False)):
                print "%d-%d-%d out of 411" % (i, j, k)
                X_sg_p, v_sg = preprocess.preprocess_data(X_sg,
                                                          suffix=suffix,
                                                          n=n,
                                                          return_vect=True,
                                                          binarize=binarize)
                X_pl_p, v_pl = preprocess.preprocess_data(X_pl,
                                                          suffix=suffix,
                                                          n=n,
                                                          return_vect=True,
                                                          binarize=binarize)

                grid1 = GridSearchCV(estimator=LinearSVC(),
                                     n_jobs=-1,
                                     verbose=True,
                                     param_grid={'C': np.logspace(-2, 2, 5)},
                                     cv=KFold(len(X_sg), k=10, indices=True))
                grid1.fit(X_sg_p, y_sg)
                scores_sg[i, j, k] = grid1.best_score
                best_C_sg = grid1.best_estimator.C
                clf = grid1.best_estimator

                X_sg_n_p = v_sg.transform(X_sg_n)
                y_sg_n = clf.predict(X_sg_n_p)
                predict_sg[i, j, k] = (y_sg_n == 0).mean()

                grid2 = GridSearchCV(estimator=LinearSVC(),
                                     n_jobs=-1,
예제 #11
0
        return unicode_content.lower()

    def __repr__(self):
        return "LowerCasePreprocessor()"

analyzer1 = CharNGramAnalyzer(
    min_n=1,
    max_n=3,
    preprocessor=LowerCasePreprocessor(),
)

# Build a vectorizer / classifier pipeline using the previous analyzer
clf = Pipeline([
    ('vec', CountVectorizer(analyzer=analyzer1)),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
])

# Fit the pipeline on the training set
clf.fit(twenty_train.data,twenty_train.target)

# Predict the outcome on the testing set
y_predicted = clf.predict(doc_test)


# Predict the result on some short new sentences:
sentences = [
    u'This is a language detection test.',
    u'Ceci est un test de d\xe9tection de la langue.',
    u'Dies ist ein Test, um die Sprache zu erkennen.',
]