예제 #1
0
the hyperplane is forced to run through the origin, thus,
the plotted HP is not the max margin HP.
"""
print __doc__

import numpy as np
import pylab as pl
from scikits.learn.sgd.sparse import SGD

# we create 40 separable points
np.random.seed(0)
X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
Y = [0] * 20 + [1] * 20

# fit the model
clf = SGD(loss="hinge", alpha=0.01, n_iter=50, fit_intercept=True)
clf.fit(X, Y)

# plot the line, the points, and the nearest vectors to the plane
xx = np.linspace(-5, 5, 10)
yy = np.linspace(-5, 5, 10)
X1, X2 = np.meshgrid(xx, yy)
Z = np.empty(X1.shape)
for (i, j), val in np.ndenumerate(X1):
    x1 = val
    x2 = X2[i, j]
    p = clf.predict_margin([x1, x2])
    Z[i, j] = p[0]
levels = [-1.0, 0.0, 1.0]
linestyles = ['dashed', 'solid', 'dashed']
colors = 'k'
예제 #2
0
pos = 0 # alt.atheism
neg = 1 # comp.graphics
pos_idx = np.where(target == pos)[0]
neg_idx = np.where(target == neg)[0]
idx = np.concatenate((pos_idx, neg_idx))
np.random.seed(13)
np.random.shuffle(idx)
data = news_train.data[idx]
target = news_train.target[idx]

print "num train docs: ", data.shape[0]
print ""
<<<<<<< HEAD
print "Training a linear SVM (hinge loss and L2 regularizer) using SGD:"

clf = SGD(n_iter=50, alpha=0.00001, fit_intercept=True)
print clf

t0 = time()
=======
print "Training a linear SVM (hinge loss and L2 regularizer) using SGD.\n"\
      "SGD(n_iter=50, alpha=0.00001, fit_intercept=True)"
t0 = time()
clf = SGD(n_iter=50, alpha=0.00001, fit_intercept=True)
#clf = LinearSVC(**parameters)
>>>>>>> remote
clf.fit(data, target)
print "done in %fs" % (time() - t0)
print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100)

print "Loading 20 newsgroups test set... "
print "%d documents" % len(news_train.filenames)
print "%d categories" % len(news_train.target_names)

print "Extracting features from the dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target

print "Training a linear SVM (hinge loss and L2 regularizer) using SGD.\n"\
      "SGD(n_iter=50, alpha=0.00001, fit_intercept=True)"
t0 = time()
clf = SGD(n_iter=50, alpha=0.00001, fit_intercept=True)
clf.fit(X_train, y_train)
print "done in %fs" % (time() - t0)
print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100)


print "Loading 20 newsgroups test set... "
news_test = load_mlcomp('20news-18828', 'test',
                         categories=['alt.atheism', 'comp.graphics'])
t0 = time()
print "done in %fs" % (time() - t0)

print "Predicting the labels of the test set..."
print "%d documents" % len(news_test.filenames)
print "%d categories" % len(news_test.target_names)
the hyperplane is forced to run through the origin, thus,
the plotted HP is not the max margin HP. 
"""
print __doc__

import numpy as np
import pylab as pl
from scikits.learn.sgd.sparse import SGD

# we create 40 separable points
np.random.seed(0)
X = np.r_[np.random.randn(20, 2) - [2,2], np.random.randn(20, 2) + [2, 2]]
Y = [0]*20 + [1]*20

# fit the model
clf = SGD(loss="hinge", alpha = 0.01, n_iter=50, fit_intercept=True)
clf.fit(X, Y)

# plot the line, the points, and the nearest vectors to the plane
xx = np.linspace(-5, 5, 10)
yy = np.linspace(-5, 5, 10)
X1, X2 = np.meshgrid(xx, yy)
Z = np.empty(X1.shape)
for (i,j), val in np.ndenumerate(X1):
    x1 = val
    x2 = X2[i,j]
    p = clf.predict_margin([x1, x2])
    Z[i,j] = p[0]
levels = [-1.0, 0.0, 1.0]
linestyles = ['dashed','solid', 'dashed']
colors = 'k'