def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0): ret = select_sets(df, sets) y = pd.DataFrame({"label":0}, index=df.index) for label, rows in enumerate(ret): y.loc[rows] = label + 1 y = y[y["label"] > 0] y -= 1 clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=len(sets) > 2, max_iter=20, alpha=alpha, C=1.0 / motifs.shape[0], tol=1e-3) accs = [] fractions = [] for i in np.arange(k): idx = np.random.choice(range(y.shape[0]), nsample, replace=True) y_pred = y.iloc[idx[:nsample * 0.8 + 1]] X_pred = motifs.loc[y_pred.index].values y_pred = y_pred.values.flatten() y_test = y.iloc[idx[nsample * 0.8 + 1:]] X_test = motifs.loc[y_test.index].values y_test = y_test.values.flatten() # train the model clf.fit(X_pred, y_pred) acc = clf.score(X_test, y_test) fraction = clf.n_nonzero(percentage=True) accs.append(acc) fractions.append(fraction) #print alpha, accs, fractions return alpha, np.median(accs), np.median(fractions)
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0): ret = select_sets(df, sets) y = pd.DataFrame({"label": 0}, index=df.index) for label, rows in enumerate(ret): y.loc[rows] = label + 1 y = y[y["label"] > 0] y -= 1 clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=len(sets) > 2, max_iter=20, alpha=alpha, C=1.0 / motifs.shape[0], tol=1e-3) accs = [] fractions = [] for i in np.arange(k): idx = np.random.choice(range(y.shape[0]), nsample, replace=True) y_pred = y.iloc[idx[:nsample * 0.8 + 1]] X_pred = motifs.loc[y_pred.index].values y_pred = y_pred.values.flatten() y_test = y.iloc[idx[nsample * 0.8 + 1:]] X_test = motifs.loc[y_test.index].values y_test = y_test.values.flatten() # train the model clf.fit(X_pred, y_pred) acc = clf.score(X_test, y_test) fraction = clf.n_nonzero(percentage=True) accs.append(acc) fractions.append(fraction) #print alpha, accs, fractions return alpha, np.median(accs), np.median(fractions)
one_over_n = 1. / float(n_samples) ds = ColumnData(X) coefs_ = np.zeros((n_features, n_classes)) fit( ds, y, one_over_n, n_samples, n_features, n_classes,coefs_,groups) s = score (X,y,coefs_) print "score = ", s print '======================================================' clf_max_iter=300 clf_tol = 1e-3 print "### Equivalent Lightning Cython Implementation ###" light_clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=clf_max_iter, alpha=0.5, # clf.alpha, C=1.0 / X.shape[0], tol=clf_tol, permute=False, verbose=3, random_state=0).fit(X, y) print "Acc:", light_clf.score(X, y) print light_clf.coef_.T
import time import numpy as np from sklearn.datasets import fetch_20newsgroups_vectorized from lightning.classification import CDClassifier bunch = fetch_20newsgroups_vectorized(subset="all") X = bunch.data y = bunch.target y[y >= 1] = 1 Cs = np.logspace(-3, 3, 20) for warm_start in (True, False): clf = CDClassifier(loss="squared_hinge", tol=1e-3, max_iter=100, warm_start=warm_start) scores = [] start = time.time() for C in Cs: clf.C = C clf.fit(X, y) scores.append(clf.score(X, y)) print "Total time", time.time() - start print "Average accuracy", np.mean(scores)
# build model # cross validation for C in [1, 0.1, 0.01, 0.001, 0.0001]: # create and fit a ridge regression model, testing each alpha clf = LogisticRegression( C=C, penalty='l1', tol=0.001 ) # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. clf.fit(X_train, y_train) # Percentage of selected features num = len(clf.coef_[0].nonzero()[0]) p = len(clf.coef_[0].nonzero()[0]) * 1.0 / len(X_train.columns) print '%s = 0, %s = 1' % tuple(clf.classes_) print 'C: ', C print 'Prediction accuracy: ', clf.score(X_test, y_test) print 'Features left (# / %): ', num, '/', p if C == 1: writer.write('%s = 0, %s = 1 \n' % tuple(clf.classes_)) writer.write('C: %s \n' % C) writer.write('Accuracy: %s \n' % clf.score(X_test, y_test)) writer.write('Features left (#/%%): %s / %s \n' % (num, p)) # selected features if p < 0.5: idx = clf.coef_[0].nonzero() ws = clf.coef_[0][idx].round(3).astype(str) fs = X_train.columns[idx] tmp = fs + ' (' + ws + ')' print 'Selected features: %s' % ', '.join(tmp)
#Source #http://contrib.scikit-learn.org/lightning/ from sklearn.datasets import fetch_20newsgroups_vectorized from lightning.classification import CDClassifier # Load News20 dataset from scikit-learn. bunch = fetch_20newsgroups_vectorized(subset="all") X = bunch.data y = bunch.target # Set classifier options. clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=20, alpha=1e-4, C=1.0 / X.shape[0], tol=1e-3) # Train the model. clf.fit(X, y) # Accuracy print(clf.score(X, y)) # Percentage of selected features print(clf.n_nonzero(percentage=True))
texts = [ " ".join(text) for text in texts] vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(texts) y_train = labels clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=15, alpha=1e-4, C=1.0 / X_train.shape[0], tol=1e-6, verbose=5) mmclf = mmclf.LatentGroupClassifier(max_iter=15, C=1.0 / X_train.shape[0]) start = time() clf.fit(X_train, y_train) elapsed = time() - start print "CDClassifier time", elapsed print "CDClassifier score", clf.score(X_train, y_train) start = time() mmclf.fit(X_train, y_train) elapsed = time() - start print "LatentGroupClassifier time", elapsed print "LatentGroupClassifier score", mmclf.score(X_train, y_train) print "CDClassifier weights\n", clf.coef_.T print "LatentGroupClassifier weights\n", mmclf.coefs_.T print "features", vectorizer.vocabulary_
top_words = 30 print "==== Keywords ==== " for m in xrange(clf.coefs_.shape[1]): t = [] print 'Topic',m for row in xrange(clf.coefs_.shape[0]): if( clf.coefs_[row,m] >0): t.append( dict_text[row]) for k in heapq.nlargest(top_words,t): print k, print print "==== Lightning Cython Implementation (Row-wise sparsity) =====" light_clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=clf.max_iter, alpha=1e-4, # clf.alpha, C=1.0 / X.shape[0], tol=clf.tol, permute=False, verbose=3, random_state=0).fit(X, y) print "==========>> Accuracy :", light_clf.score(X, y) print "Weight Matrix:" print (light_clf.coef_.T)
import time import numpy as np from sklearn.datasets import fetch_20newsgroups_vectorized from lightning.classification import CDClassifier # Load News20 dataset from scikit-learn. bunch = fetch_20newsgroups_vectorized(subset="all") X = bunch.data y = bunch.target y[y >= 1] = 1 for shrinking in (True, False): clf = CDClassifier(C=1.0, loss="squared_hinge", penalty="l1", tol=1e-3, max_iter=1000, shrinking=shrinking, random_state=0) start = time.time() clf.fit(X, y) print "Training time", time.time() - start print "Accuracy", clf.score(X, y)