class Solver(BaseSolver): name = 'Lightning' install_cmd = 'conda' requirements = [ 'pip:git+https://github.com/scikit-learn-contrib/lightning.git' ] def set_objective(self, X, y, lmbd): self.X, self.y, self.lmbd = X, y, lmbd self.clf = CDClassifier(loss='log', penalty='l1', C=1, alpha=self.lmbd, tol=0, permute=False, shrinking=False, warm_start=False) def run(self, n_iter): self.clf.max_iter = n_iter self.clf.fit(self.X, self.y) def get_result(self): return self.clf.coef_.flatten()
def createLightningClassification(params = None): ## Params lParams = CDClassifier().get_params() if params is None: params = lParams C = getParams('C', float, None, params, lParams) Cd = getParams('Cd', float, None, params, lParams) alpha = getParams('alpha', float, None, params, lParams) beta = getParams('beta', float, None, params, lParams) loss = getParams('loss', str, ['squared_hinge'], params, lParams) max_iter = getParams('max_iter', int, None, params, lParams) max_steps = getParams('max_steps', str, ['auto'], params, lParams) n_calls = getParams('n_calls', int, None, params, lParams) n_jobs = getParams('n_jobs', int, None, params, lParams) penalty = getParams('penalty', str, ['l1', 'l2', 'l1/l2'], params, lParams) sigma = getParams('sigma', float, None, params, lParams) termination = getParams('termination', str, ['violation_max', 'violation_sum'], params, lParams) tol = getParams('tol', float, None, params, lParams) ## Estimator clf = CDClassifier(C=C, Cd=Cd, alpha=alpha, beta=beta, loss=loss, max_iter=max_iter, max_steps=max_steps, n_calls=n_calls, n_jobs=n_jobs, penalty=penalty, sigma=sigma, termination=termination, tol=tol) return clf # ## Load News20 dataset from scikit-learn. #bunch = fetch_20newsgroups_vectorized(subset="all") #X = bunch.data #y = bunch.target # ## Set classifier options. #clf = CDClassifier(penalty="l1/l2", # loss="squared_hinge", # multiclass=True, # max_iter=20, # alpha=1e-4, # C=1.0 / X.shape[0], # tol=1e-3) # ## Train the model. #clf.fit(X, y) # ## Accuracy #print(clf.score(X, y)) # ## Percentage of selected features #print(clf.n_nonzero(percentage=True))
def set_objective(self, X, y, lmbd): self.X, self.y, self.lmbd = X, y, lmbd self.clf = CDClassifier(loss='log', penalty='l1', C=1, alpha=self.lmbd, tol=0, permute=False, shrinking=False, warm_start=False)
def Light_lasso(X, y, alpha_): clf = CDClassifier( penalty="l1/l2", loss="squared_hinge", #multiclass=True, max_iter=50, alpha=alpha_, C=1.0 / X.shape[0], tol=1e-3) clf.fit(X, y) H1, H2 = np.nonzero(clf.coef_) X = X[:, H2] return X, H2
def fit_model(data): X, y, multi, alpha, C = data #print "fitting {} {}".format(X.shape, y.shape) # Set classifier options. clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=multi, max_iter=20, alpha=alpha, C=C, tol=1e-3) # Train the model. return clf.fit(X, y)
def fit_model(data): X,y,multi,alpha, C = data #print "fitting {} {}".format(X.shape, y.shape) # Set classifier options. clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=multi, max_iter=20, alpha=alpha, C=C, tol=1e-3) # Train the model. return clf.fit(X, y)
def __init__(self, scale=True, permute=False, ncpus=None): """Predict motif activities using lightning CDClassifier Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled before classification ncpus : int, optional Number of threads. Default is the number specified in the config. Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) fitted coefficients sig_ : DataFrame, shape (n_motifs,) boolean values, if coefficients are higher/lower than the 1%t from random permutation """ self.act_description = ("activity values: coefficients from " "fitted model") #self.cdc = CDClassifier(random_state=args.seed) self.cdc = CDClassifier() self.parameters = { "penalty": ["l1/l2"], "loss": ["squared_hinge"], "multiclass": [True], "max_iter": [20], "alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 3.0)], "C": [0.001, 0.01, 0.1, 0.5, 1.0], "tol": [1e-3] } self.kfolds = 10 if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.clf = GridSearchCV(self.cdc, self.parameters, cv=self.kfolds, n_jobs=ncpus) self.scale = scale self.permute = permute self.act_ = None self.sig_ = None self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "classification"
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0): ret = select_sets(df, sets) y = pd.DataFrame({"label":0}, index=df.index) for label, rows in enumerate(ret): y.loc[rows] = label + 1 y = y[y["label"] > 0] y -= 1 clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=len(sets) > 2, max_iter=20, alpha=alpha, C=1.0 / motifs.shape[0], tol=1e-3) accs = [] fractions = [] for i in np.arange(k): idx = np.random.choice(range(y.shape[0]), nsample, replace=True) y_pred = y.iloc[idx[:nsample * 0.8 + 1]] X_pred = motifs.loc[y_pred.index].values y_pred = y_pred.values.flatten() y_test = y.iloc[idx[nsample * 0.8 + 1:]] X_test = motifs.loc[y_test.index].values y_test = y_test.values.flatten() # train the model clf.fit(X_pred, y_pred) acc = clf.score(X_test, y_test) fraction = clf.n_nonzero(percentage=True) accs.append(acc) fractions.append(fraction) #print alpha, accs, fractions return alpha, np.median(accs), np.median(fractions)
def __init__(self, scale=True): """Predict motif activities using lighting CDClassifier Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled before classification Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) fitted coefficients sig_ : DataFrame, shape (n_motifs,) boolean values, if coefficients are higher/lower than the 1%t from random permutation """ self.act_description = ("activity values: coefficients from " "fitted model") #self.cdc = CDClassifier(random_state=args.seed) self.cdc = CDClassifier() self.parameters = { "penalty": ["l1/l2"], "loss": ["squared_hinge"], "multiclass": [True], "max_iter": [20], "alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 3.0)], "C": [0.001, 0.01, 0.1, 0.5, 1.0], "tol": [1e-3] } self.kfolds = 10 self.clf = GridSearchCV(self.cdc, self.parameters, cv=self.kfolds, n_jobs=-1) self.scale = scale self.act_ = None self.sig_ = None
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0): ret = select_sets(df, sets) y = pd.DataFrame({"label": 0}, index=df.index) for label, rows in enumerate(ret): y.loc[rows] = label + 1 y = y[y["label"] > 0] y -= 1 clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=len(sets) > 2, max_iter=20, alpha=alpha, C=1.0 / motifs.shape[0], tol=1e-3) accs = [] fractions = [] for i in np.arange(k): idx = np.random.choice(range(y.shape[0]), nsample, replace=True) y_pred = y.iloc[idx[:nsample * 0.8 + 1]] X_pred = motifs.loc[y_pred.index].values y_pred = y_pred.values.flatten() y_test = y.iloc[idx[nsample * 0.8 + 1:]] X_test = motifs.loc[y_test.index].values y_test = y_test.values.flatten() # train the model clf.fit(X_pred, y_pred) acc = clf.score(X_test, y_test) fraction = clf.n_nonzero(percentage=True) accs.append(acc) fractions.append(fraction) #print alpha, accs, fractions return alpha, np.median(accs), np.median(fractions)
import time import numpy as np from sklearn.datasets import fetch_20newsgroups_vectorized from lightning.classification import CDClassifier bunch = fetch_20newsgroups_vectorized(subset="all") X = bunch.data y = bunch.target y[y >= 1] = 1 Cs = np.logspace(-3, 3, 20) for warm_start in (True, False): clf = CDClassifier(loss="squared_hinge", tol=1e-3, max_iter=100, warm_start=warm_start) scores = [] start = time.time() for C in Cs: clf.C = C clf.fit(X, y) scores.append(clf.score(X, y)) print "Total time", time.time() - start print "Average accuracy", np.mean(scores)
one_over_n = 1. / float(n_samples) ds = ColumnData(X) coefs_ = np.zeros((n_features, n_classes)) fit( ds, y, one_over_n, n_samples, n_features, n_classes,coefs_,groups) s = score (X,y,coefs_) print "score = ", s print '======================================================' clf_max_iter=300 clf_tol = 1e-3 print "### Equivalent Lightning Cython Implementation ###" light_clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=clf_max_iter, alpha=0.5, # clf.alpha, C=1.0 / X.shape[0], tol=clf_tol, permute=False, verbose=3, random_state=0).fit(X, y) print "Acc:", light_clf.score(X, y) print light_clf.coef_.T
from lightning.classification import CDClassifier from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics import f1_score import scattertext as st newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer() tfidf_X = vectorizer.fit_transform(newsgroups_train.data) clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=20, alpha=1e-4, C=1.0 / tfidf_X.shape[0], tol=1e-3) clf.fit(tfidf_X, newsgroups_train.target) corpus = st.CorpusFromScikit(X=CountVectorizer( vocabulary=vectorizer.vocabulary_).fit_transform(newsgroups_train.data), y=newsgroups_train.target, feature_vocabulary=vectorizer.vocabulary_, category_names=newsgroups_train.target_names, raw_texts=newsgroups_train.data).build() html = st.produce_frequency_explorer( corpus, 'alt.atheism', scores=clf.coef_[0],
bunch = fetch_20newsgroups_vectorized(subset="all") X = bunch.data y = bunch.target # Select a subset of the classes for faster training. ind = np.arange(X.shape[0]) subset = y < 5 X = X[ind[subset]] y = y[subset] # Train / test split. X_tr, X_te, y_tr, y_te = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=0) clfs = (CDClassifier(loss="squared_hinge", penalty="l2", max_iter=20, random_state=0), LinearSVC(max_iter=20, random_state=0), SGDClassifier(learning_rate="constant", alpha=1e-3, max_iter=20, random_state=0)) for clf in clfs: print(clf.__class__.__name__) clf.fit(X_tr, y_tr) print(clf.score(X_te, y_te))
from sklearn import svm from sklearn.metrics import accuracy_score from sklearn.linear_model import SGDClassifier from lightning.classification import CDClassifier from sklearn.preprocessing import LabelEncoder le = LabelEncoder() train_set_y = np.asarray(le.fit_transform(train_set_y), dtype='int32') valid_set_y = np.asarray(le.transform(valid_set_y), dtype='int32') test_set_y = np.asarray(le.transform(test_set_y), dtype='int32') clfs = [ #SGDClassifier(loss='hinge', penalty='l2'), "LogisticRegression", CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=20, alpha=1e-4, C=1.0 / train_set_x.shape[0], tol=1e-3), CDClassifier(penalty="l1/l2", loss="log", multiclass=True, max_iter=20, alpha=1e-4, C=1.0 / train_set_x.shape[0], tol=1e-3), #svm.LinearSVC(), svm.SVC(kernel='rbf', cache_size=8000, max_iter=20) ] for clf in clfs: print clf
#normalization of features scale = preprocessing.StandardScaler().fit(XtrainPos) XtrainPos = scale.fit_transform(XtrainPos) XtestPos = scale.fit_transform(XtestPos) #scale = preprocessing.MinMaxScaler() #XtrainPos = scale.fit_transform(XtrainPos) #XtestPos = scale.fit_transform(XtestPos) # scale = preprocessing.Normalizer().fit(XtrainPos) XtrainPos = scale.fit_transform(XtrainPos) XtestPos = scale.fit_transform(XtestPos) #classification clf = CDClassifier(penalty="l1/l2", loss="squared_hinge",multiclass=True,max_iter=20,C=1, alpha=1e-4,tol=1e-3) #clf = LinearSVC(penalty="l2") clf = clf.fit(XtrainPos, YtrainPos) print(metrics.classification_report(YtestPos, clf.predict(XtestPos))) ## Crossvalidation 5 times using different split #scores = cross_validation.cross_val_score(clf_svm, posfeat, label, cv=5, scoring='f1') #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # Visualization #plt.hist(XtrainPos[:,0]) #plt.show()
alpha=alpha, eta=eta_svrg, n_inner=1.0, max_iter=100, random_state=0, tol=1e-24) clf2 = SDCAClassifier(loss="squared_hinge", alpha=alpha, max_iter=100, n_calls=X.shape[0] / 2, random_state=0, tol=tol) clf3 = CDClassifier(loss="squared_hinge", alpha=alpha, C=1.0 / X.shape[0], max_iter=50, n_calls=X.shape[1] / 3, random_state=0, tol=tol) clf4 = AdaGradClassifier(loss="squared_hinge", alpha=alpha, eta=eta_adagrad, n_iter=100, n_calls=X.shape[0] / 2, random_state=0) clf5 = SAGAClassifier(loss="squared_hinge", alpha=alpha, max_iter=100, random_state=0, tol=tol) clf6 = SAGClassifier(loss="squared_hinge",
import time import numpy as np from sklearn.datasets import fetch_20newsgroups_vectorized from lightning.classification import CDClassifier # Load News20 dataset from scikit-learn. bunch = fetch_20newsgroups_vectorized(subset="all") X = bunch.data y = bunch.target y[y >= 1] = 1 for shrinking in (True, False): clf = CDClassifier(C=1.0, loss="squared_hinge", penalty="l1", tol=1e-3, max_iter=1000, shrinking=shrinking, random_state=0) start = time.time() clf.fit(X, y) print "Training time", time.time() - start print "Accuracy", clf.score(X, y)
top_words = 30 print "==== Keywords ==== " for m in xrange(clf.coefs_.shape[1]): t = [] print 'Topic',m for row in xrange(clf.coefs_.shape[0]): if( clf.coefs_[row,m] >0): t.append( dict_text[row]) for k in heapq.nlargest(top_words,t): print k, print print "==== Lightning Cython Implementation (Row-wise sparsity) =====" light_clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=clf.max_iter, alpha=1e-4, # clf.alpha, C=1.0 / X.shape[0], tol=clf.tol, permute=False, verbose=3, random_state=0).fit(X, y) print "==========>> Accuracy :", light_clf.score(X, y) print "Weight Matrix:" print (light_clf.coef_.T)
# remove words that appear only once all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] texts = [ " ".join(text) for text in texts] vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(texts) y_train = labels clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=15, alpha=1e-4, C=1.0 / X_train.shape[0], tol=1e-6, verbose=5) mmclf = mmclf.LatentGroupClassifier(max_iter=15, C=1.0 / X_train.shape[0]) start = time() clf.fit(X_train, y_train) elapsed = time() - start print "CDClassifier time", elapsed print "CDClassifier score", clf.score(X_train, y_train) start = time() mmclf.fit(X_train, y_train) elapsed = time() - start print "LatentGroupClassifier time", elapsed
#Source #http://contrib.scikit-learn.org/lightning/ from sklearn.datasets import fetch_20newsgroups_vectorized from lightning.classification import CDClassifier # Load News20 dataset from scikit-learn. bunch = fetch_20newsgroups_vectorized(subset="all") X = bunch.data y = bunch.target # Set classifier options. clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=20, alpha=1e-4, C=1.0 / X.shape[0], tol=1e-3) # Train the model. clf.fit(X, y) # Accuracy print(clf.score(X, y)) # Percentage of selected features print(clf.n_nonzero(percentage=True))
from lightning.classification import CDClassifier from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics import f1_score import scattertext as st newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer() tfidf_X = vectorizer.fit_transform(newsgroups_train.data) clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=20, alpha=1e-4, C=1.0 / tfidf_X.shape[0], tol=1e-3) clf.fit(tfidf_X, newsgroups_train.target) corpus = st.CorpusFromScikit( X=CountVectorizer(vocabulary=vectorizer.vocabulary_).fit_transform(newsgroups_train.data), y=newsgroups_train.target, feature_vocabulary=vectorizer.vocabulary_, category_names=newsgroups_train.target_names, raw_texts=newsgroups_train.data ).build() html = st.produce_frequency_explorer( corpus, 'alt.atheism', scores=clf.coef_[0],
print "### BASELINE GROUP LASSO in pure python/numpy###" X = X_train y = y_train clf = ogroup.BaselineGroupLasso(max_iter=30, alpha=.5, max_steps=30) clf.fit(X, y, groups) print "Acc:", clf.score(X, y) print clf.coefs_ print "### Equivalent Lightning Cython Implementation ###" light_clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=clf.max_iter, alpha=1e-4, # clf.alpha, C=1.0 / X.shape[0], tol=clf.tol, permute=False, verbose=3, random_state=0).fit(X, y) print "Acc:", light_clf.score(X, y) print light_clf.coef_.T import numpy as np data = np.load('3ng_train.npz') X = data['X'].item() Xaug = data['Xaug'].item() y = data['y'] groups = data['groups'] clf.fit(Xaug, y, groups) print clf.score(Xaug, y)
def main(): desc = ''' Learns a multi-class classification model that discriminates across clusters. The path(s) to the feature matrices are read from STDIN. Each path should contain be an npz file containing the feature matrix for a different cluster. Each cluster will be considered as a separate class. A multi-class classification model will be trained on a fraction of the data (controlled by the --train parameter). The rest of the data will be split on a test and a validation set of equal sizes.''' parser = argparse.ArgumentParser(description = desc) parser.add_argument('outfile') parser.add_argument('--alpha', type = float, default = 0.01, help = 'Coefficient of the penalty term.') parser.add_argument('--tol', type = float, default = 0.01, help = 'Tolerance for the termination criterion.') parser.add_argument('--train', type = float, default = 0.8, help = 'Fraction of examples used for training. [%(default)s]') parser.add_argument('--maxfreq', type = float, default = 0.3, help = 'Maximum frequency for a feature to be considered. [%(default)s]') parser.add_argument('--log', action = 'store_true', default = False, help = 'Use logistic regression') args = parser.parse_args() outfile = args.outfile alpha = args.alpha assert(alpha >= 0) train_prc = args.train assert(train_prc > 0 and train_prc < 1) max_freq = args.maxfreq assert(max_freq >= 0 and max_freq <= 1) files = [] for filename in fileinput.input([]): files.append(filename.strip()) (scores, rule_names) = merge_scores(files, vertical = True) y = np.repeat(np.arange(len(files)), scores.shape[0] / len(files)) if args.log: model = LogisticRegression(penalty = 'l1', C = alpha, tol = args.tol, random_state = 1) else: model = CDClassifier(penalty = 'l1/l2', loss = 'squared_hinge', multiclass = True, max_iter = 100, alpha = alpha, C = 1.0 / y.size, shrinking = False, # weird behavior if this is set to True tol = args.tol, random_state = 1, verbose = 2) numpy.random.seed(1) perm = numpy.random.permutation(len(y)) y = y[perm] scores = scores[perm, :] hits = np.sum(scores, axis = 0) / float(scores.shape[0]) print >> sys.stderr, 'Max frequency', np.max(hits) print >> sys.stderr, 'Num features with freq >', max_freq, np.sum(hits > max_freq) sel_feat = np.argwhere(hits <= max_freq).flatten() rule_names = list(np.array(rule_names)[sel_feat]) scores = scores[:, sel_feat] print >> sys.stderr, 'Scores shape', scores.shape # Get balanced training, test, and validation sets. cv = StratifiedShuffleSplit(y, 1, 1.0 - train_prc, indices = True) for train, test in cv: train_idx = train test_tmp = test # Now split the test set (which is balanced by design) into two balanced parts. cv = StratifiedShuffleSplit(y[test_tmp], 1, 0.5, indices = True) for train, test in cv: test_idx = test_tmp[train] val_idx = test_tmp[test] assert(len(set(train_idx).intersection(set(test_idx))) == 0) assert(len(set(val_idx).intersection(set(test_idx))) == 0) assert(len(set(train_idx).intersection(set(val_idx))) == 0) print >> sys.stderr, 'Will use', len(train_idx), 'examples for training,', \ len(test_idx), ' for testing, and', len(val_idx), 'for validation' all_idx = [train_idx, val_idx, test_idx] #assert(np.sum([len(i) for i in all_idx]) == y.size) model.fit(sp.csr_matrix(scores[train_idx, :], dtype = np.float), y[train_idx]) acc = [] confusion = [] for idx in all_idx: pred = model.predict(sp.csr_matrix(scores[idx, :], dtype = np.float)) acc.append(accuracy_score(y[idx], pred)) confusion.append(confusion_matrix(y[idx], pred)) with open(args.outfile, 'wb') as outfile: pickle.dump(model.coef_, outfile) pickle.dump(rule_names, outfile) pickle.dump(acc, outfile) pickle.dump(confusion, outfile)
# split data X_train, X_test, y_train, y_test = train_test_split(X, y_type, test_size=0.1, random_state=42) # build model # prepare parameters params = dict(alpha=[0.001], C=[0.0001]) # create and fit a ridge regression model, testing each alpha clf = CDClassifier(penalty="l1/l2", loss="log", multiclass=True, max_iter=20, alpha=1e-4, verbose=1, C=1.0, tol=1e-3) grid = GridSearchCV(estimator=clf, param_grid=params) grid.fit(X_train, y_train) print(grid) # summarize the results of the grid search print(grid.best_score_) print(grid.best_estimator_) bst = grid.best_estimator_ bst.predict(X_test) bst.score(X_test, y_test)