def SGDGridSearch_OLD(): # C=1 is best cs = 10.0**np.arange(-9,9,1) aucs = [] for c in cs: clf = SGDClassifier(penalty='l1',alpha=c).fit(f_train, y_train) probs = clf.decision_function(f_test) fpr,tpr,_ = roc_curve(y_true=y_test,y_score=probs) roc_auc = auc(fpr,tpr) cstr = '%0.2e'%c myplt = st.plotROC(fpr,tpr,roc_auc, figure=False, show=False, returnplt=True, showlegend=False, title='Grid Search - SGD Classifier ROC Curve') aucs.append(roc_auc) best = 0 for i in range(len(cs)): if aucs[i] > aucs[best]: best = i c = cs[best] clf = SGDClassifier(penalty='l1',alpha=c).fit(f_train, y_train) probs = clf.decision_function(f_test) fpr,tpr,_ = roc_curve(y_true=y_test,y_score=probs) myplt = st.plotROC(fpr,tpr,roc_auc, legendlabel='Best C = %0.2e' % c, figure=False, show=False, returnplt=True, showlegend=True, title='Grid Search - SGD Classifier ROC Curve') myplt.show() return clf, aucs
def train_kaggle(dataset, alg="rig", data="bow"): train_x, train_y, test_x = dataset print "shape for training data is", train_x.shape if alg == "svm": clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20) elif alg == "svm_sq": clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20, loss="squared_hinge") elif alg == "log": clf = LogisticRegression(verbose=1, n_jobs=2) elif alg == "per": clf = Perceptron(verbose=1, n_jobs=2, n_iter=25) elif alg == "rig": clf = RidgeClassifier() elif alg == "pa": clf = PassiveAggressiveClassifier(n_jobs=2, n_iter=25) else: raise NotImplementedError print "training with %s..." % alg clf.fit(train_x, train_y) # clf.fit(validate_x, validate_y) predicted = clf.predict(test_x) save_csv(predicted, fname=alg + "_" + data) if alg != "nb": return clf.decision_function(train_x), clf.decision_function(test_x) else: return clf.predict_proba(train_x), clf.predict_proba(test_x)
class PlattScaledSVM(BaseEstimator, ClassifierMixin): def __init__(self, **svm_kwargs): self.svm_kwargs = svm_kwargs self.svm = SGDClassifier(loss="hinge", **self.svm_kwargs) self.lr = LogisticRegression() def fit(self, X, y): self.svm.fit(X, y) dists = self.svm.decision_function(X) self.lr.fit(dists.reshape(-1, 1), y) return self def predict(self, X, y=None): dists = self.svm.decision_function(X) preds = self.lr.predict(dists.reshape(-1, 1)) def predict_proba(self, X, y=None): dists = self.svm.decision_function(X) probs = self.lr.predict_proba(dists.reshape(-1, 1)) return probs def get_params(self, deep=True): return self.svm_kwargs def set_params(self, **parameters): for parameter, value in parameters.items(): self.setattr(parameter, value) return self
class SVM(object): """docstring for SVM""" def __init__(self, ground_truth): self.max_num = 3.0 self.ground_truth = ground_truth self.scale = Scaler((0, 1)) #scale all features to be between 0-1 def scaleDataFit(self, data): self.scale.fit(data) return self.scale.transform(data) def fit(self, X, Y, distance=chi2Distance): X = self.scaleDataFit(X) self.distance = distance #compute distances match, mis = threadComputeMatrix(X, Y, distance) #prepare labels labels = [1 for l in match] lab = [0 for l in mis] #merge matches and mismatches match = np.vstack((match, mis)) del mis labels.extend(lab) match = np.asarray(match) labels = np.asarray(labels) #learn self.clf = SGDClassifier(loss="hinge", penalty="l2", n_jobs=8, shuffle=True) self.clf.fit(match, labels) #self.clf = SVC() #self.clf.fit(match, labels) # print "ACC", self.clf.score( match ,labels) # data_match = self.clf.decision_function( np.asarray(match) ) # data_mis = self.clf.decision_function( np.asarray(mis ) ) # return (data_match, data_mis) #transform data for evaulation purpose def transform(self, X, Y): X = self.scale.transform(X) match, mis = threadComputeMatrix(X, Y, self.distance) #match, mis = computeDistanceMatrix(X, Y, self.distance) data_match = self.clf.decision_function(np.asarray(match)) data_mis = self.clf.decision_function(np.asarray(mis)) return (data_match, data_mis) def predict(self, X1, X2): stack = np.vstack((X1, X2)) stack = self.scale.transform(stack) distance = self.distance(stack[0, :], stack[1, :]) return self.clf.decision_function(np.asarray(distance))
def select_threshold(X, Y, a): global tol global loss global penalty skf = StratifiedKFold(n_splits=3) model = SGDClassifier(loss=loss, alpha=a, class_weight='balanced', penalty=penalty, n_jobs=-1, tol=tol) thld = 0 mean_f1 = 0 for train_index, test_index in skf.split(X, Y): model.fit(X[train_index], Y[train_index]) scores = model.decision_function(X[test_index]) fpr, tpr, thresholds = roc_curve(Y[test_index], scores, pos_label=1) f1 = [] #thld_range = thresholds thld_range = np.linspace(thresholds[0], thresholds[-1], 50) for t in thld_range: f1.append(f1_score(Y[test_index], (scores > t).astype(int))) best_f1 = max(f1) best_t = thld_range[f1.index(best_f1)] thld = thld + best_t / 3 mean_f1 = mean_f1 + best_f1 / 3 return thld, mean_f1
def get_CV_data(X, Y, Alpha, verbose=False): global tol global loss global penalty t_0 = time.time() skf = StratifiedKFold(n_splits=3) auc_scores = [] i = 0 for a in Alpha: model = SGDClassifier(loss=loss, alpha=a, class_weight='balanced', penalty=penalty, n_jobs=-1, tol=tol) auc_scores.append(0) for train_index, test_index in skf.split(X, Y): model.fit(X[train_index], Y[train_index]) scores = model.decision_function(X[test_index]) roc_auc = roc_auc_score(Y[test_index], scores) auc_scores[i] = auc_scores[i] + roc_auc / 3 if verbose: time_str = str(timedelta(seconds=time.time() - t_0)) print( 'CV for alpha = {}\n AUC score = {} Time since begining: {}\n'. format(a, auc_scores[i], time_str)) i += 1 return auc_scores
def plot_sgd_separator(): # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, max_iter=200, fit_intercept=True) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] p = clf.decision_function([[x1, x2]]) Z[i, j] = p[0] levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' ax = plt.axes() ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) ax.axis('tight')
class kernelsvm(): def __init__(self, theta0, alpha, loss_metric): self.theta0 = theta0 self.alpha = alpha self.loss_metric = loss_metric def fit(self, X, y, idx_SR): n_SR = len(idx_SR) self.feature_map_nystroem = General_Nystroem(kernel='rbf', gamma=self.theta0, n_components=n_SR) X_features = self.feature_map_nystroem.fit_transform(X,idx_SR) print("fitting SGD") self.clf = SGDClassifier(loss=self.loss_metric,alpha=self.alpha) self.clf.fit(X_features, y) print("fitting SGD finished") def predict(self, X): print("Predicting") X_transform = self.feature_map_nystroem.transform(X) return self.clf.predict(X_transform), X_transform def decision_function(self, X): # X should be the transformed input! return self.clf.decision_function(X) def err_rate(self, y_true, y_pred): acc = accuracy_score(y_true, y_pred) err_rate = 1.0-acc return err_rate def get_params(self): return self.clf.get_params()
def linear_sgd(data_test, data_train, target_train, proba=False): """ :param data_test: :param data_train: :param target_train: :param proba: :return: """ logging.info('SGDClassifier') sgd = SGDClassifier() duration = time.time() sgd.fit(data_train, target_train) duration = time.time() - duration logging.info(f'duration fit: {duration}') if proba: duration = time.time() result = sgd.predict(data_test) duration = time.time() - duration logging.info(f'duration predict: {duration}') proba = sgd.decision_function(data_test) return result, proba duration = time.time() result = sgd.predict(data_test) duration = time.time() - duration logging.info(f'duration predict: {duration}') return result
class RBFSamplerSGDClassifierEstimator(BaseEstimator, TransformerMixin): def __init__(self, gamma=1.0, n_components=100, random_state=None, **kwargs): kwargs['random_state'] = random_state self.rbf_sampler = RBFSampler(gamma=gamma, n_components=n_components, random_state=random_state) self.sgdclassifier = SGDClassifier(**kwargs) def fit(self, X, y): X = self.rbf_sampler.fit_transform(X) self.sgdclassifier.fit(X, y) return self def transform(self, X, y=None): return np.sqrt(self.rbf_sampler.n_components) / np.sqrt( 2.) * self.rbf_sampler.transform(X) def predict(self, X): return self.sgdclassifier.predict(self.transform(X)) def decision_function(self, X): return self.sgdclassifier.decision_function(self.transform(X))
def plot_sgd_classifier(num_samples, clt_std): #generation of data X, y = make_blobs(n_samples=num_samples, centers=2, cluster_std=clt_std) #fitting of data using logistic regression clf = SGDClassifier(loss='log', alpha=0.01) clf.fit(X, y) #plotting of data x_ = np.linspace(min(X[:, 0]), max(X[:, 0]), 10) y_ = np.linspace(min(X[:, 1]), max(X[:, 1]), 10) X_, Y_ = np.meshgrid(x_, y_) Z = np.empty(X_.shape) for (i, j), val in np.ndenumerate(X_): x1 = val x2 = Y_[i, j] conf_score = clf.decision_function([x1, x2]) Z[i, j] = conf_score[0] levels = [-1.0, 0, 1.0] colors = 'k' linestyles = ['dashed', 'solid', 'dashed'] ax = plt.axes() plt.xlabel('X1') plt.ylabel('X2') ax.contour(X_, Y_, Z, colors=colors, levels=levels, linestyles=linestyles, labels='Boundary') ax.scatter(X[:, 0], X[:, 1], c=y)
def multiple_claasifier(): sgd_clfs = SGDClassifier(random_state=42, max_iter=None, tol=None) sgd_clfs.fit(X_train, y_train) some_digit = X[1] sgd_clfs.predict([some_digit]) some_digit_scores = sgd_clfs.decision_function([some_digit]) print(some_digit_scores)
def plot_sgd_separador(): # Creamos 50 puntos separados X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fijamos el modelo clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True) clf.fit(X, Y) # dibujamos la linea, los puntos y los puntos cercanos al plano xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] p = clf.decision_function(np.array([[x1, x2]])) Z[i, j] = p[0] levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' ax = plt.axes() ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) ax.axis('tight')
def main(feature_pkl): print 'Loading data...' featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl) print 'Normalizing data...' trainFeatures = sklearn.preprocessing.normalize(trainFeatures.tocsc(), norm='l2', axis=0) testFeatures = sklearn.preprocessing.normalize(testFeatures.tocsc(), norm='l2', axis=0) #trainSplit, testSplit = splitTuple # Best estimator from grid search: clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=5, n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False) print 'Fitting model...' clf.fit(trainFeatures,trainTargets) # Use probabilities or decision function to generate a ranking predicted_scores = clf.decision_function(testFeatures) with open(os.path.splitext(feature_pkl)[0]+'_testRanking.csv', 'w') as f: f.write('id\n') for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): f.write('%d\n' % (item_id)) # Turn estimator params into word clouds features, indices = zip(*sorted(featureIndex.iteritems(), key=operator.itemgetter(1))) coef_tuple = zip(clf.coef_[0],indices) coef_sort = sorted(coef_tuple, reverse=True) print 'Top 20 for illicit:' wordle_print(coef_sort[:20],features) print 'Top 20 for licit:' wordle_print(coef_sort[-20:],features)
def plot_sgd_separator(): # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) # e.g. # array([-1. , -0.33333333, 0.33333333, 1. , 1.66666667, # 2.33333333, 3. , 3.66666667, 4.33333333, 5. ]) X1, X2 = np.meshgrid(xx, yy) # make 2 lists comprising all 2D co-ordinate pairs Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] decision_function_array = np.array([x1, x2]).reshape(1, -1) # e.g. [[-1.0, -1.0]] p = clf.decision_function(decision_function_array) Z[i, j] = p[0] # confidence scores for sample (signed distance to hyperplane for each sample) levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' ax = plt.axes() ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) ax.axis('tight')
def plot_sgd_separator(): # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] p = clf.decision_function([x1, x2]) Z[i, j] = p[0] levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' ax = plt.axes() ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) ax.axis('tight')
def plot_sgd_separating_hyperplane(): """ ========================================= SGD: Maximum margin separating hyperplane ========================================= Plot the maximum margin separating hyperplane within a two-class separable dataset using a linear Support Vector Machines classifier trained using SGD. """ print(__doc__) import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import SGDClassifier from sklearn.datasets.samples_generator import make_blobs # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, max_iter=200, fit_intercept=True) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] p = clf.decision_function([[x1, x2]]) Z[i, j] = p[0] # Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 用这句更简单 levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' plt.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolor='black', s=20) plt.axis('tight') plt.show()
def svm_cross_validate(X,y,category,C,penalty,sample_weights): clf_svm_1 = SGDClassifier(loss=loss, penalty=penalty, alpha=C, shuffle=True) clf_svm_2 = SGDClassifier(loss=loss, penalty=penalty, alpha=C, shuffle=True) #N = len(category) #half_data= np.floor(N/2) #cv_indices_1= np.repeat([False],N) #cv_indices_2= np.repeat([False],N) #cv_indices_1[0:half_data] =True #cv_indices_2[half_data:N] =True #cv_indices= np.concatenate((cv_indices_1,cv_indices_2),axis=1) cv_indices = generate_cv_indices_unbalanced(category) train_ids = cv_indices[0:N] test_ids = cv_indices[N:2*N] clf_svm_1.fit(X[train_ids,:], y[train_ids],sample_weight=sample_weights[train_ids]) clf_svm_2.fit(X[test_ids,:], y[test_ids],sample_weight=sample_weights[test_ids]) score = np.zeros(2) score[0] = clf_svm_1.score(X[test_ids,:], y[test_ids]) score[1] = clf_svm_2.score(X[train_ids,:], y[train_ids]) mean_score = np.mean(score) #y_1 = clf_svm_1.predict_proba(X[test_ids,:]) #y_2 = clf_svm_2.predict_proba(X[train_ids,:]) y_1 = clf_svm_1.decision_function(X[test_ids,:]) y_2 = clf_svm_2.decision_function(X[train_ids,:]) y_1 = sigmoid(y_1) y_2 = sigmoid(y_2) auc = np.zeros(2) fpr, tpr, thresholds = metrics.roc_curve(y[test_ids], y_1, pos_label=1) auc[0] = metrics.auc(fpr, tpr) fpr, tpr, thresholds = metrics.roc_curve(y[train_ids], y_2, pos_label=1) auc[1] = metrics.auc(fpr, tpr) mean_auc = np.mean(auc,axis=0) print("Finished running standard cross validation") return mean_auc
def Get10SGDClassifiers(X_train, X_test, y_train, y_test): sgd_classificator = SGDClassifier(random_state=42, max_iter=5, tol=-np.inf) sgd_classificator.fit(X_train, y_train) predict = sgd_classificator.predict([X_test[1]]) array_score = sgd_classificator.decision_function([X_test[1]]) print("każda cyfra ma swój klasyfikator") print("predykcja: ", predict) print("target: ", y_test[1]) print("klasy: ", sgd_classificator.classes_) print("macierz punktów: ", array_score)
def svm_cross_validate_category(X,y,category,C,penalty,sample_weights): clf_svm_1 = SGDClassifier(loss=loss, penalty=penalty, alpha=C, shuffle=True) clf_svm_2 = SGDClassifier(loss=loss, penalty=penalty, alpha=C, shuffle=True) cv_indices = generate_cv_indices(category) train_ids = cv_indices[0:N] test_ids = cv_indices[N:2*N] clf_svm_1.fit(X[train_ids,:], y[train_ids],sample_weight=sample_weights[train_ids]) clf_svm_2.fit(X[test_ids,:], y[test_ids],sample_weight=sample_weights[test_ids]) score = np.zeros(2) score[0] = clf_svm_1.score(X[test_ids,:], y[test_ids]) score[1] = clf_svm_2.score(X[train_ids,:], y[train_ids]) mean_score = np.mean(score) # y_1 = clf_svm_1.predict_proba(X[test_ids,:]) # y_2 = clf_svm_2.predict_proba(X[train_ids,:]) y_1 = clf_svm_1.decision_function(X[test_ids,:]) y_2 = clf_svm_2.decision_function(X[train_ids,:]) y_1 = sigmoid(y_1) y_2 = sigmoid(y_2) u, indices = np.unique(category, return_inverse=True) auc = np.zeros((2,len(u))) for i in range(0,len(u)): i_inds = indices == i if(np.sum(test_ids & i_inds)!=0): fpr, tpr, thresholds = metrics.roc_curve(y[test_ids & i_inds], y_1[i_inds[test_ids],1], pos_label=1) auc[0,i] = metrics.auc(fpr, tpr) if(np.sum(train_ids & i_inds)!=0): fpr, tpr, thresholds = metrics.roc_curve(y[train_ids & i_inds], y_2[i_inds[train_ids],1], pos_label=1) auc[1,i] = metrics.auc(fpr, tpr) mean_auc = np.mean(auc,axis=0) print("Finished running category cross-validation") return mean_auc
def number_classify_ova(X_train, y_train): # 创建随机梯度下降多分类器实例 sgd_clf = SGDClassifier(random_state=42) sgd_clf.fit(X_train, y_train) # 预测样本 sample = X_train[100] predict = sgd_clf.predict([sample]) # 查看该样本在各类中的得分 digit_scores = sgd_clf.decision_function([sample]) print('OvA的随机梯度下降分类器预测结果为:', predict, '该样本的各类得分:', digit_scores) return sgd_clf
def sgd_ova(digit): some_digit = X[digit] sgd_clf = SGDClassifier(random_state = 34) sgd_clf.fit(X_train, y_train) prediction = sgd_clf.predict([some_digit]) some_digit_scores = sgd_clf.decision_function([some_digit]) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train.astype(np.float64)) cvs = cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy") print(cvs) print(some_digit_scores) print(prediction)
def run(): x_train,y_train,x_test = load_data() X_train,Y_train,X_test,Y_test = split_data(x_train,y_train) best_score_cv = 0 best_algo = '' clf = SGDClassifier(loss="hinge", penalty="l2") clf.fit(X_train,Y_train) Y_pred = clf.decision_function(X_test) if best_score_cv<metric(Y_test,Y_pred): best_score_cv = metric(Y_test,Y_pred) best_algo = 'hinge + l2' for alpha in [0.0001,0.001, 0.01, 0.1]: clf= Lasso(alpha=alpha) clf.fit(X_train,Y_train) Y_pred = clf.decision_function(X_test) if best_score_cv<metric(Y_test,Y_pred): best_score_cv = metric(Y_test,Y_pred) best_algo = 'LASSO with alpha='+str(alpha) clf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=1, random_state=0) clf.fit(X_train,Y_train) Y_pred = clf.predict_proba(X_test) if best_score_cv<metric(Y_test,Y_pred[:,1]): best_score_cv = metric(Y_test,Y_pred[:,1]) best_algo = 'randomforest with 100 trees' print print 'Thank you for running ML21 futurist meta-algorithm' print print '> the best algorithm is : '+best_algo print print '> the best cross-validation score is : '+str(best_score_cv) print print 'If you want, I can also do your breakfast.' print
def sgd_classify(self): print "Stochastic Gradient Descent" clf = SGDClassifier() clf.fit(self.descr, self.target) mean = clf.score(self.test_descr, self.test_target) print "Mean : %3f" % mean print "Probability ", clf.coef_ print "Mean of each feature per class ", clf.intercept_ print "Confidence Score ",clf.decision_function(self.descr) print "Predict Probability ", clf.predict_proba(self.descr) print "Transform ", clf.transform(self.descr)
def sgd_classify(self): print "Stochastic Gradient Descent" clf = SGDClassifier() clf.fit(self.descr, self.target) mean = clf.score(self.test_descr, self.test_target) print "Mean : %3f" % mean print "Probability ", clf.coef_ print "Mean of each feature per class ", clf.intercept_ print "Confidence Score ", clf.decision_function(self.descr) print "Predict Probability ", clf.predict_proba(self.descr) print "Transform ", clf.transform(self.descr)
def evaluate(X_train, Y_train, X_test, Y_test, a, thld, plot=False, plot_path='../results/Test_ROC.png'): global tol global loss global penalty model = SGDClassifier(loss=loss, alpha=a, class_weight='balanced', penalty=penalty, n_jobs=-1, tol=tol) model.fit(X_train, Y_train) train_scores = model.decision_function(X_train) Y_train_pred = (train_scores > thld).astype(int) train_report = classification_report(Y_train, Y_train_pred, digits=3) print('Train report:\n{}'.format(train_report)) test_scores = model.decision_function(X_test) fpr, tpr, thresholds = roc_curve(Y_test, test_scores, pos_label=1) Y_test_pred = (test_scores > thld).astype(int) test_report = classification_report(Y_test, Y_test_pred, digits=3) print('test report:\n{}'.format(test_report)) if (plot): plt.plot(fpr, tpr) plt.plot(fpr, fpr, linestyle=':', color='k') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.savefig(plot_path) plt.show() ROC_data = (fpr, tpr, thld) return train_report, test_report, ROC_data, model
class MySGDClassifier(BaseEstimator, TransformerMixin): def __init__(self, threshold=0, random_state=42): self.threshold = threshold self.random_state = random_state self.classifier = SGDClassifier(random_state=random_state) def fit(self, X, y): self.classifier.fit(X, y) return self def predict(self, x): Y_score_values = self.classifier.decision_function(x) Y_values = Y_score_values > self.threshold return Y_values
def classify(x_train, y_train, x_test): """ Trains logistic regression classifier on training set and then returns the probabilities of being the correct answer for points in training and test sets. Args: x_train: features of training set y_train: labels of training set x_test: features of test set Returns: y_train_prob: probabilities of training set points y_test_prob: probabilities of testing set points lr: classifier """ # train classifier lr = SGDClassifier(loss='log', penalty='l2', max_iter=5, tol=None) lr.fit(x_train, y_train) # obtain probabilities from decision boundary of classifier y_train_prob = lr.decision_function(x_train) y_test_prob = lr.decision_function(x_test) return y_train_prob, y_test_prob, lr
def SGDC(train_x, train_y, test_x, test_y, parameters=None): ''' Creates and fits the SGDClassifier :param train_x: train_x :param train_y: train_y :param test_x: test_x :param test_y: test_y :return: fpr, tpr, auc_score ''' clf_sgd = SGDClassifier(n_jobs=-1) clf_sgd.fit(train_x, train_y) predictions = clf_sgd.decision_function(test_x) fpr, tpr, _ = roc_curve(test_y, predictions, pos_label=1.0) score = round(roc_auc_score(test_y, predictions), 4) return fpr, tpr, score
def GetSGDClassifier2(X_train, X_test, y_train, y_test): n = 36000 some_digit_image = X_train[n] some_digit_target = y_train[n] y_train_5 = (y_train == 5) y_test_5 = (y_test == 5) sgd_classificator = SGDClassifier(random_state=42, max_iter=5, tol=-np.inf) sgd_classificator.fit(X_train, y_train_5) score = cross_val_score(sgd_classificator, X_train, y_train_5, cv=3, scoring="accuracy") y_train_predict = cross_val_predict(sgd_classificator, X_train, y_train_5, cv=3) confusion_matrix_digits = confusion_matrix(y_train_5, y_train_predict) precision_s = precision_score(y_train_5, y_train_predict) pelnosc_recall = recall_score(y_train_5, y_train_predict) f1 = f1_score(y_train_5, y_train_predict) y_scores = sgd_classificator.decision_function([some_digit_image]) threshold_df = 200000 y_some_digit_pred = (y_scores > threshold_df) y_scores_plot = cross_val_predict(sgd_classificator, X_train, y_train_5, cv=3, method="decision_function") precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores_plot) # EditData.plot_precision_recall_vs_threshold(precisions, recalls, thresholds) # EditData.plot_precision_vs_recall(precisions, recalls) p = 70000 y_train_pred_90 = (y_scores_plot > p) #EditData.plot_roc_curve(y_train_5, y_scores_plot) if False: print("predykcja: ", sgd_classificator.predict([some_digit_image])) print("target: ", some_digit_target) print("Score: ", score) print("confusion matrix: [PN, FP]: ", confusion_matrix_digits[0], ", [FN, PP]: ", confusion_matrix_digits[1]) print("Wynik F1: ", f1) print("predykcja z progiem ", threshold_df, ": ", y_some_digit_pred) print("Precyzja z progiem", p, precision_score(y_train_5, y_train_pred_90)) print("Pełność z progiem", p, recall_score(y_train_5, y_train_pred_90)) print("................................................................................") print("ROC") print("........SGD............") print("Precyzja SGD: ", np.round(precision_s, 4)) print("Pełność SGD: ", np.round(pelnosc_recall, 4)) print(".......................") return y_train_5, y_scores_plot
def main(): train_x,test_x,train_y,test_y = split_data() print("train ",train_x.shape) print("test ",test_x.shape) # 二分类 train_y_5 = (train_y == 5) test_y_5 = (test_y == 5) # sgd_clf(train_x,train_y_5,test_x,test_y_5) # cross_val(train_x,train_y_5) # 多分类 from sklearn.linear_model import SGDClassifier sgd_clf = SGDClassifier() sgd_clf.fit(train_x,train_y) pred_y = sgd_clf.decision_function(test_x[50].reshape(1,-1)) print(pred_y)
def train_custom_one_vs_all(X_train, X_test, Y_train, topk): #convert matrix to row for efficient splicing Y_train = Y_train.tocsc() tag_classifiers = [] num_training, numclasses = Y_train.shape num_test_examples = X_test.shape[0] # hold a vector mxk, containing top k prediction classes for each example, maintain m heaps for that num_examples = X_test.shape[0] num_classes = len(tag_classifiers) topk_class_distances = [] for i in xrange(num_examples): heap = [] topk_class_distances += [heap] for j in xrange(numclasses): # train on each class label for all the training examples y = numpy.ravel(Y_train.getcol(j).todense()) clf = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, fit_intercept=True, n_iter=10, shuffle=True, n_jobs=4, learning_rate='optimal') clf.fit(X_train, y) print "Trained for class", j # get the decision for all test examples decision = clf.decision_function(X_test) # for each test example add its decision value to the heap of top k decision values for i in xrange(num_test_examples): h = topk_class_distances[i] if len(h) < topk: heapq.heappush(h, (decision[i], j)) else: heapq.heappushpop(h, (decision[i], j)) print "Predicted for class", j #clean the decision values and store the class labels class_label_indices = [] for i in xrange(num_examples): topk_labels = [label for dist, label in topk_class_distances[i]] class_label_indices += [topk_labels] return class_label_indices
def train(input_filename, num_train_examples, num_test_examples, block_size): # Load initial training data and test data X_train, y_train, X_test, y_test, scaler = loaddata( input_filename, num_test_examples, block_size) # Feature generation using random forests forest = RandomForestClassifier(n_estimators=150, n_jobs=-1) forest.fit(X_train, y_train) encoder = OneHotEncoder() encoder.fit(forest.apply(X_train)) X_test = encoder.transform(forest.apply(X_test)) # Make sure that classes are weighted inversely to their frequencies weights = float(y_train.shape[0]) / (2 * numpy.bincount(y_train)) class_weights = {0: weights[0], 1: weights[1]} learner = SGDClassifier(loss="hinge", penalty="l2", learning_rate="invscaling", alpha=0.0001, average=10**4, eta0=1.0, class_weight=class_weights) num_passes = 3 aucs = [] for j in range(num_passes): for i in range(0, num_train_examples, block_size): df = pandas.read_csv(input_filename, header=None, skiprows=i, nrows=block_size) X_train = df.values[:, 1:] X_train = scaler.transform(X_train) X_train = encoder.transform(forest.apply(X_train)) y_train = numpy.array(df.values[:, 0], numpy.int) del df learner.partial_fit(X_train, y_train, classes=numpy.array([0, 1])) y_pred_prob = learner.decision_function(X_test) auc = roc_auc_score(y_test, y_pred_prob) aucs.append([i + num_train_examples * j, auc]) print(aucs[-1]) df = pandas.DataFrame(aucs, columns=["Iterations", "AUC"]) df = df.set_index("Iterations") return df
def bursi_get_extremes(num=200): po, ne = list(gspan.gspan_to_eden("bursi.pos.gspan")), list( gspan.gspan_to_eden("bursi.neg.gspan")) X, y = graphs_to_Xy(po, ne) esti = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=4, loss='log') esti.fit(X, y) res = [(score, idd) for idd, score in enumerate(esti.decision_function(X))] # list res.sort() graphs = po + ne # returns pos/neg return [graphs[idd] for (score, idd) in res[0 - num:] ], [graphs[idd] for (score, idd) in res[:num]]
def test_not_robust_classif(loss, weighting, multi_class): clf = RobustWeightedClassifier( loss=loss, max_iter=100, weighting=weighting, k=0, c=1e7, burn_in=0, multi_class=multi_class, random_state=rng, ) clf_not_rob = SGDClassifier(loss=loss, random_state=rng) clf.fit(X_c, y_c) clf_not_rob.fit(X_c, y_c) pred1 = clf.base_estimator_.decision_function(X_c) pred2 = clf_not_rob.decision_function(X_c) assert np.mean((pred1 > 0) == (pred2 > 0)) > 0.8
class LinearClassifier(object): def __init__(self, decompose_func=None, preprocessor=None, nbits=15, seed=1): self.decompose_func = decompose_func self.nbits = nbits feature_size, bitmask = set_feature_size(nbits=nbits) self.feature_size = feature_size self.bitmask = bitmask self.encoding_func = make_encoder(decompose_func, preprocessors=preprocessor, bitmask=self.bitmask, seed=seed) self.classifier = SGDClassifier(penalty='elasticnet') def fit(self, graphs, targets): data_mtx = vectorize_graphs(graphs, encoding_func=self.encoding_func, feature_size=self.feature_size) # binarize data_mtx.data = np.where(data_mtx.data > 0, 1, 0) self.classifier.fit(data_mtx, targets) return self def decision_function(self, graphs): # return probability associated to largest target type data_mtx = vectorize_graphs(graphs, encoding_func=self.encoding_func, feature_size=self.feature_size) # binarize data_mtx.data = np.where(data_mtx.data > 0, 1, 0) preds = self.classifier.decision_function(data_mtx) return preds def predict(self, graphs): data_mtx = vectorize_graphs(graphs, encoding_func=self.encoding_func, feature_size=self.feature_size) # binarize data_mtx.data = np.where(data_mtx.data > 0, 1, 0) preds = self.classifier.predict(data_mtx) return preds
def train(input_filename, num_train_examples, num_test_examples, block_size): # Load initial training data and test data X_train, y_train, X_test, y_test, scaler = loaddata(input_filename, num_test_examples, block_size) # Feature generation using random forests forest = RandomForestClassifier(n_estimators=150, n_jobs=-1) forest.fit(X_train, y_train) encoder = OneHotEncoder() encoder.fit(forest.apply(X_train)) X_test = encoder.transform(forest.apply(X_test)) # Make sure that classes are weighted inversely to their frequencies weights = float(y_train.shape[0]) / (2 * numpy.bincount(y_train)) class_weights = {0: weights[0], 1: weights[1]} learner = SGDClassifier( loss="hinge", penalty="l2", learning_rate="invscaling", alpha=0.0001, average=10 ** 4, eta0=1.0, class_weight=class_weights, ) num_passes = 3 aucs = [] for j in range(num_passes): for i in range(0, num_train_examples, block_size): df = pandas.read_csv(input_filename, header=None, skiprows=i, nrows=block_size) X_train = df.values[:, 1:] X_train = scaler.transform(X_train) X_train = encoder.transform(forest.apply(X_train)) y_train = numpy.array(df.values[:, 0], numpy.int) del df learner.partial_fit(X_train, y_train, classes=numpy.array([0, 1])) y_pred_prob = learner.decision_function(X_test) auc = roc_auc_score(y_test, y_pred_prob) aucs.append([i + num_train_examples * j, auc]) print(aucs[-1]) df = pandas.DataFrame(aucs, columns=["Iterations", "AUC"]) df = df.set_index("Iterations") return df
def train_custom_one_vs_all(X_train,X_test,Y_train,topk): #convert matrix to row for efficient splicing Y_train = Y_train.tocsc() tag_classifiers = [] num_training,numclasses = Y_train.shape num_test_examples = X_test.shape[0] # hold a vector mxk, containing top k prediction classes for each example, maintain m heaps for that num_examples = X_test.shape[0] num_classes = len(tag_classifiers) topk_class_distances = [] for i in xrange(num_examples): heap = [] topk_class_distances += [heap] for j in xrange(numclasses): # train on each class label for all the training examples y = numpy.ravel(Y_train.getcol(j).todense()); clf = SGDClassifier(loss='hinge',penalty='l2',alpha=0.0001,fit_intercept=True,n_iter = 10,shuffle=True,n_jobs=4,learning_rate='optimal') clf.fit(X_train,y); print "Trained for class",j # get the decision for all test examples decision = clf.decision_function(X_test) # for each test example add its decision value to the heap of top k decision values for i in xrange(num_test_examples): h = topk_class_distances[i] if len(h) < topk: heapq.heappush(h,(decision[i],j)) else: heapq.heappushpop(h,(decision[i],j)) print "Predicted for class",j #clean the decision values and store the class labels class_label_indices = [] for i in xrange(num_examples): topk_labels = [label for dist,label in topk_class_distances[i]] class_label_indices += [topk_labels] return class_label_indices
def multi_class(X, y, output_loc, number=16000, rand_state=42, cv=3): """ Model fitted, does a OvA, hence shown by desc_func output """ sgd_clf = SGDClassifier(random_state=rand_state) sgd_clf.fit(X, y) # Can train on the OvO strategy ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=rand_state)) ovo_clf.fit(X, y) forest_clf = RandomForestClassifier(random_state=rand_state) forest_clf.fit(X, y) # Index of the highest score is the given class scores = sgd_clf.decision_function(X[number, :].reshape(1, -1)) """ assert int(sgd_clf.predict(X[number,:].reshape(1, -1))[0,]) == \ int(np.argmax(scores)) assert int(ovo_clf.predict(X[number,:].reshape(1, -1))[0,]) == \ int(sgd_clf.predict(X[number,:].reshape(1, -1))[0,]) assert int(sgd_clf.predict(X[number,:].reshape(1, -1))[0,]) == \ int(forest_clf.predict(X[number,:].reshape(1, -1))[0,]) assert int(np.argmax( forest_clf.predict_proba( X[number,:].reshape(1, -1))[0,]).flatten()) == \ int(forest_clf.predict(X[number,:].reshape(1, -1))[0,]) """ assert len(ovo_clf.estimators_) == 45 sgd_score = cross_val_score(sgd_clf, X, y, cv=cv, scoring="accuracy") ovo_score = cross_val_score(ovo_clf, X, y, cv=cv, scoring="accuracy") rf_score = cross_val_score(forest_clf, X, y, cv=cv, scoring="accuracy") plot_save_loc = os.path.join(os.getcwd(), output_loc, "conf_mat_multi.jpg") conf_mat_normal = conf_mat(sgd_clf, X, y, plot_save_loc, None, cv=3) return np.mean(sgd_score), np.mean(rf_score), np.mean(ovo_score)
class SGDClassifierImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
class SGDC(object): def __init__(self, texts, classes, nlpdict): # TODO: add list of smileys to texts/classes self.s = SGDClassifier(loss="hinge", penalty="l1", shuffle=True, class_weight="auto") if nlpdict: self.dictionary = nlpdict else: self.dictionary = NLPDict(texts=texts) self._train(texts, classes) def _train(self, texts, classes): vectors = self.dictionary.feature_vectors(texts) self.s.fit(vectors, classes) def classify(self, texts): vectors = self.dictionary.feature_vectors(texts) predictions = self.s.decision_function(vectors) predictions = predictions / 20 + 0.5 predictions[predictions > 1] = 1 predictions[predictions < 0] = 0 return predictions
axes[i].set_ylim(y_min, y_max) pylab.sca(axes[i]) plt.scatter(x_train[:, 0], x_train[:, 1], c=y_train, cmap= plt.cm.prism) ys = (-clf.intercept_[i] - xs * clf.coef_[i,0])/ clf.coef_[i,1] plt.plot(xs, ys, hold=True) # Show Triple Binary Classifier plt.show() # Predicts the Species of Flower with Sepal Width 4.7 and Sepal Length 3.1 # Selects the Class in which it is more confident (Boundary line whose distance # to instance is longer) print clf.predict(scaler.transform([[4.7, 3.1]])) # Prints distance of all three boundary lines from the Point(4.7, 3.1) print clf.decision_function(scaler.transform([[4.7, 3.1]])) # Measure effeciveness of Results (82 % here in Train Dataset) from sklearn import metrics y_train_pred = clf.predict(x_train) print metrics.accuracy_score(y_train, y_train_pred) # (68 % Efficiency in Test Data) y_test_pred = clf.predict(x_test) print metrics.accuracy_score(y_test, y_test_pred) # Print Precision, F1-Score, Recall, Support print metrics.classification_report(y_test, y_test_pred, target_names= iris.target_names) # Print Confusion Matrix print metrics.confusion_matrix(y_test, y_test_pred)
''' lr = SGDClassifier(loss='log', penalty='l1') lr.fit(trained_model.vecs, trained_model.emos) ''' emotion categories predicted for the test vectors ''' predicted_op = lr.predict(test_model.vecs) ''' decision_function provides the value by which the hyperplane is separated which is used in ROC curves ''' predicted_score = lr.decision_function(test_model.vecs) def plot_confusion_matrix(cm, cmap=plt.cm.Greens): fig, ax = plt.subplots(figsize=(9,9)) plt.imshow(cm, interpolation='nearest', cmap=cmap) cb = plt.colorbar() cb.set_label("Predicted values") tick_marks = np.arange(len(emotion_categories)) plt.xticks(tick_marks, emotion_categories.keys(), rotation=45) plt.yticks(tick_marks, emotion_categories.keys()) width, height = np.shape(cm) for x in xrange(width): for y in xrange(height): ax.annotate(str(cm[x][y]), xy=(y, x), horizontalalignment='center', verticalalignment='center') ax.xaxis.tick_top()
class LearningModel(object): """ Represents the model that can be trained and later used to predict keywords for unknown data """ def __init__(self, global_index, word2vec_model): self.scaler = StandardScaler() self.classifier = SGDClassifier(n_jobs=-1) # try loss log (logistic reg) self.global_index = global_index self.word2vec = word2vec_model def maybe_fit_and_scale(self, matrix): """ If the scaler is not initialized, the fit() is performed on given data. Exception is thrown if the data is not big enough. Input matrix is scaled and returned. :param matrix: matrix to be transformed :return: scaled matrix """ if not hasattr(self.scaler, 'n_samples_seen_'): if len(matrix) < 1000: raise ValueError("Please user bigger batch size. " "The feature matrix is too small " "to fit the scaler.") else: self.scaler.fit(matrix) return self.scaler.transform(matrix) def partial_fit_classifier(self, input_matrix, output_vector): """ Fit the classifier on X, y matrices. Can be used for online training. :param input_matrix: feature matrix :param output_vector: vector of the same length as input_matrix :return: None """ classes = np.array([0, 1], dtype=np.bool_) # TODO Maybe initialize the classifier with this for balancing classes # weights = compute_class_weight('balanced', classes, output_vector) self.classifier = self.classifier.partial_fit( input_matrix, output_vector, classes=classes, ) def fit_classifier(self, input_matrix, output_vector): """ Fit the classifier on X, y matrices. Previous fit is discarded. :param input_matrix: feature matrix :param output_vector: vector of the same length as input_matrix :return: None """ self.classifier = self.classifier.fit(input_matrix, output_vector) def scale_and_predict(self, input_matrix): """ Predict output for given samples :param input_matrix: a feature matrix :return: matrix with predictions for each sample """ scaled_matrix = self.scaler.transform(input_matrix) return self.classifier.predict(scaled_matrix) def scale_and_predict_confidence(self, input_matrix): """ Predict confidence values for given samples :param input_matrix: a feature matrix :return: matrix with confidence values for each sample """ scaled_matrix = self.scaler.transform(input_matrix) return self.classifier.decision_function(scaled_matrix) def get_global_index(self): """ Get the GlobalFrequencyIndex field. """ return self.global_index
def test_sgd_proba(self): """Check SGD.predict_proba""" # Hinge loss does not allow for conditional prob estimate. # We cannot use the factory here, because it defines predict_proba # anyway. clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=10).fit(X, Y) assert_false(hasattr(clf, "predict_proba")) assert_false(hasattr(clf, "predict_log_proba")) # log and modified_huber losses can output probability estimates # binary case for loss in ["log", "modified_huber"]: clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10) clf.fit(X, Y) p = clf.predict_proba([3, 2]) assert_true(p[0, 1] > 0.5) p = clf.predict_proba([-1, -1]) assert_true(p[0, 1] < 0.5) p = clf.predict_log_proba([3, 2]) assert_true(p[0, 1] > p[0, 0]) p = clf.predict_log_proba([-1, -1]) assert_true(p[0, 1] < p[0, 0]) # log loss multiclass probability estimates clf = self.factory(loss="log", alpha=0.01, n_iter=10).fit(X2, Y2) d = clf.decision_function([[0.1, -0.1], [0.3, 0.2]]) p = clf.predict_proba([[0.1, -0.1], [0.3, 0.2]]) assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1)) assert_almost_equal(p[0].sum(), 1) assert_true(np.all(p[0] >= 0)) p = clf.predict_proba([-1, -1]) d = clf.decision_function([-1, -1]) assert_array_equal(np.argsort(p[0]), np.argsort(d[0])) l = clf.predict_log_proba([3, 2]) p = clf.predict_proba([3, 2]) assert_array_almost_equal(np.log(p), l) l = clf.predict_log_proba([-1, -1]) p = clf.predict_proba([-1, -1]) assert_array_almost_equal(np.log(p), l) # Modified Huber multiclass probability estimates; requires a separate # test because the hard zero/one probabilities may destroy the # ordering present in decision_function output. clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10) clf.fit(X2, Y2) d = clf.decision_function([3, 2]) p = clf.predict_proba([3, 2]) if not isinstance(self, SparseSGDClassifierTestCase): assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1)) else: # XXX the sparse test gets a different X2 (?) assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1)) # the following sample produces decision_function values < -1, # which would cause naive normalization to fail (see comment # in SGDClassifier.predict_proba) x = X.mean(axis=0) d = clf.decision_function(x) if np.all(d < -1): # XXX not true in sparse test case (why?) p = clf.predict_proba(x) assert_array_almost_equal(p[0], [1 / 3.0] * 3)
class EdenEstimator(BaseEstimator, ClassifierMixin): """Build an estimator for graphs.""" def __init__(self, r=3, d=8, nbits=16, discrete=True, balance=False, subsample_size=200, ratio=2, normalization=False, inner_normalization=False, penalty='elasticnet'): """construct.""" self.set_params(r, d, nbits, discrete, balance, subsample_size, ratio, normalization, inner_normalization, penalty) def set_params(self, r=3, d=8, nbits=16, discrete=True, balance=False, subsample_size=200, ratio=2, normalization=False, inner_normalization=False, penalty='elasticnet'): """setter.""" self.r = r self.d = d self.nbits = nbits self.normalization = normalization self.inner_normalization = inner_normalization self.discrete = discrete self.balance = balance self.subsample_size = subsample_size self.ratio = ratio if penalty == 'perceptron': self.model = Perceptron(max_iter=5, tol=None) else: self.model = SGDClassifier( average=True, class_weight='balanced', shuffle=True, penalty=penalty, max_iter=5, tol=None) self.vectorizer = Vectorizer( r=self.r, d=self.d, normalization=self.normalization, inner_normalization=self.inner_normalization, discrete=self.discrete, nbits=self.nbits) return self def transform(self, graphs): """transform.""" x = self.vectorizer.transform(graphs) return x @timeit def kernel_matrix(self, graphs): """kernel_matrix.""" x = self.transform(graphs) return metrics.pairwise.pairwise_kernels(x, metric='linear') def fit(self, graphs, targets, randomize=True): """fit.""" if self.balance: if randomize: bal_graphs, bal_targets = balance( graphs, targets, None, ratio=self.ratio) else: samp_graphs, samp_targets = subsample( graphs, targets, subsample_size=self.subsample_size) x = self.transform(samp_graphs) self.model.fit(x, samp_targets) bal_graphs, bal_targets = balance( graphs, targets, self, ratio=self.ratio) size = len(bal_targets) logger.debug('Dataset size=%d' % (size)) x = self.transform(bal_graphs) self.model = self.model.fit(x, bal_targets) else: x = self.transform(graphs) self.model = self.model.fit(x, targets) return self def predict(self, graphs): """predict.""" x = self.transform(graphs) preds = self.model.predict(x) return preds def decision_function(self, graphs): """decision_function.""" x = self.transform(graphs) preds = self.model.decision_function(x) return preds @timeit def cross_val_score(self, graphs, targets, scoring='roc_auc', cv=5): """cross_val_score.""" x = self.transform(graphs) scores = cross_val_score( self.model, x, targets, cv=cv, scoring=scoring) return scores @timeit def cross_val_predict(self, graphs, targets, cv=5): """cross_val_score.""" x = self.transform(graphs) scores = cross_val_predict( self.model, x, targets, cv=cv, method='decision_function') return scores @timeit def cluster(self, graphs, n_clusters=16): """cluster.""" x = self.transform(graphs) clust_est = MiniBatchKMeans(n_clusters=n_clusters) cluster_ids = clust_est.fit_predict(x) return cluster_ids @timeit def model_selection(self, graphs, targets, n_iter=30, subsample_size=None): """model_selection_randomized.""" param_distr = {"r": list(range(1, 5)), "d": list(range(0, 10))} if subsample_size: graphs, targets = subsample( graphs, targets, subsample_size=subsample_size) pool = mp.Pool() scores = pool.map(_eval, [(graphs, targets, param_distr)] * n_iter) pool.close() pool.join() best_params = max(scores)[1] logger.debug("Best parameters:\n%s" % (best_params)) self = EdenEstimator(**best_params) return self @timeit def learning_curve(self, graphs, targets, cv=5, n_steps=10, start_fraction=0.1): """learning_curve.""" graphs, targets = paired_shuffle(graphs, targets) x = self.transform(graphs) train_sizes = np.linspace(start_fraction, 1.0, n_steps) scoring = 'roc_auc' train_sizes, train_scores, test_scores = learning_curve( self.model, x, targets, cv=cv, train_sizes=train_sizes, scoring=scoring) return train_sizes, train_scores, test_scores @timeit def bias_variance_decomposition(self, graphs, targets, cv=5, n_bootstraps=10): """bias_variance_decomposition.""" x = self.transform(graphs) score_list = [] for i in range(n_bootstraps): scores = cross_val_score( self.model, x, targets, cv=cv) score_list.append(scores) score_list = np.array(score_list) mean_scores = np.mean(score_list, axis=1) std_scores = np.std(score_list, axis=1) return mean_scores, std_scores
# cv=3, # scoring="accuracy")) y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3) # print(y_train_pred) # print(y_train_5) # print(confusion_matrix(y_train_5, y_train_pred)) # print("precision:\n",precision_score(y_train_5, y_train_pred)) # print("recall:\n",recall_score(y_train_5, y_train_pred)) # print("f1:\n", f1_score(y_train_5, y_train_pred)) y_scores = sgd_clf.decision_function([some_digit]) # print(y_scores) threshold = 0 y_some_digit_pred = (y_scores > threshold) # print(y_some_digit_pred) threshold = 200000 y_some_digit_pred = (y_scores > threshold) # print(y_some_digit_pred) y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function") precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import SGDClassifier from sklearn.datasets.samples_generator import make_blobs # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] p = clf.decision_function([x1, x2]) Z[i, j] = p[0] levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' plt.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) plt.axis('tight') plt.show()
from sklearn.linear_model import SGDClassifier enetloglike = SGDClassifier(loss="log", penalty="elasticnet", alpha=0.0001, l1_ratio=0.15, class_weight='balanced') enetloglike.fit(X, y) enethinge = SGDClassifier(loss="hinge", penalty="elasticnet", alpha=0.0001, l1_ratio=0.15, class_weight='balanced') enetloglike.fit(X, y) enethinge.fit(X, y) print(np.corrcoef(enetloglike.coef_, enethinge.coef_)) # The weights vectors are highly correlated print(np.corrcoef(enetloglike.decision_function(X), enethinge.decision_function(X))) # The decision function are highly correlated plt.plot(enetloglike.decision_function(X), enethinge.decision_function(X), "o") ''' ## Exercise Compare predictions of Enet Logistic regression (LR) and Hinge Enet - Compute the correlation between pairs of weights vectors. - Compare the predictions of two classifiers using their decision function: * Compute the correlation decision function. * Plot the pairwise decision function of the classifiers.
def classify(X, y): print 'classify(X=%s,Y=%s)' % (X.shape, y.shape) # Normalize means = X.mean(axis=0) stds = X.std(axis=0) if False: print ' X:', X.shape, X[0,:] print 'means:', means.shape, means print ' stds:', stds.shape, stds for i in range(X.shape[1]): X[:,i] = X[:,i] - means[i] if abs(stds[i]) > 1e-4: X[:,i] = X[:,i]/stds[i] if False: means = X.mean(axis=0) stds = X.std(axis=0) print 'After normalization' print ' X:', X.shape, X[0,:] print 'means:', means.shape, means print ' stds:', stds.shape, stds for k in [1,5,20]: for i in range(5): classify_nn(X,y,k) common.SUBHEADING() if False: for k in range(1,200): for i in range(10): classify_nn(X,y,k) common.SUBHEADING() exit() if False: X = Xa.tolist() y = ya.tolist() print 'X: %dx%d' %(len(X),len(X[0])) print 'y: %d' %(len(y)) if False: X2 = [] Y2 = [] for i in range(len(X)): if any(X[i]): print 'X[%d]:%s' %(i,X[i]) print 'Y[%d]:%s' %(i,Y[i]) X2.append(X[i]) Y2.append(Y[i]) X = X2 Y = Y2 # fit the model clf = SGDClassifier(loss="hinge", alpha = 0.01, n_iter=50) #, fit_intercept=True) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-5, 5, 10) yy = np.linspace(-5, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i,j), val in np.ndenumerate(X1): x1 = val x2 = X2[i,j] p = clf.decision_function([x1, x2]) Z[i,j] = p[0] levels = [-1.0, 0.0, 1.0] linestyles = ['dashed','solid', 'dashed'] colors = 'k' pl.set_cmap(pl.cm.Paired) pl.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) pl.scatter(X[:,0], X[:,1], c=Y)
class ExperimentalOneClassEstimator: ''' there might be a bug connected to nx.digraph.. ''' def __init__(self, nu=.5, cv=2, n_jobs=-1, move_bias_calibrate=True, classifier=SGDClassifier(loss='log')): ''' Parameters ---------- nu: part of graphs that will be placed in the negative set (0~1) cv: n_jobs: jobs for fitting move_bias_calibrate: after moving the bias we can recalibrate classifier: calssifier object Returns ------- ''' self.status = 'new' self.nu = nu self.cv = cv self.n_jobs = n_jobs self.move_bias_recalibrate = move_bias_calibrate self.classifier = classifier self.inverse_prediction = False self.intercept_ = .5 # PROJECT PRETEND TO BE UNCALLIBRATED TO TRICK EDEN # tricking eden th think i am a normal estimator... hehhehe def decision_function(self, vector): # PROJECT PRETEND TO BE UNCALLIBRATED TO TRICK EDEN return self.superesti.decision_function(vector) def fit(self, data_matrix, random_state=None): if random_state is not None: random.seed(random_state) # use eden to fitoooOoO self.estimator = self.fit_estimator(data_matrix, n_jobs=self.n_jobs, cv=self.cv, random_state=random_state) # move bias to obtain oneclassestimator self.cal_estimator = self.move_bias(data_matrix, estimator=self.estimator, nu=self.nu, cv=self.cv) self.status = 'trained' return self ''' disabled for now.. since the discsampler is not expected to work def fit_2(self, pos_iterator, neg_iterator, vectorizer=None, cv=2, n_jobs=-1): """ This is used in the discsampler .,., i am not sure why i am not using eden directly. I will fix this when i look into the disk sampler next time. :param pos_iterator: :param neg_iterator: :param vectorizer: :param cv: :param n_jobs: :return: """ self.vectorizer=vectorizer data_matrix = vectorizer.fit_transform(pos_iterator) neagtive_data_matrix = vectorizer.transform(neg_iterator) estimator = eden_fit_estimator(SGDClassifier(loss='log'), positive_data_matrix=data_matrix, negative_data_matrix=neagtive_data_matrix, cv=cv, n_jobs=n_jobs, n_iter_search=10) # esti= CalibratedClassifierCV(estimator,cv=cv,method='sigmoid') # esti.fit( vstack[ X,Y], numpy.asarray([1]*X.shape[0] + [0]*Y.shape[0])) return estimator ''' def fit_estimator(self, data_matrix, n_jobs=-1, cv=2, random_state=42): ''' create self.estimator... by inversing the data_matrix set to get a negative set and then using edens fit_estimator ''' # create negative set: data_matrix_neg = data_matrix.multiply(-1) # i hope loss is log.. not 100% sure.. # probably calibration will fix this# return eden_fit_estimator(self.classifier, positive_data_matrix=data_matrix, negative_data_matrix=data_matrix_neg, cv=cv, n_jobs=n_jobs, n_iter_search=10, random_state=random_state) def move_bias(self, data_matrix, estimator=None, nu=.5, cv=2): ''' move bias until nu of data_matrix are in the negative class then use scikits calibrate to calibrate self.estimator around the input ''' # move bias # l = [(estimator.decision_function(g)[0], g) for g in data_matrix] # l.sort(key=lambda x: x[0]) # element = int(len(l) * nu) # estimator.intercept_ -= l[element][0] scores = [estimator.decision_function(sparse_vector)[0] for sparse_vector in data_matrix] scores_sorted = sorted(scores) pivot = scores_sorted[int(len(scores_sorted) * self.nu)] estimator.intercept_ -= pivot # calibrate if self.move_bias_recalibrate: # data_matrix_binary = vstack([a[1] for a in l]) # data_y = numpy.asarray([0] * element + [1] * (len(l) - element)) data_y = numpy.asarray([1 if score >= pivot else -1 for score in scores]) self.superesti = SGDClassifier(loss='log') # self.superesti.fit(data_matrix, data_y) # estimator = CalibratedClassifierCV(estimator, cv=cv, method='sigmoid') # estimator = CalibratedClassifierCV(self.testimator, cv=cv, method='sigmoid') # estimator.fit(data_matrix, data_y) return self.superesti def predict_single(self, vectorized_graph): return self.superesti.decision_function(vectorized_graph)[0] # probably broken ... you should use predict single now o OO def predict(self, things): # return self.predict_single(things) # return numpy.array( [ 1 if self.predict_single(thing)>.5 else 0 for thing in things] ) return self.superesti.predict(things)
X=[[0., 0.], [1., 1.]] y=[0, 1] clf = SGDClassifier(loss="hinge", penalty="l2") # Model fitting print("Fitting: ",clf.fit(X, y)) # Model to be used to predict new values print("Prediction: ",clf.predict([[2.,2.]])) # Model parameters print("Model parameter: ",clf.coef_) # Model intercept (aka offset or bias) print("Model Intercept: ",clf.intercept_) # Signed distance to the hyperplane print("Hyperplane distance: ",clf.decision_function([[2., 2.]])) # Concrete loss function (logistic parameter) clf = SGDClassifier(loss="log").fit(X, y) print("Classifier with LR: ",clf.predict_proba([[1., 1.]])) print(clf)
def run(self, nFold=3, iter=10, verbose=1, loss='modified_huber', penalty='l2', shuffle=True): """ CV: -1 => total model (no cv) CV: nFold => mean metric over cv """ self.__database.createGOIDView(self.__goidtable, double=["AUROC", "AUPR", "Fmax"], drop=True) self.__database.createProteinView(self.__proteintable, \ double=["ProteinID", "Label", "Score"], drop=True) # Get labels test = 0 pp = permutation(self.__numproteins) resultid = 0 for goid in self.__goid: print "____________ GOID= %d ____________" % goid # Get label for GOID goidindex = where(self.__goid==goid) goidindex = int(goidindex[0]) annotations = self.selectAnnotatedProteinsMousefunc(goidindex) print "0s=", len([x for x in annotations if x == 0]) print "1s=", len([x for x in annotations if x == 1]) print "-1s=", len([x for x in annotations if x == -1]) annotation = [] for value in annotations: annotation.append(value) annotation = asarray(annotation).astype(float64) annotation = annotation.ravel() model = SGDClassifier(loss=loss, class_weight='auto', penalty=penalty, \ n_iter=iter, shuffle=shuffle, verbose=verbose) model.fit(self.__network, annotation) scores = model.decision_function(self.__network) scores = self.convertScore(scores) per = Performance(annotations, scores) roc = per.AUROCGillis() print "AUROC= ", roc pr = per.AUPRGillis() print "AUPR= ", pr fmax = per.Fmax() print "Fmax= ", fmax self.__database.insertProteinView(self.__proteintable, resultid, goid[0], -1, \ self.__proteins, annotations, scores) self.__database.insertGOIDView(self.__goidtable, resultid, goid[0], -1, [roc, pr, fmax]) resultid += 1 del per labelIx = range(self.__numproteins) offset = 0 fold = 0 meanroc = [] meanpr = [] meanfmax = [] while fold < nFold: print "____________ Fold= %d ____________" % fold lastelem = min(self.__numproteins, offset+floor(self.__numproteins/nFold)) ix = [] for index in pp[offset+1:lastelem]: ix.append(labelIx[index]) offset = lastelem labeltmp = [] for value in annotations: labeltmp.append(float(value)) for index in ix: labeltmp[index] = 0 print "0s=", len([x for x in labeltmp if x == 0]) print "1s=", len([x for x in labeltmp if x == 1]) print "-1s=", len([x for x in labeltmp if x == -1]) model = SGDClassifier(loss=loss, class_weight='auto', penalty=penalty, \ n_iter=iter, shuffle=shuffle, verbose=verbose) model.fit(self.__network, labeltmp) scores = model.decision_function(self.__network) scores = self.convertScore(scores) score = [] annotation = [] proteins = [] for index in ix: score.append(float(scores[index])) annotation.append(annotations[index]) proteins.append(self.__proteins[index]) per = Performance(annotation, score) roc = per.AUROCGillis() print "AUROC= ", roc meanroc.append(roc) pr = per.AUPRGillis() print "AUPR= ", pr meanpr.append(pr) fmax = per.Fmax() print "Fmax= ", fmax meanfmax.append(fmax) self.__database.insertGOIDView(self.__goidtable, resultid, goid[0], fold,\ [roc, pr, fmax]) self.__database.insertProteinView(self.__proteintable, resultid, goid[0],\ fold, proteins, annotation, score) del proteins del annotation del score del per fold += 1 resultid += 1 roc_mean = reduce(lambda x, y: x + y / float(len(meanroc)), meanroc, 0) print "Mean AUROC= ", roc_mean pr_mean = reduce(lambda x, y: x + y / float(len(meanpr)), meanpr, 0) print "Mean AUPR= ", pr_mean fmax_mean = reduce(lambda x, y: x + y / float(len(meanfmax)), meanfmax, 0) print "Mean Fmax= ", fmax_mean self.__database.insertGOIDView(self.__goidtable, resultid, goid[0], nFold, \ [roc_mean, pr_mean, fmax_mean]) resultid += 1 test += 1
FN = open('fn.txt','wb') for tr_doc,te_doc in kf: train_index = doc_to_sen(tr_doc,index_map) test_index = doc_to_sen(te_doc,index_map) train_data = features.tocsr()[train_index,:] train_label = all_labels[train_index] test_data = features.tocsr()[test_index,:] test_label = all_labels[test_index] #train_data = scaler1.fit_transform(train_data) clf.fit(train_data,train_label) sorted_index_train = [] #the sorted index in the absract for t in tr_doc: #current_scores = [] sen_index = doc_to_sen([t],index_map) cur_train_score = clf.decision_function(features.tocsr()[sen_index,:]) #obtain the sorted position of each sentence according to the distance to the boundary temp = [i[0] for i in sorted(enumerate(list(cur_train_score)),key=lambda x:x[1],\ reverse=True)] sorted_index = np.zeros(len(temp)) for i, q in enumerate(temp): sorted_index[q] = i sorted_index_train += list(sorted_index) #add the previous max score to train new SVM train_data = hstack([train_data,np.array(sorted_index_train).reshape(-1,1)]) #train_data = scaler2.fit_transform(train_data) #clf1.fit(train_data,train_label) #test_data = scaler1.transform(test_data) test_score = clf.decision_function(test_data) sorted_index_test = [] prediction = []
for f in train.columns: if train[f].dtype=='object': print(f) lbl = preprocessing.LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) features = [s for s in train.columns.ravel().tolist() if s != 'QuoteConversion_Flag'] print("Features: ", features) print("Train a SGDClassifier model") X_train, X_valid = train_test_split(train, test_size=0.01) y_train = X_train['QuoteConversion_Flag'] y_valid = X_valid['QuoteConversion_Flag'] clf = SGDClassifier(loss="hinge", penalty="l2", n_jobs=-1) clf.fit(X_train[features].values, y_train.values) print("## Validating Data") preds = clf.decision_function(X_valid[features]) auc_value = roc_auc_score(y_valid, preds) print("ROC Score : " + str(auc_value)) print("## Predicting test data") preds = clf.decision_function(test[features].values) test["QuoteConversion_Flag"] = preds test[['QuoteNumber',"QuoteConversion_Flag"]].to_csv('test_predictions.csv', index=False)
# belang zijn deze parameter hoger in te stellen als de resultaten niet # consistent zijn. classifier = SGDClassifier(n_iter=50, loss=config.get("classifier", "loss"), shuffle=True, random_state=random_state) # We fitten (trainen) de classifier als volgt: classifier.fit(X_train, y_train) show_most_informative_features(vectorizer, classifier, n=config.getint("classifier", "top-features")) if config.get('documents', 'test') == 'no': # nu is alles klaar om de classifier te testen op onze test set preds = classifier.predict(X_test) # de decision_function methode geeft de daadwerkelijke getallen terug # op basis waarvan de classificatie wordt gemaakt Dat kan handig zijn # later om een drempelwaarde te bepalen decisions = classifier.decision_function(X_test) print classification_report(y_test, preds) print "Area Under the Precision Recall Curve:", average_precision_score(y_test, decisions) precision, recall, _ = precision_recall_curve(y_test, decisions) sb.plt.figure() sb.plt.plot(recall, precision) sb.plt.savefig("Precision-recall-curve.pdf") else: decisions = classifier.decision_function(X_test) preds = classifier.predict(X_test) for doc_id, decision, pred in sorted(zip(doc_ids, decisions, preds), key=lambda i: i[1]): print 'Document:', doc_id, "Score: %.4f, Prediction: %s" % ( decision, 'NL' if pred == 0 else 'B')
def decision_function(self, X, *args, **kw): X = sp.csr_matrix(X) return SGDClassifier.decision_function(self, X, *args, **kw)
from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler X = [[0., 0.], [1., 1.]] y = [0, 1] clf = SGDClassifier(loss="hinge", penalty="l2") print clf.fit(X, y) print clf.predict([[2., 2.]]) print clf.coef_ print clf.intercept_ print clf.decision_function([[2., 2.]]) clf = SGDClassifier(loss='log').fit(X, y) print clf.predict_ probab([[1., 1.]]) scaler = StandardScaler() scaler.fot(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test)