def learning_curve(classifier, y, train, cv, n=15): """Plot train and cv loss for increasing train sample sizes.""" chunk = int(len(y)/n) n_samples = [] train_losses = [] cv_losses = [] previous_cache_dir = classifier.cache_dir classifier.cache_dir = "diagnostics" for i in range(n): train_subset = train[:(i + 1)*chunk] preds_cv = classifier.fit_predict(y, train_subset, cv, show_steps=False) preds_train = classifier.fit_predict(y, train_subset, train_subset, show_steps=False) n_samples.append((i + 1)*chunk) cv_losses.append(hinge_loss(y[cv], preds_cv)) train_losses.append(hinge_loss(y[train_subset], preds_train)) classifier.cache_dir = previous_cache_dir plt.clf() plt.plot(n_samples, train_losses, 'r--', n_samples, cv_losses, 'b--') plt.ylim([min(train_losses) - .01, max(cv_losses) + .01]) plt.savefig('plots/learning_curve.png') plt.show()
def check_lambda(datanm, samples_per_class, Cs, num_classes, gamma, num_iter = 100, kernel = 'linear', strat = 'ovr'): data, labels = load_full(datanm, samples_per_class) slo = StratifiedShuffleSplit(labels, n_iter=num_iter, test_size=0.3, train_size=0.7, random_state=None) ans = np.zeros((len(Cs), len(gamma), 4)) for train_index, test_index in slo: train_data = [data[train_index, :], labels[train_index]] valid_data = [data[test_index , :], labels[test_index ]] for j, g in enumerate(gamma): for i, C in enumerate(Cs): clf = svm.SVC(C=C, kernel=kernel, degree=3, gamma=g, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=10000, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=strat, random_state=None) clf.fit(train_data[0], train_data[1]) out_train = clf.decision_function(train_data[0]) out_valid = clf.decision_function(valid_data[0]) ans[i, j, 2] += hinge_loss(train_data[1], out_train, range(num_classes)) ans[i, j, 3] += hinge_loss(valid_data[1], out_valid, range(num_classes)) #ans[i, j, 0] += log_loss(train_data[1], clf.predict_proba(train_data[0])) #ans[i, j, 1] += log_loss(valid_data[1], clf.predict_proba(valid_data[0])) ans[:, :, :] /= num_iter np.savez("svm_lambda_" + kernel + '_' + strat, ans= ans, Cs = Cs, num_iter = num_iter, num_classes = num_classes, samples_per_class = samples_per_class) return ans
def svm(x, y, p): ##Default parameters default_parameters = [ 0.001, 1000 ] ##Parameters with order: Tolerance, maximum iteration ##set custom parameters for i in range(len(p)): if p[i] != "": default_parameters[i] = p[i] ##create model model = LinearSVC(tol=default_parameters[0], max_iter=default_parameters[1]) ##Train and test accuracy = [ model.fit(x[train], y[train]).score(x[test], y[test]) for train, test in kf.split(x) ] res = np.array(accuracy) print("\nSupport Vector Machine\n----------------------\nAccuracy: %.2f" % res.mean()) print("Loss: %.2f" % hinge_loss(y, model.decision_function(x))) info = [ '%.2f' % res.mean(), '%.2f' % hinge_loss(y, model.decision_function(x)) ] return model, info, default_parameters
def learning_curve(classifier, y, train, cv, n=15): """Plot train and cv loss for increasing train sample sizes.""" chunk = int(len(y)/n) n_samples = [] train_losses = [] cv_losses = [] previous_cache_dir = classifier.cache_dir classifier.cache_dir = "diagnostics" for i in range(n): train_subset = train[:(i + 1)*chunk] preds_cv = classifier.fit_predict(y, train_subset, cv, show_steps=False) preds_train = classifier.fit_predict(y, train_subset, train_subset, show_steps=False) n_samples.append((i + 1)*chunk) cv_losses.append(hinge_loss(y[cv], preds_cv, neg_label=0)) train_losses.append(hinge_loss(y[train_subset], preds_train, neg_label=0)) classifier.cache_dir = previous_cache_dir plt.clf() plt.plot(n_samples, train_losses, 'r--', n_samples, cv_losses, 'b--') plt.ylim([min(train_losses) - .01, max(cv_losses) + .01]) plt.savefig('plots/learning_curve.png') plt.show()
def test_hinge_loss_binary(): y_true = np.array([-1, 1, 1, -1]) pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) assert_equal(hinge_loss(y_true, pred_decision), 1.2 / 4) y_true = np.array([0, 2, 2, 0]) pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) assert_equal(hinge_loss(y_true, pred_decision), 1.2 / 4)
def test_hinge_loss_binary(): y_true = np.array([-1, 1, 1, -1]) pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision)) y_true = np.array([0, 2, 2, 0]) pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision, pos_label=2, neg_label=0))
def test_hinge_loss_binary(): y_true = np.array([-1, 1, 1, -1]) pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision)) with warnings.catch_warnings(): # Test deprecated pos_label assert_equal(hinge_loss(-y_true, pred_decision), hinge_loss(y_true, pred_decision, pos_label=-1, neg_label=1)) y_true = np.array([0, 2, 2, 0]) pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision)) with warnings.catch_warnings(): # Test deprecated pos_label assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision, pos_label=2, neg_label=0))
def svm_classify(all_x, all_y, n_pixels): # read the original data accuracies = np.zeros([len(all_y), len(frames)]) losses = np.zeros([len(all_y), len(frames)]) coef = np.zeros([len(all_y), len(frames), n_pixels]) for idx, validation_y in enumerate(all_y): train_indx = np.delete(np.arange(len(all_y)), idx) for f in range(n_frames): train_x = all_x[train_indx, f, :] validation_x = all_x[idx, f, :].reshape([1, -1]) m = np.mean(train_x, axis=0) s = np.std(train_x, axis=0) train_x = (train_x - m) / s validation_x = (validation_x - m) / s train_y = all_y[train_indx] clf = svm.SVC(kernel='linear', class_weight='balanced') clf.fit(train_x, train_y) coef[idx, f, :] = clf.coef_ pred = clf.predict(validation_x) acc = pred == validation_y accuracies[idx, f] = acc est = clf.decision_function(validation_x) loss = hinge_loss([validation_y], est) losses[idx, f] = loss return accuracies, losses, coef
def validation_metric_vw(self): y_pred_holdout = self.get_y_pred_holdout() if self.outer_loss_function == 'logistic': if self.labels_clf_count > 2: y_pred_holdout_proba = y_pred_holdout else: y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout] loss = log_loss(self.y_true_holdout, y_pred_holdout_proba) elif self.outer_loss_function == 'squared': loss = mean_squared_error(self.y_true_holdout, y_pred_holdout) elif self.outer_loss_function == 'hinge': loss = hinge_loss(self.y_true_holdout, y_pred_holdout) elif self.outer_loss_function == 'pr-auc': loss = -average_precision_score(self.y_true_holdout, y_pred_holdout) elif self.outer_loss_function == 'roc-auc': y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout] fpr, tpr, _ = roc_curve(self.y_true_holdout, y_pred_holdout_proba) loss = -auc(fpr, tpr) else: raise KeyError('Invalide outer loss function') self.logger.info('parameter suffix: %s' % self.param_suffix) self.logger.info('loss value: %.6f' % loss) return loss
def test_hinge_loss_multiclass_invariance_lists(): # Currently, invariance of string and integer labels cannot be tested # in common invariance tests because invariance tests for multiclass # decision functions is not implemented yet. y_true = ['blue', 'green', 'red', 'green', 'white', 'red'] pred_decision = [ [0.36, -0.17, -0.58, -0.99], [-0.55, -0.38, -0.48, -0.58], [-1.45, -0.58, -0.38, -0.17], [-0.55, -0.38, -0.48, -0.58], [-2.36, -0.79, -0.27, 0.24], [-1.45, -0.58, -0.38, -0.17]] dummy_losses = np.array([ 1 - pred_decision[0][0] + pred_decision[0][1], 1 - pred_decision[1][1] + pred_decision[1][2], 1 - pred_decision[2][2] + pred_decision[2][3], 1 - pred_decision[3][1] + pred_decision[3][2], 1 - pred_decision[4][3] + pred_decision[4][2], 1 - pred_decision[5][2] + pred_decision[5][3] ]) dummy_losses[dummy_losses <= 0] = 0 dummy_hinge_loss = np.mean(dummy_losses) assert_equal(hinge_loss(y_true, pred_decision), dummy_hinge_loss)
def obj(self, data, score, C1, C2): d, n = data.weight.shape[0], len(data.y) embed_loss = norm(self.X - data.weight.dot( self.X), 'fro')**2 / d + self.delta * norm(self.X, 'fro')**2 / d obj = n*hinge_loss(y_true=data.y, pred_decision=score) \ +1./(2.*C1)*np.sum(self.beta**2) + 1./(2.*C2)*embed_loss return obj
def validation_metric_vw(self): v = open('%s' % self.holdout_pred, 'r') y_pred_holdout = [] for line in v: y_pred_holdout.append(float(line.split()[0].strip())) if self.outer_loss_function == 'logistic': y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout] loss = log_loss(self.y_true_holdout, y_pred_holdout_proba) elif self.outer_loss_function == 'squared': loss = mean_squared_error(self.y_true_holdout, y_pred_holdout) elif self.outer_loss_function == 'hinge': loss = hinge_loss(self.y_true_holdout, y_pred_holdout) elif self.outer_loss_function == 'pr-auc': loss = -average_precision_score(self.y_true_holdout, y_pred_holdout) elif self.outer_loss_function == 'roc-auc': y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout] fpr, tpr, _ = roc_curve(self.y_true_holdout, y_pred_holdout_proba) loss = -auc(fpr, tpr) self.logger.info('parameter suffix: %s' % self.param_suffix) self.logger.info('loss value: %.6f' % loss) return loss
def validation_metric_vw(self): v = open('%s' % self.holdout_pred, 'r') y_pred_holdout = [] for line in v: y_pred_holdout.append(float(line.split()[0].strip())) if self.outer_loss_function == 'logistic': y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout] loss = log_loss(self.y_true_holdout, y_pred_holdout_proba) elif self.outer_loss_function == 'squared': loss = mean_squared_error(self.y_true_holdout, y_pred_holdout) elif self.outer_loss_function == 'hinge': loss = hinge_loss(self.y_true_holdout, y_pred_holdout) elif self.outer_loss_function == 'pr-auc': loss = -average_precision_score(self.y_true_holdout, y_pred_holdout) elif self.outer_loss_function == 'roc-auc': y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout] fpr, tpr, _ = roc_curve(self.y_true_holdout, y_pred_holdout_proba) loss = -auc(fpr, tpr) else: raise KeyError('Invalide outer loss function') self.logger.info('parameter suffix: %s' % self.param_suffix) self.logger.info('loss value: %.6f' % loss) return loss
def calculate_accuracy(train_x, train_y, test_x, test_y): loss_list = ['benign', 'dos', 'probe', 'u2r', 'r2l'] model.fit(train_x, train_y) model_predicted = model.predict(test_x) model_predicted = pd.DataFrame(model_predicted) model_predicted.columns = test_y.columns from sklearn.metrics import precision_score, mean_squared_error, f1_score, hinge_loss precision_score = precision_score(test_y, model_predicted, average='micro') print('The precision at is ', precision_score) class_proba = model.predict_proba(test_x) class_proba_managed = pd.DataFrame() # temp=class_proba[0] # [row[1] for row in temp] class_proba_managed = class_proba[1] class_proba_managed = pd.DataFrame(class_proba_managed) class_proba_managed.columns = test_y.columns loss_function = [] for columns in test_y.columns: loss_function.append( hinge_loss(test_y[columns], class_proba_managed[columns])) average_loss = (sum(loss_function)) / 2 print("The average loss is", average_loss)
def h_loss(estimator, X_test, y_test): "hinge loss" y_predicted = estimator.predict(X_test) score = -hinge_loss(y_test, y_predicted) return score
def print_metrics(y_true, y_preds): ''' Description: print out accuracy, recall, precision, hinge loss, and f1-score of model ''' print "Accuracy: %.4g" % metrics.accuracy_score(y_true, y_preds, normalize=True) print "Recall: %.4g" % metrics.recall_score(y_true, y_preds) print "Precision: %.4g" % metrics.precision_score(y_true, y_preds) print "Hinge loss: %.4g" % metrics.hinge_loss(y_true, y_preds) print "F1 score: %.4g" % metrics.f1_score(y_true, y_preds)
def svm_run(filters, c_range=1.0, kernel_type='rbf', gamma='auto', train_sizes=[15, 100, 300, 500, 800], table_folder="/", save_file=None, time_from=32, time_to=8, downsample_ratio=None, oversample=None): timesteps = time_from - time_to X_train, X_test, y_train, y_test, churn_number, total_number, feature_names = import_and_preprocess_table( timesteps, time_from, time_to, filters, table_folder, downsample_ratio, oversample) X_train = list(map(lambda x: x.flatten(), X_train)) X_test = list(map(lambda x: x.flatten(), X_test)) clf = svm.SVC(kernel=kernel_type, gamma=gamma, C=c_range) # train_sizes, train_scores, validation_scores = learning_curve(clf, X_train, y_train, train_sizes=train_sizes, cv=5, shuffle=True, scoring='f1') train_sizes, train_scores, validation_scores = training_curve( clf, X_train, y_train, X_test, y_test, train_sizes=train_sizes, shuffle=True, scoring='precision', train_last=True) # print(train_scores, valid_scores) clf.fit(X_train, y_train) # cross_val_score(clf, X_train, y_train, scoring='recall_macro', cv=5) y_pred = clf.predict(X_test) if kernel_type == 'linear': feature_importances = clf.coef_.flatten() else: feature_importances = [] scores = [ accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), hinge_loss(y_test, y_pred), f1_score(y_test, y_pred) ] # print_feature_importances(feature_importances, feature_names, all=True) print(y_pred) return [ y_pred, y_test, feature_importances, scores, train_sizes, train_scores, validation_scores, churn_number, total_number, feature_names ]
def test_hinge_loss_binary(): y_true = np.array([-1, 1, 1, -1]) pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision)) with warnings.catch_warnings(): # Test deprecated pos_label assert_equal( hinge_loss(-y_true, pred_decision), hinge_loss(y_true, pred_decision, pos_label=-1, neg_label=1)) y_true = np.array([0, 2, 2, 0]) pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision)) with warnings.catch_warnings(): # Test deprecated pos_label assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision, pos_label=2, neg_label=0))
def class_cost(self, X, Yc): """Calculates the current cost of classification""" predictions = [ x @ wc + rho for x, wc, rho in zip(X, self.Wc, self.rho) ] h_loss = sum([hinge_loss(y, pred) for y, pred in zip(Yc, predictions)]) return h_loss
def train(self, kernel_type): print("--- Training {} SVM with Gamma = {} ---".format( kernel_type, self.bestGamma)) start_time = datetime.now() # Build the SVM with linear/RBF kernel clf = self.classifier(kernel_=kernel_type, gamma_=self.bestGamma, verbose_=VERBOSE) clf.fit(self.trainX, self.trainY) time = datetime.now() - start_time print("Finish training in ", time) # Compute the loss of the SVM on the training set and test set pred_decision_train = clf.decision_function(self.trainX) loss_train = hinge_loss(self.trainY, pred_decision_train) pred_decision_test = clf.decision_function(self.testX) loss_test = hinge_loss(self.testY, pred_decision_test) print("=> Loss in training set: {:.4f}".format(loss_train)) print("=> Loss in test set: {:.4f}".format(loss_test)) # Compute the accuray predY_train = clf.predict(self.trainX) predY = clf.predict(self.testX) acc_train = accuracy_score(self.trainY, predY_train) predY_test = clf.predict(self.testX) predY = clf.predict(self.testX) acc_test = accuracy_score(self.testY, predY_test) print("=> Accuracy in training set: {:.4f}".format(acc_train)) print("=> Accuracy in test set: {:.4f}".format(acc_test)) # save the well trained classifier # plot AUC fpr, tpr, _ = roc_curve(self.testY, pred_decision_test) AUC = auc(fpr, tpr) #plot_auc_curve(fpr, tpr, AUC) cm = confusion_matrix(self.testY, predY) plot_confusion_matrix(cm, ["Class 0", "Class 1"], title=self.dataset_name) self.clf = clf
def check_vb(datanm, samples_per_class, Cs, num_classes, gamma, num_iter = 100, kernel = 'linear', strat = 'ovr'): data, labels = load_full(datanm, samples_per_class) slo = StratifiedShuffleSplit(labels, n_iter=num_iter, test_size=0.5, train_size=0.5, random_state=None) ans = np.zeros((len(Cs), len(gamma), samples_per_class/2, 4)) for train_index, test_index in slo: train_data = [data[train_index, :], labels[train_index]] valid_data = [data[test_index , :], labels[test_index ]] for l in xrange(samples_per_class/2): ind_train = [] ind_valid = [] for k in xrange(num_classes): ind_train = ind_train + np.where(train_data[1] == k)[0].tolist()[:l+1] ind_valid = ind_valid + np.where(valid_data[1] == k)[0].tolist()[:l+1] ctrain_data = [ train_data[0][ind_train], train_data[1][ind_train] ] cvalid_data = [ valid_data[0][ind_valid], valid_data[1][ind_valid] ] for i, C in enumerate(Cs): for j, g in enumerate(gamma): clf = svm.SVC(C=C, kernel=kernel, degree=3, gamma=g, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=10000, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=strat, random_state=None) clf.fit(ctrain_data[0], ctrain_data[1]) #out_train = clf.predict_proba(ctrain_data[0]) #out_valid = clf.predict_proba(cvalid_data[0]) #ans[i, l, 0] += log_loss(ctrain_data[1], out_train) #ans[i, l, 1] += log_loss(cvalid_data[1], out_valid) out_train = clf.decision_function(train_data[0]) out_valid = clf.decision_function(valid_data[0]) ans[i, j, l, 2] += hinge_loss(train_data[1], out_train, range(num_classes)) ans[i, j, l, 3] += hinge_loss(valid_data[1], out_valid, range(num_classes)) ans /= num_iter np.savez("svm_bv_" + kernel + '_' + strat, ans= ans, Cs = Cs, num_iter = num_iter, num_classes = num_classes, samples_per_class = samples_per_class) return ans
def get_soft_linear_svm_w_b(self, subset_c): x = self.X[subset_c] y = self.Y[subset_c] reg_par = float(1) / (2.0 * self.lamb * subset_c.shape[0]) model = svm.SVC(kernel='linear', C=reg_par) model.fit(x, y) y_pred = model.decision_function(x) w = model.coef_ reg = self.lamb * (subset_c.shape[0]) * np.dot(w, w.T)[0][0] hinge_machine_loss = hinge_loss(y, y_pred) hinge_machine_loss *= y_pred.shape[0] return reg + hinge_machine_loss
def optimizelinearsvc(sigma, knn, penalty, tolerance): #using the package x1 = [[0] * D for k in range(knn)] lossfunction = [0] * knn totalloss = 0 beta = [0] * knn beta0 = [0] * knn x_training = [[] for k in range(knn)] y_training = [[] for k in range(knn)] a_training = [0] * knn # print(np.array(x_training).shape) # # print(np.array(y_training).shape) for k in range(knn): for i in range(N): if sigma[i][k] >= 0.5: x_training[k].append(x[i]) y_training[k].append(y[i]) # index = [[0]*J for k in range(knn)] # indicator = [0]* knn # for k in range(knn): # for j in range(J): # for i in range(N): # if y_training[k][i]==j: # index[k][j] = 1 # break for k in range(knn): lin_clf = LinearSVC(C=penalty, tol=tolerance) lin_clf.fit(x_training[k], y_training[k]) beta[k] = lin_clf.coef_ beta0[k] = lin_clf.intercept_ a_training[k] = accuracy_score(y_training[k], lin_clf.predict(x_training[k])) b_training = lin_clf.decision_function(x_training[k]) lossfunction[k] = hinge_loss(y_training[k], b_training) for k in range(knn): totalloss = totalloss + a_training[k] * sum(sigma[i][k] for i in range(N)) # print ('accuracy', accuracy_score(vay, lin_clf.predict(x_testing_extraction))) for k in range(knn): for d in range(D): x1[k][d] = sum(sigma[i][k] * x[i][d] for i in range(N)) / sum(sigma[i][k] for i in range(N)) return a_training, beta, beta0, x1, lossfunction, totalloss / N
def prediction(X_train, X_test, y_train, y_test): [classifier_names, classifiers] = build_classifiers() for cidx, clf_name in enumerate(classifier_names): clf = classifiers[cidx].fit(X_train, y_train) y_pred = clf.predict(X_test) if hasattr(clf, "decision_function"): pred_decision = clf.decision_function(X_test) else: pred_decision = clf.predict_proba(X_test)#[:, 1] performances = [cohen_kappa_score(y_test, y_pred), hinge_loss(y_test, pred_decision), matthews_corrcoef(y_test, y_pred)] print("%s\t cohen_kappa_score: %.2f\t hinge_loss: %.2f\tmatthews_corrcoef:%.2f " % (clf_name, performances[0], performances[1], performances[2])) cm = confusion_matrix(y_test, y_pred) return ["%.2f" % item for item in performances], cm
def cross_validation(self, model): # kfold = cross_validation.KFold(self.train_x.shape[0], n_folds=5, shuffle=True, random_state=self.random_state) kfold = cross_validation.StratifiedKFold( self.train_y, n_folds=self.k_fold_, shuffle=True, random_state=self.random_state) scores = { 'auc': list(), 'hinge_loss': list(), 'log_loss': list(), 'accuracy': list(), 'precision': list(), 'recall': list(), 'f1_value': list() } #scores = list() preds = np.zeros(len(self.train_y)) i = 0 for train_idx, test_idx in kfold: print(' --------- fold {0} ---------- '.format(i)) train_x = self.train_x.iloc[ train_idx] # 明示的にindex, columsを番号で指定したい, sinhrks.hatenablog.com/entry/2014/11/12/233216 train_y = self.train_y[train_idx] test_x = self.train_x.iloc[test_idx] test_y = self.train_y[test_idx] model.fit(train_x, train_y) pred = model.predict(test_x) score = metrics.roc_auc_score(test_y, pred) preds[test_idx] = pred score = metrics.roc_auc_score(test_y, pred) # auc scores['auc'].append(score) score = metrics.hinge_loss(test_y, pred) # hinge_loss scores['hinge_loss'].append(score) score = metrics.log_loss(test_y, pred) # log_loss scores['log_loss'].append(score) #score = metrics.accuracy_score(test_y, pred)# accuracy #scores['accuracy'].append(score) #score = metrics.precision_score(test_y, pred)# precision #scores['precision'].append(score) #score = metrics.recall_score(test_y, pred)# recall #scores['recall'].append(score) #score = metrics.f1_score(test_y, pred)# f_value #scores['f1_value'].append(score) i += 1 for key in scores.keys(): scores[key] = np.asarray(scores[key], dtype=np.float32) #print scores.mean(), scores.std() return scores, preds
def compare(self, model, X, y): """Compares the score of a sample in two models. Returns a crossvalidation of metrics, predictions and score. :param model: model :param X: data :type model: MultiModelClassifier :type X: ndarray or scipy.sparse matrix, (n_samples, n_features) """ scores = {} y_pred = self.predict(X) y_pred_prob = self._predict_prob(X) other_y_pred = model.predict(X) other_y_pred_prob = model._predict_prob(X) self._guess_problem(y) if self._problem == 'binary': #Binary-only metrics scores['PreRec'] = (metrics.precision_recall_curve(y, y_pred_prob), metrics.precision_recall_curve( y, other_y_pred_prob)) scores['ROC'] = (metrics.roc_curve(y, y_pred_prob), metrics.roc_curve(y, other_y_pred_prob)) scores['Kappa'] = (metrics.cohen_kappa_score(y, y_pred), metrics.cohen_kappa_score(y, other_y_pred)) scores['Confusion'] = (metrics.confusion_matrix(y, y_pred), metrics.confusion_matrix(y, other_y_pred)) scores['HL'] = (metrics.hinge_loss(y, y_pred_prob), metrics.hinge_loss(y, other_y_pred_prob)) scores['MCC'] = (metrics.matthews_corrcoef(y, y_pred), metrics.matthews_corrcoef(y, other_y_pred)) return scores
def linear_test(iris_X, iris_y): optimal_c, iris_X_train, iris_y_train, iris_X_valid, iris_y_valid, iris_X_test, iris_y_test = find_c( iris_X, iris_y) # Test loss and accuracy for optimal c svc = svm.SVC(kernel='linear', C=optimal_c) svc.fit(iris_X_train, iris_y_train) predictions = svc.predict(iris_X_test) test_score = accuracy_score(iris_y_test, predictions) prediction_dec = svc.decision_function(iris_X_test) h_loss_t = hinge_loss(iris_y_test, prediction_dec) print(" Linear>>>>") print("Testing Score and loss for Optimal C= {} is : {}, {} \n".format( optimal_c, test_score * 100.0, h_loss_t))
def apply_model_linear(iris_X_train, iris_y_train, iris_X_valid, iris_y_valid, c): svc = svm.SVC(kernel='linear', C=c) svc.fit(iris_X_train, iris_y_train) # Validation loss and accuracy predictions = svc.predict(iris_X_valid) valid_score = accuracy_score(iris_y_valid, predictions) prediction_dec = svc.decision_function(iris_X_valid) h_loss_v = hinge_loss(iris_y_valid, prediction_dec) # # print( # "Validation Score and loss for C= {} is : {}, {}".format(c, valid_score * 100.0, h_loss_v)) return h_loss_v
def feature_update(x,v,j,Z,phi_prototypes,prototypes,labels,bag_index): lamb = 1 #NOTA: Este Lambda de donde?. Z_copy= deepcopy(Z) #phi_prototypes_prim = deepcopy(phi_prototypes) #phi_prototypes_prim[bag_index] = j #prototypes_prim[bag_index] = deepcopy(train_bags[bag_index][j]) for i_index_bag in range (0,len(train_bags)): #Para Cada Bolsa. Z_copy[i_index_bag,j] = np.exp(-lamb * _min_hau_bag(train_bags[i_index_bag],[x])) pred_decision = lin_svc.decision_function(Z_copy) v_prim = hinge_loss(labels, pred_decision) if v_prim > v: v_prim = np.inf break return v_prim, Z_copy, phi_prototypes_prim
def calculate_loss(y_true, y_score, outer_loss_function): if outer_loss_function == 'logistic': y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_score] loss = log_loss(y_true, y_pred_holdout_proba) elif outer_loss_function == 'squared': loss = mean_squared_error(y_true, y_score) elif outer_loss_function == 'hinge': loss = hinge_loss(y_true, y_score) elif outer_loss_function == 'pr-auc': loss = -average_precision_score(y_true, y_score) elif outer_loss_function == 'roc-auc': fpr, tpr, _ = roc_curve(y_true, y_score) loss = 1 - auc(fpr, tpr) return loss
def get_soft_kernel_svm_w_b(self, subset_c): x = self.X[subset_c] y = self.Y[subset_c] reg_par = float(1) / (2.0 * self.lamb * subset_c.shape[0]) model = svm.SVC(C=reg_par, kernel='poly', degree=2, gamma='auto') model.fit(x, y) coef = model.dual_coef_ sv = model.support_vectors_ w = np.dot(coef, sv) reg = self.lamb * (subset_c.shape[0]) * np.dot(w, w.T)[0][0] y_pred = model.decision_function(x) hinge_machine_loss = hinge_loss(y, y_pred) hinge_machine_loss *= y_pred.shape[0] return reg + hinge_machine_loss
def cross_validation(self, model): # kfold = cross_validation.KFold(self.train_x.shape[0], n_folds=5, shuffle=True, random_state=self.random_state) kfold = cross_validation.StratifiedKFold(self.train_y, n_folds=self.k_fold_, shuffle=True, random_state=self.random_state) scores = {'auc':list(), 'hinge_loss':list(), 'log_loss':list(), 'accuracy':list(), 'precision':list(), 'recall':list(), 'f1_value':list()} #scores = list() preds = np.zeros(len(self.train_y)) i = 0 for train_idx, test_idx in kfold: print (' --------- fold {0} ---------- '.format(i)) train_x = self.train_x.toarray()[train_idx] train_y = self.train_y[train_idx] test_x = self.train_x.toarray()[test_idx] test_y = self.train_y[test_idx] model.fit(train_x, train_y) pred = model.predict(test_x) score = metrics.roc_auc_score(test_y, pred) preds[test_idx] = pred score = metrics.roc_auc_score(test_y, pred)# auc scores['auc'].append(score) score = metrics.hinge_loss(test_y, pred)# hinge_loss scores['hinge_loss'].append(score) score = metrics.log_loss(test_y, pred)# log_loss scores['log_loss'].append(score) #score = metrics.accuracy_score(test_y, pred)# accuracy #scores['accuracy'].append(score) #score = metrics.precision_score(test_y, pred)# precision #scores['precision'].append(score) #score = metrics.recall_score(test_y, pred)# recall #scores['recall'].append(score) #score = metrics.f1_score(test_y, pred)# f_value #scores['f1_value'].append(score) i += 1 for key in scores.keys(): scores[key] = np.asarray(scores[key], dtype=np.float32) #print key, scores[key].mean(), scores[key].std() return scores, preds
def learn_SVM(X_train, Y_train, X_test, Y_test, kernel='linear', C=1, gamma=None, print_result=False, print_all=True): """ train an SVM model from extracted characteristics of a signal INPUT : listes train/test OUTPUT : training score, confusion matrix """ # initialization of the SVM model # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html if gamma is not None: SVM_model = svm.SVC(kernel=kernel, C=C, gamma=gamma, class_weight='balanced') else: SVM_model = svm.SVC(kernel=kernel, C=C, class_weight='balanced') # training the model SVM_model.fit(X_train, Y_train) pred_decision = SVM_model.decision_function(X_test) loss = hinge_loss(y_true=Y_test, pred_decision=pred_decision) assert print_result in [True, False] # if True, display the score of the trained model on the test-set and the confusion matrix if print_result: score, confusion_matrix = print_SVM_results(SVM_model, X_test, Y_test, kernel=kernel, C=C, gamma=gamma, print_all=print_all) else: # testing the model score = SVM_model.score(X_test, Y_test) confusion_matrix = None return SVM_model, loss, score, confusion_matrix
def train_svm_poly(X_trn, y_trn, l, P, g): splits = 5 kf = KFold(n_splits=splits, shuffle=True) clf = svm.SVC(kernel='poly', C=1 / (2 * l), degree=P, coef0=g, gamma=1) sum_hinge_loss = 0 for train_index, test_index in kf.split(X_trn): # Split train-test X_train, X_test = X_trn[train_index], X_trn[test_index] y_train, y_test = y_trn[train_index], y_trn[test_index] # Train the model clf.fit(X_train, y_train) predictions = clf.decision_function(X_test) sum_hinge_loss += hinge_loss(y_test, predictions) avg_hinge_loss = sum_hinge_loss / splits return avg_hinge_loss
def apply_model_rbf(iris_X_train, iris_y_train, iris_X_valid, iris_y_valid, c, g, count, to_plot): svc = svm.SVC(kernel='rbf', gamma=g, C=c) svc.fit(iris_X_train, iris_y_train) # Validation loss and accuracy predictions = svc.predict(iris_X_valid) valid_score = accuracy_score(iris_y_valid, predictions) prediction_dec = svc.decision_function(iris_X_valid) h_loss_v = hinge_loss(iris_y_valid, prediction_dec) if to_plot: p = plot_helper.plot_helper(iris_X_train, iris_y_train, c, g, svc, count) p.plot() # print( # "Validation Score and loss for C= {} and gamma={} is : {}, {}".format(c, g, valid_score * 100.0, h_loss_v)) return h_loss_v
def get_soft_linear_svm_w(self, subset_c): x = self.X[subset_c] y = self.Y[subset_c] reg_par = float(1) / (2.0 * self.lamb * subset_c.shape[0]) model = svm.LinearSVC(fit_intercept=False, C=reg_par, loss='hinge') model.fit(x, y) y_pred = model.decision_function(x) w = model.coef_ b = model.intercept_ assert (b == 0) reg = self.lamb * (subset_c.shape[0]) * np.dot(w, w.T)[0][0] hinge_machine_loss = hinge_loss(y, y_pred) hinge_machine_loss *= y_pred.shape[0] return reg + hinge_machine_loss
def cross_validation(self, model): kfold = cross_validation.StratifiedKFold(self.train_y, n_folds=self.k_fold_, shuffle=True, random_state=self.random_state) scores = {'auc':list(), 'hinge_loss':list(), 'log_loss':list(), 'accuracy':list(), 'precision':list(), 'recall':list(), 'f1_value':list()} #scores = list() preds = np.zeros(len(self.train_y)) i = 0 for train_idx, test_idx in kfold: print (' --------- fold {0} ---------- '.format(i)) train_x = self.train_x.iloc[train_idx] # 明示的にindex, columsを番号で指定したい, sinhrks.hatenablog.com/entry/2014/11/12/233216 train_y = self.train_y[train_idx] test_x = self.train_x.iloc[test_idx] test_y = self.train_y[test_idx] model.fit(train_x, train_y) pred = model.predict(test_x) preds[test_idx] = pred score = metrics.roc_auc_score(test_y, pred)# auc scores['auc'].append(score) score = metrics.hinge_loss(test_y, pred)# hinge_loss scores['hinge_loss'].append(score) score = metrics.log_loss(test_y, pred)# log_loss scores['log_loss'].append(score) #score = metrics.accuracy_score(test_y, pred)# accuracy #scores['accuracy'].append(score) #score = metrics.precision_score(test_y, pred)# precision #scores['precision'].append(score) #score = metrics.recall_score(test_y, pred)# recall #scores['recall'].append(score) #score = metrics.f1_score(test_y, pred)# f_value #scores['f1_value'].append(score) i += 1 for key in scores.keys(): scores[key] = np.asarray(scores[key], dtype=np.float32) #print scores.mean(), scores.std() return scores, preds
def getResult(self, predict, data_set): y_true, y_predict = control.calculate_entire_ds(predict, data_set) result = metrics.classification_report(y_true, y_predict) result += "\nAccuracy classification: %f\n" % metrics.accuracy_score(y_true, y_predict) result += "F1 score: %f\n" % metrics.f1_score(y_true, y_predict) result += "Fbeta score: %f\n" % metrics.fbeta_score(y_true, y_predict, beta=0.5) result += "Hamming loss: %f\n" % metrics.hamming_loss(y_true, y_predict) result += "Hinge loss: %f\n" % metrics.hinge_loss(y_true, y_predict) result += "Jaccard similarity: %f\n" % metrics.jaccard_similarity_score(y_true, y_predict) result += "Precision: %f\n" % metrics.precision_score(y_true, y_predict) result += "Recall: %f\n" % metrics.recall_score(y_true, y_predict) if self.is_binary(): result += "Average precision: %f\n" % metrics.average_precision_score(y_true, y_predict) result += "Matthews correlation coefficient: %f\n" % metrics.matthews_corrcoef(y_true, y_predict) result += "Area Under the Curve: %f" % metrics.roc_auc_score(y_true, y_predict) return result
def run_question_17_svm(x_trn,y_trn,c = [2,20,200],gamma = [1,0.01,0.001],kernel='rbf'): for penalty in c: for g in gamma: hinge_losses = [] kf = KFold(n_splits=5, shuffle=True, random_state=3815) for train_index, test_index in kf.split(x_trn): x_trn_5, x_tst_5 = x_trn[train_index], x_trn[test_index] y_trn_5, y_tst_5 = y_trn[train_index], y_trn[test_index] clf = svm.SVC(kernel = kernel, C = 1/penalty, gamma = g) clf.fit(x_trn_5,y_trn_5) y_pred = clf.decision_function(x_tst_5) hinge_losses.append(hinge_loss(y_tst_5,y_pred)) mean_hinge_loss = (sum(hinge_losses)/5) print("The mean hinge loss with 5-Fold cross validation using hinge loss with lambda = " + str(penalty) + " and gamma = " + str(g) + " is: " , mean_hinge_loss)
def test_hinge_loss_multiclass_with_missing_labels(): pred_decision = np.array([ [0.36, -0.17, -0.58, -0.99], [-0.55, -0.38, -0.48, -0.58], [-1.45, -0.58, -0.38, -0.17], [-0.55, -0.38, -0.48, -0.58], [-1.45, -0.58, -0.38, -0.17] ]) y_true = np.array([0, 1, 2, 1, 2]) labels = np.array([0, 1, 2, 3]) dummy_losses = np.array([ 1 - pred_decision[0][0] + pred_decision[0][1], 1 - pred_decision[1][1] + pred_decision[1][2], 1 - pred_decision[2][2] + pred_decision[2][3], 1 - pred_decision[3][1] + pred_decision[3][2], 1 - pred_decision[4][2] + pred_decision[4][3] ]) dummy_losses[dummy_losses <= 0] = 0 dummy_hinge_loss = np.mean(dummy_losses) assert_equal(hinge_loss(y_true, pred_decision, labels=labels), dummy_hinge_loss)
def test_hinge_loss_multiclass(): pred_decision = np.array([ [0.36, -0.17, -0.58, -0.99], [-0.54, -0.37, -0.48, -0.58], [-1.45, -0.58, -0.38, -0.17], [-0.54, -0.38, -0.48, -0.58], [-2.36, -0.79, -0.27, 0.24], [-1.45, -0.58, -0.38, -0.17] ]) y_true = np.array([0, 1, 2, 1, 3, 2]) dummy_losses = np.array([ 1 - pred_decision[0][0] + pred_decision[0][1], 1 - pred_decision[1][1] + pred_decision[1][2], 1 - pred_decision[2][2] + pred_decision[2][3], 1 - pred_decision[3][1] + pred_decision[3][2], 1 - pred_decision[4][3] + pred_decision[4][2], 1 - pred_decision[5][2] + pred_decision[5][3] ]) dummy_losses[dummy_losses <= 0] = 0 dummy_hinge_loss = np.mean(dummy_losses) assert_equal(hinge_loss(y_true, pred_decision), dummy_hinge_loss)
def evaluate(estimator, dev_X, dev_y): print('evaluating on development set', flush=True) guess_dev = estimator.predict(dev_X) score_roc_auc_dev = roc_auc_score(dev_y, guess_dev) print('{:.4f} -- roc auc'.format(score_roc_auc_dev)) score_brier_loss_dev = brier_score_loss(dev_y, guess_dev) print('{:.4f} -- brier loss'.format(score_brier_loss_dev)) score_log_loss_dev = log_loss(dev_y, estimator.predict_proba(dev_X)) print('{:.4f} -- log loss'.format(score_log_loss_dev)) guess_dev_negative_one = guess_dev.copy().astype('int8') guess_dev_negative_one[guess_dev_negative_one == 0] = -1 ''' decision_fuction not implemented # score_hinge_loss_dev = hinge_loss(dev_y, estimator.decision_function(dev_X)) ''' score_hinge_loss_dev = hinge_loss(dev_y, guess_dev_negative_one) print('{:.4f} -- hinge loss'.format(score_hinge_loss_dev)) score_matthews_corrcoef_dev = matthews_corrcoef(dev_y, guess_dev_negative_one) print('{:.4f} -- matthews_corrcoef'.format(score_matthews_corrcoef_dev)) print(flush=True) return score_roc_auc_dev, score_brier_loss_dev,\ score_log_loss_dev, score_hinge_loss_dev, score_matthews_corrcoef_dev
param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8) Clf.fit(TrainFvs,TrainLabels) PredictedLabels = Clf.predict(TestFvs) print '*'*100 print 'classification report' print '-'*20 Accuracy = np.mean(PredictedLabels == TestLabels) print "Test Set Accuracy = ", Accuracy print(metrics.classification_report(TestLabels, PredictedLabels, target_names=['Neg', 'Pos'])) print "Accuracy classification score:", metrics.accuracy_score(TestLabels, PredictedLabels) print "Hamming loss:", metrics.hamming_loss(TestLabels, PredictedLabels) print "Average hinge loss:", metrics.hinge_loss(TestLabels, PredictedLabels) print "Log loss:", metrics.log_loss(TestLabels, PredictedLabels) print "F1 Score:", metrics.f1_score(TestLabels, PredictedLabels) print "Zero-one classification loss:", metrics.zero_one_loss(TestLabels, PredictedLabels) print '*'*100 print 'total vocab size: {} '.format(len(model.vocab.keys())) # for k,v in model.vocab.iteritems(): # print k # print v # raw_input()
def main_func(datanm, samples_per_class, C, num_classes, gamma, num_iter = 100, kernel = 'linear', strat = 'ovr'): data, labels = load_full(datanm, samples_per_class) slo = StratifiedShuffleSplit(labels, n_iter=num_iter, test_size=0.3, train_size=0.7, random_state=None) recall = np.zeros((num_classes+1, 2)) precision = np.zeros((num_classes+1, 2)) f1 = np.zeros((num_classes+1, 2)) accuracy = np.zeros((2)) logloss = np.zeros((2)) hingeloss = np.zeros((2)) for train_index, test_index in slo: train_data = [data[train_index, :], labels[train_index]] valid_data = [data[test_index , :], labels[test_index ]] clf = svm.SVC(C=C, kernel=kernel, degree=3, gamma=gamma, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=10000, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=strat, random_state=None) clf.fit(train_data[0], train_data[1]) #out_train = clf.predict_proba(train_data[0]) #out_valid = clf.predict_proba(valid_data[0]) #logloss[0] += log_loss(train_data[1], out_train) #logloss[1] += log_loss(valid_data[1], out_valid) out_train = clf.decision_function(train_data[0]) out_valid = clf.decision_function(valid_data[0]) hingeloss[0] += hinge_loss(train_data[1], out_train) hingeloss[1] += hinge_loss(valid_data[1], out_valid) out_train = clf.predict(train_data[0]) out_valid = clf.predict(valid_data[0]) accuracy[0] += accuracy_score(train_data[1], out_train) accuracy[1] += accuracy_score(valid_data[1], out_valid) precision[:-1, 0] += precision_score(train_data[1], out_train, average = None) precision[-1, 0] += precision_score(train_data[1], out_train, average = 'macro') precision[:-1, 1] += precision_score(valid_data[1], out_valid, average = None) precision[-1, 1] += precision_score(valid_data[1], out_valid, average = 'macro') recall[:-1, 0] += recall_score(train_data[1], out_train, average = None) recall[-1, 0] += recall_score(train_data[1], out_train, average = 'macro') recall[:-1, 1] += recall_score(valid_data[1], out_valid, average = None) recall[-1, 1] += recall_score(valid_data[1], out_valid, average = 'macro') f1[:-1, 0] += f1_score(train_data[1], out_train, average = None) f1[-1, 0] += f1_score(train_data[1], out_train, average = 'macro') f1[:-1, 1] += f1_score(valid_data[1], out_valid, average = None) f1[-1, 1] += f1_score(valid_data[1], out_valid, average = 'macro') f1 /= num_iter recall /= num_iter precision /= num_iter logloss /= num_iter accuracy /= num_iter np.savez("svm_final_" + kernel + '_' + strat, accuracy = accuracy, recall = recall, f1 = f1, precision = precision, logloss = logloss, C = C, num_iter = num_iter, num_classes = num_classes, samples_per_class = samples_per_class, hingeloss = hingeloss) return [accuracy, recall, f1, precision, logloss, hingeloss]
def train_svm(train_data, valid_data, test_data, model_dir, C=1.0, kernel='rbf', num_classes=10, tol=0.001, max_iterations=-1, verbose=False, random_state=12345678, **kwargs): """ Train a Support Vector Machine model on the given data Args: X_train: Training feature data (Type: np.ndarray) y_train: Training label data (Type: np.ndarray) X_test: Testing feature data (Type: np.ndarray) y_test: Testing label data (Type: np.ndarray) Keyword Args: C: SVM regularization hyperparameter (Type: float) verbose: If True, print verbose messages (Type: bool) Returns: clf: Classifier object (Type: sklearn.svm.SVC) y_train_pred: Predicted train output of classifier (Type: np.ndarray) y_test_pred: Predicted test output of classifier (Type: np.ndarray) """ np.random.seed(random_state) random.seed(random_state) X_train = train_data['features'] y_train = train_data['labels'] model_output_path = os.path.join(model_dir, "model.pkl") # Create classifier clf = SVC(C=C, probability=True, kernel=kernel, max_iter=max_iterations, tol=tol, random_state=random_state, verbose=verbose) # Fit data and get output for train and valid batches LOGGER.debug('Fitting model to data...') clf.fit(X_train, y_train) LOGGER.info('Saving model...') joblib.dump(clf, model_output_path) y_train_pred = clf.predict(X_train) # Compute new metrics classes = np.arange(num_classes) train_loss = hinge_loss(y_train, clf.decision_function(X_train), labels=classes) train_metrics = compute_metrics(y_train, y_train_pred, num_classes=num_classes) train_metrics['loss'] = train_loss train_msg = 'Train - hinge loss: {}, acc: {}' LOGGER.info(train_msg.format(train_loss, train_metrics['accuracy'])) if valid_data: X_valid = valid_data['features'] y_valid = valid_data['labels'] y_valid_pred = clf.predict(X_valid) valid_loss = hinge_loss(y_valid, clf.decision_function(X_valid), labels=classes) valid_metrics = compute_metrics(y_valid, y_valid_pred, num_classes=num_classes) valid_metrics['loss'] = valid_loss valid_msg = 'Valid - hinge loss: {}, acc: {}' LOGGER.info(valid_msg.format(valid_loss, valid_metrics['accuracy'])) else: valid_metrics = {} # Evaluate model on test data if test_data: X_test = test_data['features'] y_test_pred_frame = clf.predict_proba(X_test) y_test_pred = [] for start_idx, end_idx in test_data['file_idxs']: class_pred = y_test_pred_frame[start_idx:end_idx].mean(axis=0).argmax() y_test_pred.append(class_pred) y_test_pred = np.array(y_test_pred) test_metrics = compute_metrics(test_data['labels'], y_test_pred, num_classes=num_classes) else: test_metrics = {} return clf, train_metrics, valid_metrics, test_metrics
def test_hinge_loss(self): result = self.df.metrics.hinge_loss() expected = metrics.hinge_loss(self.target, self.decision) self.assertEqual(result, expected)
def main(): # if sys.argv[2] == 'svm': # Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100) # elif sys.argv[2] == 'lr': # Clf = LogisticRegression (C=0.1,max_iter=100,n_jobs=8) # elif sys.argv[2] == 'pa': # Clf = PassiveAggressiveClassifier(C=0.1,n_iter=1,n_jobs=8,class_weight='balanced') # else: # Clf = SGDClassifier(n_iter=1,n_jobs=8,class_weight='balanced') Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100) Clf = LogisticRegression (C=0.1,max_iter=1000,n_jobs=8,class_weight='balanced') Clf = GridSearchCV(LogisticRegression(max_iter=1000,n_jobs=8,class_weight='balanced'), cv=5, param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8) # Clf = GridSearchCV(LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=1000), cv=3, # param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8) File = '/home/annamalai/Senti/UCI/amazon_cells_labelled.txt' Ngram = 2 print 'Clf: {}, File: {}, ngram: {}'.format(Clf, File, Ngram) PosSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('1')]#[:100] NegSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('0')]#[:100] print 'loaded {} pos and {} neg samples'.format(len(PosSamples), len(NegSamples)) X = PosSamples + NegSamples y = [1 for _ in xrange(len(PosSamples))] + [-1 for _ in xrange (len(NegSamples))] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random.randint(0,100)) print '# TrainLabels', len(y_train) print '# TestLabels', len(y_test) print 'performing CVectorizer' CVectorizer = CountVectorizer(lowercase = True, stop_words='english', # token_pattern='(?u)\b\w\w+\b', # tokenizer = SGTokenizer, tokenizer = Tokenizer, ngram_range=(1,2), dtype=np.float64, decode_error = 'ignore', max_df=0.8) print 'performing TfidfTransformer and Normalizer' # TFIDFTransformer = TfidfTransformer() normalizer = Normalizer() print 'creating Train and Test FVs' T0 = time() TrainFVs = CVectorizer.fit_transform(X_train) TestFVs = CVectorizer.transform(X_test) print 'feat ext time', time() - T0 # TrainFVs = TFIDFTransformer.fit_transform(TrainFVs) # TestFVs = TFIDFTransformer.transform(TestFVs) TrainFVs = normalizer.fit_transform(TrainFVs) TestFVs = normalizer.transform(TestFVs) print 'Trai/test split' print TrainFVs.shape print TestFVs.shape # raw_input('hit any key...') print 'training classifier with train samples shape:', TrainFVs.shape T0 = time() # memory_dump('before_train_mem.txt') Model = Clf.fit (TrainFVs, y_train) # re-train on current training set (daily) print 'batch fitted' print 'training time', time() - T0 # memory_dump('after_train_mem.txt') print 'testing classifier with test samples shape:', TestFVs.shape T0 = time() # memory_dump('before_test_mem.txt') PredictedLabels = Clf.predict(TestFVs) print 'testing time', time() - T0 # memory_dump('after_test_mem.txt') print '*'*100 print 'classification report' print '-'*20 Accuracy = np.mean(PredictedLabels == y_test) print "Test Set Accuracy = ", Accuracy print(metrics.classification_report(y_test, PredictedLabels, target_names=['Neg', 'Pos'])) print "Accuracy classification score:", metrics.accuracy_score(y_test, PredictedLabels) print "Hamming loss:", metrics.hamming_loss(y_test, PredictedLabels) print "Average hinge loss:", metrics.hinge_loss(y_test, PredictedLabels) print "Log loss:", metrics.log_loss(y_test, PredictedLabels) print "F1 Score:", metrics.f1_score(y_test, PredictedLabels) print "Zero-one classification loss:", metrics.zero_one_loss(y_test, PredictedLabels) print '*'*100 Vocab = CVectorizer.get_feature_names() # print Vocab[:100] # raw_input() try: FeatureImportances = Clf.coef_[0] except: FeatureImportances = Clf.best_estimator_.coef_[0] print FeatureImportances.shape raw_input() PosTopFeatureIndices = FeatureImportances.argsort()[-100:][::-1] NegTopFeatureIndices = FeatureImportances.argsort()[:100][::-1] for PosFIndex, NegFIndex in zip(PosTopFeatureIndices, NegTopFeatureIndices): print Vocab[PosFIndex], '+-', Vocab[NegFIndex] FeatureImportancesSparseArray = ssp.lil_matrix((TestFVs.shape[1],TestFVs.shape[1])) FeatureImportancesSparseArray.setdiag(FeatureImportances) AllFVsTimesW = TestFVs*FeatureImportancesSparseArray print AllFVsTimesW.shape Ind = 0 for TestFV in TestFVs: if PredictedLabels[Ind] != y_test[Ind]: Ind += 1 continue if len(X_test[Ind].split()) < 5: Ind += 1 continue print 'Sample: {}, actual label: {}'.format(X_test[Ind], y_test[Ind]) # print TestFV # print TestFV.shape CurTestFV = np.array(AllFVsTimesW[Ind].toarray()) CurTestFV = CurTestFV.transpose() CurTestFV = CurTestFV.reshape(CurTestFV.shape[0],) # print CurTestFV.shape # raw_input() PosTopFeatureIndices = CurTestFV.argsort()[-2:][::-1] NegTopFeatureIndices = CurTestFV.argsort()[:2][::-1] PosFeatImps= CurTestFV.argsort()[-2:] NegFeatImps = CurTestFV.argsort()[:2] Tmp = AllFVsTimesW[Ind].todense() Tmp = np.sort(Tmp) # print PosTopFeatureIndices, AllFVsTimesW[Ind].todense().argsort(), Tmp # print NegTopFeatureIndices, NegFeatImps if y_test[Ind] == 1: print 'top postive feats:', colored(', '.join(['['+Vocab[PosFIndex]+']' for PosFIndex in PosTopFeatureIndices]), 'green') else: print 'top negative feats: ', colored(', '.join (['['+Vocab[NegFIndex]+']' for NegFIndex in NegTopFeatureIndices]), 'red') Ind += 1 raw_input()