def compute_ranking(learnFullModel=False): path='/home/arya/PubMed/GEO/Datasets/' modelpath=path+'libsvm/model/' if not os.path.exists(modelpath): os.makedirs(modelpath) outpath='{}libsvm/out/'.format(path) sys.stdout=open('{}SVM.log'.format('/home/arya/PubMed/GEO/Log/'),'w') sys.stderr=open('{}SVM.err'.format('/home/arya/PubMed/GEO/Log/'),'w') if not os.path.exists(outpath): os.makedirs(outpath) X, Y = load_svmlight_file(path+'Corpus.libsvm',multilabel=True) Y=np.array(Y) if learnFullModel: model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, Y) joblib.dump(model, modelpath+'Model.libsvm') print 'The Full Model is Saved!' Folds=pd.read_pickle(path+'Folds.df') for fold in range(Folds.shape[1]): start=time() Xtr,Ytr=X[Folds[fold].values,:],Y[Folds[fold].values] print 'learning on fold...',Xtr.shape,fold, sys.stdout.flush() model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(Xtr, Ytr) Xte=X[~Folds[fold].values,:] labels=model.classes_ # Yte=remove_unknown_classes(Yte, labels) # idx=np.array(map(lambda x: len(x)>0,Yte)) # Yte=np.array(Yte)[idx] # Xte=Xte[idx] print 'predicting...',Xte.shape, sys.stdout.flush() pd.DataFrame(columns=labels,data=model.decision_function(Xte)).to_pickle('{}deci.{}.df'.format(outpath,fold)) # (pd.DataFrame(columns=labels,data=MultiLabelBinarizer().fit_transform(list(Yte)+[labels]))).iloc[:-1].to_pickle('{}labels.{}.df'.format(outpath,fold)) # ranking.to_pickle('{}ranking.{}.df'.format(outpath,fold)) print 'Done in {:.0f} minutes'.format((time()-start)/60.0)
def setUp(self): import sklearn.svm as svm import sklearn.preprocessing as pp from sklearn.multiclass import OneVsRestClassifier # 2 class iris = datasets.load_iris() self.data = iris.data self.target = pp.LabelBinarizer().fit_transform(iris.target) self.df = pdml.ModelFrame(self.data, target=self.target) self.assertEqual(self.df.shape, (150, 7)) svc1 = svm.SVC(probability=True, random_state=self.random_state) estimator1 = OneVsRestClassifier(svc1) self.df.fit(estimator1) self.df.predict(estimator1) self.assertTrue(isinstance(self.df.predicted, pdml.ModelFrame)) svc2 = svm.SVC(probability=True, random_state=self.random_state) estimator2 = OneVsRestClassifier(svc2) estimator2.fit(self.data, self.target) self.pred = estimator2.predict(self.data) self.proba = estimator2.predict_proba(self.data) self.decision = estimator2.decision_function(self.data) # argument for classification reports self.labels = np.array([2, 1, 0])
class ClassDistanceMapper(TransformerMixin): """ Fit a OneVsRestClassifier for each sentiment class (against all others combined) and return the distances from the decision boundary for each class. Hence, this transformation can be seen as a dimensionality reduction from #words to #sentiment_classes (=5). """ def __init__(self): """ Initialize a one-vs-rest multiclass classifer with a SGDClassifier. The choice of the SGDclassifier here is arbitrary, any other classifier might work as well. """ self.clf = OneVsRestClassifier(LogisticRegression()) def fit(self, X, y): """ Fit the multiclass classifier. """ self.clf.fit(X, y) return self def transform(self, X): """ Return the distance of each sample from the decision boundary for each class. """ return self.clf.decision_function(X)
def benchmark(clf_current): print('_' * 80) print("Test performance for: ") clf_descr = str(clf_current).split('(')[0] print(clf_descr) t0 = time() classif = OneVsRestClassifier(clf_current) classif.fit(X_train, Y_train.toarray()) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() if hasattr(clf_current,"decision_function"): dfmatrix = classif.decision_function(X_test) score = metrics.f1_score(Y_test.toarray(), df_to_preds(dfmatrix, k = 5)) else: probsmatrix = classif.predict_proba(X_test) score = metrics.f1_score(Y_test.toarray(), probs_to_preds(probsmatrix, k = 5)) test_time = time() - t0 print("f1-score: %0.7f" % score) print("test time: %0.3fs" % test_time) print('_' * 80) return clf_descr, score, train_time, test_time
def test_ovr_always_present(): """Test that ovr works with classes that are always present or absent """ # Note: tests is the case where _ConstantPredictor is utilised X = np.ones((10, 2)) X[:5, :] = 0 y = np.zeros((10, 3)) y[5:, 0] = 1 y[:, 1] = 1 y[:, 2] = 1 [[int(i >= 5), 2, 3] for i in range(10)] ovr = OneVsRestClassifier(LogisticRegression()) assert_warns(UserWarning, ovr.fit, X, y) y_pred = ovr.predict(X) assert_array_equal(np.array(y_pred), np.array(y)) y_pred = ovr.decision_function(X) assert_equal(np.unique(y_pred[:, -2:]), 1) y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.ones(X.shape[0])) # y has a constantly absent label y = np.zeros((10, 2)) y[5:, 0] = 1 # variable label ovr = OneVsRestClassifier(LogisticRegression()) assert_warns(UserWarning, ovr.fit, X, y) y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
def test_ovr_fit_predict_sparse(): for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix]: base_clf = MultinomialNB(alpha=1) X, Y = datasets.make_multilabel_classification( n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0 ) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) Y_pred = clf.predict(X_test) clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train)) Y_pred_sprs = clf_sprs.predict(X_test) assert_true(clf.multilabel_) assert_true(sp.issparse(Y_pred_sprs)) assert_array_equal(Y_pred_sprs.toarray(), Y_pred) # Test predict_proba Y_proba = clf_sprs.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > 0.5 assert_array_equal(pred, Y_pred_sprs.toarray()) # Test decision_function clf_sprs = OneVsRestClassifier(svm.SVC()).fit(X_train, sparse(Y_train)) dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int) assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
def test_ovr_multilabel_decision_function(): X, Y = datasets.make_multilabel_classification( n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0 ) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train) assert_array_equal((clf.decision_function(X_test) > 0).astype(int), clf.predict(X_test))
def test_ovr_single_label_decision_function(): X, Y = datasets.make_classification(n_samples=100, n_features=20, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train) assert_array_equal(clf.decision_function(X_test).ravel() > 0, clf.predict(X_test))
def main(): #sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"]) sets = select_sets_by_tag(20,4,tag_names) #sets = random_select_sets(30,6) train_tags = fetch_tags(sets["train"]) train_texts = id_to_filename(sets["train"])#txt_to_list(sets["train"]) #vectorize count_vect = CountVectorizer(stop_words='english', encoding="utf-16", input="filename") X_train_counts = count_vect.fit_transform(train_texts) #tf-idf transformation tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) #process tags mlb = MultiLabelBinarizer() processed_train_tags = mlb.fit_transform(train_tags) #rint(processed_train_tags) #classifier #clf = OneVsRestClassifier(MultinomialNB()) clf = OneVsRestClassifier(LinearSVC()) clf.fit(X_train_tfidf,processed_train_tags) print("classes:{}".format(clf.classes_)) #process test set test_texts = id_to_filename(sets["test"])#txt_to_list(sets["test"]) X_test_counts = count_vect.transform(test_texts) #print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts))) X_test_tfidf = tfidf_transformer.transform(X_test_counts) predicted_tags = clf.predict(X_test_tfidf) predicted_tags_readable = mlb.inverse_transform(predicted_tags) test_tags_actual = fetch_tags(sets["test"]) predicted_probs = clf.decision_function(X_test_tfidf) #predicted_probs = clf.get_params(X_test_tfidf) class_list = mlb.classes_ report = metrics.classification_report(mlb.transform(test_tags_actual),predicted_tags,target_names=class_list) print(report) #retrieve top 30% for each class top_percentage = 30 threshold_index = int( len(sets["test"]) *(top_percentage/100.0) ) threshold_vals_dic = {} threshold_vals = [] num_classes = len(class_list) for i in range(num_classes): z = [ predicted_probs[j,i] for j in range(len(sets["test"]))] z.sort(reverse=True) threshold_vals_dic[class_list[i]]= z[threshold_index] threshold_vals.append(z[threshold_index]) print(threshold_vals_dic) print_predictions(sets["test"],predicted_tags_readable,class_list, class_probablities=predicted_probs,threshold_vals=threshold_vals)
def gensim_classifier(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) label_list = get_labels() tweet_list = get_labelled_tweets() # split all sentences to list of words sentences = [] for tweet in tweet_list: temp_doc = tweet.split() sentences.append(temp_doc) # parameters for model num_features = 100 min_word_count = 1 num_workers = 4 context = 2 downsampling = 1e-3 # Initialize and train the model w2v_model = Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed=1) index_value, train_set, test_set = train_test_split(0.80, sentences) train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features) test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features) train_vector = Imputer().fit_transform(train_vector) test_vector = Imputer().fit_transform(test_vector) # train model and predict model = LinearSVC() classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value]) result = classifier_fitted.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/w2v_linsvc.csv", sep=',') # store the model to mmap-able files create_directory('model') joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc') # evaluation label_score = classifier_fitted.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(label_list, classes=class_list) evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def PR_multi_class(data_train, data_test, data_test_vectors): # Binarize the output y_train_label = label_binarize(data_train.target, classes=[0, 1, 2]) n_classes = y_train_label.shape[1] random_state = np.random.RandomState(0) # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split(data_train_vectors, y_train_label, test_size=.5, random_state=random_state) # Learn to predict each class against the other classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state)) classifier.fit(X_train, y_train) y_pred_score = classifier.decision_function(data_test_vectors) y_test_label = label_binarize(data_test.target, classes=[0, 1, 2]) # Compute Precision-Recall and plot curve precision = dict() recall = dict() average_precision = dict() for i in range(n_classes): precision[i], recall[i], _ = precision_recall_curve(y_test_label[:, i], y_pred_score[:, i]) average_precision[i] = average_precision_score(y_test_label[:, i], y_pred_score[:, i]) # Compute micro-average ROC curve and ROC area precision["micro"], recall["micro"], _ = precision_recall_curve(y_test_label.ravel(), y_pred_score.ravel()) average_precision["micro"] = average_precision_score(y_test_label, y_pred_score, average="micro") # Plot Precision-Recall curve for each class plt.clf() # plt.plot(recall["micro"], precision["micro"], # label='micro-average PR curve (area = {0:0.2f})' # ''.format(average_precision["micro"])) for i in range(n_classes): plt.plot(recall[i], precision[i], label='PR curve of class {0} (area = {1:0.2f})' ''.format(i, average_precision[i])) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall curve of multi-class') plt.legend(loc="lower right") plt.show() return 0
def conduct_test(base_clf, test_predict_proba=False): clf = OneVsRestClassifier(base_clf).fit(X, y) assert_equal(set(clf.classes_), classes) y_pred = clf.predict(np.array([[0, 0, 4]]))[0] assert_equal(set(y_pred), set("eggs")) if hasattr(base_clf, 'decision_function'): dec = clf.decision_function(X) assert_equal(dec.shape, (5,)) if test_predict_proba: X_test = np.array([[0, 0, 4]]) probabilities = clf.predict_proba(X_test) assert_equal(2, len(probabilities[0])) assert_equal(clf.classes_[np.argmax(probabilities, axis=1)], clf.predict(X_test)) # test input as label indicator matrix clf = OneVsRestClassifier(base_clf).fit(X, Y) y_pred = clf.predict([[3, 0, 0]])[0] assert_equal(y_pred, 1)
def lin_svc(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,) ## do transformation into vector fitted_vectoriser = vectoriser.fit(tweet_list) vectorised_tweet_list = fitted_vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list, label_list, test_size=0.8, random_state=42) # train model and predict model = LinearSVC() ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels) result = ovr_classifier.predict(test_vector) # output result to csv create_directory('data') save_to_csv("data/testset_labels.csv", test_labels) result.tofile("data/tfidf_linsvc.csv", sep=',') save_model(ovr_classifier, 'tfidf_linsvc') save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser') # evaluation label_score = ovr_classifier.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
class SVM(ContinuousModel): """C-Support Vector Machine Classifier When decision_function_shape == 'ovr', we use OneVsRestClassifier(SVC) from sklearn.multiclass instead of the output from SVC directory since it is not exactly the implementation of One Vs Rest. References ---------- http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html """ def __init__(self, *args, **kwargs): self.model = sklearn.svm.SVC(*args, **kwargs) if self.model.decision_function_shape == 'ovr': self.decision_function_shape = 'ovr' # sklearn's ovr isn't real ovr self.model = OneVsRestClassifier(self.model) def train(self, dataset, *args, **kwargs): return self.model.fit(*(dataset.format_sklearn() + args), **kwargs) def predict(self, feature, *args, **kwargs): return self.model.predict(feature, *args, **kwargs) def score(self, testing_dataset, *args, **kwargs): return self.model.score(*(testing_dataset.format_sklearn() + args), **kwargs) def predict_real(self, feature, *args, **kwargs): dvalue = self.model.decision_function(feature, *args, **kwargs) if len(np.shape(dvalue)) == 1: # n_classes == 2 return np.vstack((-dvalue, dvalue)).T else: if self.decision_function_shape != 'ovr': LOGGER.warn("SVM model support only 'ovr' for multiclass" "predict_real.") return dvalue
def svm_training_1(combined_data): print('svm_training_1') """ function to perform svm training 1. benign vs pca training 2. pca grading training on pca group params: benignData: dictionary of numpy arrays for benign patients pcaData: dictionary of numpy arrays for pca patients TODO: BALANCE DATA SETS SO EACH GRADE HAS SIMILAR NUMBER OF VOXELS OPTIMIZE KERNELS TO SEE WHICH ONE FITS OUR DATA THE BEST """ start = time.time() training_data, test_data, training_target, test_target = ms.train_test_split( combined_data.get('data'), combined_data.get('label1'), test_size=0.2) svm_plain_classifier = OneVsRestClassifier( svm.SVC(C=1000.0, cache_size=200, class_weight='balanced', decision_function_shape=None, gamma=0.1, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)) svm_plain_classifier.fit(training_data, training_target) score = svm_plain_classifier.decision_function(test_data) false_pos = dict() true_pos = dict() roc_auc = dict() svm_predict = svm_plain_classifier.predict(test_data) accuracy = svm_plain_classifier.score(test_data, test_target) end = time.time() runtime = end - start print('runtime: ' + str(runtime)) print('score: ') print(score) print("benign vs pca accuracy: " + str(accuracy)) print(mt.confusion_matrix(test_target, svm_predict)) classes = np.unique(combined_data.get('label1')) print( mt.classification_report(test_target, svm_predict, target_names=map(str, classes))) # benign_v_pca_results = [test_data, test_target, svm_predict] print(type(test_target)) print(test_target.shape) print(type(score)) print(score.shape) sys.exit() for i in range(len(classes)): false_pos[i], true_pos[i], _ = mt.roc_curve(test_target[:, i], score[:, i]) roc_auc[i] - mt.auc(false_pos[i], true_pos[i]) false_pos["micro"], true_pos["micro"], _ = roc_curve( test_target.ravel(), score.ravel()) roc_auc["micro"] = auc(false_pos["micro"], true_pos["micro"]) plt.figure() lw = 2 plt.plot(false_pos[2], true_pos[2], color='darkorange', lw=lw, label='ROC curve(area = %0.2f)' % roc_auc[2]) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characterisitc Example') plt.legend(loc='lower right') plt.show() return [[test_data, test_target, svm_predict]]
# In[46]: pca = PCA(n_components=n_components,whiten=True) pca.fit(X_train_multiclass) X_train_multiclass_pca = pca.transform(X_train_multiclass) X_test_multiclass_pca = pca.transform(X_test_multiclass) # In[48]: oneRestClassifier=OneVsRestClassifier(lr) oneRestClassifier.fit(X_train_multiclass_pca, y_train_multiclass) y_score=oneRestClassifier.decision_function(X_test_multiclass_pca) # In[49]: # for each class precision = dict() recall = dict() average_precision = dict() for i in range(n_classes): precision[i],recall[i], _ = metrics.precision_recall_curve(y_test_multiclass[:,i],y_score[:,i]) precision['micro'], recall['micro'], _ = metrics.precision_recall_curve(y_test_multiclass.ravel(),y_score.ravel()) average_precision['micro'] = metrics.average_precision_score(y_test_multiclass,y_score,average='micro') print('Average precision score, micro-averaged over all classes: {0:0.2f}'
if args.load_n_classifier == None: n_estimator = OneVsRestClassifier(LinearSVC(random_state=0, C=100, loss='l1', penalty='l2')) n_estimator.fit(X_n_train_PCA, Y_n_train) if not args.save_n_classifier==None: pickle.dump(n_estimator, open(args.save_n_classifier, 'wb')) else: n_estimator = pickle.load(open(args.load_n_classifier, 'rb')) if args.load_s_classifier == None: s_estimator = OneVsRestClassifier(LinearSVC(random_state=0, C=100, loss='l1', penalty='l2')) s_estimator.fit(X_s_train_PCA, Y_s_train) if not args.save_s_classifier==None: pickle.dump(s_estimator, open(args.save_s_classifier, 'wb')) else: s_estimator = pickle.load(open(args.load_s_classifier, 'rb')) test_normal_scores = n_estimator.decision_function(X_n_test_PCA) test_shuffled_scores = n_estimator.decision_function(X_s_test_PCA) test_n_sm = [softmax(line) for line in test_normal_scores] test_s_sm = [softmax(line) for line in test_shuffled_scores] print('normal score:', test_normal_scores[0]) print('shuffled score:', test_shuffled_scores[0]) print('normal softmax:', test_n_sm[0]) print('shuffled softmax:', test_s_sm[0]) root_mse = [np.sqrt(mean_squared_error(test_n_sm[i], test_s_sm[i])) for i in range(len(test_n_sm))] print(root_mse[:5]) print(np.mean(root_mse[:5])) # dist = numpy.linalg.norm(a-b) root_mse_mean = np.mean(root_mse)
paperClassifier.y_train) # Find the best Hyper Parametets for the estimator print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(paperClassifier.parameters_grid.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) paperClassifier.text_clf = LogisticRegression( penalty='l2', tol=best_parameters['classifier__tol']) # Fit the model OneVsRestClassifier paper_clf = OneVsRestClassifier(paperClassifier.pipeline).fit( paperClassifier.x_train, paperClassifier.y_train) y_train_test_score = paper_clf.decision_function( paperClassifier.x_train_test) paperClassifier.plot_roc_curves(y_train_test_score) # Get test IDs too test_ids = list() with open('./data/test.csv', 'r') as f: next(f) for line in f: test_ids.append(line[:-2]) y_pred = paper_clf.predict_proba(paperClassifier.x_test) # Write predictions to a file with open('sample_submission.csv', 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',')
for i in range(T): labels = map(int, input().split(' ')) RawData.append(input()) Labels.append(labels) Queries = [] for i in range(E): Queries.append(input()) RawData.extend(Queries) X = CVectorizer.fit_transform(RawData) Xtf = TfIdfVectorizer.fit_transform(X) del X MLB = MultiLabelBinarizer() Yt = MLB.fit_transform(Labels) XtfTrain = Xtf[0:T] XtfTest = Xtf[T:] Clf = OneVsRestClassifier(LinearSVC(loss='l1', class_weight={ 1: 100, 0: 1 })).fit(XtfTrain, Yt) Classes = list(MLB.classes_) for xTest in XtfTest: y = Clf.decision_function(xTest) y1 = list(y[0]) c1 = Classes lbls = [x for (y, x) in sorted(zip(y1, c1))][-10:] list.reverse(lbls) print(' '.join([str(i) for i in lbls]))
# split into input (X) and output (Y) variables X = dataset[:, 0:13] Y = dataset[:,13] XTest1 = dataset[0:91, 0:13] YTest1 = dataset[0:91, 13] XTest1 = numpy.concatenate([XTest1, dataset[181:271, 0:13]]) YTest1 = numpy.concatenate([YTest1, dataset[181:271, 13]]) XTest1Valid = dataset[91:181, 0:13] YTest1Valid = dataset[91:181, 13] XPredict = dataset[250:271, 0:13] YPredict = dataset[250:271, 13] ovr = OneVsRestClassifier(svm.SVC(kernel='linear', C=2)) ovr.fit(X, Y) cross = cross_val_score(ovr, X, Y, cv=3) print("Cross-Validation") print(cross) scores = ovr.score(XTest1, YTest1) print("Evalutation: %0.2f%%" % (scores.mean()*100)) scores = ovr.score(XTest1Valid, YTest1Valid) print("Validation: %0.2f%%" % (scores.mean()*100)) print("Vraies valeurs = ") print(YPredict) print("Prédictions = ") print(ovr.predict(XPredict)) print(ovr.decision_function(XPredict))
class PPB2(BaseEstimator, ClassifierMixin): """PPB2 model""" def __init__(self, model="morg2-nn+nb", n_proc=8, k=200): model = model.split("-") assert len(model) == 2 self.fp = model[0] assert self.fp in { "rdk", "morg2", "morg3", "rdk_maccs", "circular", "maccs", "all" } self.model_name = model[1] assert self.model_name in { "dum", "nn", "nb", "nn+nb", "bag", "lr", "svc", "etc", "ridge", "ada", "gb", "lda", "xgc" } self.n_proc = n_proc self.k = k model_name = self.model_name if model_name == "dum": self.model = DummyClassifier(strategy="stratified") elif model_name == "nn": self.model = KNeighborsClassifier(n_neighbors=self.k, metric="jaccard", algorithm="brute", n_jobs=self.n_proc) elif model_name == "nb": self.model = BernoulliNB(alpha=1.) elif model_name == "nn+nb": self.model = None elif model_name == "svc": self.model = SVC(probability=True) elif model_name == "bag": self.model = BaggingClassifier( # n_jobs=self.n_proc, n_jobs=None, verbose=True) elif model_name == "lr": self.model = LogisticRegressionCV( max_iter=1000, # n_jobs=self.n_proc, n_jobs=None, ) elif model_name == "ada": self.model = AdaBoostClassifier() elif model_name == "gb": self.model = GradientBoostingClassifier() elif model_name == "lda": self.model = LinearDiscriminantAnalysis() elif model_name == "etc": self.model = ExtraTreesClassifier( n_estimators=500, bootstrap=True, max_features="log2", min_samples_split=10, max_depth=5, min_samples_leaf=3, verbose=True, n_jobs=n_proc ) # capable of multilabel classification out of the box elif model_name == "ridge": self.model = RidgeClassifierCV() elif model_name == "xgc": self.model = XGBClassifier( # n_jobs=self.n_proc, n_jobs=None, num_parallel_tree=None, verbosity=1) else: raise Exception def fit(self, X, y): """ """ assert isinstance(X, pd.Series) assert X.shape[0] == y.shape[0] print("fitting PPB2 model", "({}-{})".format(self.fp, self.model_name), "to", X.shape[0], "SMILES") if len(y.shape) == 1: print("fitting in the single-target setting") self.multi_label = False else: print("fitting in the multi-target setting") print("number of targets:", y.shape[1]) self.multi_label = True if self.multi_label and self.model_name not in support_multi_label.union( {"nn+nb"}): self.model = OneVsRestClassifier( # wrap classifier in OneVsRestClassifier for multi-label case self.model, n_jobs=self.n_proc) # covert X to fingerprint # X = load_training_fingerprints(X, self.fp,) X = compute_fp(smiles=X, all_fp=self.fp, n_proc=self.n_proc) if self.model_name in dense_input: # cannot handle sparse input X = X.A if self.model_name == "nn+nb": # keep training data references for local NB fitting self.X = X self.y = y assert X.shape[0] == y.shape[0] if self.model is not None: print("fitting", self.model_name, "model to", X.shape[0], "'", self.fp, "' fingerprints", "of shape", X.shape, "for", y.shape[1], "targets", "using", self.n_proc, "core(s)") with parallel_backend('loky', n_jobs=self.n_proc): self.model.fit(X, y) return self # def _determine_k_closest_samples(self, X, chunksize=1000): # if not isinstance(X, np.ndarray): # dense needed for jaccard distance # X = X.A # # training_samples = load_training_fingerprints(self.X, self.fp) # training_samples = self.X # if not isinstance(training_samples, np.ndarray): # training_samples = training_samples.A # training_labels = self.y # if not isinstance(training_labels, np.ndarray): # training_labels = training_labels.A # print ("determining", self.k, # "nearest compounds to each query") # n_queries = X.shape[0] # n_chunks = n_queries // chunksize + 1 # print ("chunking queries with chunksize", chunksize,) # print ("number of chunks:", n_chunks) # # idx = np.empty((n_queries, self.k)) # for chunk in range(n_chunks): # chunk_queries = X[chunk*chunksize:(chunk+1)*chunksize] # dists = pairwise_distances( # chunk_queries, # training_samples, # metric="jaccard", n_jobs=self.n_proc, ) # # idx[chunk*chunksize:(chunk+1)*chunksize] = \ # idx = dists.argsort(axis=-1)[:,:self.k] # smallest k distances # k_nearest_samples = training_samples[idx] # return dense # k_nearest_labels = training_labels[idx] # yield (chunk_queries, # k_nearest_samples, k_nearest_labels) # print ("completed chunk", chunk+1) # print ("closest", self.k, "neighbours determined") # assert idx.shape[0] == X.shape[0] # assert idx.shape[1] == self.k # k_nearest_samples = training_samples[idx] # return dense # k_nearest_labels = training_labels[idx] # return k_nearest_samples, k_nearest_labels def _fit_local_nb(self, query, mode="predict", alpha=1.): if len(query.shape) == 1: query = query[None, :] X = self.X y = self.y assert isinstance(query, sp.csr_matrix) assert query.dtype == bool assert isinstance(X, sp.csr_matrix) assert X.dtype == bool # sparse jaccard distance assert query.shape[1] == X.shape[1] dists = pairwise_distances(query.A, X.A, metric="jaccard", n_jobs=1) idx = dists.argsort()[0, :self.k] assert query.shape[0] == 1 X = X[idx] y = y[idx] n_targets = y.shape[-1] pred = np.zeros(n_targets) ones_idx = y.all(axis=0) zeros_idx = (1 - y).all(axis=0) # set prediction for classes where only positive class # is seen pred[ones_idx] = 1 # only fit on targets with pos and neg examples idx = ~np.logical_or(ones_idx, zeros_idx) if idx.any(): nb = BernoulliNB(alpha=alpha) if idx.sum() > 1: nb = OneVsRestClassifier(nb, n_jobs=1) y_ = y[:, idx] if idx.sum() == 1: y_ = y_.flatten() nb.fit(X, y_) pred_ = (nb.predict(query)[0] if mode == "predict" else nb.predict_proba(query)[0]) if idx.sum() == 1 and mode != "predict": assert pred_.shape[0] == 2 assert nb.classes_.any() pred_ = pred_[nb.classes_ == 1] pred[idx] = pred_ return pred def _local_nb_prediction( self, queries, # X, y, mode="predict"): print("fitting unique NB models for each query", "in mode", mode) n_queries = queries.shape[0] with mp.Pool(processes=self.n_proc) as p: predictions = p.map( functools.partial(self._fit_local_nb, mode=mode), (query for query in queries)) predictions = np.array(predictions) assert predictions.shape[0] == n_queries if self.multi_label: assert predictions.shape[1] == self.y.shape[-1] return predictions def predict(self, X): print("predicting for", X.shape[0], "query molecules") X = compute_fp(X, self.fp, n_proc=self.n_proc) print("performing prediction", "using", self.n_proc, "processes") if self.model_name == "nn+nb": return self._local_nb_prediction(X, mode="predict") else: if self.model_name in dense_input \ and not isinstance(X, np.ndarray): X = X.A assert hasattr(self.model, "predict") with parallel_backend('threading', n_jobs=self.n_proc): return self.model.predict(X) def predict_proba(self, X): print("predicting probabilities for", X.shape[0], "query molecules") X = compute_fp(X, self.fp, n_proc=self.n_proc) print("performing probability prediction", "using", self.n_proc, "processes") if self.model_name == "nn+nb": return self._local_nb_prediction(X, mode="predict_proba") if self.model_name in dense_input \ and not isinstance(X, np.ndarray): X = X.A if self.model_name in support_multi_label: with parallel_backend('threading', n_jobs=self.n_proc): probs = self.model.predict_proba( X) # handle missing classes correctly classes = self.model.classes_ return np.hstack([ probs[:, idx] if idx.any() else 1 - probs for probs, idx in zip(probs, classes) ]) # check for existence of positive class else: assert isinstance(self.model, OneVsRestClassifier) if hasattr(self.model, "predict_proba"): with parallel_backend('threading', n_jobs=self.n_proc): return self.model.predict_proba(X) elif hasattr(self.model, "decision_function"): print("predicting with decision function") with parallel_backend('threading', n_jobs=self.n_proc): return self.model.decision_function(X) else: raise Exception def decision_function(self, X): print("predicting probabilities for", X.shape[0], "query molecules") X = compute_fp(X, self.fp, n_proc=self.n_proc) print("determining decision function", "using", self.n_proc, "processes") if self.model_name == "nn+nb": return self._local_nb_prediction( X, mode="predict_proba") # NB does not have a decision function if self.model_name in dense_input \ and not isinstance(X, np.ndarray): X = X.A if self.model_name in support_multi_label: # k neigbours has no decision function with parallel_backend('threading', n_jobs=self.n_proc): probs = self.model.predict_proba( X) # handle missing classes correctly classes = self.model.classes_ return np.hstack([ probs[:, idx] if idx.any() else 1 - probs for probs, idx in zip(probs, classes) ]) # check for existence of positive class else: assert isinstance(self.model, OneVsRestClassifier) if hasattr(self.model, "decision_function"): with parallel_backend('threading', n_jobs=self.n_proc): return self.model.decision_function(X) elif hasattr(self.model, "predict_proba"): print("predicting using probability") with parallel_backend('threading', n_jobs=self.n_proc): return self.model.predict_proba(X) else: raise Exception def check_is_fitted(self): if self.model is None: return True try: check_is_fitted(self.model) return True except NotFittedError: return False def __str__(self): return "PPB2({}-{})".format(self.fp, self.model_name) def set_n_proc(self, n_proc): self.n_proc = n_proc if self.model is not None: self.model.n_jobs = n_proc def set_k(self, k): self.k = k if isinstance(self.model, KNeighborsClassifier): self.model.n_neighbors = k
n_samples, n_features = X.shape X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) # Learn to predict each class against the other classifier = OneVsRestClassifier( svm.SVC(kernel='linear', probability=True, random_state=random_state)) classifier = classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) y_score = classifier.decision_function(X_test) feature_list = range(4) target_names = ['setosa', 'versicolor', 'virginica'] # Create a trained model instance ce = ClassifierEvaluator(classifier, y_test, y_pred, y_score, feature_list, target_names, estimator_name='super awesome SVC') template = ''' # Report
class MySVM: #_tasks = ['sede1', 'sede2', 'sede12', 'morfo1', 'morfo2', 'morfo12'] _tasks = ['sede1', 'sede12', 'morfo1', 'morfo2'] #_tasks = ['sede1'] _filesFolder = "./filesFolds-SVMbigramsPROOFRAND" _memmapFolder = "./memmapFolds-SVMbigramsPROOFRAND" _corpusFolder = "./corpusLSTM_ICDO3" _fileLb = { 'sede1': _memmapFolder + "/binarizers/lbSede1.p", 'sede2': _memmapFolder + "/binarizers/lbSede2.p", 'sede12': _memmapFolder + "/binarizers/lbSede12.p", 'morfo1': _memmapFolder + "/binarizers/lbMorfo1.p", 'morfo2': _memmapFolder + "/binarizers/lbMorfo2.p", 'morfo12': _memmapFolder + "/binarizers/lbMorfo12.p" } _fileEvaluation = _filesFolder + "/outputSVM/evaluation.txt" _fileModel = { 'sede1': _filesFolder + "/modelsSVM/modelCatSede1.h5", 'sede2': _filesFolder + "/modelsSVM/modelCatSede2.h5", 'sede12': _filesFolder + "/modelsSVM/modelCatSede12.h5", 'morfo1': _filesFolder + "/modelsSVM/modelCatMorfo1.h5", 'morfo2': _filesFolder + "/modelsSVM/modelCatMorfo2.h5", 'morfo12': _filesFolder + "/modelsSVM/modelCatMorfo12.h5", } _textFile = _corpusFolder + "/text.txt" _fileSedeClean = _corpusFolder + "/sedeClean.txt" _fileMorfoClean = _corpusFolder + "/morfoClean.txt" _fileVectors = _corpusFolder + "/vectors.txt" _fileMemmapX = "./tmp/X.dat" _fileMemmapYUn = { 'sede1': "./tmp/yUnSede1.dat", 'sede2': "./tmp/yUnSede2.dat", 'sede12': "./tmp/yUnSede12.dat", 'morfo1': "./tmp/yUnMorfo1.dat", 'morfo2': "./tmp/yUnMorfo2.dat", 'morfo12': "./tmp/yUnMorfo12.dat" } _fileMemmapY = { 'sede1': "./tmp/ySede1.dat", 'sede2': "./tmp/ySede2.dat", 'sede12': "./tmp/ySede12.dat", 'morfo1': "./tmp/yMorfo1.dat", 'morfo2': "./tmp/yMorfo2.dat", 'morfo12': "./tmp/yMorfo12.dat" } _fileShapes = _memmapFolder + "/shapes.p" _fileIndexes = _memmapFolder + "/indexes.p" _fileMemmapXTrain = _memmapFolder + "/XTrain.dat" _fileMemmapYTrain = { 'sede1': _memmapFolder + "/ySede1Train.dat", 'sede2': _memmapFolder + "/ySede2Train.dat", 'sede12': _memmapFolder + "/ySede12Train.dat", 'morfo1': _memmapFolder + "/yMorfo1Train.dat", 'morfo2': _memmapFolder + "/yMorfo2Train.dat", 'morfo12': _memmapFolder + "/yMorfo12Train.dat" } _fileMemmapXTest = _memmapFolder + "/XTest.dat" _fileMemmapYTest = { 'sede1': _memmapFolder + "/ySede1Test.dat", 'sede2': _memmapFolder + "/ySede2Test.dat", 'sede12': _memmapFolder + "/ySede12Test.dat", 'morfo1': _memmapFolder + "/yMorfo1Test.dat", 'morfo2': _memmapFolder + "/yMorfo2Test.dat", 'morfo12': _memmapFolder + "/yMorfo12Test.dat" } def extractData(self): self._phraseLen = 100 self.stratifications = 10 with open(self._textFile) as fid: text = fid.readlines() with open(self._fileSedeClean) as fid: sedeClean = fid.readlines() with open(self._fileMorfoClean) as fid: morfoClean = fid.readlines() vectorizer = TfidfVectorizer(min_df=3, max_df=0.3, strip_accents='unicode', ngram_range=(1, 2)) #vectorizer = TfidfVectorizer(min_df=3, max_df=0.5, strip_accents='unicode', ngram_range=(1,2)) vectorizer.fit(text) self._vecLen = len(vectorizer.get_feature_names()) #X = np.memmap(self._fileMemmapX, mode='w+', shape=(len(text), self._vecLen), dtype=np.float) #X[:] = vectorizer.transform(text).toarray() self.X = vectorizer.transform(text) del text yUn = {} yUn['sede1'] = np.memmap(self._fileMemmapYUn['sede1'], mode='w+', shape=(len(sedeClean)), dtype=np.int) yUn['sede2'] = np.memmap(self._fileMemmapYUn['sede2'], mode='w+', shape=(len(sedeClean)), dtype=np.int) yUn['sede12'] = np.memmap(self._fileMemmapYUn['sede12'], mode='w+', shape=(len(sedeClean)), dtype=np.int) for i, c in enumerate(sedeClean): yUn['sede1'][i], yUn['sede2'][i] = c.split() yUn['sede12'][i] = yUn['sede1'][i] * 10 + yUn['sede2'][i] yUn['morfo1'] = np.memmap(self._fileMemmapYUn['morfo1'], mode='w+', shape=(len(morfoClean)), dtype=np.int) yUn['morfo2'] = np.memmap(self._fileMemmapYUn['morfo2'], mode='w+', shape=(len(morfoClean)), dtype=np.int) yUn['morfo12'] = np.memmap(self._fileMemmapYUn['morfo12'], mode='w+', shape=(len(morfoClean)), dtype=np.int) for i, c in enumerate(morfoClean): yUn['morfo1'][i], yUn['morfo2'][i] = c.split() yUn['morfo12'][i] = yUn['morfo1'][i] * 10 + yUn['morfo2'][i] self.lb = LabelBinarizer() self.lb.fit(yUn['sede12']) self.y = np.memmap(self._fileMemmapY['sede12'], mode='w+', shape=(len(sedeClean), len(self.lb.classes_)), dtype=np.int) self.y[:] = self.lb.transform(yUn['sede12']) #del yUn[task] print("Splitting data") skf = StratifiedKFold(n_splits=self.stratifications) self.trainIndexes = [] self.testIndexes = [] for train, test in skf.split(np.zeros(len(yUn['sede12'])), yUn['sede12']): self.trainIndexes.append(train) self.testIndexes.append(test) #self.fold = random.randint(0,9) self.fold = 1 self.XTrain = self.X[self.trainIndexes[self.fold]] self.XTest = self.X[self.testIndexes[self.fold]] self.yTrain = np.memmap(self._fileMemmapYTrain['sede12'], mode='w+', shape=(len(self.trainIndexes[self.fold]), len(self.lb.classes_)), dtype=np.int) self.yTest = np.memmap(self._fileMemmapYTest['sede12'], mode='w+', shape=(len(self.testIndexes[self.fold]), len(self.lb.classes_)), dtype=np.int) self.yTrain[:] = self.y[self.trainIndexes[self.fold]] self.yTest[:] = self.y[self.testIndexes[self.fold]] self.yTrain.flush() self.yTest.flush() def createModels(self): print("Creating models") self.model = OneVsRestClassifier(LinearSVC()) self.model.fit(self.XTrain, self.yTrain) def evaluate(self): print("Evaluating Test") self._evaluate(self.XTest, self.yTest) #print("Evaluating Train") #self._evaluate(self.XTrain, self.yTrain) def _evaluate(self, X, y): metrics = {} table = [[ "task", "average", "MAPs", "MAPc", "accur.", "kappa", "prec.", "recall", "f1score" ]] na = ' ' table.append([" ", " ", " ", " ", " ", " ", " ", " "]) yp = self.model.decision_function(X) yt = y ytn = self.lb.inverse_transform(yt) yc = np.zeros(yt.shape, np.int) for i, p in enumerate(yp): yc[i][np.argmax(p)] = 1 ycn = self.lb.inverse_transform(yc) metrics = {} metrics['MAPs'] = MAPScorer().samplesScore(yt, yp) metrics['MAPc'] = MAPScorer().classesScore(yt, yp) metrics['accuracy'] = accuracy_score(yt, yc) metrics['kappa'] = cohen_kappa_score(ytn, ycn) metrics['precision'] = {} metrics['recall'] = {} metrics['f1score'] = {} table.append([ 'sede12', na, "{:.3f}".format(metrics['MAPs']), "{:.3f}".format(metrics['MAPc']), "{:.3f}".format(metrics['accuracy']), "{:.3f}".format(metrics['kappa']), na, na, na ]) for avg in ['micro', 'macro', 'weighted']: metrics['precision'][avg], metrics['recall'][avg], metrics[ 'f1score'][avg], _ = precision_recall_fscore_support( yt, yc, average=avg) table.append([ 'sede12', avg, na, na, na, na, "{:.3f}".format(metrics['precision'][avg]), "{:.3f}".format(metrics['recall'][avg]), "{:.3f}".format(metrics['f1score'][avg]) ]) #metrics['pr-curve'] = {} #metrics['pr-curve']['x'], metrics['pr-curve']['y'], metrics['pr-curve']['auc'] = self._calculateMicroMacroCurve(lambda y,s: (lambda t: (t[1],t[0]))(precision_recall_curve(y,s)), yt, yp) #metrics['roc-curve'] = {} #metrics['roc-curve']['x'], metrics['roc-curve']['y'], metrics['roc-curve']['auc'] = self._calculateMicroMacroCurve(lambda y,s: (lambda t: (t[0],t[1]))(roc_curve(y,s)), yt, yp) print(tabulate(table))
def run_prototype(snow_tweets_folder, prototype_output_folder, restart_probability, number_of_threads): """ This is a sample execution of the User Network Profile Classifier Prototype. Specifically: - Reads a set of tweets from a local folder. - Forms graphs and text-based vector representation for the users involved. - Fetches Twitter lists for influential users. - Extracts keywords from Twitter lists and thus annotates these users as experts in these topics. - Extracts graph-based features using the ARCTE algorithm. - Performs user classification for the rest of the users. """ if number_of_threads is None: number_of_threads = get_threads_number() #################################################################################################################### # Read data. #################################################################################################################### # Read graphs. edge_list_path = os.path.normpath(snow_tweets_folder + "/graph.tsv") adjacency_matrix = read_adjacency_matrix(file_path=edge_list_path, separator='\t') number_of_nodes = adjacency_matrix.shape[0] # Read labels. node_label_list_path = os.path.normpath(snow_tweets_folder + "/user_label_matrix.tsv") user_label_matrix, number_of_categories, labelled_node_indices = read_node_label_matrix(node_label_list_path, '\t') #################################################################################################################### # Extract features. #################################################################################################################### features = arcte(adjacency_matrix, restart_probability, 0.00001, number_of_threads=number_of_threads) features = normalize_columns(features) percentages = np.arange(1, 11, dtype=np.int) trial_num = 10 #################################################################################################################### # Perform user classification. #################################################################################################################### mean_macro_precision = np.zeros(percentages.size, dtype=np.float) std_macro_precision = np.zeros(percentages.size, dtype=np.float) mean_micro_precision = np.zeros(percentages.size, dtype=np.float) std_micro_precision = np.zeros(percentages.size, dtype=np.float) mean_macro_recall = np.zeros(percentages.size, dtype=np.float) std_macro_recall = np.zeros(percentages.size, dtype=np.float) mean_micro_recall = np.zeros(percentages.size, dtype=np.float) std_micro_recall = np.zeros(percentages.size, dtype=np.float) mean_macro_F1 = np.zeros(percentages.size, dtype=np.float) std_macro_F1 = np.zeros(percentages.size, dtype=np.float) mean_micro_F1 = np.zeros(percentages.size, dtype=np.float) std_micro_F1 = np.zeros(percentages.size, dtype=np.float) F1 = np.zeros((percentages.size, number_of_categories), dtype=np.float) for p in np.arange(percentages.size): percentage = percentages[p] # Initialize the metric storage arrays to zero macro_precision = np.zeros(trial_num, dtype=np.float) micro_precision = np.zeros(trial_num, dtype=np.float) macro_recall = np.zeros(trial_num, dtype=np.float) micro_recall = np.zeros(trial_num, dtype=np.float) macro_F1 = np.zeros(trial_num, dtype=np.float) micro_F1 = np.zeros(trial_num, dtype=np.float) trial_F1 = np.zeros((trial_num, number_of_categories), dtype=np.float) folds = generate_folds(user_label_matrix, labelled_node_indices, number_of_categories, percentage, trial_num) for trial in np.arange(trial_num): train, test = next(folds) ######################################################################################################## # Separate train and test sets ######################################################################################################## X_train, X_test, y_train, y_test = features[train, :],\ features[test, :],\ user_label_matrix[train, :],\ user_label_matrix[test, :] contingency_matrix = chi2_contingency_matrix(X_train, y_train) community_weights = peak_snr_weight_aggregation(contingency_matrix) X_train, X_test = community_weighting(X_train, X_test, community_weights) #################################################################################################### # Train model #################################################################################################### # Train classifier model = OneVsRestClassifier(svm.LinearSVC(C=1, random_state=None, dual=False, fit_intercept=True), n_jobs=number_of_threads) model.fit(X_train, y_train) #################################################################################################### # Make predictions #################################################################################################### y_pred = model.decision_function(X_test) y_pred = form_node_label_prediction_matrix(y_pred, y_test) ######################################################################################################## # Calculate measures ######################################################################################################## measures = evaluation.calculate_measures(y_pred, y_test) macro_recall[trial] = measures[0] micro_recall[trial] = measures[1] macro_precision[trial] = measures[2] micro_precision[trial] = measures[3] macro_F1[trial] = measures[4] micro_F1[trial] = measures[5] trial_F1[trial, :] = measures[6] mean_macro_precision[p] = np.mean(macro_precision) std_macro_precision[p] = np.std(macro_precision) mean_micro_precision[p] = np.mean(micro_precision) std_micro_precision[p] = np.std(micro_precision) mean_macro_recall[p] = np.mean(macro_recall) std_macro_recall[p] = np.std(macro_recall) mean_micro_recall[p] = np.mean(micro_recall) std_micro_recall[p] = np.std(micro_recall) mean_macro_F1[p] = np.mean(macro_F1) std_macro_F1[p] = np.std(macro_F1) mean_micro_F1[p] = np.mean(micro_F1) std_micro_F1[p] = np.std(micro_F1) F1[p, :] = np.mean(trial_F1, axis=0) measure_list = [(mean_macro_precision, std_macro_precision), (mean_micro_precision, std_micro_precision), (mean_macro_recall, std_macro_recall), (mean_micro_recall, std_micro_recall), (mean_macro_F1, std_macro_F1), (mean_micro_F1, std_micro_F1), F1] write_results(measure_list, os.path.normpath(prototype_output_folder + "/F1_average_scores.txt"))
def ROC_multi_class(data_train, data_test, data_test_vectors): # Binarize the output y_train_label = label_binarize(data_train.target, classes=[0, 1, 2]) n_classes = y_train_label.shape[1] random_state = np.random.RandomState(1) # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split(data_train_vectors, y_train_label, test_size=.5, random_state=0) # Learn to predict each class against the other classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state)) classifier.fit(X_train, y_train) y_pred_score = classifier.decision_function(data_test_vectors) y_test_label = label_binarize(data_test.target, classes=[0, 1, 2]) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test_label[:, i], y_pred_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test_label.ravel(), y_pred_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Plot ROC curves for the multiclass problem # Compute macro-average ROC curve and ROC area # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves plt.figure() # plt.plot(fpr["micro"], tpr["micro"], # label='micro-average ROC curve (area = {0:0.2f})' # ''.format(roc_auc["micro"]), # linewidth=2) # # plt.plot(fpr["macro"], tpr["macro"], # label='macro-average ROC curve (area = {0:0.2f})' # ''.format(roc_auc["macro"]), # linewidth=2) for i in range(n_classes): plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})' ''.format(i, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic of multi-class') plt.legend(loc="lower right") plt.show() return 0
solver='adam', verbose=0, random_state=21) # warm_start=warm_start_set) sys.exit(0) print("Strat training...") classifier.fit(X_train, y_train) #sys.exit(0) #---------------------use X_test for evaluation------------------ if (classifier_type == 'svm'): #note that in svm predict_proba is inconsistent with predict function #use decision_function-->consistent y_pred_proba = classifier.decision_function( X_test) #return inverse of distance if (classifier_type == 'mlp'): y_pred_proba = classifier.predict_proba(X_test) all_labels = classifier.classes_ #----------------------get top-k results------------------------- #print("Training finished(test on original dataset):\ncomponent type: {} \nemddeing: {} \nclassifier: {}\n" \ # .format(cur_exp_param,cur_sent_embd_type,classifier_type)) y_top_K = [] # --pick out the max probability labels(by sorting predict_proba or decision_function) #--note this may be different in rnn if (classifier_type == 'mlp' or classifier_type == 'svm'):
class KOMD(BaseEstimator, ClassifierMixin): """KOMD. KOMD is a kernel method for classification and ranking. Read more in http://www.math.unipd.it/~dasan/papers/km-omd.icann08.pdf by F. Aiolli, G. Da San Martino, and A. Sperduti. For details on the precise mathematical formulation of the provided kernel functions and how `gamma`, `coef0` and `degree` affect each other, see the corresponding section in the narrative documentation: :ref:`svm_kernels`. Parameters ---------- lam : float, (default=0.1) Specifies the lambda value, between 0.0 and 1.0. kernel : optional (default='linear') Specifies the kernel function used by the algorithm. It must be one of 'linear', 'poly', 'rbf', a callable or a gram matrix. If none is given, 'linear' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. rbf_gamma : float, optional (default=0.1) Coefficient for 'rbf' and 'poly' kernels. Ignored by all other kernels. degree : float, optional (default=2.0) Specifies the degree of the 'poly' kernel. Ignored by all other kernels. coef0 : flaot, optional (default=0.0) Specifies the coeff0 in a polynomial kernel. Ignored by all other kernels. max_iter : int, optional (default=100) Hard limit on iterations within solver, it can't be negative. verbose : bool, (default=False) Enable verbose output during fit. multiclass_strategy : string, optional (default='ova') Specifies the strategy used in case of multiclass. 'ova' for one_vs_all pattern (also called one_vs_rest), 'ovo' for one_vs_one pattern. With other unexpected string, 'ova' pattern is used. Attributes ---------- gamma : array-like, shape = [n_samples] probability-like vector that define the distance vector over the two class. classes_ : array-like, shape = [n_classes] Vector that contain all possibile labels multiclass_ : boolean, True if the number of classes > 2 Examples -------- >>>import numpy as np >>>from ??.komd import KOMD >>>X = np.array([[1,2,i] for i in range(5)]) >>>Y = np.array([1,1,1,-1,-1]) >>>cls = KOMD() >>>cls = cls.fit(X,Y) >>>print cls.predict([[1,1,5]]) [1] References ---------- `A Kernel Method for the Optimization of the Margin Distribution <http://www.math.unipd.it/~dasan/papers/km-omd.icann08.pdf>`__ """ def __init__(self, lam = 0.1, kernel = 'rbf', rbf_gamma = 0.1, degree = 2.0, coef0 = 0.0, max_iter = 100, verbose = False, multiclass_strategy = 'ova'): self.lam = lam self.gamma = None self.bias = None self.X = None self.Y = None self.is_fitted = False self.rbf_gamma = rbf_gamma self.degree = degree self.coef0 = coef0 self.max_iter = max_iter self.verbose = verbose self.kernel = kernel self.multiclass_strategy = multiclass_strategy self.multiclass_ = None self.classes_ = None self._pairwise = self.kernel=='precomputed' def __kernel_definition__(self): """Select the kernel function Returns ------- kernel : a callable relative to selected kernel """ if hasattr(self.kernel, '__call__'): return self.kernel if self.kernel == 'rbf' or self.kernel == None: return lambda X,Y : rbf_kernel(X,Y,self.rbf_gamma) if self.kernel == 'poly': return lambda X,Y : polynomial_kernel(X, Y, degree=self.degree, gamma=self.rbf_gamma, coef0=self.coef0) if self.kernel == 'linear': return lambda X,Y : linear_kernel(X,Y) if self.kernel == 'precomputed': return lambda X,Y : X def fit(self, X, Y): """Fit the model according to the given training data Parameters ---------- X : array-like, shape = [n_samples, n_features] Matrix of the examples, where n_samples is the number of samples and n_feature is the number of features Y : array-like, shape = [n_samples] array of the labels relative to X Returns ------- self : object Returns self """ X,Y = validation.check_X_y(X, Y, dtype=np.float64, order='C', accept_sparse='csr') #check_consistent_length(X,Y) check_classification_targets(Y) self.classes_ = np.unique(Y) if len(self.classes_) < 2: raise ValueError("The number of classes has to be almost 2; got ", len(self.classes_)) if len(self.classes_) == 2: self.multiclass_ = False return self._fit(X,Y) else : self.multiclass_ = True if self.multiclass_strategy == 'ovo': return self._one_vs_one(X,Y) else : return self._one_vs_rest(X,Y) raise ValueError('This is a very bad exception...') def _one_vs_one(self,X,Y): self.cls = OneVsOneClassifier(KOMD(**self.get_params())).fit(X,Y) self.is_fitted = True return self def _one_vs_rest(self,X,Y): self.cls = OneVsRestClassifier(KOMD(**self.get_params())).fit(X,Y) self.is_fitted = True return self def _fit(self,X,Y): self.X = X values = np.unique(Y) Y = [1 if l==values[1] else -1 for l in Y] self.Y = Y npos = len([1.0 for l in Y if l == 1]) nneg = len([1.0 for l in Y if l == -1]) gamma_unif = matrix([1.0/npos if l == 1 else 1.0/nneg for l in Y]) YY = matrix(np.diag(list(matrix(Y)))) Kf = self.__kernel_definition__() ker_matrix = matrix(Kf(X,X).astype(np.double)) #KLL = (1.0 / (gamma_unif.T * YY * ker_matrix * YY * gamma_unif)[0])*(1.0-self.lam)*YY*ker_matrix*YY KLL = (1.0-self.lam)*YY*ker_matrix*YY LID = matrix(np.diag([self.lam * (npos * nneg / (npos+nneg))]*len(Y))) Q = 2*(KLL+LID) p = matrix([0.0]*len(Y)) G = -matrix(np.diag([1.0]*len(Y))) h = matrix([0.0]*len(Y),(len(Y),1)) A = matrix([[1.0 if lab==+1 else 0 for lab in Y],[1.0 if lab2==-1 else 0 for lab2 in Y]]).T b = matrix([[1.0],[1.0]],(2,1)) solvers.options['show_progress'] = False#True solvers.options['maxiters'] = self.max_iter sol = solvers.qp(Q,p,G,h,A,b) self.gamma = sol['x'] if self.verbose: print '[KOMD]' print 'optimization finished, #iter = ', sol['iterations'] print 'status of the solution: ', sol['status'] print 'objval: ', sol['primal objective'] bias = 0.5 * self.gamma.T * ker_matrix * YY * self.gamma self.bias = bias self.is_fitted = True self.ker_matrix = ker_matrix return self def predict(self, X): """Perform classification on samples in X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Matrix containing new samples Returns ------- y_pred : array, shape = [n_samples] The value of prediction for each sample """ if self.is_fitted == False: raise NotFittedError("This KOMD instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.") X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C") if self.multiclass_ == True: return self.cls.predict(X) return np.array([self.classes_[1] if p >=0 else self.classes_[0] for p in self.decision_function(X)]) def get_params(self, deep=True): # this estimator has parameters: return {"lam": self.lam, "kernel": self.kernel, "rbf_gamma":self.rbf_gamma, "degree":self.degree, "coef0":self.coef0, "max_iter":self.max_iter, "verbose":self.verbose, "multiclass_strategy":self.multiclass_strategy} def set_params(self, **parameters): for parameter, value in parameters.items(): setattr(self,parameter,value) return self def decision_function(self, X): """Distance of the samples in X to the separating hyperplane. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- Z : array-like, shape = [n_samples, 1] Returns the decision function of the samples. """ if self.is_fitted == False: raise NotFittedError("This KOMD instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.") X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C") if self.multiclass_ == True: return self.cls.decision_function(X) Kf = self.__kernel_definition__() YY = matrix(np.diag(list(matrix(self.Y)))) ker_matrix = matrix(Kf(X,self.X).astype(np.double)) z = ker_matrix*YY*self.gamma z = z-self.bias return np.array(list(z))
def _predict(self, train_index, test_index): """ :param train_index: list with the index of the models data used in the algorithm of SVM. :param test_index: list with the index to predict. :return: tuple with (label_prediction, label_score, word_prediction, word_score, bin_predictions) of test_index made by: - predict Label (hiper/hipo/normal) with SVM (using all trainindex of the same meal type) - predict word with SVM using only hiper/hipo/normal trainindex depending on the label predicted """ # models to predict data_pred = self.loader.get_models(test_index, {'weight': self.weights}) data_pred = data_pred.iloc[:, 1:-1] # Get models with words wdata = self.loader.get_models(train_index, {'weight': self.weights}) bin_labels = [] score_label = [] # Get Possible meal labels meal_types = list(wdata.iloc[:, 3].unique()) for d in data_pred.iloc[:, 2].unique(): if d not in meal_types: meal_types.append(d) # split models by type of meal for etiqueta_apat in meal_types: mdata_pred = data_pred.ix[data_pred.iloc[:, 2] == etiqueta_apat] mwdata = wdata.ix[wdata.iloc[:, 3] == etiqueta_apat] # Get models hipo/hiper/norm labels ldata_labels = self.loader.get_labels_of_words( mwdata.iloc[:, -1].tolist()) # Predict label svm = OneVsRestClassifier( SVC(kernel=self._kernel, C=self._C, gamma=self._gamma)) y = label_binarize(ldata_labels, classes=[-1, 0, 1]) svm.fit(mwdata.iloc[:, 1:-1], y) mbin_labels = svm.predict(mdata_pred) mscore_label = svm.decision_function(mdata_pred) if len(bin_labels): bin_labels = np.concatenate((bin_labels, mbin_labels), axis=0) score_label = np.concatenate((score_label, mscore_label), axis=0) else: bin_labels = mbin_labels score_label = mscore_label ldata_labels = self.loader.get_labels_of_words(wdata.iloc[:, -1].tolist()) # Predict word using only vocabulary of label predicted res_word = [] res_label = [] score_word = [] predicters = {} for i in range(len(bin_labels)): label = None for l in range(-1, 2): if bin_labels[i][l + 1] == 1: label = l break if not label: maxscore = -1000 for s in range(-1, 2): score = score_label[i][s + 1] if score >= maxscore: label = s maxscore = score res_label.append(label) # Work only with sessions of the label # sessions = [wdata.iloc[z, 0] for z in range(len(wdata)) if ldata_labels[z] == label] # # models = wdata[wdata.id.isin(sessions)] # models_labels = models.iloc[:, -1] # models = models.iloc[:, 1:-1] # # if not predicters.get(str(label), False): # svm = SVC(kernel=self._kernel, C=self._C, gamma=self._gamma) # svm.fit(models, models_labels) # predicters[str(label)] = svm # # svm = predicters.get(str(label)) # res_w = svm.predict(data_pred) # score_w = svm.decision_function(data_pred) # # res_word.append(res_w) # score_word.append(score_w) return (res_label, score_label, res_word, score_word, bin_labels)
n_classes = Y.shape[1] # Split into training and test X_train, X_test, Y_train, Y_test = model_selection.train_test_split( doc_vec, Y, test_size=.3, random_state=10) # We use OneVsRestClassifier for multi-label prediction from sklearn.multiclass import OneVsRestClassifier # Run classifier classifier = OneVsRestClassifier(svm.SVC(kernel='linear', random_state=12)) #classifier = OneVsRestClassifier(RandomForestClassifier(n_estimators=25, random_state=1)) classifier.fit(Train_X, Y_train) #y_score = classifier.predict_proba(Test_X) y_score = classifier.decision_function(Test_X) # For each class precision = dict() recall = dict() average_precision = dict() for i in range(n_classes): precision[i], recall[i], _ = precision_recall_curve( Y_test[:, i], y_score[:, i]) average_precision[i] = average_precision_score(Y_test[:, i], y_score[:, i]) # A "micro-average": quantifying score on all classes jointly precision["micro"], recall["micro"], _ = precision_recall_curve( Y_test.ravel(), y_score.ravel()) average_precision["micro"] = average_precision_score(Y_test, y_score,
# In[41]: print(classification_report(y_test, mn_y_pred)) print(classification_report(y_test, svc_y_pred)) # In[38]: from sklearn.metrics import precision_recall_curve from sklearn.metrics import average_precision_score clf = OneVsRestClassifier(svc) clf.fit(X_train, y_train) y_score = clf.decision_function(X_test) # For each class precision = dict() recall = dict() average_precision = dict() n_classes = y_bin.shape[1] for i in range(n_classes): precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i]) average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i]) precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(), y_score.ravel()) average_precision["micro"] = average_precision_score(y_test, y_score, average="micro") print('Average precision score, micro-averaged over all classes: {0:0.2f}'.format(average_precision["micro"]))
def cross_validation(X, y, n_trials=5, trial_splits=None, fname=None): """Perform model selection via 5-fold cross validation""" # filter samples with no annotations del_rid = np.where(y.sum(axis=1) == 0)[0] y = np.delete(y, del_rid, axis=0) X = np.delete(X, del_rid, axis=0) # range of hyperparameters C_range = 10.**np.arange(-1, 3) gamma_range = 10.**np.arange(-3, 1) # pre-generating kernels print("### Pregenerating kernels...") K_rbf = {} for gamma in gamma_range: K_rbf[gamma] = rbf_kernel(X, gamma=gamma) print("### Done.") # performance measures perf = dict() pr_micro = [] pr_macro = [] fmax = [] acc = [] if trial_splits is None: # shuffle and split training and test sets trials = ShuffleSplit(n_splits=n_trials, test_size=0.2, random_state=None) ss = trials.split(X) trial_splits = [] for train_idx, test_idx in ss: trial_splits.append((train_idx, test_idx)) it = 0 for jj in range(0, n_trials): train_idx = trial_splits[jj][0] test_idx = trial_splits[jj][1] it += 1 y_train = y[train_idx] y_test = y[test_idx] print("### [Trial %d] Perfom cross validation...." % (it)) print("Train samples=%d; #Test samples=%d" % (y_train.shape[0], y_test.shape[0])) # setup for neasted cross-validation splits = ml_split(y_train) # parameter fitting C_opt = None gamma_opt = None max_aupr = 0 for C in C_range: for gamma in gamma_range: # Multi-label classification cv_results = [] for train, valid in splits: clf = OneVsRestClassifier(svm.SVC(C=C, kernel='precomputed', probability=False), n_jobs=-1) K_train = K_rbf[gamma][ train_idx[train], :][:, train_idx[train]] K_valid = K_rbf[gamma][ train_idx[valid], :][:, train_idx[train]] y_train_t = y_train[train] y_train_v = y_train[valid] y_score_valid = np.zeros(y_train_v.shape, dtype=float) y_pred_valid = np.zeros_like(y_train_v) idx = np.where(y_train_t.sum(axis=0) > 0)[0] clf.fit(K_train, y_train_t[:, idx]) y_score_valid[:, idx] = clf.decision_function(K_valid) y_pred_valid[:, idx] = clf.predict(K_valid) perf_cv = evaluate_performance(y_train_v, y_score_valid, y_pred_valid) cv_results.append(perf_cv['m-aupr']) cv_aupr = np.median(cv_results) print("### gamma = %0.3f, C = %0.3f, AUPR = %0.3f" % (gamma, C, cv_aupr)) if cv_aupr > max_aupr: C_opt = C gamma_opt = gamma max_aupr = cv_aupr print("### Optimal parameters: ") print("C_opt = %0.3f, gamma_opt = %0.3f" % (C_opt, gamma_opt)) print("### Train dataset: AUPR = %0.3f" % (max_aupr)) print("### Using full training data...") clf = OneVsRestClassifier(svm.SVC(C=C_opt, kernel='precomputed', probability=False), n_jobs=-1) y_score = np.zeros(y_test.shape, dtype=float) y_pred = np.zeros_like(y_test) idx = np.where(y_train.sum(axis=0) > 0)[0] clf.fit(K_rbf[gamma_opt][train_idx, :][:, train_idx], y_train[:, idx]) # Compute performance on test set y_score[:, idx] = clf.decision_function( K_rbf[gamma_opt][test_idx, :][:, train_idx]) y_pred[:, idx] = clf.predict(K_rbf[gamma_opt][test_idx, :][:, train_idx]) perf_trial = evaluate_performance(y_test, y_score, y_pred) pr_micro.append(perf_trial['m-aupr']) pr_macro.append(perf_trial['M-aupr']) fmax.append(perf_trial['F1']) acc.append(perf_trial['acc']) print( "### Test dataset: AUPR['micro'] = %0.3f, AUPR['macro'] = %0.3f, F1 = %0.3f, Acc = %0.3f" % (perf_trial['m-aupr'], perf_trial['M-aupr'], perf_trial['F1'], perf_trial['acc'])) perf['m-aupr_avg'] = np.mean(pr_micro) perf['m-aupr_std'] = std(pr_micro) perf['M-aupr_avg'] = np.mean(pr_macro) perf['M-aupr_std'] = std(pr_macro) perf['F1_avg'] = np.mean(fmax) perf['F1_std'] = std(fmax) perf['acc_avg'] = np.mean(acc) perf['acc_std'] = std(acc) if fname is not None: fout = open(fname, 'w') fout.write("aupr[micro], aupr[macro], F_max, accuracy\n") for ii in range(0, n_trials): fout.write(pr_micro[ii], pr_macro[ii], fmax[ii], acc[ii]) fout.close() return perf
total_y_pred = [] test11 = [] test22 = [] for train_index, test_index in kf.split(X): # print(train_index,test_index) # print("_") train_X = fromIndexToFeatures(X, train_index) train_y = fromIndexToLabels(y, train_index) test_X = fromIndexToFeatures(X, test_index) test_y = fromIndexToLabels(y, test_index) test11.extend(test_y) clf.fit(train_X, train_y) score = clf.decision_function(test_X) for i in score: test22.append(i) y_pred = clf.predict(test_X) total_y_test.extend(test_y) total_y_pred.extend(y_pred) print('done') # print(train_X) # print() test11 = np.asarray(test11) test22 = np.asarray(test22) test11 = label_binarize(y, classes=[0, 1, 2, 3, 4, 5, 6, 7]) print(confusion_matrix(total_y_test, total_y_pred)) print(classification_report(total_y_test, total_y_pred))
def temporal_holdout(X, y, indx, bootstrap, fname, goterms=None, go_fname=None): """Perform temporal holdout validation""" X_train = X[indx['train'].tolist()] X_test = X[indx['test'].tolist()] X_valid = X[indx['valid'].tolist()] y_train = y['train'].tolist() y_test = y['test'].tolist() y_valid = y['valid'].tolist() if goterms is not None: goterms = goterms['terms'].tolist() # range of hyperparameters C_range = 10.**np.arange(-1, 3) gamma_range = 10.**np.arange(-3, 1) # pre-generating kernels print("### Pregenerating kernels...") K_rbf_train = {} K_rbf_test = {} K_rbf_valid = {} for gamma in gamma_range: K_rbf_train[gamma] = rbf_kernel(X_train, gamma=gamma) K_rbf_test[gamma] = rbf_kernel(X_test, X_train, gamma=gamma) K_rbf_valid[gamma] = rbf_kernel(X_valid, X_train, gamma=gamma) print("### Done.") print("Train samples=%d; #Test samples=%d" % (y_train.shape[0], y_test.shape[0])) # parameter fitting C_opt = None gamma_opt = None max_aupr = 0 for C in C_range: for gamma in gamma_range: # Multi-label classification clf = OneVsRestClassifier(svm.SVC(C=C, kernel='precomputed', probability=False), n_jobs=-1) clf.fit(K_rbf_train[gamma], y_train) y_score_valid = clf.decision_function(K_rbf_valid[gamma]) y_pred_valid = clf.predict(K_rbf_valid[gamma]) perf = evaluate_performance(y_valid, y_score_valid, y_pred_valid) micro_aupr = perf['m-aupr'] print("### gamma = %0.3f, C = %0.3f, AUPR = %0.3f" % (gamma, C, micro_aupr)) if micro_aupr > max_aupr: C_opt = C gamma_opt = gamma max_aupr = micro_aupr print("### Optimal parameters: ") print("C_opt = %0.3f, gamma_opt = %0.3f" % (C_opt, gamma_opt)) print("### Train dataset: AUPR = %0.3f" % (max_aupr)) print("### Computing performance on test dataset...") clf = OneVsRestClassifier(svm.SVC(C=C_opt, kernel='precomputed', probability=False), n_jobs=-1) clf.fit(K_rbf_train[gamma_opt], y_train) # Compute performance on test set y_score = clf.decision_function(K_rbf_test[gamma_opt]) y_pred = clf.predict(K_rbf_test[gamma_opt]) # performance measures for bootstrapping perf = dict() pr_micro = [] pr_macro = [] fmax = [] acc = [] # individual goterms pr_goterms = {} for i in range(0, len(goterms)): pr_goterms[goterms[i]] = [] for ind in bootstrap: perf_ind = evaluate_performance(y_test[ind], y_score[ind], y_pred[ind]) pr_micro.append(perf_ind['m-aupr']) pr_macro.append(perf_ind['M-aupr']) fmax.append(perf_ind['F1']) acc.append(perf_ind['acc']) for i in range(0, len(goterms)): pr_goterms[goterms[i]].append(perf_ind[i]) perf['m-aupr_avg'] = np.mean(pr_micro) perf['m-aupr_std'] = std(pr_micro) perf['M-aupr_avg'] = np.mean(pr_macro) perf['M-aupr_std'] = std(pr_macro) perf['F1_avg'] = np.mean(fmax) perf['F1_std'] = std(fmax) perf['acc_avg'] = np.mean(acc) perf['acc_std'] = std(acc) # trials fout = open(fname, 'w') fout.write("aupr[micro], aupr[macro], F_max, accuracy\n") for it in range(0, len(bootstrap)): fout.write(pr_micro[it], pr_macro[it], fmax[it], acc[it], "\n") fout.close() # write performance on individual GO terms if go_fname is not None: fout = open(go_fname, 'wb') print >> fout, "GO_id, AUPRs" for i in range(0, len(goterms)): print >> fout, goterms[i], sum(y_train[:, i]) / float( y_train.shape[0]), for pr in pr_goterms[goterms[i]]: print >> fout, pr, print >> fout fout.close() return perf
class TextClassifier: def __init__(self): self.vectorizer = None self.clf = None self.doc_ids = None self.label2id = None self.id2label = None self.platt_a = None self.platt_b = None self.dist_max = None self.dist_min = None def _get_label_dicts(self, labels): """ Create dictionaries mapping labels to integers 0 to n, in which n is the number of unique labels encountered in the given list of labels. :param labels: (list) """ sorted_labels = set([l.strip() for ls in labels for l in ls]) self.label2id = {l.strip(): i for i, l in enumerate(sorted_labels)} self.id2label = {i: l.strip() for l, i in self.label2id.items()} def _file_save(self, path, filename, platt_a, platt_b, dist_max, dist_min): """ :param path: (string) :param filename: (str) :param platt_a: (float) :param platt_b: (float) :param dist_max: (float) :param dist_min: (float) """ with open(path + '{0}_vec.pkl'.format(filename), 'wb') as f: dill.dump(self.vectorizer, f) with open(path + '{0}_clf.pkl'.format(filename), 'wb') as f: dill.dump(self.clf, f) with open(path + '{0}.json'.format(filename), 'w') as f: d = { 'classifier_name': '{0}_clf.pkl'.format(filename), 'vectorizer_name': '{0}_vec.pkl'.format(filename), 'save_datetime': str(datetime.now()), 'parameters': { 'PlattA': str(platt_a), 'PlattB': str(platt_b), 'DistMaximum': str(dist_max.tostring()), 'DistMinimum': str(dist_min.tostring()), 'DocumentIDs': self.doc_ids, 'Labels2IDs': self.label2id } } json.dump(json.dumps(d), f, indent=4) def _load_from_file(self, path, filename): """ :param path: (string) :param filename: (string) """ with open(path + '{0}.json'.format(filename), 'r') as f: metadata = json.loads(json.load(f)) with open(path + '{0}'.format(metadata['classifier_name']), 'rb') as f: self.clf = dill.load(f) with open(path + '{0}'.format(metadata['vectorizer_name']), 'rb') as f: self.vectorizer = dill.load(f) self.platt_a = float(metadata['parameters']['PlattA']) self.platt_b = float(metadata['parameters']['PlattB']) self.dist_max = np.fromstring( eval(metadata['parameters']['DistMaximum'])) self.dist_min = np.fromstring( eval(metadata['parameters']['DistMinimum'])) self.doc_ids = metadata['parameters']['DocumentIDs'] self.label2id = metadata['parameters']['Labels2IDs'] self.id2label = {i: l.strip() for l, i in self.label2id.items()} def _predict_multi(self, documents, output_positive_score=False): """ Returns label guesses (with probability of accuracy) for each document. :param documents: (list) :param output_positive_score: (bool, False by default) :return: list of tuples (str, float) of label predictions and associated probabilities """ doc_vectors = self.vectorizer.transform(documents) decisions = self.clf.decision_function(doc_vectors) a = self.platt_a if self.platt_a is not None else -5. b = self.platt_b if self.platt_b is not None else 1. pdf = 1. / (1. + np.exp(a * decisions + b)) assert isinstance(pdf, np.ndarray) classes = [str(self.id2label[i]) for i in range(pdf.shape[1])] predictions = [] for ps in pdf: if output_positive_score: zp = zip( np.array(classes)[decisions[0] > 0].tolist(), [float(x) for x in ps[decisions[0] > 0]]) else: zp = zip(classes, map(lambda x: float(x), ps)) predictions.append(sorted(zp, reverse=True, key=lambda x: x[1])) return predictions def label_vectorizer(self, labels): """ Turn a list of labels into an equivalent binarized array of labels. :param labels: (list) :return: (ndarray) """ label_ids = [[self.label2id[l.strip()] for l in ls] for ls in labels] return MultiLabelBinarizer( classes=range(len(self.label2id))).fit_transform(label_ids) def train(self, documents, labels, identifiers): """ Fits vectorizer and classifier :param documents: (list) :param labels: (list) :param identifiers: (list) """ self.vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1, tokenizer=lemma_tokenizer) self.clf = OneVsRestClassifier(LinearSVC(random_state=0)) self._get_label_dicts(labels) self.doc_ids = identifiers x = self.vectorizer.fit_transform(documents) y = self.label_vectorizer(labels) self.clf.fit(x, y) def predict(self, documents): """ Returns an array of predictions for documents. :param documents: (list) :return: (ndarray) """ prediction = self._predict_multi(documents) return np.array(prediction)[:, 0] def grid_predict(self, documents, platt_a, platt_b, low_memory=False): """ Returns label guesses (with probability of accuracy) for each document. This function is only executed when grid searches for the parameters of Platt's posterior probability bootstrapping algorithm are being performed. Otherwise predict_multi is run. :param documents: (list) :param platt_a: Platt parameter A (float) :param platt_b: Platt parameter B (float) :param low_memory: (bool) :return: ndarray if low_memory is True, list if low_memory is False """ decisions = self.decision_function(documents) if low_memory: pdf = np.exp(platt_a * decisions + platt_b).astype(np.float16) pdf += 1. return 1. / pdf pdf = 1. / (1. + np.exp(platt_a * decisions + platt_b)) assert isinstance(pdf, np.ndarray) classes = [self.id2label[i] for i in range(pdf.shape[1])] predictions_bulk = [] for ps in pdf: prediction = zip(classes, ps) prediction = sorted(prediction, reverse=True, key=lambda s: s[1]) predictions_bulk.append(prediction) return predictions_bulk def decision_function(self, documents): """ Returns the decision function values :param documents: (list) :return: (ndarray) """ doc_vectors = self.vectorizer.transform(documents) return self.clf.decision_function(doc_vectors) def save(self, path, name, platt_a, platt_b, dist_max, dist_min, in_db=False): """ :param path: (string) :param name: (str) :param platt_a: (float) :param platt_b: (float) :param dist_max: (float) :param dist_min: (float) :param in_db: (bool) """ file_name = '{0}_{1}'.format(name, uuid4()) if not in_db: self._file_save(path, file_name, platt_a, platt_b, dist_max, dist_min) return file_name else: raise NotImplementedError def load(self, path, name, in_db=False): """ :param path: (string) :param name: (str) :param in_db: (bool) """ if not in_db: self._load_from_file(path, name) else: raise NotImplementedError
modelsvm.fit(X_train, y_train) #The best hyper parameters set print("Best Hyper Parameters:\n", modelsvm.best_params_) y_pred = modelsvm.predict(X_test) y_pred_train = modelsvm.predict(X_train) y_train1 = label_binarize(y_train, classes=[0, 1, 2, 3, 4]) y_pred_train1 = label_binarize(y_pred_train, classes=[0, 1, 2, 3, 4]) y_pred1 = label_binarize(y_pred, classes=[0, 1, 2, 3, 4]) y_test1 = label_binarize(y_test, classes=[0, 1, 2, 3, 4]) auc_r2_rmse(y_train1, y_pred_train1, y_test1, y_pred1, "svm") classifier_svm = OneVsRestClassifier(modelsvm.best_estimator_) y_score = classifier_svm.fit(X_train, y_train1).decision_function(X_test) y_score_train = classifier_svm.decision_function(X_train) plot_roc_auc(y_score, y_test1, 'svm_auc_roc.png', 'SVM (test)') plot_roc_auc(y_score_train, y_train1, 'svm_train_auc_roc.png', 'SVM (train)') ############################################################################################# ############################################################################################# ############################################################################################# ############################################################################################# ############# NN - 1 hidden layer #################################################################### ############################################################################################# ############################################################################################# from keras.models import Sequential from keras.layers import Dense from keras.wrappers.scikit_learn import KerasClassifier from keras.utils import np_utils
T, E = map(int, input().split(' ')) RawData = [] Labels = [] for i in range(T) : labels = map(int, input().split(' ')) RawData.append(input()) Labels.append(labels) Queries = [] for i in range(E) : Queries.append(input()) RawData.extend(Queries) X = CVectorizer.fit_transform(RawData) Xtf = TfIdfVectorizer.fit_transform(X) del X MLB = MultiLabelBinarizer() Yt = MLB.fit_transform(Labels) XtfTrain = Xtf[0:T] XtfTest = Xtf[T:] Clf = OneVsRestClassifier(LinearSVC(loss='l1', class_weight={1:100,0:1})).fit(XtfTrain, Yt) Classes = list(MLB.classes_) for xTest in XtfTest: y = Clf.decision_function(xTest) y1 = list(y[0]) c1 = Classes lbls = [x for (y,x) in sorted(zip(y1,c1))][-10:] list.reverse(lbls) print (' '.join([str(i) for i in lbls]))
print '\rFitting %d/%d ' % (i, TIMES), sys.stdout.flush() # resampling classifier = OneVsRestClassifier( svm.SVC(kernel=multichannel_wrapper(2, chi_square_kernel), probability=True)) X_train, X_test, y_train, y_test = train_test_split(x, y, tag) y_score = classifier.fit(X_train, y_train).decision_function(X_test) l.append( float((y_test.argmax(1) == y_score.argmax(1)).sum()) / y_score.shape[0] * 100) print map(lambda x: '%.3f%%' % x, l), '=', np.mean(l) y_score = classifier.decision_function(x) print 'Test all = %.3f%%' % (float( (y.argmax(1) == y_score.argmax(1)).sum()) / y_score.shape[0] * 100) if True: import matplotlib.pyplot as plt from sklearn.metrics import precision_recall_curve from sklearn.metrics import average_precision_score # Compute Precision-Recall and plot curve precision = dict() recall = dict() average_precision = dict() for i in range(n_classes): precision[i], recall[i], _ = precision_recall_curve( y[:, i], y_score[:, i])
X1 = X_train.toarray() X2 = X_test.toarray() # X1 = X_train # X2 = X_test # clf = GaussianNB() # clf=SGDClassifier() clf=LinearSVC(random_state=0) # clf=RandomForestClassifier(n_estimators = 100) # clf=MultinomialNB() classif = OneVsRestClassifier(clf).fit(X1, Y1) class_set=classif.classes_ scores=classif.decision_function(X2) Y3=[] # predict=classif.predict(X2) if len(scores.shape) == 1: indices = (scores > 0).astype(np.int) else: for score in scores: buf=[] for i in range(9): if score[i]>0: buf.append(class_set[i]) if not buf: indices = np.argmax(score) buf.append(class_set[indices])
import pandas as pd from sklearn.datasets import load_iris import matplotlib.pyplot as plt from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression from matplotlib import font_manager, rc font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name() rc('font', family=font_name) plt.rcParams['axes.unicode_minus']= False iris = load_iris() # OneVsOneClassifier 비해 속도는 빠르고 정확도는 떨어진다. model_ovr =OneVsRestClassifier(LogisticRegression(solver='lbfgs')).fit(iris.data, iris.target) ax1 = plt.subplot(211) pd.DataFrame(model_ovr.decision_function(iris.data)).plot(ax=ax1, legend=True) plt.title("판별함수") ax2 = plt.subplot(212) pd.DataFrame(model_ovr.predict(iris.data), columns=["prediction"]).plot(marker='o', ls='',ax=ax2) plt.title('클래스 판별') plt.tight_layout() plt.show()
if FOLD_CV: print "Performing 5-fold cv" scores = cv.cross_val_score( clf, X, y, cv=5, scoring="roc_auc" ) print "%d-fold cv, average auRoc %f" % (len(scores), scores.mean()) if PLOT_RESULTS: X_train, X_test, y_train, y_test = cv.train_test_split( X, y, test_size=0.3, random_state=0 ) clf.fit(X_train, y_train) print "Plotting results" y_scores = clf.decision_function(X_test) tname = "-".join(tissues) is_extra = brain_feats is not None or limb_feats is not None or heart_feats is not None plot_roc( y_test, y_scores, "ROC Tissue", out="figures/roc-curve-tis-%s%s.png" % (tname, is_extra) ) # plot_precision_recall(y_true, y_scores) # plot_2d_results(X_test, y_test, clf.predict(X_test)) print "Done plotting" end = time.clock()
class KOMD(BaseEstimator, ClassifierMixin): """KOMD. KOMD is a kernel method for classification and ranking. Read more in http://www.math.unipd.it/~dasan/papers/km-omd.icann08.pdf by F. Aiolli, G. Da San Martino, and A. Sperduti. For details on the precise mathematical formulation of the provided kernel functions and how `gamma`, `coef0` and `degree` affect each other, see the corresponding section in the narrative documentation: :ref:`svm_kernels`. Parameters ---------- lam : float, (default=0.1) Specifies the lambda value, between 0.0 and 1.0. kernel : optional (default='linear') Specifies the kernel function used by the algorithm. It must be one of 'linear', 'poly', 'rbf', a callable or a gram matrix. If none is given, 'linear' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. rbf_gamma : float, optional (default=0.1) Coefficient for 'rbf' and 'poly' kernels. Ignored by all other kernels. degree : float, optional (default=2.0) Specifies the degree of the 'poly' kernel. Ignored by all other kernels. coef0 : flaot, optional (default=0.0) Specifies the coeff0 in a polynomial kernel. Ignored by all other kernels. max_iter : int, optional (default=100) Hard limit on iterations within solver, it can't be negative. verbose : bool, (default=False) Enable verbose output during fit. multiclass_strategy : string, optional (default='ova') Specifies the strategy used in case of multiclass. 'ova' for one_vs_all pattern (also called one_vs_rest), 'ovo' for one_vs_one pattern. With other unexpected string, 'ova' pattern is used. Attributes ---------- gamma : array-like, shape = [n_samples] probability-like vector that define the distance vector over the two class. classes_ : array-like, shape = [n_classes] Vector that contain all possibile labels multiclass_ : boolean, True if the number of classes > 2 Examples -------- >>>import numpy as np >>>from ??.komd import KOMD >>>X = np.array([[1,2,i] for i in range(5)]) >>>Y = np.array([1,1,1,-1,-1]) >>>cls = KOMD() >>>cls = cls.fit(X,Y) >>>pred = cls.predict([[1,1,5]]) References ---------- `A Kernel Method for the Optimization of the Margin Distribution <http://www.math.unipd.it/~dasan/papers/km-omd.icann08.pdf>`__ """ def __init__(self, lam = 0.1, kernel = 'rbf', rbf_gamma = 0.1, degree = 2.0, coef0 = 0.0, max_iter = 100, verbose = False, multiclass_strategy = 'ova'): self.lam = lam self.gamma = None self.bias = None self.X = None self.Y = None self.is_fitted = False self.rbf_gamma = rbf_gamma self.degree = degree self.coef0 = coef0 self.max_iter = max_iter self.verbose = verbose self.kernel = kernel self.multiclass_strategy = multiclass_strategy self.multiclass_ = None self.classes_ = None self._pairwise = self.kernel=='precomputed' def __kernel_definition__(self): """Select the kernel function Returns ------- kernel : a callable relative to selected kernel """ if hasattr(self.kernel, '__call__'): return self.kernel if self.kernel == 'rbf' or self.kernel == None: return lambda X,Y : rbf_kernel(X,Y,self.rbf_gamma) if self.kernel == 'poly': return lambda X,Y : polynomial_kernel(X, Y, degree=self.degree, gamma=self.rbf_gamma, coef0=self.coef0) if self.kernel == 'linear': return lambda X,Y : linear_kernel(X,Y) if self.kernel == 'precomputed': return lambda X,Y : X def fit(self, X, Y): """Fit the model according to the given training data Parameters ---------- X : array-like, shape = [n_samples, n_features] Matrix of the examples, where n_samples is the number of samples and n_feature is the number of features Y : array-like, shape = [n_samples] array of the labels relative to X Returns ------- self : object Returns self """ X,Y = validation.check_X_y(X, Y, dtype=np.float64, order='C', accept_sparse='csr') #check_consistent_length(X,Y) check_classification_targets(Y) self.classes_ = np.unique(Y) if len(self.classes_) < 2: raise ValueError("The number of classes has to be almost 2; got ", len(self.classes_)) if len(self.classes_) == 2: self.multiclass_ = False return self._fit(X,Y) else : self.multiclass_ = True if self.multiclass_strategy == 'ovo': return self._one_vs_one(X,Y) else : return self._one_vs_rest(X,Y) raise ValueError('This is a very bad exception...') def _one_vs_one(self,X,Y): self.cls = OneVsOneClassifier(KOMD(**self.get_params())).fit(X,Y) self.is_fitted = True return self def _one_vs_rest(self,X,Y): self.cls = OneVsRestClassifier(KOMD(**self.get_params())).fit(X,Y) self.is_fitted = True return self def _fit(self,X,Y): self.X = X values = np.unique(Y) Y = [1 if l==values[1] else -1 for l in Y] self.Y = Y npos = len([1.0 for l in Y if l == 1]) nneg = len([1.0 for l in Y if l == -1]) gamma_unif = matrix([1.0/npos if l == 1 else 1.0/nneg for l in Y]) YY = matrix(np.diag(list(matrix(Y)))) Kf = self.__kernel_definition__() ker_matrix = matrix(Kf(X,X).astype(np.double)) #KLL = (1.0 / (gamma_unif.T * YY * ker_matrix * YY * gamma_unif)[0])*(1.0-self.lam)*YY*ker_matrix*YY KLL = (1.0-self.lam)*YY*ker_matrix*YY LID = matrix(np.diag([self.lam * (npos * nneg / (npos+nneg))]*len(Y))) Q = 2*(KLL+LID) p = matrix([0.0]*len(Y)) G = -matrix(np.diag([1.0]*len(Y))) h = matrix([0.0]*len(Y),(len(Y),1)) A = matrix([[1.0 if lab==+1 else 0 for lab in Y],[1.0 if lab2==-1 else 0 for lab2 in Y]]).T b = matrix([[1.0],[1.0]],(2,1)) solvers.options['show_progress'] = False#True solvers.options['maxiters'] = self.max_iter sol = solvers.qp(Q,p,G,h,A,b) self.gamma = sol['x'] if self.verbose: print ('[KOMD]') print ('optimization finished, #iter = %d' % sol['iterations']) print ('status of the solution: %s' % sol['status']) print ('objval: %.5f' % sol['primal objective']) bias = 0.5 * self.gamma.T * ker_matrix * YY * self.gamma self.bias = bias self.is_fitted = True self.ker_matrix = ker_matrix return self def predict(self, X): """Perform classification on samples in X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Matrix containing new samples Returns ------- y_pred : array, shape = [n_samples] The value of prediction for each sample """ if self.is_fitted == False: raise NotFittedError("This KOMD instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.") X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C") if self.multiclass_ == True: return self.cls.predict(X) return np.array([self.classes_[1] if p >=0 else self.classes_[0] for p in self.decision_function(X)]) def get_params(self, deep=True): # this estimator has parameters: return {"lam": self.lam, "kernel": self.kernel, "rbf_gamma":self.rbf_gamma, "degree":self.degree, "coef0":self.coef0, "max_iter":self.max_iter, "verbose":self.verbose, "multiclass_strategy":self.multiclass_strategy} def set_params(self, **parameters): for parameter, value in parameters.items(): setattr(self,parameter,value) return self def decision_function(self, X): """Distance of the samples in X to the separating hyperplane. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- Z : array-like, shape = [n_samples, 1] Returns the decision function of the samples. """ if self.is_fitted == False: raise NotFittedError("This KOMD instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.") X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C") if self.multiclass_ == True: return self.cls.decision_function(X) Kf = self.__kernel_definition__() YY = matrix(np.diag(list(matrix(self.Y)))) ker_matrix = matrix(Kf(X,self.X).astype(np.double)) z = ker_matrix*YY*self.gamma z = z-self.bias return np.array(list(z))
def run_experiment(dataset_name, dataset_folder, feature_extraction_method_name, percentages, trial_num, thread_num, feature_extraction_parameters, classifier_parameters): if dataset_name == "snow2014": adjacency_matrix,\ node_label_matrix,\ labelled_node_indices,\ number_of_categories = read_snow2014graph_data(dataset_folder) elif dataset_name == "flickr": adjacency_matrix,\ node_label_matrix,\ labelled_node_indices,\ number_of_categories = read_asu_data(dataset_folder) elif dataset_name == "youtube": adjacency_matrix,\ node_label_matrix,\ labelled_node_indices,\ number_of_categories = read_asu_data(dataset_folder) elif dataset_name == "politicsuk": adjacency_matrix,\ node_label_matrix,\ labelled_node_indices,\ number_of_categories = read_insight_data(dataset_folder) else: print("Invalid dataset name.") raise RuntimeError print("Graphs and labels read.") feature_matrix,\ feature_extraction_elapsed_time = feature_extraction(adjacency_matrix, feature_extraction_method_name, thread_num, feature_extraction_parameters) print("Feature extraction elapsed time: ", feature_extraction_elapsed_time) if feature_extraction_parameters["community_weighting"] is None: pass elif feature_extraction_parameters["community_weighting"] == "chi2": feature_matrix = normalize_columns(feature_matrix) elif feature_extraction_parameters["community_weighting"] == "ivf": feature_matrix = normalize_columns(feature_matrix) else: print("Invalid community weighting selection.") raise RuntimeError C = classifier_parameters["C"] fit_intercept = classifier_parameters["fit_intercept"] for p in np.arange(percentages.size): percentage = percentages[p] # Initialize the metric storage arrays to zero macro_F1 = np.zeros(trial_num, dtype=np.float) micro_F1 = np.zeros(trial_num, dtype=np.float) folds = generate_folds(node_label_matrix, labelled_node_indices, number_of_categories, percentage, trial_num) for trial in np.arange(trial_num): train, test = next(folds) ######################################################################################################## # Separate train and test sets ######################################################################################################## X_train, X_test, y_train, y_test = feature_matrix[train, :],\ feature_matrix[test, :],\ node_label_matrix[train, :],\ node_label_matrix[test, :] if issparse(feature_matrix): if feature_extraction_parameters["community_weighting"] == "chi2": contingency_matrix = chi2_contingency_matrix(X_train, y_train) community_weights = peak_snr_weight_aggregation(contingency_matrix) X_train, X_test = community_weighting(X_train, X_test, community_weights) else: X_train = normalize(X_train, norm="l2") X_test = normalize(X_test, norm="l2") ############################################################################################################ # Train model ############################################################################################################ # Train classifier. start_time = time.time() model = OneVsRestClassifier(svm.LinearSVC(C=C, random_state=None, dual=False, fit_intercept=fit_intercept), n_jobs=thread_num) model.fit(X_train, y_train) hypothesis_training_time = time.time() - start_time print('Model fitting time: ', hypothesis_training_time) ############################################################################################################ # Make predictions ############################################################################################################ start_time = time.time() y_pred = model.decision_function(X_test) prediction_time = time.time() - start_time print('Prediction time: ', prediction_time) ############################################################################################################ # Calculate measures ############################################################################################################ y_pred = evaluation.form_node_label_prediction_matrix(y_pred, y_test) measures = evaluation.calculate_measures(y_pred, y_test) macro_F1[trial] = measures[4] micro_F1[trial] = measures[5] # print('Trial ', trial+1, ':') # print(' Macro-F1: ', macro_F1[trial]) # print(' Micro-F1: ', micro_F1[trial]) # print('\n') ################################################################################################################ # Experiment results ################################################################################################################ print(percentage) print('\n') print('Macro F1 average: ', np.mean(macro_F1)) print('Micro F1 average: ', np.mean(micro_F1)) print('Macro F1 std: ', np.std(macro_F1)) print('Micro F1 std: ', np.std(micro_F1))
def _predict(self, train_index, test_index): """ :param train_index: list with the index of the models data used in the algorithm of SVM. :param test_index: list with the index to predict. :return: tuple with (label_prediction, label_score, word_prediction, word_score, bin_predictions) of test_index made by: - predict Label (hiper/hipo/normal) with SVM (using all trainindex) - predict word with KNN using only hiper/hipo/normal trainindex depending on the label predicted """ # models to predict data_pred = self.loader.get_models(test_index, {'weight': self.weights}) data_pred = data_pred.iloc[:, 1:-1] # Get models with words wdata = self.loader.get_models(train_index, {'weight': self.weights}) # Get models hipo/hiper/norm labels ldata_labels = self.loader.get_labels_of_words(wdata.iloc[:, -1].tolist()) # Predict label svm = OneVsRestClassifier( SVC(kernel=self._kernel, C=self._C, gamma=self._gamma)) y = label_binarize(ldata_labels, classes=[-1, 0, 1]) svm.fit(wdata.iloc[:, 1:-1], y) bin_labels = svm.predict(data_pred) score_label = svm.decision_function(data_pred) # Predict word using only vocabulary of label predicted res_word = [] res_label = [] score_word = [] predicters = {} for i in range(len(bin_labels)): label = None for l in range(-1, 2): if bin_labels[i][l + 1] == 1: label = l break if not label: maxscore = -1000 for s in range(-1, 2): score = score_label[i][s + 1] if score >= maxscore: label = s maxscore = score res_label.append(label) # Work only with sessions of the label sessions = [ wdata.iloc[z, 0] for z in range(len(wdata)) if ldata_labels[z] == label ] models = wdata[wdata.id.isin(sessions)] models_labels = models.iloc[:, -1] models = models.iloc[:, 1:-1] if not predicters.get(str(label), False): knn = KNN(n_neighbors=5) knn.fit(models, models_labels) predicters[str(label)] = knn knn = predicters.get(str(label)) res_w = knn.predict(data_pred) score_w = knn.predict_proba(data_pred) res_word.append(res_w) score_word.append(score_w) return (res_label, score_label, res_word, score_word, bin_labels)
# Use label_binarize to be multi-label like settings Y = label_binarize(y, classes=[0, 1, 2]) n_classes = Y.shape[1] # Split into training and test X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5, random_state=random_state) # We use OneVsRestClassifier for multi-label prediction from sklearn.multiclass import OneVsRestClassifier # Run classifier classifier = OneVsRestClassifier(svm.LinearSVC(random_state=random_state)) classifier.fit(X_train, Y_train) y_score = classifier.decision_function(X_test) ############################################################################### # The average precision score in multi-label settings # .................................................... from sklearn.metrics import precision_recall_curve from sklearn.metrics import average_precision_score # For each class precision = dict() recall = dict() average_precision = dict() for i in range(n_classes): precision[i], recall[i], _ = precision_recall_curve(Y_test[:, i], y_score[:, i])
#saving the above data into a npz file, as a temporary storage so that we don't have to run the entire parsing #over and over again. outfile = TemporaryFile() #np.savez(outfile, X = X, Y=Y, X_t = X_t, Y_t = Y_t) outfile.seek(0) npzfile = np.load(outfile) X = npzfile['X'] Y = npzfile['Y'] X_t = npzfile['X_t'] Y_t = npzfile['Y_t']""" #model = OneVsRestClassifier(svm.SVC(kernel='linear',probability=True,random_state=0)).fit(X,Y) model2 = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X,Y) Y_score = model2.decision_function(X_t) #Y_pred = model2.predict(X_t) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(Y_t[:, i], Y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) #plot ROC plt.figure() for i in range(n_classes): plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})' ''.format(i, roc_auc[i]))
y_val = np.array(encoded_labels_df_val) # Define model linsvm = LinearSVC(loss='hinge') #multi_class='ovr', #verbose=True, #max_iter=1000) model = OneVsRestClassifier(linsvm, n_jobs=-1) start = time.process_time() model.fit(X_train, Y_train) elapsed_fit = time.process_time() - start print("Time to fit model (min):", elapsed_fit / 60) start_predict = time.process_time() ### change y_pred = model.decision_function(x_val) elapsed_predict = time.process_time() - start_predict print("Time to predict (min):", elapsed_predict / 60) # Evaluate ### change y_true = y_val LRAP = label_ranking_average_precision_score(y_true, y_pred) print("LRAP:", LRAP) print(y_pred[0:3])
Y_train_bin = lb.fit_transform(test_features[f'Label']) sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0) numSplits = sss.get_n_splits(X_train,Y_train_bin) precisionList = dict() recallList = dict() APList = dict() classifyReports = dict() preRecFSupports = dict() for j, (train_index, test_index) in enumerate(sss.split(X_train,Y_train_bin)): X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index] y_train_fold, y_test_fold = Y_train_bin[train_index], Y_train_bin[test_index] svc.fit(X_train_fold,y_train_fold) for num in range(3): print(f'Number of support vectors in the {num} class is {svc.estimators_[num].n_support_}') y_predict = svc.predict(X_test_fold) y_score = svc.decision_function(X_test_fold) #print(y_test_fold) #average precision score do not support multiple class. #average_precision = average_precision_score(y_test_fold, y_score) #print('Average precision-recall score: {0:0.2f}'.format(average_precision)) # For each class precision = dict() recall = dict() average_precision = dict() # we can get U,D,S information from here. for i in range(3): precision[i], recall[i], _ = precision_recall_curve(y_test_fold[:, i],y_score[:, i]) average_precision[i] = average_precision_score(y_test_fold[:, i], y_score[:, i]) # A "micro-average": quantifying score on all classes jointly precision["micro"], recall["micro"], _ = precision_recall_curve(y_test_fold.ravel(),y_score.ravel())
print("Training SVM") TIMES = 10 l = [] for i in range(TIMES): print '\rFitting %d/%d ' % (i, TIMES), sys.stdout.flush() # resampling classifier = OneVsRestClassifier(svm.SVC(kernel=multichannel_wrapper(2, chi_square_kernel), probability=True)) X_train, X_test, y_train, y_test = train_test_split(x, y, tag) y_score = classifier.fit(X_train, y_train).decision_function(X_test) l.append(float((y_test.argmax(1) == y_score.argmax(1)).sum())/y_score.shape[0]*100) print map(lambda x: '%.3f%%' % x, l), '=', np.mean(l) y_score = classifier.decision_function(x) print 'Test all = %.3f%%' % (float((y.argmax(1) == y_score.argmax(1)).sum())/y_score.shape[0]*100 ) if True: import matplotlib.pyplot as plt from sklearn.metrics import precision_recall_curve from sklearn.metrics import average_precision_score # Compute Precision-Recall and plot curve precision = dict() recall = dict() average_precision = dict() for i in range(n_classes): precision[i], recall[i], _ = precision_recall_curve(y[:, i], y_score[:, i]) average_precision[i] = average_precision_score(y[:, i], y_score[:, i])
0, support_index[0]:support_index[1]] = clf.estimators_[0].dual_coef_ supports = clf.estimators_[0].support_vectors_ # create the alpha vector and support vector list by iterating through the estimators for i in range(1, classes): alpha_vector[ i, support_index[i]:support_index[i + 1]] = clf.estimators_[i].dual_coef_ supports = np.concatenate( (supports, clf.estimators_[i].support_vectors_)) num_classifiers = classes # this is the raw votes, test for equality here decision1 = clf.decision_function(X_test) # one vs one classification creates pairwise combinations of all classes as classifier # we need to create a classifier map for these pairwise combinations # we will also arrange the alphas accordingly, since the SVC function from sklearn is too optimized... elif class_type == 'ovo': # extract parameters from the SVC function alphas = clf.dual_coef_ supports = clf.support_vectors_ intercept = clf.intercept_ # create the indices for the alpha vector expansion support_index = np.concatenate(([0], np.cumsum(clf.n_support_))) # generate the combination maps for the alphas combo_map = np.zeros((classes, classes - 1, 2))
def svm(i): train_x = pd.read_csv( f'./CV_Features_631/ClassificationFeatures/Train_CV_{i}.csv').iloc[:, 9:] train_y = pd.read_csv( f'./CV_Features_631/ClassificationFeatures/Train_CV_{i}.csv').iloc[:, 4] validation_x = pd.read_csv( f'./CV_Features_631/ClassificationFeatures/Validation_CV_{i}.csv' ).iloc[:, 9:] validation_y = pd.read_csv( f'./CV_FeCV_Features_631atures/ClassificationFeatures/Validation_CV_{i}.csv' ).iloc[:, 4] test_x = pd.read_csv( f'./CV_Features_631/ClassificationFeatures/Test_CV_{i}.csv').iloc[:, 9:] test_y = pd.read_csv( f'./CV_Features_631/ClassificationFeatures/Test_CV_{i}.csv').iloc[:, 4] encoder = LabelEncoder().fit( train_y) # #训练LabelEncoder, 把y_train中的类别编码为0,1,2,3,4,5 y = encoder.transform(train_y) y_train = pd.DataFrame( encoder.transform(train_y)) # 使用训练好的LabelEncoder对源数据进行编码 y_valid = pd.DataFrame(encoder.transform(validation_y)) y_test = pd.DataFrame(encoder.transform(test_y)) # 标签降维度 y_train = y_train.iloc[:, 0].ravel() y_valid = y_valid.iloc[:, 0].ravel() y_test = y_test.iloc[:, 0].ravel() # X标准化 scaler = StandardScaler() x_train_std = scaler.fit_transform(train_x) x_valid_std = scaler.fit_transform(validation_x) x_test_std = scaler.fit_transform(test_x) # ------------ # Gamma # ------------ accuracy_list_valid, f1_list_valid, auc_list_valid = [], [], [] gamma_range = np.logspace(-10, 1, 10, base=2) logger.info(gamma_range) for idx, gamma in enumerate(tqdm(gamma_range)): # ------------ # Training # ------------ time0 = time() logger.info( f">>>>>>>CV = {i}/10, Start Trainng {idx + 1}/{len(gamma_range)}>>>>>>>" ) print( f">>>>>>> CV = {i}/10, Start Training {idx + 1}/{len(gamma_range)}>>>>>>>" ) clf = OneVsRestClassifier( SVC( kernel='rbf', # gamma=gamma, C=1, # default degree=1, cache_size=5000, probability=True, class_weight='balanced')) clf.fit(x_train_std, y_train) # ------------ # Validation: Fine-tuning on Validation dataset # ------------ y_prediction_valid = clf.predict(x_valid_std) accuracy_valid = accuracy_score(y_valid, y_prediction_valid) accuracy_list_valid.append(accuracy_valid) f1_valid = f1_score(y_valid, y_prediction_valid, average="weighted") f1_list_valid.append(f1_valid) y_binary_valid = label_binarize(y_valid, classes=list(range(6))) result_valid = clf.decision_function(x_valid_std) auc_valid = roc_auc_score(y_binary_valid, result_valid, average='micro') auc_list_valid.append(auc_valid) # Logger logger.info( f"Validation Gamma >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}" ) print( f"Validation Gamma >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}" ) print( datetime.datetime.fromtimestamp(time() - time0).strftime("%M:%S:%f")) best_gamma = gamma_range[accuracy_list_valid.index( max(accuracy_list_valid))] best_acc = max(accuracy_list_valid) best_f1 = f1_list_valid[accuracy_list_valid.index( max(accuracy_list_valid))] best_auc = auc_list_valid[accuracy_list_valid.index( max(accuracy_list_valid))] print( f"Validation >>> Best gamma = {best_gamma}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}\n" ) logger.info( f"Validation >>> Best gamma = {best_gamma}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}" ) # ------------ # C # ------------ best_gamma = gamma_range[accuracy_list_valid.index( max(accuracy_list_valid))] C = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19] accuracy_list_C_valid = [] for idx, c in enumerate(tqdm(C)): time0 = time() logger.info( f">>>>>>>CV = {i}/10, Fine-Tuining C, Start Trainng {idx + 1}/{len(C)}>>>>>>>" ) print( f">>>>>>> CV = {i}/10, Fine-Tuining C, Start Training {idx + 1}/{len(C)}>>>>>>>" ) clf = OneVsRestClassifier( SVC( kernel='rbf', # gamma=best_gamma, C=c, # default degree=1, cache_size=5000, probability=True, class_weight='balanced')) clf.fit(x_train_std, y_train) # ------------ # Validation: Fine-tuning on Validation dataset # ------------ y_prediction_valid = clf.predict(x_valid_std) accuracy_valid = accuracy_score(y_valid, y_prediction_valid) accuracy_list_C_valid.append(accuracy_valid) f1_valid = f1_score(y_valid, y_prediction_valid, average="weighted") y_binary_valid = label_binarize(y_valid, classes=list(range(6))) result_valid = clf.decision_function(x_valid_std) auc_valid = roc_auc_score(y_binary_valid, result_valid, average='micro') # Logger logger.info( f"Validation C >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}" ) print( f"Validation C >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}" ) print( datetime.datetime.fromtimestamp(time() - time0).strftime("%M:%S:%f")) best_c = C[accuracy_list_C_valid.index(max(accuracy_list_C_valid))] # logger best_acc = max(accuracy_list_C_valid) best_f1 = f1_list_valid[accuracy_list_valid.index( max(accuracy_list_valid))] best_auc = auc_list_valid[accuracy_list_valid.index( max(accuracy_list_valid))] print( f"Validation >>> Best gamma = {best_gamma}, Best C = {best_c}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}\n" ) logger.info( f"Validation >>> Best gamma = {best_gamma}, Best C = {best_c}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}" ) # ------------ # Test: Test on Test dataset with best gamma # ------------ clf_best_test = OneVsRestClassifier( SVC( kernel='rbf', # gamma=best_gamma, C=best_c, # default degree=1, cache_size=5000, probability=True, class_weight='balanced')) clf_best_test.fit(x_train_std, y_train) # accuracy & F1 & AUC on Test dataset y_test_prediction = clf_best_test.predict(x_test_std) test_accuracy = round(accuracy_score(y_test, y_test_prediction), 4) test_f1 = round(f1_score(y_test, y_test_prediction, average="weighted"), 4) y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot result_test = clf_best_test.decision_function(x_test_std) test_auc = round( roc_auc_score(y_test_binary, result_test, average='micro'), 4) print( f"CV = {i}, Test >>> gamma = {best_gamma}, Acc. ={test_accuracy}, F1-Score = {test_f1}, AUC = {test_auc}" ) logger.info( f"CV = {i}, Test >>> gamma = {best_gamma}, Acc. ={test_accuracy}, F1-Score = {test_f1}, AUC = {test_auc}" ) # save result_test = clf_best_test.predict_proba(x_test_std) df = pd.DataFrame(result_test) df.to_csv( f"./Prediction_202106_Ratio631/categorical_vggish_6pnn_20210621_prediction_CV{i}_Gamma_{round(best_gamma,4)}_C_{round(best_c)}_ACC_{test_accuracy}_F1_{test_f1}_AUC_{test_auc}.csv" ) df2 = pd.DataFrame(y_test) df2.to_csv( f"./Prediction_202106_Ratio631/categorical_vggish_6pnn_20210324_GT_CV{i}.csv" ) print(f">>>>>>> CV = {i}/10, Over Training >>>>>>>\n") logger.info(f">>>>>>> CV = {i}/10,Over Training >>>>>>>") return [test_accuracy, test_f1, test_auc]
def runsvm(set): print("DataSet: " + set) print("parameters for SVM: " + json.dumps(request.json)) _kernel = request.json['kernel'] _gamma = request.json['gamma'] _C = request.json['penalty'] _degree = request.json['degree'] #_kernel = request.form['kernel'] #_C = request.form['c'] #_gamma = request.form['gamma'] nClasses = 10 trainData = pd.read_csv(os.path.join(root_dir(), 'data/mnist_train.csv'), sep=',', header=None) trainLabel = pd.read_csv(os.path.join(root_dir(), 'data/mnist_train_label.csv'), sep=',', header=None) trainLabelBinary = label_binarize(trainLabel, classes=np.array(range(nClasses))) testData = pd.read_csv(os.path.join(root_dir(), 'data/mnist_test.csv'), sep=',', header=None) testLabel = pd.read_csv(os.path.join(root_dir(), 'data/mnist_test_label.csv'), sep=',', header=None) testLabelBinary = label_binarize(testLabel, classes=np.array(range(nClasses))) #random_state = np.random.RandomState(0) clt = svm.SVC(kernel=_kernel, C=_C, degree=_degree) classifier = OneVsRestClassifier(clt) clt.fit(trainData, trainLabel) classifier.fit(trainData, trainLabelBinary) precision = dict() recall = dict() precisionJson = "" recallJson = "" if set == "test": pred = clt.predict(testData) f1Score = f1_score(testLabel, pred, average=None) cm = confusion_matrix(testLabel, pred) accuracy = accuracy_score(testLabel, pred) precisionScore = precision_score(testLabel, pred, average=None) recallScore = recall_score(testLabel, pred, average=None) #hingeLoss = hinge_loss(testLabel, pred) dec = classifier.decision_function(testData) for i in range(nClasses): precision[i], recall[i], _ = precision_recall_curve( testLabelBinary[:, i], dec[:, i]) precisionJson += json.dumps(np.round(precision[i], 3).tolist()) recallJson += json.dumps(np.round(recall[i], 3).tolist()) if i != nClasses - 1: precisionJson += ", " recallJson += ", " else: pred = clt.predict(trainData) f1Score = f1_score(trainLabel, pred, average=None) cm = confusion_matrix(trainLabel, pred) accuracy = accuracy_score(trainLabel, pred) precisionScore = precision_score(trainLabel, pred, average=None) recallScore = recall_score(trainLabel, pred, average=None) #hingeLoss = hinge_loss(trainLabel, pred) dec = classifier.decision_function(trainData) for i in range(nClasses): precision[i], recall[i], _ = precision_recall_curve( trainLabelBinary[:, i], dec[:, i]) precisionJson += json.dumps(np.round(precision[i], 3).tolist()) recallJson += json.dumps(np.round(recall[i], 3).tolist()) if i != nClasses - 1: precisionJson += ", " recallJson += ", " return "{\"result\": " + json.dumps(pred.tolist()) + ",\"f1_score\": " + json.dumps(np.round(f1Score,3).tolist()) \ + ",\"confusion\": " + json.dumps(cm.tolist()) + ",\"accuracy_score\": " + json.dumps(np.round(accuracy,3).tolist()) \ + ", \"precision_score\": " + json.dumps(np.round(precisionScore,3).tolist()) \ + ", \"recall_score\": " + json.dumps(np.round(recallScore,3).tolist()) \ + ", \"precision_curve\": ["+ precisionJson + "],\"recall_curve\": [" + recallJson \ + "]" + "}"
n_estimators = 3 print("Developing SVM models....") model3 = OneVsRestClassifier(BaggingClassifier(LinearSVC(class_weight='balanced', max_iter = 100000), max_samples=1.0 / n_estimators, n_estimators=n_estimators)) print("Fitting SVM models....") model3.fit(X_train, y_train) dump(model3, "svm_model.joblib") print("SVM - Saved!") print() # predict probabilities nb_probs = model1.predict_proba(X_test) rf_probs = model2.predict_proba(X_test) svm_probs = model3.decision_function(X_test) # keep probabilities for the positive outcome only nb_probs = nb_probs[:, 1] rf_probs = rf_probs[:, 1] # calculate scores ns_auc = roc_auc_score(y_test, ns_probs) nb_auc = roc_auc_score(y_test, nb_probs) rf_auc = roc_auc_score(y_test, rf_probs) svm_auc = roc_auc_score(y_test, svm_probs) print("NB - Accuracy: %f" % accuracy_score(y_test, model1.predict(X_test))) print("NB - AUC score: %f" % nb_auc)
def getTrainAndTest(self): #df = pd.read_csv('H:\pc programming\Django(Prac)\ML\Classification\Classification\Review_Testing_Format.txt') df = pd.read_csv('Review_Testing_Format.txt') df.replace('?', -99999, inplace=True) df.drop(['id'], 1, inplace=True) X = np.array(df.drop(['class'], 1)) y = np.array(df['class']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.10) # Built-In Decision Tree self.clf_DTree = tree.DecisionTreeClassifier() self.clf_DTree.fit(X_train, y_train) accuracy = self.clf_DTree.score(X_test, y_test) print("Accuracy in Decision Tree: %s" % accuracy) # Built-In K-Nearest Neighbour self.clf_KNN = tree.DecisionTreeClassifier() self.clf_KNN.fit(X_train, y_train) accuracy = self.clf_KNN.score(X_test, y_test) print("Accuracy in KNN: %s" % accuracy) # Built-In Support Vector Machine self.clf_SVM = tree.DecisionTreeClassifier() self.clf_SVM.fit(X_train, y_train) accuracy = self.clf_SVM.score(X_test, y_test) print("Accuracy in SVM: %s" % accuracy) Y = label_binarize(y, classes=['A', 'B', 'C']) n_classes = Y.shape[1] X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=.5, ) classifier = OneVsRestClassifier(svm.LinearSVC(random_state=None)) classifier.fit(X_train, Y_train) y_score = classifier.decision_function(X_test) # For each class precision = dict() recall = dict() average_precision = dict() ''' for i in range(n_classes): average_precision[i] = average_precision_score(Y_test[:, i], y_score[:, i]) average_precision["micro"] = average_precision_score(Y_test, y_score, average="micro") average_precision["macro"] = average_precision_score(Y_test, y_score, average="macro") average_precision["weighted"] = average_precision_score(Y_test, y_score, average="weighted") print('Average precision score, micro-averaged over all classes: {0:0.2f}' .format(average_precision["micro"])) recall["micro"] = recall_score(Y_test, y_score,average="micro") print('Recall score, micro over all classes: {0:0.2f}' .format(recall["micro"])) ''' '''
# TEST_FEATURES = numpy.array(TEST_FEATURES) TEST_FEATURES = TRAIN_FEATURES TRAIN_LABELS = numpy.array(TRAIN_LABELS) # TEST_LABELS = numpy.array(TEST_LABELS) TEST_LABELS = TRAIN_LABELS TRAIN_ATTRIBUTE = numpy.array(TRAIN_ATTRIBUTE) # TEST_ATTRIBUTE = numpy.array(TEST_ATTRIBUTE) TEST_ATTRIBUTE = TRAIN_ATTRIBUTE pc=0 nc=0 classifier = OneVsRestClassifier(LinearSVC(C=2.0,random_state=0)) classifier.fit(TRAIN_FEATURES,TRAIN_ATTRIBUTE) decision = classifier.decision_function(TEST_FEATURES) prediction = classifier.predict(TEST_FEATURES) for i in range(0,len(TEST_ATTRIBUTE)): for j in range(22): if prediction[i][j]==TEST_ATTRIBUTE[i][j]: pc+=1 else: nc+=1 # print prediction[i],TEST_ATTRIBUTE[i],TEST_LABELS[i], decision[i] print pc,nc print classifier.score(TEST_FEATURES,TEST_ATTRIBUTE) TRAIN_LABELS = [] TRAIN_FEATURES = [] TRAIN_ATTRIBUTE = []
def oneVsAll(self, clf, idindiv=0, nbrot=5, test_size=0.33): data = self.countMat[idindiv * self.kmByIndiv:(idindiv + 1) * self.kmByIndiv, :].T data = normalize(data, axis=1, copy=False) from sklearn.preprocessing import label_binarize Y = label_binarize(self.classname, classes=np.unique(self.classname)) uniqClasname = np.unique(self.classname) n_classes = Y.shape[1] result = np.array([]) for i in range(nbrot): X_train, X_test, Y_train, Y_test = model_selection.train_test_split( data, Y, test_size=test_size) # ,random_state=seed) classifier = OneVsRestClassifier(clf) classifier.fit(X_train, Y_train) y_score = classifier.decision_function(X_test) # For each class precision = dict() recall = dict() average_precision = dict() for i in range(n_classes): if n_classes > 1: tmpscore = y_score[:, i] else: tmpscore = y_score precision[i], recall[i], _ = precision_recall_curve( Y_test[:, i], tmpscore) average_precision[i] = average_precision_score( Y_test[:, i], tmpscore) # A "micro-average": quantifying score on all classes jointly precision["micro"], recall["micro"], _ = precision_recall_curve( Y_test.ravel(), y_score.ravel()) average_precision["micro"] = average_precision_score( Y_test, y_score, average="micro") print( 'Average precision score, micro-averaged over all classes: {0:0.2f}' .format(average_precision["micro"])) result = np.append(result, average_precision["micro"]) plt.figure() plt.step(recall['micro'], precision['micro'], color='b', alpha=0.2, where='post') #plt.fill_between(recall["micro"], precision["micro"], alpha=0.2, color='b', **step_kwargs) plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title( 'Average precision score, micro-averaged over all classes: AP={0:0.2f}' .format(average_precision["micro"])) plt.savefig( os.path.join(self.savepath, "average_precision_score.png")) plt.close() #Plot Precision - Recall curve for each class and iso-f1 curves¶ from itertools import cycle # setup plot details colors = cycle( ['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal']) plt.figure(figsize=(7, 8)) f_scores = np.linspace(0.2, 0.8, num=4) lines = [] labels = [] for f_score in f_scores: x = np.linspace(0.01, 1) y = f_score * x / (2 * x - f_score) l, = plt.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2) plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02)) lines.append(l) labels.append('iso-f1 curves') l, = plt.plot(recall["micro"], precision["micro"], color='gold', lw=2) lines.append(l) labels.append('micro-average Precision-recall (area = {0:0.2f})' ''.format(average_precision["micro"])) for i, color in zip(range(n_classes), colors): l, = plt.plot(recall[i], precision[i], color=color, lw=2) lines.append(l) labels.append( 'Precision-recall for class {0} (area = {1:0.2f})' ''.format(uniqClasname[i], average_precision[i])) fig = plt.gcf() fig.subplots_adjust(bottom=0.25) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Extension of Precision-Recall curve to multi-class') plt.legend(lines, labels, loc=(0, -.38), prop=dict(size=14)) plt.savefig( os.path.join(self.savepath, "Precision-Recall_curve.png")) plt.close() print( "oneVsAll crossvalidation Average precision score, micro-averaged over all classes:" ) # print(result) print("mean = ", np.mean(result), ", std = ", np.std(result))