def calculate_roc(truth, predictions): lb_truth = label_binarize(truth.iloc[:, -1].astype(int), np.arange(n_classes)) lb_prediction = label_binarize(predictions.iloc[:, -1].astype(int), np.arange(n_classes)) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(len(letter_set)): fpr[i], tpr[i], _ = roc_curve(lb_truth[:, i], lb_prediction[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(lb_truth.ravel(), lb_prediction.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) return fpr, tpr, roc_auc
def PersonWorker(person): print('starting on person: ', str(person)) #data = 40 videos x 32 alpha(csp channel) (X_train, y_train, X_test, y_test) = DL.loadPersonEpochDimRedu(person=person, featureFunc = featureFunc, ) #http://stackoverflow.com/questions/26963454/lda-ignoring-n-components => only 1 feature :( print(np.shape(X_train)) svm = LinearSVC() svm.fit(X_train, y_train) y = svm.predict(X_train) y = label_binarize(y, classes=[0, 1, 2, 3]) train_auc = UT.auc(y, y_train) y = svm.predict(X_test) y = label_binarize(y, classes=[0, 1, 2, 3]) test_auc = UT.auc(y, y_test) print('person: ', person, ' - train auc: ', str(train_auc), ' - test auc: ' , str(test_auc) ) return [train_auc, test_auc]
def fit(self, X, y): self.init_params(X, y) self.paths = self.construct_paths() num = len(self.paths[0]) swarm_paths = [sorted(list(set([s[i] for s in self.paths if s[i] is not None]))) for i in xrange(num)] W = self.init_network() self.W_swarms = [[[s for s in self.swarms if s.path[j] == i] for i in swarm_paths[j]] for j in xrange(num)] X_train, X_valid, y_train, y_valid = cv.train_test_split(X, y, test_size=self.validation_size, random_state=self.random_state) # binarize true values if len(self.classes_) > 2: y_train = label_binarize(y_train, self.classes_) y_valid = label_binarize(y_valid, self.classes_) else: y_train = self.mlb.fit_transform(label_binarize(y_train, self.classes_)) y_valid = self.mlb.fit_transform(label_binarize(y_valid, self.classes_)) j = 0 tmp = [1e3 - float(x * 1e3)/self.window for x in xrange(self.window)] window = deque(tmp, maxlen=(self.window * 5)) self.num_evals = 0 best_score = np.inf if self.verbose: print "Fitting network {0}-{1}-{2} with {3} paths".format(self.n_in, self.n_hidden, self.n_out, len(self.swarms)) while True: j += 1 for s in self.swarms: for p_index in xrange(self.num_particles): self.num_evals += 1 # evaluate each swarm score = s.evaluate(W, X_train, y_train, p_index) # reconstruct gvn Wn = self.reconstruct_gvn(W) # update s.update(self.w, self.c1, self.c2, p_index) # evaluate gvn y_pred = self.forward(Wn, X_valid) score = self.cost(y_valid, y_pred) if score < best_score: W = Wn[:] best_score = score window.append(best_score) r = linregress(range(self.window), list(window)[-self.window:]) if self.verbose: print j, best_score if r[0] >= 0 or best_score < 1e-3: self.W = W self.num_generations = j return self
def test_sensitivity_specificity_error_multilabels(): y_true = [1, 3, 3, 2] y_pred = [1, 1, 3, 2] y_true_bin = label_binarize(y_true, classes=np.arange(5)) y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) with pytest.raises(ValueError): sensitivity_score(y_true_bin, y_pred_bin)
def __init__(self, file_path, number_features): dataset = self.load_dataset(file_path, number_features) xs = dataset[:, 0:number_features + 1] ys = dataset[:, number_features + 1] self.xs, self.xs_test, ys, ys_test = train_test_split(xs, ys, train_size=0.6) self.ys = np.transpose(label_binarize(ys, classes=[0, 1, 2])) self.ys_test = np.transpose(label_binarize(ys_test, classes=[0, 1, 2])) self.m = self.xs.shape[0] self.test_set_size = self.xs_test.shape[0]
def getROCScore(X_train, y_train, X_test, y_test, classifierName, depth=None, Cvalue=1,alphaValue=0.0): # Binarize the output y_train = label_binarize(y_train, classes=[3, 4, 5, 6, 7, 8, 9, 12, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 999]) n_classes = y_train.shape[1] y_test = label_binarize(y_test, classes=[3, 4, 5, 6, 7, 8, 9, 12, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 999]) # Learn to predict each class against the other if classifierName=='DecisionTree': classifier=OneVsRestClassifier(tree.DecisionTreeClassifier(max_depth=depth)) elif classifierName=='LogisticRegression': classifier = OneVsRestClassifier(linear_model.LogisticRegression(C=Cvalue)) elif classifierName=='LinearSVC': classifier= OneVsRestClassifier(LinearSVC(C=Cvalue)) elif classifierName=='NaiveBayes': classifier= OneVsRestClassifier(MultinomialNB(alpha=alphaValue)) elif classifierName=='Bagging': estimator= tree.DecisionTreeClassifier() classifier=OneVsRestClassifier(BaggingClassifier(base_estimator=estimator)) y_score = classifier.fit(X_train, y_train).predict(X_test) fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Compute macro-average ROC curve and ROC area # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) return (roc_auc["micro"],roc_auc["macro"],classifier)
def xval(clf, x, y, train_index, test_index): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(x_train, y_train) y_pred = clf.predict_proba(x_test) if len(clf.classes_) > 2: mse = mean_squared_error(label_binarize(y_test, clf.classes_), y_pred) else: mlb = MultiLabelBinarizer() mse = mean_squared_error(mlb.fit_transform(label_binarize(y_test, clf.classes_)), y_pred) acc = accuracy_score(y_test, y_pred.argmax(axis=1)) evals = clf.get_num_evals() return mse, acc, evals
def PR_multi_class(data_train, data_test, data_test_vectors): # Binarize the output y_train_label = label_binarize(data_train.target, classes=[0, 1, 2]) n_classes = y_train_label.shape[1] random_state = np.random.RandomState(0) # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split(data_train_vectors, y_train_label, test_size=.5, random_state=random_state) # Learn to predict each class against the other classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state)) classifier.fit(X_train, y_train) y_pred_score = classifier.decision_function(data_test_vectors) y_test_label = label_binarize(data_test.target, classes=[0, 1, 2]) # Compute Precision-Recall and plot curve precision = dict() recall = dict() average_precision = dict() for i in range(n_classes): precision[i], recall[i], _ = precision_recall_curve(y_test_label[:, i], y_pred_score[:, i]) average_precision[i] = average_precision_score(y_test_label[:, i], y_pred_score[:, i]) # Compute micro-average ROC curve and ROC area precision["micro"], recall["micro"], _ = precision_recall_curve(y_test_label.ravel(), y_pred_score.ravel()) average_precision["micro"] = average_precision_score(y_test_label, y_pred_score, average="micro") # Plot Precision-Recall curve for each class plt.clf() # plt.plot(recall["micro"], precision["micro"], # label='micro-average PR curve (area = {0:0.2f})' # ''.format(average_precision["micro"])) for i in range(n_classes): plt.plot(recall[i], precision[i], label='PR curve of class {0} (area = {1:0.2f})' ''.format(i, average_precision[i])) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall curve of multi-class') plt.legend(loc="lower right") plt.show() return 0
def gensim_classifier(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) label_list = get_labels() tweet_list = get_labelled_tweets() # split all sentences to list of words sentences = [] for tweet in tweet_list: temp_doc = tweet.split() sentences.append(temp_doc) # parameters for model num_features = 100 min_word_count = 1 num_workers = 4 context = 2 downsampling = 1e-3 # Initialize and train the model w2v_model = Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed=1) index_value, train_set, test_set = train_test_split(0.80, sentences) train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features) test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features) train_vector = Imputer().fit_transform(train_vector) test_vector = Imputer().fit_transform(test_vector) # train model and predict model = LinearSVC() classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value]) result = classifier_fitted.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/w2v_linsvc.csv", sep=',') # store the model to mmap-able files create_directory('model') joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc') # evaluation label_score = classifier_fitted.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(label_list, classes=class_list) evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def roc(features_trunc, labels, categories, classifier): """ compute and plot the roc curve for the given classifier features_trunc - features matrix truncated to the k best features labels - the classes of the data categories - different possible categories (66 for subcategories or 14 for categories) classifier - MultinomialNB or lda """ # divide the data into training and test set features_train, features_test, categoryids_train, categoryids_test = train_test_split(features_trunc, labels, test_size=.1,random_state=0) # define the OneVsRestClassifier with the given classifier (LDA or Naive Bayes) clf = OneVsRestClassifier(classifier) # train the classifier and compute the probabilities for the test data labels clf_fit = clf.fit(features_train, categoryids_train) labels_score = clf_fit.predict_proba(features_test) # binarize the labels (necessary for the roc curve) categoryids_test = label_binarize(categoryids_test, classes=categories) # compute the false positive rate, true positive rate and the thresholds fpr, tpr, thresholds = metrics.roc_curve(categoryids_test.ravel(), labels_score.ravel()) # compute the area under the curve roc_auc = metrics.auc(fpr, tpr) # plot the roc curve pl.clf() pl.plot(fpr, tpr, 'r',label='micro-average ROC curve (area = {0:0.2f})'''.format(roc_auc), linewidth=2) pl.plot([0, 1], [0, 1], 'k--', linewidth=2) pl.xlim([0.0, 1.0]) pl.ylim([0.0, 1.05]) pl.xlabel('false positive rate') pl.ylabel('true positive rate') pl.title('Receiver operating characteristic for micro-averaged classification scores') pl.legend(loc="lower right") pl.show()
def model(train_data, train_label, test_data, test_label, n_classes): # Binarize the output train_label = label_binarize(train_label, classes=list(np.arange(n_classes))) test_label = label_binarize(test_label, classes=list(np.arange(n_classes))) # Basic classifier # basic_clf = LogisticRegression(C=1.0) # basic_clf = SVC() # basic_clf = KNeighborsClassifier() basic_clf = GaussianNB() # Multi-class classifier = OneVsRestClassifier(basic_clf) classifier.fit(train_data, train_label) # test_score = classifier.decision_function(test_data) test_score = classifier.predict_proba(test_data) return test_score, test_label
def prepare_features(df): df['Age'].fillna(df['Age'].mean(), inplace = True) df['Fare'].fillna(df['Fare'].mean(), inplace = True) df['Sex'] = label_binarize(df['Sex'], classes = ['males', 'female']) # disabled as usefull transformation # df['Fare'] = df['Fare'].apply(lambda x: int(round(math.log(x+1)))) df_embarked = pd.DataFrame(label_binarize(df['Embarked'], classes = ['C', 'Q', 'S']), columns = ['Embarked_C', 'Embarked_Q', 'Embarked_S']) df = pd.concat([df, df_embarked], axis = 1, copy = False) return df
def multiclass_AUC(clf, X, Y): # Binarize the output X, Y = np.array(X), np.array(Y) Y = label_binarize(Y, classes=list(set(Y))) n_classes = Y.shape[1] # shuffle and split training and test sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5, random_state=0) # Learn to predict each class against the other classifier = OneVsRestClassifier(clf) Y_score = classifier.fit(X_train, Y_train).predict(X_test) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], Y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(Y_test.ravel(), Y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) print "AUC for multiclass {}: {}".format(clf.__class__.__name__, roc_auc["micro"])
def evaluateOneEpoch(inputCoor, inputGraph, inputLabel, para, sess, trainOperaion): test_loss = [] test_acc = [] test_predict = [] for i in range(len(inputCoor)): xTest, graphTest, labelTest = inputCoor[i], inputGraph[i], inputLabel[i] graphTest = graphTest.tocsr() labelBinarize = label_binarize(labelTest, classes=[i for i in range(para.outputClassN)]) test_batch_size = para.testBatchSize for testBatchID in range(len(labelTest) / test_batch_size): start = testBatchID * test_batch_size end = start + test_batch_size batchCoor, batchGraph, batchLabel = get_mini_batch(xTest, graphTest, labelBinarize, start, end) batchWeight = uniform_weight(batchLabel) batchGraph = batchGraph.todense() feed_dict = {trainOperaion['inputPC']: batchCoor, trainOperaion['inputGraph']: batchGraph, trainOperaion['outputLabel']: batchLabel, trainOperaion['weights']: batchWeight, trainOperaion['keep_prob_1']: 1.0, trainOperaion['keep_prob_2']: 1.0} predict, loss_test, acc_test = sess.run( [trainOperaion['predictLabels'], trainOperaion['loss'], trainOperaion['acc']], feed_dict=feed_dict) test_loss.append(loss_test) test_acc.append(acc_test) test_predict.append(predict) test_average_loss = np.mean(test_loss) test_average_acc = np.mean(test_acc) return test_average_loss, test_average_acc, test_predict
def compute_rocauc(self): """ :return: """ # Binarize the output y_test = label_binarize(self.y_test, classes=list(range(self.n_classes))) # Compute ROC curve and ROC area for each class y_score = self.clf.predict_proba(self.X_test) fpr = dict() tpr = dict() roc_auc = dict() for i in range(self.n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) self.report["roc_auc"] = dict( fpr={str(k): v.tolist() for k, v in fpr.items()}, tpr={str(k): v.tolist() for k, v in tpr.items()}, roc_auc={str(k): v.tolist() for k, v in roc_auc.items()} )
def trainModel(data): model = Sequential() model.add(Dense(400, input_dim=(data.shape[1] - 1), init="uniform")) model.add(Activation("relu")) model.add(Dropout(0.5)) model.add(Dense(500, init="uniform")) model.add(Activation("relu")) model.add(Dropout(0.5)) model.add(Dense(39, init="uniform")) model.add(Activation("softmax")) cb = EarlyStopping(monitor="val_loss", patience=3, verbose=0, mode="auto") output = label_binarize(data[0:, 0], range(0, 39)) print (output.shape) # optim = Adam(lr=0.1, beta_l=0.2, beta_2=0.7, epsilon=1e-6) # model.compile(loss='categorical_crossentropy',optimizer=optim) # model.fit(data[0:,1:].astype(np.float32),output,nb_epoch=30,batch_size=16,show_accuracy=True,validation_split=0.5,callbacks=[cb]) # optim = Adam(lr=0.01, beta_l=0.5, beta_2=0.8, epsilon=1e-07) # model.compile(loss='categorical_crossentropy',optimizer=optim) # model.fit(data[0:,1:].astype(np.float32),output,nb_epoch=30,batch_size=16,show_accuracy=True,validation_split=0.3,callbacks=[cb]) optim = Adam(lr=0.001, beta_l=0.9, beta_2=0.999, epsilon=1e-07) model.compile(loss="categorical_crossentropy", optimizer=optim) model.fit( data[0:, 1:].astype(np.float64), output, nb_epoch=30, batch_size=16, show_accuracy=True, validation_split=0.1, callbacks=[cb], ) return model
def set_shared_variables(self, dataset, index,enable_time): c = np.zeros((self.batch_size, self.max_seqlen), dtype=np.int32) q = np.zeros((self.batch_size, ), dtype=np.int32) y = np.zeros((self.batch_size, self.num_classes), dtype=np.int32) c_pe = np.zeros((self.batch_size, self.max_seqlen, self.max_sentlen, self.embedding_size), dtype=theano.config.floatX) q_pe = np.zeros((self.batch_size, 1, self.max_sentlen, self.embedding_size), dtype=theano.config.floatX) # c_pe = np.ones((self.batch_size, self.max_seqlen, self.max_sentlen, self.embedding_size), dtype=theano.config.floatX) # q_pe = np.ones((self.batch_size, 1, self.max_sentlen, self.embedding_size), dtype=theano.config.floatX) indices = range(index*self.batch_size, (index+1)*self.batch_size) for i, row in enumerate(dataset['C'][indices]): row = row[:self.max_seqlen] c[i, :len(row)] = row q[:len(indices)] = dataset['Q'][indices] #问题的行数组成的列表 '''底下这个整个循环是得到一个batch对应的那个调整的矩阵''' for key, mask in [('C', c_pe), ('Q', q_pe)]: for i, row in enumerate(dataset[key][indices]): sentences = self.S[row].reshape((-1, self.max_sentlen)) #这句相当于把每一句,从标号变成具体的词,并补0 for ii, word_idxs in enumerate(sentences): J = np.count_nonzero(word_idxs) for j in np.arange(J): mask[i, ii, j, :] = (1 - (j+1)/J) - ((np.arange(self.embedding_size)+1)/self.embedding_size)*(1 - 2*(j+1)/J) # c_pe=np.not_equal(c_pe,0) # q_pe=np.not_equal(q_pe,0) # y[:len(indices), 1:self.num_classes] = self.lb.transform(dataset['Y'][indices])#竟然是把y变成了而之花的one=hot向量都,每个是字典大小这么长 y[:len(indices), 1:self.num_classes] = label_binarize(dataset['Y'][indices],self.vocab)#竟然是把y变成了而之花的one=hot向量都,每个是字典大小这么长 # y[:len(indices), 1:self.embedding_size] = self.mem_layers[0].A[[self.word_to_idx(i) for i in list(dataset['Y'][indices])]]#竟然是把y变成了而之花的one=hot向量都,每个是字典大小这么长 self.c_shared.set_value(c) self.q_shared.set_value(q) self.a_shared.set_value(y) self.c_pe_shared.set_value(c_pe) self.q_pe_shared.set_value(q_pe)
def transform(self, X, y=None): f = np.vectorize(self._replace_label) X_t = f(X).reshape(len(X), 1) if self.binarize: return label_binarize(X_t, classes=self.labels) else: return X_t
def cross_validate(nb_class, X, y, nb_epoch, task, labels, avg_scores, n_folds=10, evaluate=False, max_words=100, stateful=False, convolutional=True, pretrained=None): skf = StratifiedKFold(y, n_folds=n_folds, shuffle=False, random_state=None) # already shuffled for i, (train_ix, test_ix) in enumerate(skf): print ("Cross-validation fold: %d/%d" % (i + 1, n_folds)) model = None # Clearing the NN. model = models.construct_cnn_lstm(stateful=stateful, convolutional=convolutional, nb_class=nb_class, max_words=max_words, pretrained_embedding=copy.deepcopy(pretrained)) # pretrained_embedding=pre_model.layers[0]) X_train, X_test = X[train_ix], X[test_ix] y_train, y_test = y[train_ix], y[test_ix] models.train_model(model, X_train, preprocessing.label_binarize(y_train, classes=labels), nb_epoch=nb_epoch, evaluate=evaluate, max_words=max_words) if evaluate is False: X_test = models.pad(X_test, max_words=max_words) y_test_pred = model.predict_classes(X_test) y_test_pred = [labels[i] for i in y_test_pred] cm = confusion_matrix(y_test, y_test_pred, labels=labels) print(", ".join(labels)) print("confusion matrix:") print(cm) scores = precision_recall_fscore_support(y_test, y_test_pred, average=None, labels=labels) print("precision, recall, fscore and support values for each class:") print(", ".join(labels)) for x, label in enumerate(["precision", "recall", "fscore", "support"]): print(label, scores[x]) for j, k in enumerate(scores[x]): avg_scores[x][j] += k print(", ".join(labels)) cPickle.dump(cm, open("data/scores/%s_cm_cross_%d.pkl" % (task, i), "w")) cPickle.dump(cm, open("data/scores/%s_precision_recall_fscore_support_pos_cross_%d.pkl" % (task, i), "w"))
def WeekDaysBinarization(column): column1 = [0] * len(column) for i in range(0, len(column)): r = 7 if column[i] in weekDays.keys(): r = weekDays[column[i]] column1[i] = r myset = set(column1) mm = list(myset) r1 = label_binarize(column1, classes=mm) r1 = r1[:,0:7] r1 = np.column_stack((r1, column1)) weekDay = [0] * len(column1) for i in range(0, len(column1)): weekDay[i] = 0 if column1[i] == 0 or column1[i] == 6: weekDay[i] = 1 r1 = np.column_stack((r1, weekDay)) return r1
def fit(self, X, y): if self.activation is None: # Useful to quantify the impact of the non-linearity self._activate = lambda x: x else: self._activate = self.activations[self.activation] rng = check_random_state(self.random_state) # one-of-K coding for output values self.classes_ = unique_labels(y) Y = label_binarize(y, self.classes_) # set hidden layer parameters randomly n_features = X.shape[1] if self.rank is None: if self.density == 1: self.weights_ = rng.randn(n_features, self.n_hidden) else: self.weights_ = sparse_random_matrix( self.n_hidden, n_features, density=self.density, random_state=rng).T else: # Low rank weight matrix self.weights_u_ = rng.randn(n_features, self.rank) self.weights_v_ = rng.randn(self.rank, self.n_hidden) self.biases_ = rng.randn(self.n_hidden) # map the input data through the hidden layer H = self.transform(X) # fit the linear model on the hidden layer activation self.beta_ = np.dot(pinv2(H), Y) return self
def fit(self, X, y): ''' Trains the model Arguments: X is a n-by-d numpy array y is an n-dimensional numpy array ''' n, d = X.shape # transform y into an n-by-10 numpy array (unique_y = 10) num_unique_y = len(np.unique(y)) binary_y = label_binarize(y, classes = np.unique(y)) self.all_layers_info = np.append(np.append(d, self.layers), num_unique_y) # print self.all_layers_info self.L = len(self.all_layers_info) np.random.seed(28) # Initialize theta for l in range(self.L - 1): self.theta[l + 1] = np.random.uniform(low=-self.epsilon, high=self.epsilon, size=(self.all_layers_info[l + 1], (self.all_layers_info[l] + 1))) # print self.theta[l+1][0] # loop though Epochs for i in range(self.numEpochs): self._forwardPropagation_(X) self._backPropagation_(binary_y)
def logfit(self, X, y, C=1e5, tol=1e-1): """ Method : logfit(X, y, C=1e5, tol=1e-1) Input : X: array of the shape [n_samples, nrow*ncol], it contains features of every sample. y: array of the shape [n_samples], it contains targets (keys) of every sample. C: inverse of regularization strength, must be a positive float, and default value is 1.0. tol: tolerance for stopping criteria, must be a positive float too. Output : self, the estimator object """ from sklearn import metrics from sklearn import preprocessing from sklearn import multiclass as mc from sklearn import linear_model as lm print "Start training the Logistic Regression model ..." # Binarize the output y = preprocessing.label_binarize(y, classes=range(10)) classifier = mc.OneVsRestClassifier(lm.LogisticRegression(C=C, tol=tol)) model = classifier.fit(X, y) print "Model training is done!" return model
def k_fold_model_select(features, labels, raw_classifiers, n_folds=10, weigh_samples_fn=None): # weigh_samples_fn is explained below # assumes that the raw_classifier output is in probability # split into training and test data X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, stratify=labels, random_state=0) # use stratified k-fold cross validation to select the model skf = StratifiedKFold(y_train, n_folds=n_folds) best_classifier = None best_score = float('-inf') for train_index, validation_index in skf: for raw_classifier in raw_classifiers: classifier = skl_clone(raw_classifier) classifier = classifier.fit(X_train[train_index], y_train[train_index]) if weigh_samples_fn != None: y_pred = classifier.predict(X_train[validation_index]) sample_weight = weigh_samples_fn(y_train[validation_index], y_pred) else: sample_weight = None score = accuracy_score(classifier.predict(X_train[validation_index]), y_train[validation_index], sample_weight=sample_weight) if score > best_score: best_classifier = classifier best_score = score # compute the confusion matrix y_pred = best_classifier.predict(X_test) conf_mat = confusion_matrix(y_test, y_pred) # now compute the score for the test data of the best found classifier if weigh_samples_fn != None: sample_weight = weigh_samples_fn(y_test, y_pred) else: sample_weight = None test_score = accuracy_score(best_classifier.predict(X_test), y_test, sample_weight=sample_weight) # obtain the classification report report = classification_report(y_test, y_pred, target_names=['cat', 'dog'], sample_weight=sample_weight) # obtain ROC curve y_test_bin = label_binarize(y_test, classes=[0, 1]) y_prob = best_classifier.predict_proba(X_test) #fpr, tpr, _ = roc_curve(y_test_bin[:, 1], y_prob[:, 1]) fpr, tpr, _ = roc_curve(y_test_bin, y_prob[:, 1]) roc_info = (best_classifier.__class__.__name__, (fpr, tpr)) return (test_score, report, conf_mat, roc_info, best_classifier)
def action_to_vector(x, n_classes,p=0): #x是bs*path_length # p=0 #p是标签正常的概率 result = np.zeros([x.shape[0], x.shape[1], n_classes]) for i in range(x.shape[0]): for j in range(x.shape[1]): if np.random.rand()<p and j!=x.shape[1]-1: result[i,j]=label_binarize([int(x[i,j])],range(n_classes))[0] return np.int32(result)
def plot_roc_curve(X_test_label,test_predicted,class_names): X_test_label_binary = label_binarize(X_test_label, classes=class_names) test_predicted_binary = label_binarize(test_predicted, classes=class_names) false_positive_rate, true_positive_rate, thresholds = roc_curve(X_test_label_binary, test_predicted_binary) roc_auc = auc(false_positive_rate, true_positive_rate) plt.figure() plt.title('Receiver Operating Characteristic') #plt.imshow(cmap=plt.cm.GnBu) plt.plot(false_positive_rate, true_positive_rate, 'b', label='ROC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([-0.1, 1.2]) plt.ylim([-0.1, 1.2]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show()
def questions_to_y(qs, topic_tags, parents=False): if parents: class_indices = range(len(unique_parents(qs))) else: class_indices = range(len(topic_tags)) return label_binarize(questions_to_topic_index(qs, topic_tags, parents), class_indices)
def roc(y_true, y_score, ax=None): """ Plot ROC curve. Parameters ---------- y_true : array-like, shape = [n_samples] Correct target values (ground truth). y_score : array-like, shape = [n_samples] or [n_samples, 2] for binary classification or [n_samples, n_classes] for multiclass Target scores (estimator predictions). ax: matplotlib Axes Axes object to draw the plot onto, otherwise uses current Axes Notes ----- It is assumed that the y_score parameter columns are in order. For example, if ``y_true = [2, 2, 1, 0, 0, 1, 2]``, then the first column in y_score must countain the scores for class 0, second column for class 1 and so on. Returns ------- ax: matplotlib Axes Axes containing the plot Examples -------- .. plot:: ../../examples/roc.py """ if ax is None: ax = plt.gca() # get the number of classes based on the shape of y_score y_score_is_vector = is_column_vector(y_score) or is_row_vector(y_score) if y_score_is_vector: n_classes = 2 else: _, n_classes = y_score.shape # check data shape? if n_classes > 2: # convert y_true to binary format y_true_bin = label_binarize(y_true, classes=np.unique(y_true)) _roc_multi(y_true_bin, y_score, ax=ax) for i in range(n_classes): _roc(y_true_bin[:, i], y_score[:, i], ax=ax) else: if y_score_is_vector: _roc(y_true, y_score, ax) else: _roc(y_true, y_score[:, 1], ax) # raise error if n_classes = 1? return ax
def plotroc(traindata, trainlabel, testdata, testlabel, labels, rocfilename, cmfilename): print('# plot ROC curve') print('## train data shape: %s' % (traindata.shape,)) #clf = LogisticRegression(C=0.0005) clf = RandomForestClassifier(10, oob_score=True, n_jobs=-1) clf.fit(traindata, trainlabel) print('## test data shape: %s' % (testdata.shape,)) predlabel = clf.predict(testdata) predprob = clf.predict_proba(testdata) cm = confusion_matrix(testlabel, predlabel) print(cm) plotconfusionmatrix(cm, labels, cmfilename) print(classification_report(testlabel, predlabel, target_names=labels)) testlabel = label_binarize(testlabel, classes=range(1,13)) predlabel = label_binarize(predlabel, classes=range(1,13)) nclasses = predlabel.shape[1] fpr = dict() tpr = dict() rocauc = dict() for i in xrange(nclasses): fpr[i], tpr[i], _ = roc_curve(testlabel[:,i], predprob[:,i]) rocauc[i] = auc(fpr[i], tpr[i]) fpr["micro"], tpr["micro"], _ = roc_curve(testlabel.ravel(), predprob.ravel()) rocauc["micro"] = auc(fpr["micro"], tpr["micro"]) plt.figure() plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})'''.format(rocauc["micro"])) for i in range(nclasses): plt.plot(fpr[i], tpr[i], label='{0} (area = {1:0.2f})' ''.format(labels[i], rocauc[i])) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show() plt.savefig(rocfilename)
def analyze_pipeline(model, X, y, folds=3) : # X, y, X_test = load() # Load model with own load function # y = y # Reload as numpy # y = np.array([Y.score for Y in y]) y = [Y.score for Y in y] y = np.array(y) print y.shape y = label_binarize(y, classes=[0, 1, 2, 3, 4]) print y.shape # y = label_binarize(y, classes=[0, 1, 2, 3, 4]) # BINARIZE HERE # X = np.array # Reload as numpy # if not model: # If no model is specified, call load_model function # model = load_model() # Manual x-validation to accumulate actual # print y.shape cv_skf = KFold(5, n_folds=folds, shuffle=True, random_state=123) print cv_skf # y = np.array(y) # Creates stratified test set from training set scores = [] # Actual scores conf_mat = np.zeros((2, 2)) # Binary classification, confusion matrix false_pos = Set() # False positive set false_neg = Set() # Falso negative set for train_i, val_i in cv_skf: X_train, X_val = X[train_i], X[val_i] y_train, y_val = y[train_i], y[val_i] print "Fitting fold..." model.fit(X_train, y_train) print "Predicting fold..." y_pprobs = model.predict_proba(X_val) # Predicted probabilities y_plabs = np.squeeze(model.predict(X_val)) # Predicted class labels print y_val scores.append(roc_auc_score(y_val, y_pprobs[:, 1])) confusion = confusion_matrix(y_val, y_plabs) conf_mat += confusion # Collect indices of false positive and negatives fp_i = np.where((y_plabs==1) & (y_val==0))[0] fn_i = np.where((y_plabs==0) & (y_val==1))[0] false_pos.update(val_i[fp_i]) false_neg.update(val_i[fn_i]) print "Fold score: ", scores[-1] print "Fold CM: \n", confusion print "\nMean score: %0.2f (+/- %0.2f)" % (np.mean(scores), np.std(scores) * 2) conf_mat /= folds print "Mean CM: \n", conf_mat print "\nMean classification measures: \n" pprint(class_report(conf_mat)) return scores, conf_mat, {'fp': sorted(false_pos), 'fn': sorted(false_neg)}
def classwise_reliability_diagram(probs, labels, class_idx, bins=15): assert labels.shape[0] == probs.shape[0], 'Label/probs shape mismatch' batch_size, num_classes = probs.shape onehot_labels = torch.from_numpy( label_binarize(labels, classes=np.arange(num_classes))).float() # Predicted probabilities / one-hot labels for the given class class_probs = probs[:, class_idx] class_labels = onehot_labels[:, class_idx] counts, bin_edges = np.histogram(class_probs, bins=bins, range=[0., 1.]) indices = np.digitize(class_probs, bin_edges, right=True) bin_probs = np.array([ torch.mean(class_probs[indices == j]).item() for j in range(1, bins + 1) ]) bin_proportions = np.array([ torch.mean(class_labels[indices == i]).item() for i in range(1, bins + 1) ]) this_class_ece = (1. / batch_size) * np.sum([ counts[i] * np.abs(bin_probs[i] - bin_proportions[i]) for i in range(bins) if counts[i] > 0 ]) # ---- Setting up figure plt.rcParams.update({'font.size': 14}) fig, ax = plt.subplots(figsize=(10, 8)) ax.set_xlabel('Class Score') ax.set_ylabel('Accuracy') ax.set_xlim(0, 1) ax.set_ylim(0, 1) ax.set_xticks(np.linspace(0, 1, 6)) ax.set_yticks(np.linspace(0, 1, 6)) ax.plot([0, 1], [0, 1], linestyle='--', color='gray') # ----- Plotting data for i in range(bins): x = [bin_edges[i], bin_edges[i], bin_edges[i + 1], bin_edges[i + 1]] y = [0, bin_proportions[i], bin_proportions[i], 0] ax.fill(x, y, 'b', alpha=0.6, edgecolor='black') ax.text(0.01, .875, 'Class {} CE: {:.3f}'.format(class_idx, this_class_ece), size=15) score_hist_ax = ax.twinx() score_hist_ax.hist(class_probs, density=True, label='Score distr.', color='orange', alpha=0.7, bins=20) score_hist_ax.set_ylim(0, 35) score_hist_ax.legend(loc='upper left') score_hist_ax.set_yticks([]) return fig
def test(test_loader, featureExtractor, model, epoch, device, cv): model.eval() # switch to train groundTruth = [] prediction_max = [] prediction_prob = [] test_correct = 0 test_total = 0 with torch.no_grad(): for i, videoFrames in enumerate(tqdm(test_loader)): label = videoFrames['label'].to(device) videoFrames = torch.squeeze(videoFrames['videoFrames']).to(device) length = videoFrames.shape[0] Outputs = [] if length < 16: lack = 16 - length repeat_frames = videoFrames[-1:, ...].repeat(lack, 1, 1, 1) videoFrames = torch.cat((videoFrames, repeat_frames), 0) circle = int(length / 8) - 1 for k in range(circle): start = 0 + 8 * k end = 16 + 8 * k features = featureExtractor(videoFrames[start:end, ...].float()) output, hidden = model(features.unsqueeze(0)) output_mean = torch.mean(output, dim=0) # one serie of frames = 16 Outputs.append(output_mean.data.cpu().numpy().tolist() ) # All series of frames Outputs = torch.Tensor(Outputs) if Outputs.shape[0] > 1: outputs_average = torch.mean(Outputs, dim=0).unsqueeze( 0) # average of All series' output groundTruth.append(label.item()) _, predicted = torch.max(outputs_average.data, 1) prediction_max.append(predicted.item()) prediction_prob_b = F.softmax(outputs_average.data) prediction_prob.append( prediction_prob_b.data.numpy().reshape(6).tolist()) test_total += label.size(0) test_correct += (predicted == label.data.cpu()).sum().item() accuracy = accuracy_score(prediction_max, groundTruth) f1 = f1_score(prediction_max, groundTruth, average="weighted") label = label_binarize(groundTruth, classes=list(range(6))) auc = roc_auc_score(label, prediction_prob, average='micro') print( f"CV {cv}/10, Epoch {epoch}/100, accuracy = {accuracy}, F1-Score = {f1}, AUC = {auc}", ) test_accuracy = 100 * test_correct / test_total print( 'CV = %d, Epoch %d, Accuracy of the network on the Test images: %d' % (cv, epoch, test_accuracy)) # Raw df = pd.DataFrame(data={ "pnn_prediction": prediction_max, "pnn_groundtruth": groundTruth }) df.to_csv( f"./Prediction_202106/CV_{cv}_Epoch_{epoch}_ACC_{test_accuracy}_eval_pnn_2.csv" ) pro = np.array(prediction_prob) df2 = pd.DataFrame(pro) df2.to_csv( f"./Prediction_202106/CV_{cv}_Epoch_{epoch}_ACC_{test_accuracy}_Categorical_lstm_6pnn_202106_2.csv" ) print(f"save cv {cv}") return [accuracy, f1, auc]
clf = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=3), n_estimators=10, max_samples=0.5, max_features=0.5) elif j == 3: clf = BaggingClassifier(base_estimator=MLPClassifier(hidden_layer_sizes=(100), activation='relu', solver='adam', batch_size=128, alpha=1e-4, learning_rate_init=1e-3, learning_rate='adaptive', tol=1e-4, max_iter=200), n_estimators=10, max_samples=0.5, max_features=0.5) elif j == 4: clf = BaggingClassifier(base_estimator=LinearSVC(penalty='l2', random_state=0, tol=1e-4), n_estimators=10, max_samples=0.5, max_features=0.5) skf = StratifiedKFold(n_splits=10) skf_accuracy = [] for train, test in skf.split(X, y): clf.fit(X[train], y[train]) if n_classes.size < 3: skf_accuracy.append(roc_auc_score(y[test], clf.predict_proba(X[test])[:, 1] if j != 4 else clf.decision_function(X[test]), average='micro')) else: ytest_one_hot = label_binarize(y[test], n_classes) skf_accuracy.append(roc_auc_score(ytest_one_hot, clf.predict_proba(X[test]) if j != 4 else clf.decision_function(X[test]), average='micro')) accuracy = np.mean(skf_accuracy) of.write(f'{accuracy:.6f}|') print(f'{time.time() - start_time:.3f}s') of.write('\n')
def plot_roc(y_true, y_score, text='', linestyle='-', classes=None, detail=False): """ plot roc, support for multi-class detail for : http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html :param y_true: shape=[n_samples] :param y_score: shape=[n_samples, n_classes] :param classes: list :return: """ unique_classes = set(y_true) if not classes: classes = unique_classes # Binarize the output y_true = label_binarize(y_true, classes=classes) n_classes = y_true.shape[1] # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) ############################################################################## # Plot ROC curves for the multiclass problem # Compute macro-average ROC curve and ROC area # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves # plt.figure() # plt.plot(fpr["micro"], tpr["micro"], # label=text + ' micro-average ROC curve (area = {0:0.8f})' # ''.format(roc_auc["micro"]), # linewidth=2) plt.plot(fpr["macro"], tpr["macro"], linestyle, label=text + ' (area = {0:0.8f})' ''.format(roc_auc["macro"]), linewidth=2) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title( 'Some extension of Receiver operating characteristic to multi-class') plt.legend(loc="lower right") # plt.show() if detail: for i in range(n_classes): plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.8f})' ''.format(classes[i], roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title( 'Some extension of Receiver operating characteristic to multi-class' ) plt.legend(loc="lower right") plt.show()
def plot_model(X_train_scaled, y_train, X_test_scaled, y_test, clf): y_predicted = clf.predict(X_test_scaled) y_train_preds = clf.predict(X_train_scaled) unique_classes = [1, 2, 10, 15] probabilities = clf.predict_proba(X_test_scaled) # Binarize the output y_test_binarized = label_binarize(y_test, classes=[1, 2, 10, 15]) n_classes = y_test_binarized.shape[1] print(clf) print("\n Classification report : \n", classification_report(y_test, y_predicted)) print("Test Accuracy Score : {:.4f}".format( accuracy_score(y_test, y_predicted))) #confusion matrix conf_matrix = confusion_matrix(y_test, y_predicted) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() thresholds = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], thresholds[i] = roc_curve(y_test_binarized[:, i], probabilities[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) ## Calculate MultiClass AUC # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) print("Multi-Class Area Under the Curve: {:.4f}".format(roc_auc['macro'])) print("\n") print('Classwise Area Under the Curves') for i in range(len(unique_classes)): if i == 0: auc_name = 'Normal- ' else: auc_name = 'Disease Type ' + str(i) + '- ' temp_auc = round(roc_auc[i], 2) print(auc_name + str(temp_auc)) print('\n') #plot confusion matrix trace1 = go.Heatmap(z=conf_matrix, x=[ "No Disease", 'Disease Class 1', 'Disease Class 2', 'Disease Class 3' ], y=[ "No Disease", 'Disease Class 1', 'Disease Class 2', 'Disease Class 3' ], showscale=False, colorscale="Picnic", name="matrix") #subplots fig = tls.make_subplots(rows=3, cols=2, specs=[[{}, None], [{}, {}], [{}, {}]], subplot_titles=('Confusion Matrix', 'ROC 1', 'ROC 2', 'ROC 3', 'ROC 4')) # fig = tls.make_subplots(rows=3, cols=2) fig.append_trace(trace1, 1, 1) for i in range(n_classes): trace2_temp = go.Scatter(x=fpr[i], y=tpr[i], name="Roc : " + str(roc_auc[i]), mode='lines+text', text=['AUC: ' + str(round(roc_auc[i], 2))], textposition='top right', textfont=dict(family="sans serif", size=18, color="DarkSeaGreen"), line=dict(color=('rgb(22, 96, 167)'), width=2)) trace3_temp = go.Scatter(x=[0, 1], y=[0, 1], line=dict(color=('rgb(205, 12, 24)'), width=2, dash='dot')) if i == 0: fig.append_trace(trace2_temp, 2, 1) fig.append_trace(trace3_temp, 2, 1) elif i == 1: fig.append_trace(trace2_temp, 2, 2) fig.append_trace(trace3_temp, 2, 2) elif i == 2: fig.append_trace(trace2_temp, 3, 1) fig.append_trace(trace3_temp, 3, 1) else: fig.append_trace(trace2_temp, 3, 2) fig.append_trace(trace3_temp, 3, 2) fig['layout'].update(showlegend=False, title="Model performance", autosize=False, height=900, width=800, plot_bgcolor='rgba(240,240,240, 0.95)', paper_bgcolor='rgba(240,240,240, 0.95)', margin=dict(b=195)) for i in [2, 3, 4, 5]: fig["layout"]["xaxis" + str(i)].update( dict(title="false positive rate")) fig["layout"]["yaxis" + str(i)].update( dict(title="true positive rate")) iplot(fig)
# In[28]: from sklearn.datasets import load_digits from sklearn.metrics import roc_curve, auc import numpy as np from sklearn.preprocessing import label_binarize from sklearn.multiclass import OneVsRestClassifier from sklearn.model_selection import train_test_split from sklearn import svm from sklearn.grid_search import GridSearchCV digits = load_digits() x = digits.data y = digits.target y = label_binarize(y, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) n_classes = y.shape[1] for i in range(0, 5): print() print("round:", i) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=0) classifier = svm.SVC(probability=True) from sklearn.grid_search import GridSearchCV parameters = { 'kernel': ('rbf', 'linear', 'poly', 'sigmoid'), 'C': [1, 10, 100, 1000], 'degree': np.arange(2, 11), 'gamma': np.arange(1e-4, 1e-2),
from sklearn import svm, datasets from sklearn.metrics import roc_curve, auc from sklearn.model_selection import train_test_split from sklearn.preprocessing import label_binarize from sklearn.multiclass import OneVsRestClassifier from scipy import interp from sklearn.metrics import roc_auc_score # import some data to play with iris = datasets.load_iris() X = iris.data y = iris.target # binarize the output y = label_binarize(y, classes=[0, 1, 2]) n_classes = y.shape[1] # Add noisy features to make the problem harder random_state = np.random.RandomState(0) n_samples, n_features = X.shape X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) # learn to predict each class against the other classifier = OneVsRestClassifier(
from sklearn import svm from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import label_binarize digits = load_digits() x, y = digits.data, digits.target print(y) y = label_binarize(y, classes=list(range(10))) print('------------------------------') print(y) x_train, x_test, y_train, y_test = train_test_split(x, y) model = OneVsRestClassifier(svm.SVC(kernel='linear')) clf = model.fit(x_train, y_train) print(clf.score(x_train, y_train))
y_true = [] y_pred = [] #----------- create y_true ------------- for allow in os.listdir(allow_path): # allow for i in os.listdir(allow_path + '/' + allow): y_true.append(anchor_name.index(allow)) for reject in os.listdir(reject_path): # reject for i in os.listdir(reject_path + '/' + reject): y_true.append(5) y_true = np.array(y_true) Y_true = label_binarize(y_true, classes=[i for i in range(nb_classes)]) print('k=', y_true.shape) #----------- create y_pred ------------- ori_img_array = [] origin_model = load_model( './model_with5/SiameseResnet_mc2_model/SiameseResnet_mc2_stable_.h5') fix_model = create_CaptureFeature_model((1, 32, 32)) fix_model.set_weights(origin_model.get_weights()) #.layers[3] relation_model = load_model('./model_with5/test_by_test.h5') #--- fix_feature ---
def transformer_binarize(y_true): return label_binarize(y_true, classes=classes)
k_scores2, label='gini', color='cornflowerblue', linestyle=':', linewidth=4) # plt.plot([0, 1], [0, 1], 'k--', lw=lw) plt.xlabel('number of estimators') plt.ylabel('Cross-Validated F1') plt.title('Random Forest') plt.legend(loc="lower right") plt.show() X = datasets.load_iris().data y = datasets.load_iris().target y = label_binarize(datasets.load_iris().target, classes=[0, 1, 2]) n_classes = y.shape[1] random_state = np.random.RandomState(0) n_samples, n_features = X.shape # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) maxtrix = np.array([[635, 35], [17, 324]]) # Learn to predict each class against the other classifier = OneVsRestClassifier( svm.SVC(kernel='linear', probability=True, random_state=random_state)) y_score = classifier.fit(X_train, y_train).decision_function(X_test)
X = train[[str(i) for i in range(4096)]].values Y = np.array(train['label'].values, dtype=np.int32) test = pd.read_csv('ts2d3dnew.csv') #test = pd.read_csv('tsIITD.csv') X_test = test[[str(i) for i in range(4096)]].values Y_test = np.array(test['label'].values, dtype=np.int32) print(Y_test) #Y_test = label_binarize(Y_test, classes=[i for i in range(230)]) #Y = label_binarize(Y, classes=[i for i in range(230)]) #n_classes = 230 Y_test = label_binarize(Y_test, classes=[i for i in range(177)]) Y = label_binarize(Y, classes=[i for i in range(177)]) n_classes = 177 random_state = np.random.RandomState(0) n_samples, n_features = X.shape classifier = OneVsRestClassifier( svm.SVC(kernel='poly', probability=True, random_state=random_state)) y_score = classifier.fit(X, Y).decision_function(X_test) fpr = dict() tpr = dict() roc_auc = dict()
def run_prob_based_train_test_kfold_roc_curve_plot(classifier, x, y, is_plot_enabled=True, discard_low_pred=False): min_discard_prob = 0.2 max_discard_prob = 0.8 n_splits = 10 y = label_binarize(y, classes=[0, 1]) x, y = shuffle(x, y) cv = StratifiedKFold(n_splits=n_splits) tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) y = label_binarize(y, classes=[0, 1]) i = 0 logger.info("###" + str(n_splits) + "-fold started ###") cum_f1_score = 0 try: cnt = 0 for train, test in cv.split(x, y): logger.info("## fold: " + str(i + 1) + "started") x = np.array(x) y = np.array(y) X_train = x[train] y_train = y[train] X_test = x[test] y_test = y[test] classifier.fit(X_train, y_train) probas_ = classifier.predict_proba(X_test) # probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve y_pred = probas_[:, 1] if discard_low_pred: y_test, y_pred = discard_low_pred_prob_prediction_couple( y_test, y_pred, min_discard_prob, max_discard_prob) print_false_predicted_entries(X_test, y_pred, y_test, True) cum_f1_score += print_evaluation_stats(y_test, y_pred, True) fpr, tpr, thresholds = roc_curve(y_test, y_pred) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) if is_plot_enabled: plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc)) i += 1 logger.info("## fold: " + str(i + 1) + "completed") logger.info("Average weighted F1-score: " + str(cum_f1_score / n_splits)) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) logger.info("Mean AUC: " + str(mean_auc)) if is_plot_enabled: plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8) plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Stratified k-fold with k=' + str(n_splits)) plt.legend(loc="lower right") plt.show() except Exception as e: logger.error(e)
from kernels import chi_square_kernel, histogram_intersection_kernel, zero_kernel, multichannel_wrapper if __name__ == '__main__': bowFilename = 'lab4/bow.pkl' if not os.path.exists(bowFilename): raise IOError("No such file '%s'." % bowFilename) x, y, tag = cPickle.load(open(bowFilename, 'rb')) x = np.array(x, dtype=float) y = np.array(y, dtype=int) - 1 y = label_binarize(y, classes=range(7)) n_classes = 7 X_train, X_test, y_train, y_test = train_test_split(x, y, tag) print("Training SVM") TIMES = 10 l = [] for i in range(TIMES): print '\rFitting %d/%d ' % (i, TIMES), sys.stdout.flush() # resampling classifier = OneVsRestClassifier( svm.SVC(kernel=multichannel_wrapper(2, chi_square_kernel), probability=True)) X_train, X_test, y_train, y_test = train_test_split(x, y, tag)
print(i) data = data_r[[i, 'label']] MU = data['label'] == "group1" WT = data['label'] == "group2" #ADDITIONAL CODE X_MU = data[MU].drop('label', axis=1) X_WT = data[WT].drop('label', axis=1) group1SampleSize = len(X_MU) group2SampleSize = len(X_WT) y_MU = data[MU].label y_WT = data[WT].label y_MU = label_binarize(y_MU, classes=["group1", "group2"]) y_WT = label_binarize(y_WT, classes=["group1", "group2"]) #ADD NOISY FEATURES TO MAKE THE PROBLEM HARDER random_state = np.random.RandomState(0) n_samples, n_features = X_MU.shape X_MU = np.c_[X_MU, random_state.randn(n_samples, 1 * n_features)] n_samples, n_features = X_WT.shape X_WT = np.c_[X_WT, random_state.randn(n_samples, 1 * n_features)] #SMOTE PARAMETERIZATION X_MU_train, X_MU_test, y_MU_train, y_MU_test = train_test_split( X_MU, y_MU, test_size=0.3) X_WT_train, X_WT_test, y_WT_train, y_WT_test = train_test_split( X_WT, y_WT, test_size=0.3)
r1 = algo.score(x_test, y_test) neighbors = nbs.NearestNeighbors(3) neighbors.fit(x_train, y_train) myneighbors = neighbors.kneighbors(x_train, 3, return_distance=True) print('*' * 100) print(myneighbors) #Logistic Regression logistic = LogisticRegression(penalty='l2', fit_intercept=True, max_iter=100) logistic.fit(x_train, y_train) r2 = logistic.score(x_test, y_test) print('Logistics训练结果:%f' % r2) predit2 = logistic.predict(x_test) y_label = label_binarize(y_test, classes=[1, 2, 3]) print(y_label) fpr, tpr, _ = roc_curve(y_label.ravel(), algo.predict_proba(x_test).ravel()) aucValue = auc(fpr, tpr) print(aucValue) fpr_log, tpr_log, _ = roc_curve(y_label.ravel(), logistic.predict_proba(x_test).ravel()) auc_log = auc(fpr_log, tpr_log) x_test_len = np.arange(len(x_test)) # plt.plot(x_test_len,y_test,'ro',markersize=7,label='真实值') # plt.plot(x_test_len,predit,'bo',markersize=5,label='KNN预测值') # plt.plot(x_test_len,predit2,'ko',markersize=3,label='Logistics预测值') # plt.title('鸢尾花分类预测,准确度:KNN=%f Logis=%f'% (r1,r2)) # plt.legend(loc='lower right')
def validate(self, verbose=True, roc=False): self.network.eval() if self._test_loader is None: with torch.no_grad(): self._test_loader = self._patch_loader( self.args.dataset_path + VALIDATION_PATH, False) val_loss = 0 correct = 0 classes = len(LABELS) tp = [0] * classes tpfp = [0] * classes tpfn = [0] * classes precision = [0] * classes recall = [0] * classes f1 = [0] * classes if verbose: print('\nEvaluating....') labels_true = [] labels_pred = np.empty((0, 4)) for images, labels in self._test_loader: if self.args.cuda: images, labels = images.cuda(), labels.cuda() with torch.no_grad(): output = self.network(Variable(images)) val_loss += F.nll_loss(output, Variable(labels), size_average=False).data.item() _, predicted = torch.max(output.data, 1) correct += torch.sum(predicted == labels) labels_true = np.append(labels_true, labels) labels_pred = np.append(labels_pred, torch.exp(output.data).cpu().numpy(), axis=0) for label in range(classes): t_labels = labels == label p_labels = predicted == label tp[label] += torch.sum(t_labels == (p_labels * 2 - 1)) tpfp[label] += torch.sum(p_labels) tpfn[label] += torch.sum(t_labels) for label in range(classes): precision[label] += (tp[label] / (tpfp[label] + 1e-8)) recall[label] += (tp[label] / (tpfn[label] + 1e-8)) f1[label] = 2 * precision[label] * recall[label] / ( precision[label] + recall[label] + 1e-8) val_loss /= len(self._test_loader.dataset) acc = 100. * correct / len(self._test_loader.dataset) if roc == 1: labels_true = label_binarize(labels_true, classes=range(classes)) for lbl in range(classes): fpr, tpr, _ = roc_curve(labels_true[:, lbl], labels_pred[:, lbl]) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=2, label='{} (AUC: {:.1f})'.format( LABELS[lbl], roc_auc * 100)) plt.xlim([0, 1]) plt.ylim([0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(loc="lower right") plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.title('Receiver Operating Characteristic') plt.show() if verbose: print('Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format( val_loss, correct, len(self._test_loader.dataset), acc)) for label in range(classes): print( '{}: \t Precision: {:.2f}, Recall: {:.2f}, F1: {:.2f}'. format(LABELS[label], precision[label], recall[label], f1[label])) print('') return acc
def permission_roc(): # get data train_data, permission_list = db_tool.get_new_train_data() y = train_data['target'] x_train, x_test, y_train, y_test = cross_validation.train_test_split( train_data['permission-data'], train_data['target'], test_size=0.3, random_state=1) selector = SelectKBest(chi2, k=15) x_train = selector.fit_transform(x_train, y_train) x_test = selector.transform(x_test) y_train = label_binarize( y_train, classes=['music-audio', 'personalization', 'social', 'communication']) y_test = label_binarize( y_test, classes=['music-audio', 'personalization', 'social', 'communication']) n_classes = 4 clss = [OneVsRestClassifier(MultinomialNB())] for cls in clss: model = cls.fit(x_train, y_train) # # valid the model y_score = model.predict_proba(x_test) fpr = dict() tpr = dict() roc_auc = dict() for i in range(4): fpr[i], tpr[i], _ = metrics.roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # # Then interpolate all ROC curves at this points # mean_tpr = np.zeros_like(all_fpr) # for i in range(n_classes): # mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # # # Finally average it and compute AUC # mean_tpr /= n_classes # # fpr["macro"] = all_fpr # tpr["macro"] = mean_tpr # roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # # plt.figure() # plt.plot(fpr["micro"], tpr["micro"], # label='micro-average ROC curve (area = {0:0.2f})' # ''.format(roc_auc["micro"]), # linewidth=2) # # plt.plot(fpr["macro"], tpr["macro"], # label='macro-average ROC curve (area = {0:0.2f})' # ''.format(roc_auc["macro"]), # linewidth=2) for i in range(4): plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})' ''.format(i, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title( 'Some extension of Receiver operating characteristic to multi-class' ) plt.legend(loc="lower right") plt.show()
def average_precision(prob_np, target_np): num_class = prob_np.shape[1] label = label_binarize(target_np, classes=list(range(num_class))) with np.errstate(divide='ignore', invalid='ignore'): return average_precision_score(label, prob_np, None)
def main(): mainData = pickle.load(open("../../../Data/XY_ARXIV.p", "rb")) X = mainData[0] Y = mainData[1] X_test = mainData[2] Y_test = mainData[3] del mainData nb = pickle.load(open("../../../Data/nbARXIVModel.p", "rb")) # Make prediction print("MAKING PREDICTIONS") Y_pred = nb.predict(X_test) y_score = nb.predict_proba(X_test) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(len(LABELS)): fpr[i], tpr[i], _ = metrics.roc_curve(Y_test, y_score[:, i], pos_label=i) roc_auc[i] = metrics.auc(fpr[i], tpr[i]) plt.figure() plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title( 'ARXIV: Naive Bayes Model Receiver operating characteristic curve') # Plot of a ROC curve for a specific class for i in range(len(LABELS)): plt.plot(fpr[i], tpr[i], label='ROC curve for label ' + str(i + 1) + " " + list(LABELS.keys())[i] + ' (area = %0.2f)' % roc_auc[i]) plt.legend(loc="lower right") plt.show() with open("../../../Data/nbARXIVPredicted.p", "wb") as handle: pickle.dump(Y_pred, handle) with open("../../../Data/ROC_Curves/NB ARXIV.p", "wb") as handle: curve = metrics.roc_curve( label_binarize(Y_test, classes=list(LABELS.values())).ravel(), y_score.ravel()) auc = metrics.roc_auc_score( label_binarize(Y_test, classes=list(LABELS.values())), label_binarize(Y_pred, classes=list(LABELS.values())), average="micro") pickle.dump((curve, auc), handle) # print(Y_pred.tolist()) # Calculate accuracy, precision, and recall print("PRINTING STATISTICS") acc = accuracy_score(y_true=Y_test, y_pred=Y_pred) print("accuracy = " + str(acc)) print("Macro Averging") prec = precision_score(y_true=Y_test, y_pred=Y_pred, average="macro") recall = recall_score(y_true=Y_test, y_pred=Y_pred, average="macro") print("F1 score = " + str(metrics.f1_score(Y_test, Y_pred, average="macro"))) print("precision = " + str(prec)) print("recall = " + str(recall)) print("Micro Averging") prec = precision_score(y_true=Y_test, y_pred=Y_pred, average="micro") recall = recall_score(y_true=Y_test, y_pred=Y_pred, average="micro") print("F1 score = " + str(metrics.f1_score(Y_test, Y_pred, average="micro"))) print("precision = " + str(prec)) print("recall = " + str(recall))
def main(args): with tf.Graph().as_default(): with tf.Session() as sess: np.random.seed(seed=args.seed) if args.use_split_dataset: dataset_tmp = facenet.get_dataset(args.data_dir) train_set, test_set = split_dataset( dataset_tmp, args.min_nrof_images_per_class, args.nrof_train_images_per_class) if (args.mode == 'TRAIN'): dataset = train_set elif (args.mode == 'CLASSIFY'): dataset = test_set else: dataset = facenet.get_dataset(args.data_dir) # Check that there are at least one training image per class for cls in dataset: assert ( len(cls.image_paths) > 0, 'There must be at least one image for each class in the dataset' ) paths, labels = facenet.get_image_paths_and_labels(dataset) print('Number of classes: %d' % len(dataset)) print('Number of images: %d' % len(paths)) # Load the model print('Loading feature extraction model') facenet.load_model(args.model) # Get input and output tensors images_placeholder = tf.get_default_graph().get_tensor_by_name( "input:0") embeddings = tf.get_default_graph().get_tensor_by_name( "embeddings:0") phase_train_placeholder = tf.get_default_graph( ).get_tensor_by_name("phase_train:0") embedding_size = embeddings.get_shape()[1] # Run forward pass to calculate embeddings print('Calculating features for images') nrof_images = len(paths) nrof_batches_per_epoch = int( math.ceil(1.0 * nrof_images / args.batch_size)) emb_array = np.zeros((nrof_images, embedding_size)) for i in range(nrof_batches_per_epoch): start_index = i * args.batch_size end_index = min((i + 1) * args.batch_size, nrof_images) paths_batch = paths[start_index:end_index] images = facenet.load_data(paths_batch, False, False, args.image_size) feed_dict = { images_placeholder: images, phase_train_placeholder: False } emb_array[start_index:end_index, :] = sess.run( embeddings, feed_dict=feed_dict) classifier_filename_exp = os.path.expanduser( args.classifier_filename) #embfilename='20180402-114759' #if not os.path.exists('D:\\facenet\\descriptors\\'+embfilename): # os.mkdir('D:\\facenet\\descriptors\\'+embfilename) #np.savetxt('D:\\facenet\\descriptors\\'+embfilename+'\\log1.gz', emb_array, fmt='%.32f', delimiter=',', newline='\n') #print('Saved feature embeddings to file "%s"' % embfilename) if (args.mode == 'TRAIN'): # Train classifier print('Training classifier') model = SVC(kernel='linear', probability=True) model.fit(emb_array, labels) # Create a list of class names class_names = [cls.name.replace('_', ' ') for cls in dataset] # Saving classifier model with open(classifier_filename_exp, 'wb') as outfile: pickle.dump((model, class_names), outfile) print('Saved classifier model to file "%s"' % classifier_filename_exp) elif (args.mode == 'CLASSIFY'): # Classify images print('Testing classifier') with open(classifier_filename_exp, 'rb') as infile: (model, class_names) = pickle.load(infile) print('Loaded classifier model from file "%s"' % classifier_filename_exp) predictions = model.predict_proba(emb_array) best_class_indices = np.argmax(predictions, axis=1) best_class_probabilities = predictions[ np.arange(len(best_class_indices)), best_class_indices] for i in range(len(best_class_indices)): print('%4d %s: %.3f' % (i, class_names[best_class_indices[i]], best_class_probabilities[i])) accuracy = np.mean(np.equal(best_class_indices, labels)) print('Accuracy: %.3f' % accuracy) labels = label_binarize(np.array(labels), classes=range(1, 21)) best_class_indices = label_binarize( np.array(best_class_indices), classes=range(1, 21)) precision, recall, _ = precision_recall_curve( labels.ravel(), best_class_indices.ravel()) average_precision = average_precision_score(labels, best_class_indices, average="micro") print( 'Average precision score, micro-averaged over all classes: {0:0.2f}' .format(average_precision)) plt.figure() plt.step(recall, precision, color='b', alpha=0.2, where='post') plt.fill_between(recall, precision, step='post', alpha=0.2, color='b') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title( 'Average precision score, micro-averaged over all classes: AP={0:0.2f}' .format(average_precision))
import matplotlib.pyplot as plt from itertools import cycle from sklearn import svm, datasets from sklearn.metrics import roc_curve, auc from sklearn.model_selection import train_test_split from sklearn.preprocessing import label_binarize from sklearn.multiclass import OneVsRestClassifier #from sklearn.multiclass import OneVsOneClassifier from scipy import interp import os x_train = np.loadtxt('D:/SJTU Lessons/X_train.txt', delimiter=' ') x_test = np.loadtxt('D:/SJTU Lessons/X_test.txt', delimiter=' ') # 将标签二值化 Y_train = np.loadtxt('D:/SJTU Lessons/Y_train.txt', delimiter=' ') Y_train = label_binarize(Y_train, classes=[0, 1, 2, 3]) Y_test = np.loadtxt('D:/SJTU Lessons/Y_test.txt', delimiter=' ') Y_test = label_binarize(Y_test, classes=[0, 1, 2, 3]) # 设置种类 n_classes = 4 # 训练模型并预测 #seed random_state = np.random.RandomState(0) #dim n_samples = 3532 n_features = 641 # Learn to predict each class against the other #'linear’, ‘poly’, ‘rbf' classifier = OneVsRestClassifier(
def plot_precision_recall(y_truth, y_score, labels=None, pos_label=None): """ Plot precision recall curve Parameters ------------------------------------- y_truth: array True labels for the belonging class. If labels are not {0, 1, ..., N}, then pos_label should be explicitly given. y_score: array Estimated probabilities or decision function. pos_label : int or str The label of the positive class. When pos_label=None, if y_true is in {0, 1, ..., N}, pos_label is set to 1, otherwise an error will be raised. Returns ------------------------------------- out: matplotlib.figure.Figure Plot containing the precision recall curves """ # get number of classes n_classes = len(np.unique(y_truth)) if (labels is None and n_classes > 2) or (labels and len(labels) != n_classes): labels = [] for i_class in range(n_classes): labels.append(f'class{i_class}') res = plt.figure() if n_classes <= 2: precision, recall, _ = precision_recall_curve( y_truth, y_score, pos_label=pos_label) plt.step(recall, precision, color='b', alpha=0.2, where='post') plt.fill_between(recall, precision, alpha=0.2, color='b', step='post') average_precision = average_precision_score(y_truth, y_score) plt.title( f'2-class Precision-Recall curve: AP={average_precision:0.2f}') else: cmap = plt.cm.get_cmap('tab10') precision, recall = (dict() for i_dict in range(2)) # convert multi-class labels to multi-labels to obtain a curve for each class y_truth_multi = label_binarize(y_truth, classes=range(n_classes)) for clas, lab in enumerate(labels): precision[clas], recall[clas], _ = precision_recall_curve( y_truth_multi[:, clas], y_score[:, clas], pos_label=pos_label) plt.step(recall[clas], precision[clas], color=cmap(clas), lw=1, where='post', label=lab) # compute also micro average precision['micro'], recall['micro'], _ = precision_recall_curve( y_truth_multi.ravel(), y_score.ravel()) plt.step(recall['micro'], precision['micro'], color='black', where='post', linestyle='--', lw=1, label='average') average_precision = average_precision_score( y_truth_multi, y_score, average='micro') plt.title( f'Average precision score, micro-averaged over all classes: {average_precision:0.2f}') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) if n_classes > 2: plt.legend(loc='lower left') plt.grid() return res
print('auc: ', score) print("Features importance...") gain = model.feature_importance('gain') feat_imp = pd.DataFrame({'feature': model.feature_name(), 'split': model.feature_importance('split'), 'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False) print('Top 50 features:\n', feat_imp.head(50)) del x_train, x_val, y_train, y_val, train_set, val_set gc.collect() return y_pred, oof_pred y_pred, oof_pred = run_lgb(train, test, use_features) y_one_hot = label_binarize(train['label'], np.arange(4)) oof_one_hot = label_binarize(oof_pred.argmax(axis=1), np.arange(4)) score = roc_auc_score(y_one_hot, oof_one_hot) print('auc: ', score) submission = pd.read_csv(DATA_DIR+'sample_submission.csv') submission.label = y_pred.argmax(axis=1) submission.label = submission.label.map(mapping_dict_inv) submission.head() submission.to_csv('submission_lgb.csv', index=False) np.save('y_pred_lgb', y_pred) np.save('oof_pred_lgb', oof_pred)
for train_index, test_index in k_fold.split(train): #Get the fold train, train target, test, test target fold_train = train[train_index] fold_test = train[test_index] fold_target_train = target[train_index] fold_target_test = target[test_index] #Create the classifier model random_forest_classifier = RandomForestClassifier(n_estimators = experiment[0], max_features = experiment[1], n_jobs = cpu_count) #Fit the classifier model on the train data random_forest_classifier.fit(fold_train, fold_target_train) #Predict the results for test data predictions = random_forest_classifier.predict(fold_test) #Get the probability estimates used for AUROC scores = random_forest_classifier.predict_proba(fold_test) #Binarize the output target as since it is a multi classification model and we get probability estimates for each class available binarized_outputs = label_binarize(fold_target_test, classes = output_classes) #Calculate the false positive rate and the true positive rate for each of the labels false_postive_rate = dict() true_postive_rate = dict() roc_auc = dict() for i in range(len(output_classes)): false_postive_rate[i], true_postive_rate[i], _ = roc_curve(binarized_outputs[:, i], scores[:, i]) roc_auc[i] = auc(false_postive_rate[i], true_postive_rate[i]) #Calculate the micro rates false_postive_rate["micro"], true_postive_rate["micro"], _ = roc_curve(binarized_outputs.ravel(), scores.ravel()) roc_auc["micro"] = auc(false_postive_rate["micro"], true_postive_rate["micro"]) all_false_postive_rate = numpy.unique(numpy.concatenate([false_postive_rate[i] for i in range(len(output_classes))])) #Interpolate all ROC curves of the different labels at this points mean_true_postive_rate = numpy.zeros_like(all_false_postive_rate)
model.forward(is_train=False) raw_output = model.outputs[0].asnumpy() pred = Softmax(raw_output) if(count==1): ypred=raw_output ytrue=label.asnumpy() else: ypred=np.vstack((ypred,raw_output)) ytrue=np.hstack((ytrue,label.asnumpy())) count=count+1 colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal']) lw = 2 ytrue=label_binarize(ytrue, classes=[0, 1, 2,3,4,5,6,7,8,9]) precision = dict() recall = dict() average_precision = dict() precision["micro"], recall["micro"], _ = precision_recall_curve(ytrue.ravel(),ypred.ravel()) average_precision["micro"] = average_precision_score(ytrue, ypred,average="micro") plt.clf() plt.plot(recall["micro"], precision["micro"], color='gold', lw=lw, label='Normal Training (area = {0:0.2f})'.format(average_precision["micro"])) # # Dropout Training # In[ ]:
result_config = [] for c in config: print 'The following configuration will be used: {}'.format(c) result_cv = [] # Go for LOPO cross-validation for idx_lopo_cv in range(len(id_patient_list)): # Display some information about the LOPO-CV print 'Round #{} of the LOPO-CV'.format(idx_lopo_cv + 1) # Get the testing data testing_data = np.atleast_2d(data[idx_lopo_cv]).T testing_data = np.nan_to_num(testing_data) testing_label = label_binarize(label[idx_lopo_cv], [0, 255]) print 'Create the testing set ...' # Create the training data and label training_data = [ arr for idx_arr, arr in enumerate(data) if idx_arr != idx_lopo_cv ] training_label = [ arr for idx_arr, arr in enumerate(label) if idx_arr != idx_lopo_cv ] # Concatenate the data training_data = np.atleast_2d(np.hstack(training_data)).T training_data = np.nan_to_num(training_data) training_label = label_binarize( np.hstack(training_label).astype(int), [0, 255]) print 'Create the training set ...'
# initialize thetas with labeled data thetas = [[], [], []] # (mean,var^-1,weight) per class # Gaussian Bayes Classifier: MLE for feature distribution + w_y for y in range(10): mask = y_train_lbl == y # [μ_y, Σ_y, w_y] thetas[0].append(np.mean(X_train_lbl[mask], axis=0)) # μ_y (128-dim) thetas[1].append(np.linalg.inv(np.cov( X_train_lbl[mask], rowvar=False))) # Σ_y^-1 (128x128-dim) thetas[2].append( counts[y] / labeled_rows ) #w_y = mean(P(z=y | x, Σ, μ)) = #y-labeled data/#labeled data gammas = np.zeros((labeled_rows + unlabeled_rows, 10), dtype='float64') #TODO: empty? gammas[0:labeled_rows] = preprocessing.label_binarize( y_train_lbl, classes=range(10)) # TODO: sparse_output=True? predicted_class = np.empty((labeled_rows + unlabeled_rows)) predicted_class[0:labeled_rows] = y_train_lbl # https://people.duke.edu/~ccc14/sta-663/EMAlgorithm.html tol = 0.001 max_iter = 100 # n = all_rows = X_train_all.shape[0] # for P(x) gm = GaussianMixture(max_iter=1, n_components=10, weights_init=thetas[2], means_init=thetas[0], precisions_init=thetas[1]) gm.fit(X_train_lbl) #needed for predict print(gm.get_params()['means_init'] == thetas[0])
def plot_roc(model, x_train, y_train): x_train = np.array(x_train, ndmin=2) y_train = np.array(y_train, ndmin=2) if (x_train.shape[0] != y_train.shape[0]): y_train = y_train.T if (x_train.shape[0] != y_train.shape[0]): print("x_train and y_train do not match in lenght: ", x_train.shape, " vs ", y_train.shape) x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=123) # binarize classes classes = [] for i in range(np.max(y_train) + 1): classes.append(i) y_test_bin = label_binarize(y_test, classes=classes) n_classes = len(classes) predictions = model.predict(x_test) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(int(np.max(y_train) + 1)): fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], predictions[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), predictions.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves plt.figure() plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4) plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4) colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) for i, color in zip(range(n_classes), colors): plt.plot(fpr[i], tpr[i], color=color, lw=2, label='ROC curve of class {0} (area = {1:0.2f})' ''.format(i, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--', lw=2) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title( 'Some extension of Receiver operating characteristic to multi-class') plt.legend(loc="lower right") plt.show()
def print_model_scores(model, y_test, X_test_scaled, X_train_scaled, y_train): from sklearn.metrics import classification_report, confusion_matrix y_predicted_test = model.predict(X_test_scaled) y_predicted_train = model.predict(X_train_scaled) print("Train Accuracy : %.4f " % (model.score(X_train_scaled, y_train))) print("Test Accuracy : %.4f " % (model.score(X_test_scaled, y_test))) print("Confusion matrix Train: ") print(confusion_matrix(y_train, y_predicted_train)) print("Confusion matrix Test: ") print(confusion_matrix(y_test, y_predicted_test)) unique_classes = [1, 2, 10, 15] probabilities_test = model.predict_proba(X_test_scaled) probabilities_train = model.predict_proba(X_train_scaled) # Binarize the output y_test_binarized = label_binarize(y_test, classes=[1, 2, 10, 15]) y_train_binarized = label_binarize(y_train, classes=[1, 2, 10, 15]) n_classes = y_test_binarized.shape[1] # Compute ROC curve and ROC area for each class fpr_train = dict() tpr_train = dict() roc_auc_train = dict() fpr_test = dict() tpr_test = dict() roc_auc_test = dict() for i in range(n_classes): fpr_test[i], tpr_test[i], _ = roc_curve(y_test_binarized[:, i], probabilities_test[:, i]) roc_auc_test[i] = auc(fpr_test[i], tpr_test[i]) fpr_train[i], tpr_train[i], _ = roc_curve(y_train_binarized[:, i], probabilities_train[:, i]) roc_auc_train[i] = auc(fpr_train[i], tpr_train[i]) ## Calculate MultiClass AUC # First aggregate all false positive rates all_fpr_train = np.unique( np.concatenate([fpr_train[i] for i in range(n_classes)])) all_fpr_test = np.unique( np.concatenate([fpr_test[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr_train = np.zeros_like(all_fpr_train) for i in range(n_classes): mean_tpr_train += interp(all_fpr_train, fpr_train[i], tpr_train[i]) mean_tpr_test = np.zeros_like(all_fpr_test) for i in range(n_classes): mean_tpr_test += interp(all_fpr_test, fpr_test[i], tpr_test[i]) # Finally average it and compute AUC mean_tpr_train /= n_classes mean_tpr_test /= n_classes fpr_train["macro"] = all_fpr_train tpr_train["macro"] = mean_tpr_train roc_auc_train["macro"] = auc(fpr_train["macro"], tpr_train["macro"]) print("AUC Train: {:.4f}".format(roc_auc_train['macro'])) fpr_test["macro"] = all_fpr_test tpr_test["macro"] = mean_tpr_test roc_auc_test["macro"] = auc(fpr_test["macro"], tpr_test["macro"]) print("AUC Test: {:.4f}".format(roc_auc_test['macro'])) print("\n") return ({ 'Test_Accuracy': round(model.score(X_test_scaled, y_test), 4), 'Train_Accuracy': round(model.score(X_train_scaled, y_train), 4), 'Train AUC': round(roc_auc_train['macro'], 4), 'Test AUC': round(roc_auc_test['macro'], 4) })