def pmid_26033813_analysis(drug: str): tree = build_tree() feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_labels = labels_all.loc[selected_samples] selected_expr = expr.loc[selected_samples, :] fit_tree(selected_expr, selected_labels, tree) predictions = pd.Series( [ predict_sample(sample_name, selected_expr, tree) for sample_name in selected_samples ], index=selected_samples, ) rd = RocData.calculate(selected_labels, predictions) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'PMID26033813 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf') pr = PrData.calculate(selected_labels, predictions) plot_pr(pr, f'PMID26033813 Precision-Recall: {drug.title()}', output_path / f'{drug}_pr.pdf')
def roc_pr(self): ''' tn, fp, fn, tp = metrics.confusion_matrix(self.labels, self.predicts) fpr = tp / (fp + tn) fnr = fn / (fn + tp) utils.plot_det(fpr, fnr) ''' # using True to replace POS exmaple # ROC curve self.fpr, self.tpr, self.ths = metrics.roc_curve(y_true=self.labels, y_score=self.predicts, pos_label=1) print('fpr', self.fpr) print('tpr', self.tpr) print('ths', self.ths) self.auc = metrics.auc(self.fpr, self.tpr) print('AUC', self.auc) utils.plot_roc(self.fpr, self.tpr, self.ths, self.auc, save_path='output/roc.png') # PR-curve self.precision, self.recall, thresholds = metrics.precision_recall_curve( y_true=self.labels, probas_pred=self.predicts, pos_label=1) print("precision len", len(self.precision)) print("recall len", len(self.recall)) print("thresholds len", len(thresholds)) utils.plot_pr(self.precision, self.recall, thresholds, save_path='output/pr.png')
def test(model, dataloader, epoch, is_graph=False): global best_test labels, distances = [], [] with torch.set_grad_enabled(False): comparer = FullPairComparer().cuda() model.eval() for batch_idx, (data1, data2, target) in enumerate(dataloader): dist = [] target = target.cuda(non_blocking=True) output1 = model(data1, False) output2 = model(data2, False) dist = comparer(output1, output2) #TODO: sign - torch.sign() #dist = comparer(torch.sign(F.relu(output1)), torch.sign(F.relu(output2))) # TODO: sign - torch.sign() distances.append(dist.data.cpu().numpy()) labels.append(target.data.cpu().numpy()) if batch_idx % 50 == 0: print('Batch-Index -{}'.format(str(batch_idx))) labels = np.array([sublabel for label in labels for sublabel in label]) distances = np.array([subdist for dist in distances for subdist in dist]) tpr, fpr, fnr, fpr_optimum, fnr_optimum, accuracy, threshold = evaluate( distances, labels) EER = np.mean(fpr_optimum + fnr_optimum) / 2 print('TEST - Accuracy = {:.12f}'.format(accuracy)) print('TEST - EER = {:.12f}'.format(EER)) is_best = EER <= best_test best_test = min(EER, best_test) if is_best and is_graph: plot_roc(fpr, tpr, figure_name=args.outdir + '/Test_ROC-{}.png'.format(epoch)) plot_DET_with_EER(fpr, fnr, fpr_optimum, fnr_optimum, figure_name=args.outdir + '/Test_DET-{}.png'.format(epoch)) plot_density(distances, labels, figure_name=args.outdir + '/Test_DENSITY-{}.png'.format(epoch)) df_results = pd.DataFrame({ 'distances': distances.transpose(), 'labels': labels.transpose() }) df_results.to_csv(args.outdir + "/test_outputs.csv", index=False) if args.evaluate is False: shutil.copyfile(args.outdir + '/model_best.pth.tar', args.outdir + '/test_model_best.pth.tar') return EER
def ki67_analysis(drug: str): feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_expr = expr.loc[selected_samples, gene] selected_labels = labels_all.loc[selected_samples] rd = RocData.calculate(selected_labels, selected_expr) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'Ki67 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf') pr = PrData.calculate(selected_labels, selected_expr) plot_pr(pr, f'Ki67 Precision-Recall: {drug.title()}', output_path / f'{drug}_pr.pdf')
def pmid_26892682_analysis(drug: str): feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_expr = expr.loc[selected_samples, selected_genes] selected_labels = labels_all.loc[selected_samples] ln_p_over_1_minus_p = selected_expr.as_matrix() @ coefs.as_matrix() probs = expit(ln_p_over_1_minus_p) rd = RocData.calculate(selected_labels, probs) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'PMID26892682 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf') pr = PrData.calculate(selected_labels, probs) plot_pr(pr, f'PMID26892682 Precision-Recall: {drug.title()}', output_path / f'{drug}_pr.pdf')
def validate(model, epoch): model.eval() labels, distances = [], [] pbar = tqdm(enumerate(test_loader)) for batch_idx, (data_a, data_p, label) in pbar: #label这里是 0 1 if args.cuda: data_a, data_p = data_a.cuda(), data_p.cuda() data_a, data_p, label = Variable(data_a, volatile=True), Variable( data_p, volatile=True), Variable(label) out_a = model(data_a, None, None) out_p = model(data_p, None, None) #one batch dists dists = l2_dist.forward( out_a, out_p ) #torch.sqrt(torch.sum((out_a - out_p) ** 2, 1)) # euclidean distance distances.append(dists.data.cpu().numpy()) labels.append(label.data.cpu().numpy()) if batch_idx % args.log_interval == 0: pbar.set_description('Test Epoch: {} [{}/{} ({:.0f}%)]'.format( epoch, batch_idx * len(data_a), len(test_loader.dataset), 100. * batch_idx / len(test_loader))) labels = np.array([sublabel for label in labels for sublabel in label]) distances = np.array([subdist for dist in distances for subdist in dist]) tpr, fpr, accuracy, best_threshold = evaluate(distances, labels) print('\n\33[91mTest set: Accuracy: {:.8f} best_threshold: {:.2f}\33[0m'. format(np.mean(accuracy), best_threshold)) logger.log_value('Test Accuracy', np.mean(accuracy)) plot_roc(fpr, tpr, args.log_dir, figure_name="roc_test_epoch_{}.png".format(epoch)) return np.mean(accuracy)
def train(self): with self.graph.as_default(): # TODO: add resume option # TODO: add exception handler train_handle = self.sess.run( self.data.train_iterator.string_handle()) valid_handle = self.sess.run( self.data.valid_iterator.string_handle()) for epoch in range(self.hparams.epoch_num): self.sess.run(self.data.train_iterator.initializer) self.sess.run(self.data.valid_iterator.initializer) log.infov('Epoch %i' % epoch) #Train self._train_epoch('epoch %i (training)' % epoch, train_handle) #Validate y_score, y = self._evaluate('epoch %i (evaluating)' % epoch, valid_handle) # call roc y_score = np.concatenate(y_score, axis=0) y = np.concatenate(y, axis=0) plot_roc(y_score, y)
def test_detector(self, test_path, saved_model): test_data, test_labels = self.load_dataset(test_path) model = load_model(saved_model) pred = model.predict(test_data) pred = np.argmax(pred, axis=1) # y_compare = np.argmax(test_labels, axis=1) score = metrics.accuracy_score(test_labels, pred) print("Final accuracy: {}".format(score)) # Compute confusion matrix cm = confusion_matrix(test_labels, pred) np.set_printoptions(precision=2) print('Confusion matrix, without normalization') print(cm) plt.figure() utils.plot_confusion_matrix(cm, names=['M', 'B'], plot_name='metrics/conf_matrix.png') #plot ROC curve # pred = [pred[i] for i in np.nonzero(pred)] # Only positive cases - benign utils.plot_roc(pred, test_labels, plot_name='metrics/roc_curve.png')
# 5 fold cross validation skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) prediction_scores = np.empty(y.shape[0], dtype='object') for train_idx, val_idx in tqdm(skf.split(X, y)): X_train, X_val = X[train_idx], X[val_idx] y_train = y[train_idx] clf = clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_val)[:, 1] # Save the predictions for this fold prediction_scores[val_idx] = y_pred plt.title('SVM 5-fold cross validation ROC AUC') plot_roc(y, prediction_scores) plt.savefig('report/figures/svm_roc.png', dpi=300) plot_prediction_samples(imgs, y, prediction_scores, 'SVM Prediction Samples') plt.savefig('report/figures/svm_confmat.png', dpi=300) # %% # load and preprocess test data then create submission X_test, test_ids = get_data(test=True) X_test = np.stack([get_HOG(img, **hog_params) for img in X_test]) clf = clf.fit(X, y) test_predictions = clf.predict_proba(X_test)[:, 1] make_submission(test_ids, test_predictions, fname='submissions/svc_10_hog_16_4_fulltrain.csv')
def train_model(clf_factory, X, Y, name, plot=False): labels = np.unique(Y) cv = ShuffleSplit( n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict( list), defaultdict(list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_factory() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] precision, recall, pr_thresholds = precision_recall_curve( y_label_test, proba_label) pr_scores[label].append(auc(recall, precision)) precisions[label].append(precision) recalls[label].append(recall) thresholds[label].append(pr_thresholds) fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: print("Plotting %s" % genre_list[label]) scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_pr(pr_scores[label][median], desc, precisions[label][median], recalls[label][median], label='%s vs rest' % genre_list[label]) plot_roc(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
rd.fpr, rd.tpr, lw=1, label= f'Permutation {i} (area = {rd.auc:.{SIGNIFICANT_DIGITS}f})') plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') plt.xlim([-0.01, 1.01]) plt.ylim([-0.01, 1.01]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.axes().set_aspect('equal', 'datalim') plt.legend(loc='lower right') plt.title(f'SNF Clustering: {drug.title()}') figure_path = output_path / f'roc_comparison_{drug}.pdf' print('Saving ROC plot to', figure_path) plt.savefig(str(figure_path), bbox_inches='tight') best_permutation = aucs.idxmax() rd = roc_data[best_permutation] plot_roc( roc_data[best_permutation], f'SNF Clustering ROC: {drug.title()}', output_path / f'roc_best_{drug}.pdf', ) rd.save(data_path / f'roc_data_{drug}.pickle')
def train_model(clf_factory, X, Y, name, plot=False): """ Trains and saves model to disk. """ labels = np.unique(Y) cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0) #print "cv = ",cv train_errors = [] test_errors = [] scores = [] pr_scores, precisions, recalls, thresholds = list(defaultdict(list)), list( defaultdict(list)), list(defaultdict(list)), list(defaultdict(list)) roc_scores, tprs, fprs = list(defaultdict(list)), list( defaultdict(list)), list(defaultdict(list)) clfs = [] # just to later get the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] global clf clf = LogisticRegression() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) """ for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] precision, recall, pr_thresholds = precision_recall_curve( y_label_test, proba_label) pr_scores[label].append(auc(recall, precision)) precisions[label].append(precision) recalls[label].append(recall) thresholds[label].append(pr_thresholds) fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr)""" if plot: for label in labels: print("Plotting %s" % genre_list[label]) scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) // 2] desc = "%s %s" % (name, genre_list[label]) plot_pr(pr_scores[label][median], desc, precisions[label][median], recalls[label][median], label='%s vs rest' % genre_list[label]) plot_roc(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores) ) #222pr_scores[label].append(auc(recall, precision)) print(summary) #save the trained model to disk joblib.dump( clf, r'C:\Users\Rag9704\Documents\GitHub\Music_Genre_Classification\my_model.pkl' ) return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
patient_gene_set_muts = pd.DataFrame(0, index=muts.index, columns=range(len(entrez_gene_sets))) for i, gene_set in enumerate(entrez_gene_sets): patient_gene_set_muts.loc[:, i] = muts.loc[:, gene_set].any(axis=1).astype(int) pathway_mut_counts = patient_gene_set_muts.sum(axis=1) gene_set_mut_matrix_path = data_path / 'gene_set_mut_matrix.pickle' print('Saving gene set mutation matrix to', gene_set_mut_matrix_path) patient_gene_set_muts.to_pickle(gene_set_mut_matrix_path) pathway_mut_count_path = data_path / 'pathway_mut_counts.pickle' print('Saving pathway mutation counts to', pathway_mut_count_path) pathway_mut_counts.to_pickle(pathway_mut_count_path) drugs = ['ai_all', 'arimidex'] feature_label_path = find_newest_data_path(f'compute_drug_features_labels_alpha_{args.alpha:.2f}') for drug in drugs: labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, pathway_mut_counts.index) selected_labels = labels_all.loc[selected_samples] selected_counts = pathway_mut_counts.loc[selected_samples] rd = RocData.calculate(selected_labels, selected_counts) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'WExT Pathway Mutation Count ROC: {drug.title()}', output_path / f'{drug}_roc.pdf')
def train_model(clf_factory, X, Y, name, plot=False): labels = np.unique(Y) cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict(list), defaultdict( list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] # just to later get the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_factory() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] precision, recall, pr_thresholds = precision_recall_curve( y_label_test, proba_label) pr_scores[label].append(auc(recall, precision)) precisions[label].append(precision) recalls[label].append(recall) thresholds[label].append(pr_thresholds) fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: print("Plotting", genre_list[label]) scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_pr(pr_scores[label][median], desc, precisions[label][median], recalls[label][median], label='%s vs rest' % genre_list[label]) plot_roc(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def test_task(dl_model, processed_train_outputs, best_checkpoint, version, batch_size, device='0'): checkpoint = best_checkpoint DEVICE = device VERSION = version BATCH_SIZE = batch_size [ numeric_input_train, numeric_input_dev, numeric_input_test, numeric_input_train_bureau, numeric_input_dev_bureau, numeric_input_test_bureau, numeric_input_train_prev_app, numeric_input_dev_prev_app, numeric_input_test_prev_app ] = processed_train_outputs["normalized_data"] [ categorical_input_train, categorical_input_dev, categorical_input_test, categorical_input_train_bureau, categorical_input_dev_bureau, categorical_input_test_bureau, categorical_input_train_prev_app, categorical_input_dev_prev_app, categorical_input_test_prev_app ] = processed_train_outputs["categorical_inputs"] [data_normalizer_app, data_normalizer_bur, data_normalizer_prev_app] = processed_train_outputs["data_normalizers"] [target_train, target_dev, target_test] = processed_train_outputs["targets"] [application_predict, bureau_predict, previous_application_predict] = processed_train_outputs["predict_data"] # Instantiating Batcher object predict_batcher = U.Batcher([ numeric_input_test, categorical_input_test, numeric_input_test_bureau, categorical_input_test_bureau, numeric_input_test_prev_app, categorical_input_test_prev_app ], BATCH_SIZE, shuffle_on_reset=False) predictions_test = [] # predictions = np.zeros([len(all_ids_test)]) with M.start_tensorflow_session(device=DEVICE) as sess: dl_model.model_saver.restore( sess, "models/dl_model_" + str(VERSION) + "/model_" + str(VERSION) + "-" + str(checkpoint), ) for i in range(predict_batcher.n_batches): # Get next batch batch_numeric_input_predict, batch_categorical_input_predict, batch_numeric_input_predict_bu, batch_categorical_input_predict_bu, batch_numeric_input_predict_prev_app, batch_categorical_input_predict_prev_app = predict_batcher.next( ) # Creating feed_dict feed_dict_predict = { dl_model.placeholders.numeric_input: batch_numeric_input_predict, dl_model.placeholders.numeric_input_bureau: batch_numeric_input_predict_bu, dl_model.placeholders.numeric_input_prev_app: batch_numeric_input_predict_prev_app } for i in range( len(C.col_classes["application_train"]["categorical"])): feed_dict_predict[dl_model.placeholders.embedding[ C.col_classes["application_train"]["categorical"] [i]]] = batch_categorical_input_predict[:, i].reshape([-1, 1]) for i in range(len(C.col_classes["bureau"]["categorical"])): feed_dict_predict[dl_model.placeholders.embedding_bureau[ C.col_classes["bureau"]["categorical"] [i]]] = np.expand_dims( batch_categorical_input_predict_bu[:, :, i], axis=2) for i in range( len(C.col_classes["previous_application"]["categorical"])): feed_dict_predict[dl_model.placeholders.embedding_prev_app[ C.col_classes["previous_application"]["categorical"] [i]]] = np.expand_dims( batch_categorical_input_predict_prev_app[:, :, i], axis=2) # Run forward prop pred = sess.run(dl_model.forward.pred, feed_dict=feed_dict_predict) predictions_test.append(pred) final_prediction_test = [] for pred_i in predictions_test: for elem in pred_i: final_prediction_test.append(elem) final_prediction_test = np.squeeze(np.array(final_prediction_test)) U.plot_roc(target_test, final_prediction_test) U.print_distribution(final_prediction_test) uplift = U.get_uplift(target_test, final_prediction_test, N=100, plot="uplift_acum") return {"prediction": final_prediction_test, "uplift": uplift}
def train_using_pretrained_model(images, labels, path, net, epochs=10, learning_rate=0.0001, batch_size=32): best_accuracy = 0.0 train_loss, test_loss = [], [] train_acc, test_acc = [], [] roc = [] roc_score = [] roc_true = [] criterion = nn.BCELoss() optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate) # Training data X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=6) train_data = Dataset(X_train, y_train) train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=batch_size) # Testing data test_data = Dataset(X_test, y_test) test_loader = torch.utils.data.DataLoader(test_data, shuffle=True, batch_size=batch_size) for epoch in range(epochs): train_loss_it, train_acc_it = [], [] test_loss_it, test_acc_it = [], [] roc_score_it, roc_true_it = [], [] total_step = len(train_loader) for i, (images, labels) in enumerate(train_loader): images = images.reshape(len(images), 1, 224, 224) labels = labels optimizer.zero_grad() outputs = net(images) loss = criterion(outputs.double(), labels) loss.backward() optimizer.step() # Accuracy predicted = torch.round(outputs.data) total = labels.size(0) * labels.size(1) correct = (predicted == labels).sum().item() accuracy = 100 * correct / total print('Epoch [{}/{}], Step [{}/{}], Train-Loss: {:.4f}, Train-Acc: {:.2f} %' .format(epoch + 1, epochs, i + 1, total_step, loss.item(), accuracy)) train_acc_it.append(accuracy) train_loss_it.append(loss.item()) train_acc.append(np.mean(np.array(train_acc_it))) train_loss.append(np.mean(np.array(train_loss_it))) total = 0.0 correct = 0.0 total_step = len(test_loader) for i, (images, labels) in enumerate(test_loader): images = images.reshape(len(images), 1, 224, 224) labels = labels outputs = net(images) loss = criterion(outputs.double(), labels) predicted = torch.round(outputs.data) total += labels.size(0) * labels.size(1) correct += (predicted == labels).sum().item() accuracy = 100 * correct / total true = np.array(labels).reshape(-1) score = np.array(outputs.data).reshape(-1) roc.append(roc_auc_score(true, score)) roc_score_it.extend(np.array(outputs.data).reshape(-1)) roc_true_it.extend(np.array(labels).reshape(-1)) test_acc_it.append(accuracy) test_loss_it.append(loss.item()) test_accuracy = 100 * correct / total test_acc.append(np.mean(np.array(test_acc_it))) test_loss.append(np.mean(np.array(test_loss_it))) print('[Test] Epoch [{}/{}], Acc: {:.2f}'.format(epoch + 1, epochs, test_accuracy)) if test_accuracy > best_accuracy: torch.save(net.state_dict(), path) best_accuracy = test_accuracy if (epoch + 1) % 10 == 0: roc_score.append(roc_score_it) roc_true.append(roc_true_it) # ROC if epochs > 9: true = np.array(roc_true) score = np.array(roc_score) plot_roc_binary(true, score, './results/transfer_bin_roc.pdf', 'Transfer Binary Classifier COVID') plot_roc(roc, './results/transfer_bin_roc_auc.pdf', 'Transfer Binary Classifier COVID') plot_loss(train_loss, test_loss, './results/transfer_bin_loss.pdf', 'Transfer Binary Classifier COVID') plot_acc(train_acc, test_acc, './results/transfer_bin_acc.pdf', 'Transfer Binary Classifier COVID')
def train_model(images, labels, path, epochs=10, learning_rate=0.0001, batch_size=32): net = Net() train_loss, test_loss, = [], [] train_acc, test_acc, = [], [] roc_score = [] roc_true = [] roc=[] # Loss and optimizer criterion = nn.BCELoss() optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate) # Generate dataset X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=6) train_data = Dataset(X_train, y_train) train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=batch_size) test_data = Dataset(X_test, y_test) test_loader = torch.utils.data.DataLoader(test_data, shuffle=True, batch_size=batch_size) for epoch in range(epochs): train_loss_it, train_acc_it = [], [] test_loss_it, test_acc_it = [], [] roc_score_it, roc_true_it = [], [] net.train() total_step = len(train_loader) for i, (images, labels) in enumerate(train_loader): # Move tensors to the configured device images = images.reshape(len(images), 1, 224, 224) labels = labels # Forward pass outputs = net(images) loss = criterion(outputs, labels) # Accuracy predicted = torch.round(outputs.data).reshape(len(labels)) total = labels.size(0) correct = (predicted == labels).sum().item() accuracy = 100 * correct / total # Backward and optimize optimizer.zero_grad() loss.backward() optimizer.step() print('[Train] Epoch [{}/{}], Step [{}/{}], Train-Loss: {:.4f}, Train-Acc: {:.2f} %' .format(epoch + 1, epochs, i + 1, total_step, loss.item(), accuracy)) train_acc_it.append(accuracy) train_loss_it.append(loss.item()) train_acc.append(np.mean(np.array(train_acc_it))) train_loss.append(np.mean(np.array(train_loss_it))) net.eval() with torch.no_grad(): correct = 0 total = 0 total_step = len(test_loader) for i, (images, labels) in enumerate(test_loader): images = images.reshape(len(images), 1, 224, 224) labels = labels outputs = net(images) loss = criterion(outputs, labels) predicted = torch.round(outputs.data).reshape(len(labels)) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy = 100 * correct / total roc_score_it.extend(np.array(outputs.data).reshape(-1)) roc_true_it.extend(np.array(labels).reshape(-1)) test_acc_it.append(accuracy) test_loss_it.append(loss.item()) true = np.array(labels).reshape(-1) score = np.array(outputs.data).reshape(-1) roc.append(roc_auc_score(true, score)) print('[Test] Epoch [{}/{}], Step [{}/{}], Test-Loss: {:.4f}, Test-Acc: {:.2f}%' .format(epoch + 1, epochs, i + 1, total_step, loss.item(), accuracy)) if (epoch + 1) % 10 == 0: roc_score.append(roc_score_it) roc_true.append(roc_true_it) test_acc.append(np.mean(np.array(test_acc_it))) test_loss.append(np.mean(np.array(test_loss_it))) # Save the model checkpoint torch.save(net.state_dict(), path) # ROC if epochs > 9: true = np.array(roc_true) score = np.array(roc_score) plot_roc_binary(true, score, './results/bin_roc.pdf', 'Binary Classifier COVID') plot_loss(train_loss, test_loss, './results/bin_loss.pdf', 'Binary Classifier COVID') plot_acc(train_acc, test_acc, './results/bin_acc.pdf', 'Binary Classifier COVID') plot_roc(roc, './results/bin_roc_auc.pdf', 'Simple Binary Classifier COVID') return net
grdtruth = [] with torch.no_grad(): dataloader = DataLoader(train_dataset, batch_size=1000, shuffle=False) for data in dataloader: truth = data[2] if len(grdtruth) <= 0: grdtruth = truth continue grdtruth = torch.cat((grdtruth, truth)) grdtruth = grdtruth.numpy() prediction = classifier.fit(features, grdtruth) precision_clf = classifier.score(features, grdtruth) print('accuracy: ', precision_clf) def print_acc(model, dataset, print_note=''): acc = models.cal_accuracy(dataset, model) print(print_note, 'accuracy: ', acc) print('Network') print_acc(model, train_dataset, print_note='train') print_acc(model, dev_dataset, print_note='validat') # plot roc fpr_train, tpr_train, rocauc_train = models.get_roc(train_dataset, model) fpr_dev, tpr_dev, rocauc_dev = models.get_roc(dev_dataset, model) plot_roc([fpr_train, fpr_dev], [tpr_train, tpr_dev], [rocauc_train, rocauc_dev]) # save model torch.save(model.state_dict(), f'{model_path}/ae_on_{data_class}.pth')
def train_model(clf_factory, X, Y, name, plot=False): labels = np.unique(Y) #得到分类列表 #随机地从X的600个元素中选出30%作为测试集,选1次 cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] #用于保存测试集的准确率 pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict(list), defaultdict( list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) #假正率字典 fprs = defaultdict(list) #真正率字典 clfs = [] # just to later get the median cms = [] for train, test in cv: #train中是420个600内的随机数test是另外180个数 X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] #现在X_train中存着训练集的420个13维向量y_train存着他们对应的420个类名 #X_test中存着测试集的180个13维向量y_test存着他们对应的180个类名 clf = clf_factory() clf.fit(X_train, y_train) #利用训练集训练出逻辑回归模型 clfs.append(clf) #将每次训练集回归出的模型塞进去 train_score = clf.score(X_train, y_train) #用训练出的模型检测训练集的准确率 test_score = clf.score(X_test, y_test) #用训练出的模型检测测试集的准确率 scores.append(test_score) train_errors.append(1 - train_score) #训练集的错误率 test_errors.append(1 - test_score) #测试集的错误率 y_pred = clf.predict(X_test) #预测测试集中的180首歌分别对应的类型 cm = confusion_matrix(y_test, y_pred) # cms.append(cm) for label in labels: #y_test是180行代表每首歌真实属于哪个类别 y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) #X_test是180行13列 #proba是180行6列每一列是这首歌曲被分为6个类别各自的概率 proba_label = proba[:, label] #proba_label是180行1列是proba中的某一列 #precision_recall_curve需要两个参数 #y_label_test是01序列180行,代表每首歌是否是label类 #proba_label也是个180行的序列,代表每首歌被预测为label类的概率 precision, recall, pr_thresholds = precision_recall_curve( y_label_test, proba_label) pr_scores[label].append(auc(recall, precision)) precisions[label].append(precision) recalls[label].append(recall) thresholds[label].append(pr_thresholds) fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: print "Plotting", genre_list[label] scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_roc(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def plot_roc(self, X, y, x_size=12, y_size=12): """Plot the ROC curve for X_test and y_test """ plot_roc(self.clf, X, y, x_size, y_size)
def measure(clf_class, parameters, name, qa_X, qa_Y,data_size=None, plot=False): feature_names = np.array(( 'NumTextTokens', 'NumCodeLines', 'LinkCount', 'AvgSentLen', 'AvgWordLen', 'NumAllCaps', 'NumExclams', 'NumImages' )) classifying_answer = "good" avg_scores_summary = [] start_time_clf = time.time() if data_size is None: X = qa_X Y = qa_Y else: X = qa_X[:data_size] Y = qa_Y[:data_size] cv = KFold(n=len(X), n_folds=10, indices=True) train_errors = [] test_errors = [] scores = [] roc_scores = [] fprs, tprs = [], [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_class(**parameters) clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) label_idx = 1 fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx]) prb = proba[:, label_idx] prb1 = proba[0:] precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, label_idx]) roc_scores.append(auc(fpr, tpr)) fprs.append(fpr) tprs.append(tpr) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) # print(classification_report(y_test, proba[:, label_idx] > # 0.63, target_names=['not accepted', 'accepted'])) # get medium clone scores_to_sort = pr_scores # roc_scores medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] if plot: plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium]) plot_pr(pr_scores[medium], name, precisions[medium], recalls[medium], classifying_answer + " answers") if hasattr(clf, 'coef_'): plot_feat_importance(feature_names, clf, name) summary = (name, np.mean(scores), np.std(scores), np.mean(roc_scores), np.std(roc_scores), np.mean(pr_scores), np.std(pr_scores), time.time() - start_time_clf) print(summary) avg_scores_summary.append(summary) precisions = precisions[medium] recalls = recalls[medium] thresholds = np.hstack(([0], thresholds[medium])) idx80 = precisions >= 0.8 p1 = precisions[idx80][0] p2 = thresholds[idx80][0] print("P=%.2f R=%.2f thresh=%.2f" % (precisions[idx80][0], recalls[ idx80][0], thresholds[idx80][0])) mean_train = np.mean(train_errors) mean_test =np.mean(test_errors) return mean_train,mean_test ,avg_scores_summary
import time from logistic_example import run_logistic_model from deep_wide import run_deep_wide_model from deepFM import run_deepfm_model from fm2 import run_fm_model from xdeepfm import run_xdeepfm_model from utils import plot_roc from sklearn.metrics import roc_curve if __name__ == '__main__': fpr_list, tpr_list, auc_list, name_list, timing_list = [], [], [], [], [] func_list = [run_logistic_model, run_deep_wide_model, run_deepfm_model, run_xdeepfm_model, run_fm_model] for func in func_list: print('Running', func.__name__) t = time.process_time() pred_ans, y_test, auc, model_name = func() elapsed_time = time.process_time() - t timing_list.append((func.__name__, elapsed_time)) fpr, tpr, thresholds = roc_curve(y_test, pred_ans, pos_label=0) fpr_list.append(fpr) tpr_list.append(tpr) auc_list.append(auc) name_list.append(model_name) [print(t[0], ': ', t[1], 's') for t in timing_list] plot_roc(tpr_list, fpr_list, auc_list, name_list)
def train(train_loader, model, optimizer, epoch): model.train() pbar = tqdm(enumerate(train_loader)) labels, distances = [], [] for batch_idx, (data_a, data_p, data_n, label_p, label_n) in pbar: if args.cuda: data_a, data_p, data_n = data_a.cuda(), data_p.cuda(), data_n.cuda( ) # compute output #triplet_loss,distsAN,distsAP,len_hard_triplets0= model(data_a,data_p,data_n,label_p,label_n,args) out_a, out_p, out_n = model(data_a), model(data_p), model(data_n) #because the special loss function,we can't put loss in forward propagation # Choose the hard negatives d_p = l2_dist.forward(out_a, out_p) d_n = l2_dist.forward(out_a, out_n) all = (d_n - d_p < args.margin).cpu().data.numpy().flatten() hard_triplets = np.where(all == 1) if len(hard_triplets[0]) == 0: continue out_selected_a = out_a[hard_triplets] out_selected_p = out_p[hard_triplets] out_selected_n = out_n[hard_triplets] # we only use triplet loss,not combine with softmax there #selected_data_a = Variable(torch.from_numpy(data_a.cpu().data.numpy()[hard_triplets]).cuda()) #selected_data_p = Variable(torch.from_numpy(data_p.cpu().data.numpy()[hard_triplets]).cuda()) #selected_data_n = Variable(torch.from_numpy(data_n.cpu().data.numpy()[hard_triplets]).cuda()) #selected_label_p = torch.from_numpy(label_p.cpu().numpy()[hard_triplets]) #selected_label_n= torch.from_numpy(label_n.cpu().numpy()[hard_triplets]) triplet_loss = TripletMarginLoss(args.margin).forward( out_selected_a, out_selected_p, out_selected_n) #cls_a = model.forward_classifier(selected_data_a) #cls_p = model.forward_classifier(selected_data_p) #cls_n = model.forward_classifier(selected_data_n) #criterion = nn.CrossEntropyLoss() #predicted_labels = torch.cat([cls_a,cls_p,cls_n]) #true_labels = torch.cat([Variable(selected_label_p.cuda()),Variable(selected_label_p.cuda()),Variable(selected_label_n.cuda())]) #cross_entropy_loss = criterion(predicted_labels.cuda(),true_labels.cuda()) #loss = cross_entropy_loss + triplet_loss # compute gradient and update weights optimizer.zero_grad() triplet_loss.backward() optimizer.step() # update the optimizer learning rate #adjust_learning_rate(optimizer) logger.log_value('triplet_loss', triplet_loss.item()).step() if batch_idx % args.log_interval == 0: pbar.set_description( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f} \t # of Selected Triplets: {}' .format(epoch, batch_idx * len(data_a), len(train_loader.dataset), 100. * batch_idx / len(train_loader), triplet_loss.item(), len(hard_triplets[0]))) dists = l2_dist.forward( out_selected_a, out_selected_n ) #torch.sqrt(torch.sum((out_a - out_n) ** 2, 1)) # euclidean distance distances.append(dists.data.cpu().numpy()) labels.append(np.zeros(dists.size(0))) dists = l2_dist.forward( out_selected_a, out_selected_p ) #torch.sqrt(torch.sum((out_a - out_p) ** 2, 1)) # euclidean distance distances.append(dists.data.cpu().numpy()) labels.append(np.ones(dists.size(0))) if batch_idx % args.val_interval == 0: #每val_interval 个batch 一验证 testaccuracy = validate(model, epoch) model.train() if batch_idx % args.save_interval == 0: #and batch_idx!=0: # 每val_interval 个batch 一验证 torch.save({ 'epoch': epoch + 1, 'state_dict': model.state_dict() }, '{}/triplet_loss_checkpoint_{}_epoch{}_lfwAcc{:.4f}.pth'.format( args.log_dir, get_time(), epoch, testaccuracy)) print( '=>saving model:triplet_loss_checkpoint_{}_epoch{}_lfwAcc{:.4f}.pth' .format(get_time(), epoch, testaccuracy)) labels = np.array([sublabel for label in labels for sublabel in label]) distances = np.array( [subdist[0] for dist in distances for subdist in dist]) tpr, fpr, accuracy, val, val_std, far = evaluate(distances, labels) print('\n\33[91mTrain set: Accuracy: {:.8f}\33[0m'.format( np.mean(accuracy))) logger.log_value('Train Accuracy', np.mean(accuracy)) plot_roc(fpr, tpr, figure_name="roc_train_epoch_{}.png".format(epoch)) # do checkpointing torch.save({ 'epoch': epoch + 1, 'state_dict': model.state_dict() }, '{}/triplet_loss_checkpoint_{}_epoch{}_lfwAcc{:.4f}.pth'.format( args.log_dir, get_time(), epoch, testaccuracy)) print('=>saving model:triplet_loss_checkpoint_{}_epoch{}_lfwAcc{:.4f}.pth'. format(get_time(), epoch, testaccuracy))