def ada_boost_testing(x_train, y_train, x_test, y_test, l=10): print('Adaboost\n\n') adbt = AdaBoostClassifier() L = l h = [] e = np.zeros(L) a = np.zeros(L) d = np.full((L, 2098), 1 / 2098) for t in range(L): h.append(adbt.fit(x_train, y_train, d[t])) for i in range(2098): if (adbt._predict(x_train[i]) != y_train[i]): e[t] = e[t] + d[t][i] # print(e[t]) a[t] = 1 / 2 * np.log(((1 - e[t]) / e[t])) # print(a[t]) if (t < L - 1): for i in range(2098): if (adbt._predict(x_train[i]) == y_train[i]): d[t + 1][i] = d[t][i] * np.exp(-a[t]) else: d[t + 1][i] = d[t][i] * np.exp(a[t]) d[t + 1] = (d[t + 1] / d[t + 1].sum()) preds_train = [] for i in range(2098): preds_train = np.append(preds_train, 0) for t in range(L): if (x_train[i][h[t].feature] >= h[t].split): preds_train[i] += a[t] * h[t].right_tree else: preds_train[i] += a[t] * h[t].left_tree if (preds_train[i] > 0): preds_train[i] = -1 else: preds_train[i] = 1 preds_test = [] for i in range(700): preds_test = np.append(preds_test, 0) for t in range(L): if (x_test[i][h[t].feature] >= h[t].split): preds_test[i] += a[t] * h[t].right_tree else: preds_test[i] += a[t] * h[t].left_tree if (preds_test[i] > 0): preds_test[i] = -1 else: preds_test[i] = 1 # print(preds_train) # print(preds_test) train_accuracy = (preds_train == y_train).sum() / len(y_train) test_accuracy = (preds_test == y_test).sum() / len(y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) print('F1 Train {}'.format(f1(y_train, preds_train))) print('F1 Test {}'.format(f1(y_test, preds_test))) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1( y_train, preds_train)
def ft_random_forest_testing(x_train, y_train, x_test, y_test): print('Random Forest Feature Loop\n\n') train_list = [] test_list = [] F1_list = [] for i in [1, 2, 5, 8, 10, 20, 25, 35, 50]: rclf = RandomForestClassifier(max_depth=7, max_features=i, n_trees=50) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = rclf.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds))) # Grab the useful number per cycle train_list.append(train_accuracy) test_list.append(test_accuracy) F1_list.append(f1(y_test, preds)) plt.rcParams['font.family'] = ['serif'] x = [1, 2, 5, 8, 10, 20, 25, 35, 50] ax = plt.subplot(111) ax.plot(x, train_list, label='training') ax.plot(x, test_list, label='testing') ax.plot(x, F1_list, label='F1') plt.xlabel("max_features") plt.xticks(x) plt.ylabel("Accuracies") ax.legend() plt.savefig("RandomForestFeatures.png") plt.clf()
def do_eval(model, train, dev, input_model=None): """ Evaluates a model on training and development set Args: model: QA model that has an instance variable 'answer' that returns answer span and takes placeholders question, question_length, paragraph, paragraph_length train: Training set dev: Development set """ checkpoint_dir = os.path.join(FLAGS.train_dir, FLAGS.model_name) parameter_space_size() saver = tf.train.Saver() # TODO add loop to run over all checkpoints in folder, # Training session with tf.Session() as session: saver.restore(session, tf.train.latest_checkpoint(checkpoint_dir)) print('Evaluation in progress.', flush=True) # Train/Dev Evaluation start_evaluate = timer() prediction, truth = multibatch_prediction_truth( session, model, train, FLAGS.eval_batches, input_model=input_model) train_f1 = f1(prediction, truth) train_em = exact_match(prediction, truth) prediction, truth = multibatch_prediction_truth( session, model, dev, FLAGS.eval_batches, input_model=input_model) dev_f1 = f1(prediction, truth) dev_em = exact_match(prediction, truth) logging.info(f'Train/Dev F1: {train_f1:.3f}/{dev_f1:.3f}') logging.info(f'Train/Dev EM: {train_em:.3f}/{dev_em:.3f}') logging.info(f'Time to evaluate: {timer() - start_evaluate:.1f} sec')
def evaluate(result, summary=False): avg = defaultdict(float) # average tp = defaultdict(int) # true positives tpfn = defaultdict(int) # true positives + false negatives tpfp = defaultdict(int) # true positives + false positives for _, y0, y1 in result: # actual value, prediction for y0, y1 in zip(y0, y1): tp[y0] += y0 == y1 tpfn[y0] += 1 tpfp[y1] += 1 for y in sorted(tpfn.keys()): pr = tp[y] / tpfp[y] if tpfp[y] else 0 rc = tp[y] / tpfn[y] if tpfn[y] else 0 avg["macro_pr"] += pr avg["macro_rc"] += rc if not summary: print() print("label = %s" % y) print("precision = {:f} ({:d}/{:d})".format(pr, tp[y], tpfp[y])) print("recall = {:f} ({:d}/{:d})".format(rc, tp[y], tpfn[y])) print("f1 = {:f}".format(f1(pr, rc))) avg["macro_pr"] /= len(tpfn) avg["macro_rc"] /= len(tpfn) avg["micro_f1"] = sum(tp.values()) / sum(tpfp.values()) print() print("macro precision = %f" % avg["macro_pr"]) print("macro recall = %f" % avg["macro_rc"]) print("macro f1 = %f" % f1(avg["macro_pr"], avg["macro_rc"])) print("micro f1 = %f" % avg["micro_f1"])
def decision_tree_various_depth(x_train, y_train, x_test, y_test): print('Decision Tree with depths 1-25 (inclusive)\n') # these will keep our points graphTrain = [] graphTest = [] graphF1 = [] # perform decision tree testing for each depth # i'd like to use the decision_tree_testing function here, but we need to set the proper depth for each iteration for layer in range(1, 26): print('Current depth: ', layer) clf = DecisionTreeClassifier(max_depth=layer) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) graphTrain.append(accuracy_score(preds_train, y_train)) graphTest.append(accuracy_score(preds_test, y_test)) print('Train {}'.format(accuracy_score(preds_train, y_train))) print('Test {}'.format(accuracy_score(preds_test, y_test))) preds = clf.predict(x_test) print('F1 Test {}\n'.format(f1(y_test, preds))) graphF1.append(f1(y_test, preds)) table = pd.DataFrame({ "Max Depth": [item for item in range(1, 26)], "Train Accuracy": graphTrain, "Test Accuracy": graphTest, "F1 Accuracy": graphF1 }) print(table) # plot our graph and output to a file plt.xlabel('Depth') plt.ylabel('Performance') plt.title('Accuracy & F1 Score vs Number of Trees') plt.plot('Max Depth', 'Train Accuracy', data=table, color='blue') plt.plot('Max Depth', 'Test Accuracy', data=table, color='green') plt.plot('Max Depth', 'F1 Accuracy', data=table, color='red') plt.legend() plt.savefig('q1.png') # get best depth in terms of validation accuracy topAccuracy = max(graphF1) print("The depth that gives the best validation accuracy is: ", [item for item in range(1, 26)][graphF1.index(topAccuracy)], "which has an F1 accuracy of ", topAccuracy) # get the most important feature for making a prediction clfMVP = DecisionTreeClassifier( max_depth=[item for item in range(1, 26)][graphF1.index(topAccuracy)]) clfMVP.fit(x_train, y_train) print("The most important feature for making a prediction is: ", clfMVP.root.feature) print("The threshold to split on for this feature is: ", clfMVP.root.split) # return the most important feature for use in main return clfMVP.root.feature
def random_forest_various_features(x_train, y_train, x_test, y_test): # keep our values to use for max_features useFeatures = [1, 2, 5, 8, 10, 20, 25, 35, 50] # for whatever reason, same variable names cause issues despite being within local scope # so we have to make sure there are no matching variable names even between functions graphTrain2 = [] graphTest2 = [] graphF12 = [] # let the user know which test this is print("== Beginning test for various max_features.\n") for features in useFeatures: print("max_features: ", features) rclf = RandomForestClassifier(max_depth=7, max_features=features, n_trees=50) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) graphTrain2.append(accuracy_score(preds_train, y_train)) graphTest2.append(accuracy_score(preds_test, y_test)) print('Train {}'.format(accuracy_score(preds_train, y_train))) print('Test {}'.format(accuracy_score(preds_test, y_test))) preds = rclf.predict(x_test) graphF12.append(f1(y_test, preds)) print('F1 Test {}\n'.format(f1(y_test, preds))) # print lengths for debugging print("== Length of Train", len(graphTrain2)) print("== Length of Test", len(graphTest2)) print("== Length of F1", len(graphF12)) # table for easily reading data table2 = pd.DataFrame({ "max_features": [i for i in useFeatures], "Train Accuracy": graphTrain2, "Test Accuracy": graphTest2, "F1 Accuracy": graphF12 }) print(table2) # plot our graph and output to a file plt.figure(3) plt.xlabel('Max Features') plt.ylabel('Performance') plt.title('Accuracy & F1 Score vs Max Features') plt.plot('max_features', 'Train Accuracy', data=table2, color='blue') plt.plot('max_features', 'Test Accuracy', data=table2, color='green') plt.plot('max_features', 'F1 Accuracy', data=table2, color='red') plt.legend() plt.savefig('q2pd.png') # return best value for max_features to use in main return [feature for feature in useFeatures][graphF12.index(max(graphF12))]
def decision_tree_testing(x_train, y_train, x_test, y_test, depth): print('Decision Tree\n\n') clf = DecisionTreeClassifier(max_depth=depth) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds))) return train_accuracy, test_accuracy, f1(y_train,preds_train), f1(y_test,preds)
def adaboost_testing(x_train, y_train, x_test, y_test, M): print("Adaboost Tree\n\n") aclf = AdaBoostClassifier(max_depth = 1) aclf.fit(x_train, y_train, M) preds_train = aclf.predict(x_train) preds_test = aclf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = aclf.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds))) preds_train = aclf.predict(x_train) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
def random_forest_testing(x_train, y_train, x_test, y_test, feat, tree): print('Random Forest\n\n') rclf = RandomForestClassifier(max_depth=7, max_features=feat, n_trees=tree) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = rclf.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds))) preds_train = rclf.predict(x_train) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
def forward_f1(self, x_spt, y_spt, x_qry, y_qry): task_num = self.task_num querysz = self.n_way * self.k_qry losses_q = [0 for _ in range(self.update_step + 1)] f1s = [0 for _ in range(self.update_step + 1)] for i in range(task_num): logits = self.net(x_spt[i], vars=None, bn_training=True) loss = F.cross_entropy(logits, y_spt[i]) grad = torch.autograd.grad(loss, self.net.parameters()) fast_weights = list(map(lambda p: p[1] - self.update_lr * p[0], zip(grad, self.net.parameters()))) with torch.no_grad(): logits_q = self.net(x_qry[i], self.net.parameters(), bn_training=True) loss_q = F.cross_entropy(logits_q, y_qry[i]) losses_q[0] += loss_q # pred_q = F.softmax(logits_q, dim=1).argmax(dim=1) result = f1(logits_q, y_qry[i]) f1s[0] = f1s[0] + result with torch.no_grad(): logits_q = self.net(x_qry[i], fast_weights, bn_training=True) loss_q = F.cross_entropy(logits_q, y_qry[i]) losses_q[1] += loss_q # pred_q = F.softmax(logits_q, dim=1).argmax(dim=1) result = f1(logits_q, y_qry[i]) f1s[1] = f1s[1] + result for k in range(1, self.update_step): logits = self.net(x_spt[i], fast_weights, bn_training=True) loss = F.cross_entropy(logits, y_spt[i]) grad = torch.autograd.grad(loss, fast_weights) fast_weights = list(map(lambda p: p[1] - self.update_lr * p[0], zip(grad, fast_weights))) logits_q = self.net(x_qry[i], fast_weights, bn_training=True) loss_q = F.cross_entropy(logits_q, y_qry[i]) losses_q[k + 1] += loss_q with torch.no_grad(): # pred_q = F.softmax(logits_q, dim=1).argmax(dim=1) result = f1(logits_q, y_qry[i]) f1s[k + 1] = f1s[k + 1] + result loss_q = losses_q[-1] / task_num self.meta_optim.zero_grad() loss_q.backward() self.meta_optim.step() accs = np.array(f1s) / (querysz * task_num) return accs
def train(epochs): print("Train start") writer = tensorboard.SummaryWriter(log_dir='./log', comment='Train loop') for ep in range(1, epochs + 1): epoch_loss, epoch_accuracy, epoch_precision = 0, 0, 0 epoch_f1, idx = 0, 0 for idx, (inp, label) in enumerate(train_loader): optimizer.zero_grad() op = model(inp) loss = criterion(op, label) loss.backward() optimizer.step() epoch_loss += loss.item() epoch_accuracy += accuracy(op, label) epoch_precision += precision(op, label) epoch_f1 += f1(op, label) writer.add_scalars( 'Training', { 'Accuracy': epoch_accuracy / idx, 'Precision': epoch_precision / idx, 'F1': epoch_f1 / idx }, ep) writer.add_scalars('Loss', {'Training': epoch_loss / idx}, ep) writer.close() torch.save(model.state_dict(), PATH) print("Done training")
def vectorized_rf(x_train, y_train, x_test, y_test, checktrain=True, ngram_range=(1, 1), vector_type="count", dataset="default"): vectorizer = CountVectorizer(tokenizer=tokenize, ngram_range=ngram_range) if vector_type=="count" \ else TfidfVectorizer(tokenizer=tokenize, ngram_range=ngram_range) vectorized_x_train = vectorizer.fit_transform(x_train) vectorized_x_test = vectorizer.transform(x_test) model = train(vectorized_x_train, y_train, checktrain, ngram_range, vector_type=vector_type, dataset=dataset) pred_x_train = predict(model, vectorized_x_train) pred_x_test = predict(model, vectorized_x_test) precision_test = precision(y_test, pred_x_test) recall_test = recall(y_test, pred_x_test) f1_test = f1(y_test, pred_x_test) print("Accuracy training accuracy (" + dataset, vector_type, " vectorized joint data) =", accuracy(y_train, pred_x_train)) print("Accuracy testing accuracy (" + dataset, vector_type, "vectorized joint data) =", accuracy(y_test, pred_x_test), "\n") print("Precision (" + dataset, vector_type + " vectorized test data) =", precision_test) print("Recall (" + dataset, vector_type + " test data) =", recall_test) print("F1 (" + dataset, vector_type + " test data) =", f1_test, "\n")
def simple_lstm(x_train, y_train, x_test, y_test, dataset="default"): model = train(x_train, y_train, dataset=dataset) print(x_train.shape) print(x_test.shape) # print(model.summary()) pred_x_train = model.predict(x_train) pred_x_test = model.predict(x_test) precision_test = precision(np.argmax(y_test, axis=1), np.argmax(pred_x_test, axis=1), labels=(0, 1, 2)) recall_test = recall(np.argmax(y_test, axis=1), np.argmax(pred_x_test, axis=1), labels=(0, 1, 2)) f1_test = f1(np.argmax(y_test, axis=1), np.argmax(pred_x_test, axis=1), labels=(0, 1, 2)) # print( "Accuracy training accuracy = ", accuracy(np.argmax(y_train, axis=1), np.argmax(pred_x_train, axis=1))) print("Accuracy testing accuracy =", accuracy(np.argmax(y_test, axis=1), np.argmax(pred_x_test, axis=1)), "\n") # print("Precision (test data) =", precision_test) print("Recall (test data) =", recall_test) print("F1 (test data) =", f1_test, "\n")
def train(trainloader, model, criterion, optimizer, epoch, use_cuda): # switch to train mode model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() micro_f1 = AverageMeter() macro_f1 = AverageMeter() end = time.time() bar = Bar('Processing', max=len(trainloader)) for batch_idx, (inputs, targets) in enumerate(trainloader): # measure data loading time data_time.update(time.time() - end) if use_cuda: inputs, targets = inputs.cuda(), targets.cuda(non_blocking=True) inputs, targets = torch.autograd.Variable( inputs), torch.autograd.Variable(targets) # compute output outputs = model(inputs) loss = criterion(outputs, targets) # measure accuracy and record loss prec1, _ = accuracy(outputs.data, targets.data, topk=(1, 2)) # odb.set_trace() losses.update(loss.item(), inputs.size(0)) top1.update(prec1.item(), inputs.size(0)) _macrof1, _microf1 = f1(outputs.data, targets.data) micro_f1.update(_microf1, inputs.size(0)) macro_f1.update(_macrof1, inputs.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # plot progress bar.suffix = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | top1: {top1: .4f} | micro: {micro: .4f} | macro: {macro: .4f}'.format( batch=batch_idx + 1, size=len(trainloader), data=data_time.avg, bt=batch_time.avg, total=bar.elapsed_td, eta=bar.eta_td, loss=losses.avg, top1=top1.avg, micro=micro_f1.avg, macro=macro_f1.avg, ) bar.next() bar.finish() return (losses.avg, top1.avg)
def random_forest_various_seeds(x_train, y_train, x_test, y_test, best_max_features, best_n_trees): # let the user know which test this is print("== Beginning test for best result with random seeds.\n") # to hold data points randseedTrain = [] randseedTest = [] randseedF1 = [] averageSeeds = [] averageTrain = [] averageTest = [] averageF1 = [] usedSeeds = [] rclf = RandomForestClassifier(max_depth=7, max_features=best_max_features, n_trees=best_n_trees) for item in [i for i in range(10)]: rclf.seed = np.random.randint(1, 1000) usedSeeds.append(rclf.seed) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) randseedTrain.append(accuracy_score(preds_train, y_train)) randseedTest.append(accuracy_score(preds_test, y_test)) print('Train {}'.format(accuracy_score(preds_train, y_train))) print('Test {}'.format(accuracy_score(preds_test, y_test))) preds = rclf.predict(x_test) randseedF1.append(f1(y_test, preds)) print('F1 Test {}\n'.format(f1(y_test, preds))) # get averages averageSeeds.append("Average") averageTrain.append(sum(randseedTrain) / len(randseedTrain)) averageTest.append(sum(randseedTest) / len(randseedTest)) averageF1.append(sum(randseedF1) / len(randseedF1)) # get table for data + add averages at the end table3 = pd.DataFrame({ "Seed": [i for i in usedSeeds] + averageSeeds, "Train Accuracy": randseedTrain + averageTrain, "Test Accuracy": randseedTest + averageTest, "F1 Score": randseedF1 + averageF1 }) print(table3)
def ada_boost_testing(x_train, y_train, x_test, y_test, num_learner): print('Ada Boost and L(', num_learner, ')') aba = AdaBoostClassifier(num_learner) aba.fit(x_train, y_train) preds_train = aba.predict(x_train) preds_test = aba.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = aba.predict(x_test) preds_train = aba.predict(x_train) print('F1 Train {}'.format(f1(y_train, preds_train))) print('F1 Test {}\n'.format(f1(y_test, preds))) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
def train(epoch): def closure(): optimizer.zero_grad() output = model(features, adj_train) loss_train = F.cross_entropy(output[idx_train], labels[idx_train]) loss_train.backward() t = time.time() model.train() optimizer.zero_grad() output = model(features, adj_train) loss_train = F.cross_entropy(output[idx_train], labels[idx_train]) if args.dataset == 'reddit': acc_train = f1(output[idx_train], labels[idx_train]) else: acc_train = accuracy(output[idx_train], labels[idx_train]) if args.optimizer == 'lbfgs': optimizer.step(closure) else: loss_train.backward() if args.grad_clip: torch.nn.utils.clip_grad_value_(model.parameters(), args.grad_clip) optimizer.step() if not args.fastmode: # Evaluate validation set performance separately, # deactivates dropout during validation run. model.eval() output = model(features, adj_val) loss_val = F.cross_entropy(output[idx_val], labels[idx_val]) if args.dataset == 'reddit': acc_val = f1(output[idx_val], labels[idx_val]) else: acc_val = accuracy(output[idx_val], labels[idx_val]) epoch_time = time.time() - t if args.print and epoch % args.print == args.print - 1: print('Epoch: {:04d}'.format(epoch + 1), 'loss_train: {:.4f}'.format(loss_train.item()), 'acc_train: {:.4f}'.format(acc_train.item()), 'loss_val: {:.4f}'.format(loss_val.item()), 'acc_val: {:.4f}'.format(acc_val.item()), 'time: {:.4f}s'.format(time.time() - t)) return loss_val.item(), acc_val.item(), epoch_time
def random_forest_various_trees(x_train, y_train, x_test, y_test): graphTrain = [] graphTest = [] graphF1 = [] # let the user know which test this is print("== Beginning test for various n_trees.\n") # plot accuracies for the number of trees specified in part b for i in range(10, 210, 10): print("n_trees: ", i) rclf = RandomForestClassifier(max_depth=7, max_features=11, n_trees=i) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) graphTrain.append(accuracy_score(preds_train, y_train)) graphTest.append(accuracy_score(preds_test, y_test)) print('Train {}'.format(accuracy_score(preds_train, y_train))) print('Test {}'.format(accuracy_score(preds_test, y_test))) preds = rclf.predict(x_test) print('F1 Test {}\n'.format(f1(y_test, preds))) graphF1.append(f1(y_test, preds)) # table for easily reading data table = pd.DataFrame({ "n_trees": [i for i in range(10, 210, 10)], "Train Accuracy": graphTrain, "Test Accuracy": graphTest, "F1 Accuracy": graphF1 }) print(table) # plot our graph and output to a file plt.figure(2) plt.xlabel('Number of trees') plt.ylabel('Performance') plt.title('Accuracy & F1 Score vs Number of Trees in the Forest') plt.plot('n_trees', 'Train Accuracy', data=table, color='blue') plt.plot('n_trees', 'Test Accuracy', data=table, color='green') plt.plot('n_trees', 'F1 Accuracy', data=table, color='red') plt.legend() plt.savefig('q2pb.png') # return our best n__trees value for use in main return [i for i in range(10, 210, 10)][graphF1.index(max(graphF1))]
def random_forest_testing(x_train, y_train, x_test, y_test, n_trees, max_features): print('Random Forest') print("max_depth: %d, max_features: %d, n_trees: %d" % (7,max_features, n_trees)) rclf = RandomForestClassifier(n_trees, max_features, max_depth=7) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = rclf.predict(x_test) preds_train = rclf.predict(x_train) print('F1 Train {}'.format(f1(y_train, preds_train))) print('F1 Test {}\n'.format(f1(y_test, preds))) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
def f1_metric(label, pred): label = label.astype(np.int) start = 0 res = [] for i in list_idx: end = start + i res.append(f1(label[start:end], pred[start:end])) start = end sc = np.mean(res) return 'f1', sc, True
def macro_f1(y_true, y_pred, num_classes=3): def f1(y_true, y_pred): y_pred = K.cast(y_pred >= 0.5, 'float32') TP = K.sum(y_pred * y_true) precision = TP/(K.sum(y_pred)+0.0001) recall = TP/(K.sum(y_true)+0.0001) return 2*precision*recall / (precision+recall+0.0001) sum = 0 for i in range(num_classes): sum += f1(y_true[..., i], y_pred[..., i]) return K.cast(sum/num_classes, 'float32')
def ababoost(x_train, y_train, x_test, y_test): print('Ababoost\n\n') leni = len(x_train) L = 3 D = np.array([1 / leni] * leni) #The first D is 1/length of train set bclf = AdaBoostClassifier() for i in range(L): preds_train, preds_test, D, we = bclf.adaboost(x_train, y_train, x_test, y_test, D) y_train[y_train == 0] = -1 y_test[y_test == 0] = -1 train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('L = ', L) print(D) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) print('F1 Train {}'.format(f1(y_train, preds_train))) print('F1 Test {}'.format(f1(y_test, preds_test))) print('we = ', we)
def test(): model.eval() output = model(features, adj_test) loss_test = F.cross_entropy(output[idx_test], labels[idx_test]) if args.dataset == 'reddit': acc_test = f1(output[idx_test], labels[idx_test]) else: acc_test = accuracy(output[idx_test], labels[idx_test]) print("Test set results:", "loss= {:.4f}".format(loss_test.item()), "accuracy= {:.4f}".format(acc_test.item())) return acc_test.item()
def decision_tree_testing(x_train, y_train, x_test, y_test): n = 1 print('Decision Tree depth: ', n) clf = DecisionTreeClassifier(max_depth=n) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds)))
def create_trees(x_train, y_train, x_test, y_test, maxdepth): #print('Decision Tree\n\n') clf = DecisionTreeClassifier(max_depth=maxdepth) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) #print('Train {}'.format(train_accuracy)) #print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) #print('F1 Test {}'.format(f1(y_test, preds))) return (f1(y_test, preds)), train_accuracy, test_accuracy
def ada_boost_testing(x_train, y_train, x_test, y_test): print('AdaBoost\n\n') graphTrain = [] graphTest = [] graphF1 = [] for i in range(10, 200, 10): weak = AdaBoostClassifier(n_trees=i) weak.fit(x_train, y_train) preds_train = weak.predict(x_train) preds_test = weak.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('L {}'.format(i)) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = weak.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds))) graphTrain.append(train_accuracy) graphTest.append(test_accuracy) graphF1.append(f1(y_test, preds)) table = pd.DataFrame({ "L Parameter": [item for item in range(10, 200, 10)], "Train Accuracy": graphTrain, "Test Accuracy": graphTest, "F1 Accuracy": graphF1 }) print(table) plt.xlabel('L Parameter') plt.ylabel('Performance') plt.title('Accuracy & F1 Score vs L') plt.plot('L Parameter', 'Train Accuracy', data=table, color='blue') plt.plot('L Parameter', 'Test Accuracy', data=table, color='green') plt.plot('L Parameter', 'F1 Accuracy', data=table, color='red') plt.legend() plt.savefig('q3.png')
def test_overfit(model, train, input_model=None): """ Tests that model can overfit on small datasets. Args: model: QA model that has an instance variable 'answer' that returns answer span and takes placeholders question, question_length, paragraph, paragraph_length train: Training set """ epochs = 100 test_size = 32 steps_per_epoch = 10 train.question, train.paragraph, train.question_length, train.paragraph_length, train.answer = train[: test_size] with tf.Session() as session: session.run(tf.global_variables_initializer()) for epoch in range(epochs): epoch_start = timer() for step in range(steps_per_epoch): feed_dict_inputs = train.get_batch(FLAGS.batch_size, replace=False) if input_model: #feed into siamese model instead question = feed_dict_inputs[0] M = input_model.run(question) input_dict_inputs[0] = M feed_dict = model.fill_feed_dict(*feed_dict_inputs) fetch_dict = { 'step': tf.train.get_global_step(), 'loss': model.loss, 'train': model.train } result = session.run(fetch_dict, feed_dict) loss = result['loss'] if (step == 0 and epoch == 0): print( f'Entropy - Result: {loss:.2f}, Expected (approx.): {2*np.log(FLAGS.max_paragraph_length):.2f}' ) if step == steps_per_epoch - 1: print(f'Cross entropy: {loss:.2f}') train.length = test_size prediction, truth = multibatch_prediction_truth( session, model, train, 1, input_model=input_model) overfit_f1 = f1(prediction, truth) print(f'F1: {overfit_f1:.2f}') global_step = tf.train.get_global_step().eval() print( f'Epoch took {timer() - epoch_start:.2f} s (step: {global_step})' )
def find_best_c(x, y, share): x_train, x_check = utils.split_data(x, share) y_train, y_check = utils.split_data(y, share) best_c = 2 ** -7 best_f1 = 0 for i in range(-7, 7): c = 2 ** i v = train(x_train, y_train, c) p, r = utils.process_result(test(x_check, y_check, v)) f1 = utils.f1(p, r) if f1 > best_f1: best_f1 = f1 best_c = c return best_c
def find_best_c(x, y, share): x_train, x_check = utils.split_data(x, share) y_train, y_check = utils.split_data(y, share) best_c = 2**-7 best_f1 = 0 for i in range(-7, 7): c = 2**i v = train(x_train, y_train, c) p, r = utils.process_result(test(x_check, y_check, v)) f1 = utils.f1(p, r) if f1 > best_f1: best_f1 = f1 best_c = c return best_c
def ada_boost_testing(x_train, y_train, x_test, y_test, num_learner=50): print('Ada Boost') print(x_train, y_train) aba = AdaBoostClassifier(num_learner) aba.fit(x_train, y_train) preds_train = aba.predict(x_train) preds_test = aba.predict(x_test) print(preds_train, preds_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = aba.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds)))
def find_best_c(x, y, share, count): x_train, x_check = utils.split_data(x, share) y_train, y_check = utils.split_data(y, share) best_f1 = 0 best_c = -1 c = 10 while c <= 40: w1, w2 = train(x_train, y_train, c, count) p, r = utils.process_result(test(x_check, y_check, w1, w2)) f1 = utils.f1(p, r) if f1 > best_f1: best_f1 = f1 best_c = c c += 10 return best_c
def find_best_c(x, y, share, count): x_train, x_check = utils.split_data(x, share) y_train, y_check = utils.split_data(y, share) best_f1 = 0 best_c = -1 c = 10 while c <= 40: w1, w2 = train(x_train, y_train, c, count) p, r = utils.process_result(test(x_check, y_check, w1, w2)) f1 = utils.f1(p, r) if f1 > best_f1: best_f1 = f1 best_c = c c += 10 return best_c
def test_maclaurin_series_0(self): self.assertEqual(maclaurin.maclaurin_series(utils.f1(50, 25, 20), 0), 1)
def test_maclaurin_series_5(self): self.assertEqual(maclaurin.maclaurin_series(utils.f1(50, 25, 20), 5), 0.7297882727154748)
def test_maclaurin_series_1(self): self.assertEqual(maclaurin.maclaurin_series(utils.f1(50, 25, 20), 1), 0.7660444431189781)
def test_maclaurin_series_4(self): self.assertEqual(maclaurin.maclaurin_series(utils.f1(50, 25, 20), 4), 0.7304015754171275)
def print_result(name, precision, recall): print(name) print("precision: %.3f recall: %.3f" % (precision, recall)) print("f1: %.3f" % (utils.f1(precision, recall))) print("###############")
def test_maclaurin_series_2(self): self.assertEqual(maclaurin.maclaurin_series(utils.f1(50, 25, 20), 2), 0.7386768418212236)
def test_maclaurin_series_3(self): self.assertEqual(maclaurin.maclaurin_series(utils.f1(50, 25, 20), 3), 0.7322740394191096)