class TestKnnClassifier(unittest.TestCase): def setUp(self): algorithm = CoverTreeAlgorithm(euclidian_distance) self.classifier = KNNClassifier(algorithm, NEIGHBOURS_COUNT) self.classifier.register_feature(TestFeature) def build_object_description(self, number): features = (TestFeature(number),) return ObjectDescription(features) def check_fixture(self, fixture): train_set = fixture['train_set'] for number, label in train_set: self.classifier.train(number, label) label = self.classifier.classify(fixture['query']) self.assertEqual(fixture['expected_result'], label) def test_classifier(self): for fixture in FIXTURES: self.check_fixture(fixture)
if y_true[i] == pos: TP += 1 else: FP += 1 FPR = FP / num_neg TPR = TP / num_pos if verbose: print("{},{}".format(FPR, TPR)) if __name__ == "__main__": if len(sys.argv) == 1: k = 30 train = "datasets/votes_train.json" test = "datasets/votes_test.json" else: k = int(sys.argv[1]) train = str(sys.argv[2]) test = str(sys.argv[3]) # parse the json files for data X_train, y_train, meta_train = parse_json(train) X_test, y_test, meta_test = parse_json(test) # fit KNN and predict confidence knn = KNNClassifier(k=k) knn.fit(X_train, y_train, meta_train) y_conf = knn.predict(X_test, verbose=False, confidence=True) roc_curve(y_test, y_conf, meta_test, verbose=True)
def setUp(self): algorithm = CoverTreeAlgorithm(euclidian_distance) self.classifier = KNNClassifier(algorithm, NEIGHBOURS_COUNT) self.classifier.register_feature(TestFeature)
# Create the output file try: file_name = "{}/output_knn.txt".format(args.outdir) f_out = open(file_name, 'w') except IOError: print("Output file {} cannot be created".format(file_name)) sys.exit(1) # Write header for output file f_out.write('{}\t{}\t{}\t{}\n'.format('Value of k', 'Accuracy', 'Precision', 'Recall')) ############################## KNN algorithm #################################### # Create the k-NN object. knn = KNNClassifier(train_X[:, 1:], train_y[:, 1:], metric='euclidean') # Iterate through all possible values of k: for k in range(min_k, max_k + 1): knn.set_k(k) # 1. Perform KNN training and classify all the test points. In this step, you will # obtain a prediction for each test point. y_pred = [] for i in range(test_X.shape[0]): result = knn.predict(test_X[i, 1:]) if result: y_pred.append(result)
def cf_gzsl(test_x, test_l, split): preds = [] truths = [] test_l_np = test_l.cpu().numpy() test_l_binary = np.array( [y in data.unseenclasses for y in test_l]) if additional_train: gen_sx, gen_sl = generate_syn_feature( self.netG, self.data.seenclasses, self.data.attribute, 100, netF=self.netF, netDec=self.netDec, opt=opt) #gen_sx = self.conditional_sample(data.train_feature, data.attribute[data.train_label], deterministic=False) #gen_sx2 = self.conditional_sample(data.train_feature, data.attribute[data.train_label], deterministic=False) #gen_sx3 = self.conditional_sample(data.train_feature, data.attribute[data.train_label], deterministic=False) #gen_sx = torch.cat((gen_sx, gen_sx2, gen_sx3), 0) #gen_sl = torch.cat((data.train_label.cuda(), data.train_label.cuda(), data.train_label.cuda()), 0) for i in range(test_x.shape[0]): gen_x, gen_l = self.generate_syn_feature_cf( test_x[i], data.unseenclasses, deterministic=deterministic) if use_train: #if additional_train: # train_x = torch.cat((gen_sx, gen_x), 0) # train_y = torch.cat((gen_sl, gen_l), 0) #else: train_x = torch.cat((data.train_feature, gen_x), 0) train_y = torch.cat((data.train_label.cuda(), gen_l), 0) else: gen_s_x, gen_s_l = self.generate_syn_feature_cf( test_x[i], data.seenclasses, deterministic=deterministic) train_x = torch.cat((gen_s_x, gen_x), 0) train_y = torch.cat((gen_s_l, gen_l), 0) if additional_train: train_x = torch.cat((train_x, gen_sx), 0) train_y = torch.cat((train_y, gen_sl.cuda()), 0) if softmax_clf: if not binary: clf = classifier.CLASSIFIER(train_x, train_y, data, self.opt.nclass_all, opt.cuda, opt.classifier_lr, opt.beta1,\ self.epoch, opt.syn_num, generalized=True, netDec=self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096, x=test_x[i], use_tde=use_tde, alpha=self.alpha) if self.test_logits is None: self.test_logits = clf.logits else: self.test_logits = np.concatenate( (self.test_logits, clf.logits), axis=0) else: clf = BINARY_CLASSIFIER(train_x, train_y, data, 2, True, opt.classifier_lr, 0.5, self.epoch, opt.syn_num, netDec=self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096, use_tde=use_tde, alpha=self.alpha, x=test_x[i]) pred = clf.pred truths.append(test_l_np[i]) preds.append(pred.item()) else: clf = KNNClassifier(train_x, train_y, test_x[i].unsqueeze(0), self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096, batch_size=100) pred = clf.fit()[0] preds.append(pred) truths.append(test_l_np[i]) if (i + 1) % 500 == 0: if not binary: binary_acc = self.get_binary_acc(truths, preds) print("%s-%dth acc: %.3f, binary acc: %.3f" % (split, i + 1, cal_macc(truth=truths, pred=preds), binary_acc)) else: test_l_binary_t = test_l_binary[:len(preds )].astype(int) preds_np = np.array(preds) acc = (preds_np == test_l_binary_t).mean() print("%s-%dth binary acc: %.3f" % (split, i + 1, acc)) if self.opt.sanity: break # Sanity check if not binary: acc = cal_macc(truth=truths, pred=preds) binary_acc = self.get_binary_acc(truths, preds) clf_results = {"truths": truths, "preds": preds} else: acc = (np.array(preds) == test_l_binary.astype(int)).mean() binary_acc = acc clf_results = {"truths": test_l_binary, "preds": preds} save_file = self.get_save_result_file(split) if self.log_to_file: with open(save_file, 'wb') as handle: pickle.dump(clf_results, handle) return acc, binary_acc
def gzsl(self, use_train, softmax_clf, cf, deterministic=False, additional_train=False, use_tde=False, binary=False): opt = self.opt data = self.data if self.siamese: clf = SiameseClassifier(data, opt, self.netE, self.netG, self.netF, self.cls_netDec, dec_size=opt.attSize, cf=cf, n_epochs=opt.clf_epoch, distance="l1") if self.netS is None: clf.train() else: clf.network = self.netS s_acc, u_acc = clf.validate(gzsl=True) if not cf: with torch.no_grad(): gen_x, gen_l = generate_syn_feature(self.netG, self.data.unseenclasses, self.data.attribute, opt.syn_num, netF=self.netF, netDec=self.netDec, opt=opt) if use_train: train_x = torch.cat((data.train_feature, gen_x), 0) train_y = torch.cat((data.train_label, gen_l), 0) else: with torch.no_grad(): gen_s_x, gen_s_l = generate_syn_feature( self.netG, self.data.seenclasses, self.data.attribute, opt.syn_num, netF=self.netF, netDec=self.netDec, opt=opt) train_x = torch.cat((gen_s_x, gen_x), 0) train_y = torch.cat((gen_s_l, gen_l), 0) if softmax_clf: if not binary: gzsl_cls = classifier.CLASSIFIER(train_x, train_y, \ data, data.allclasses.size(0), opt.cuda, opt.classifier_lr, 0.5, self.epoch, opt.syn_num, generalized=True, netDec=self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096, use_tde=use_tde, alpha=self.alpha) self.test_logits = gzsl_cls.all_outputs else: gzsl_cls = BINARY_CLASSIFIER(train_x, train_y, data, 2, True, opt.classifier_lr, 0.5, self.epoch, opt.syn_num, netDec=self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096, use_tde=use_tde, alpha=self.alpha) s_acc = gzsl_cls.acc_seen u_acc = gzsl_cls.acc_unseen h_acc = gzsl_cls.H self.s_bacc = gzsl_cls.s_bacc self.u_bacc = gzsl_cls.u_bacc if not binary: clf_results = {"preds": gzsl_cls.pred_s.cpu().numpy()} save_file = self.get_save_result_file("seen") if self.log_to_file and not binary: with open(save_file, 'wb') as handle: pickle.dump(clf_results, handle) if not binary: clf_results = {"preds": gzsl_cls.pred_u.cpu().numpy()} save_file = self.get_save_result_file("unseen") if self.log_to_file and not binary: with open(save_file, 'wb') as handle: pickle.dump(clf_results, handle) else: u_cls = KNNClassifier(train_x, train_y, data.test_unseen_feature, self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096, batch_size=100) preds = u_cls.fit() truths = data.test_unseen_label.cpu().numpy() u_acc = cal_macc(truth=truths, pred=preds) s_cls = KNNClassifier(train_x, train_y, data.test_seen_feature, self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096, batch_size=100) preds = s_cls.fit() truths = data.test_seen_label.cpu().numpy() s_acc = cal_macc(truth=truths, pred=preds) h_acc = 2 * u_acc * s_acc / (u_acc + s_acc) else: self.test_logits = None def cf_gzsl(test_x, test_l, split): preds = [] truths = [] test_l_np = test_l.cpu().numpy() test_l_binary = np.array( [y in data.unseenclasses for y in test_l]) if additional_train: gen_sx, gen_sl = generate_syn_feature( self.netG, self.data.seenclasses, self.data.attribute, 100, netF=self.netF, netDec=self.netDec, opt=opt) #gen_sx = self.conditional_sample(data.train_feature, data.attribute[data.train_label], deterministic=False) #gen_sx2 = self.conditional_sample(data.train_feature, data.attribute[data.train_label], deterministic=False) #gen_sx3 = self.conditional_sample(data.train_feature, data.attribute[data.train_label], deterministic=False) #gen_sx = torch.cat((gen_sx, gen_sx2, gen_sx3), 0) #gen_sl = torch.cat((data.train_label.cuda(), data.train_label.cuda(), data.train_label.cuda()), 0) for i in range(test_x.shape[0]): gen_x, gen_l = self.generate_syn_feature_cf( test_x[i], data.unseenclasses, deterministic=deterministic) if use_train: #if additional_train: # train_x = torch.cat((gen_sx, gen_x), 0) # train_y = torch.cat((gen_sl, gen_l), 0) #else: train_x = torch.cat((data.train_feature, gen_x), 0) train_y = torch.cat((data.train_label.cuda(), gen_l), 0) else: gen_s_x, gen_s_l = self.generate_syn_feature_cf( test_x[i], data.seenclasses, deterministic=deterministic) train_x = torch.cat((gen_s_x, gen_x), 0) train_y = torch.cat((gen_s_l, gen_l), 0) if additional_train: train_x = torch.cat((train_x, gen_sx), 0) train_y = torch.cat((train_y, gen_sl.cuda()), 0) if softmax_clf: if not binary: clf = classifier.CLASSIFIER(train_x, train_y, data, self.opt.nclass_all, opt.cuda, opt.classifier_lr, opt.beta1,\ self.epoch, opt.syn_num, generalized=True, netDec=self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096, x=test_x[i], use_tde=use_tde, alpha=self.alpha) if self.test_logits is None: self.test_logits = clf.logits else: self.test_logits = np.concatenate( (self.test_logits, clf.logits), axis=0) else: clf = BINARY_CLASSIFIER(train_x, train_y, data, 2, True, opt.classifier_lr, 0.5, self.epoch, opt.syn_num, netDec=self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096, use_tde=use_tde, alpha=self.alpha, x=test_x[i]) pred = clf.pred truths.append(test_l_np[i]) preds.append(pred.item()) else: clf = KNNClassifier(train_x, train_y, test_x[i].unsqueeze(0), self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096, batch_size=100) pred = clf.fit()[0] preds.append(pred) truths.append(test_l_np[i]) if (i + 1) % 500 == 0: if not binary: binary_acc = self.get_binary_acc(truths, preds) print("%s-%dth acc: %.3f, binary acc: %.3f" % (split, i + 1, cal_macc(truth=truths, pred=preds), binary_acc)) else: test_l_binary_t = test_l_binary[:len(preds )].astype(int) preds_np = np.array(preds) acc = (preds_np == test_l_binary_t).mean() print("%s-%dth binary acc: %.3f" % (split, i + 1, acc)) if self.opt.sanity: break # Sanity check if not binary: acc = cal_macc(truth=truths, pred=preds) binary_acc = self.get_binary_acc(truths, preds) clf_results = {"truths": truths, "preds": preds} else: acc = (np.array(preds) == test_l_binary.astype(int)).mean() binary_acc = acc clf_results = {"truths": test_l_binary, "preds": preds} save_file = self.get_save_result_file(split) if self.log_to_file: with open(save_file, 'wb') as handle: pickle.dump(clf_results, handle) return acc, binary_acc s_acc, s_bacc = cf_gzsl(data.test_seen_feature, data.test_seen_label, "seen") u_acc, u_bacc = cf_gzsl(data.test_unseen_feature, data.test_unseen_label, "unseen") # s_acc = 0.3 if u_acc + s_acc == 0: h_acc = 0 else: h_acc = 2 * u_acc * s_acc / (u_acc + s_acc) self.s_bacc = s_bacc self.u_bacc = u_bacc return s_acc, u_acc, h_acc
def zsl(self, softmax_clf, cf, deterministic=False): opt = self.opt data = self.data if not cf: with torch.no_grad(): gen_x, gen_l = generate_syn_feature(self.netG, self.data.unseenclasses, self.data.attribute, opt.syn_num, netF=self.netF, netDec=self.netDec, opt=opt) if softmax_clf: zsl_cls = classifier.CLASSIFIER(gen_x, util.map_label(gen_l, data.unseenclasses), \ data, data.unseenclasses.size(0), opt.cuda, opt.classifier_lr, 0.5, self.epoch, opt.syn_num, \ generalized=False, netDec=self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096) acc = zsl_cls.acc else: zsl_cls = KNNClassifier(gen_x, gen_l, data.test_unseen_feature, self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096, batch_size=100) preds = zsl_cls.fit() truths = data.test_unseen_label.cpu().numpy() acc = cal_macc(truth=truths, pred=preds) else: preds = [] truths = [] test_x = data.test_unseen_feature mapped_unseen_l = util.map_label(data.test_unseen_label, data.unseenclasses) unseen_label_np = data.test_unseen_label.cpu().numpy() for i in range(test_x.shape[0]): gen_x, gen_l = self.generate_syn_feature_cf( test_x[i], data.unseenclasses, deterministic=deterministic) gen_l = util.map_label(gen_l, data.unseenclasses) if softmax_clf: clf = classifier.CLASSIFIER(gen_x, gen_l, data, data.unseenclasses.size(0), opt.cuda, opt.classifier_lr, 0.5, self.epoch, opt.syn_num, generalized=False, netDec=self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096, x=test_x[i]) pred = clf.pred truths.append(mapped_unseen_l[i]) preds.append(pred) else: clf = KNNClassifier(gen_x, gen_l, test_x[i].unsqueeze(0), self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096, batch_size=100) pred = clf.fit()[0] preds.append(pred) truths.append(unseen_label_np[i]) if (i + 1) % 500 == 0: print("%dth acc: %.3f" % (i + 1, cal_macc(truth=truths, pred=preds))) if self.opt.sanity: break # Sanity check acc = cal_macc(truth=truths, pred=preds) return acc
test = "datasets/digits_test.json" else: max_k = int(sys.argv[1]) train = str(sys.argv[2]) val = str(sys.argv[3]) test = str(sys.argv[4]) # parse the json files for data X_train, y_train, meta_train = parse_json(train) X_val, y_val, meta_val = parse_json(val) X_test, y_test, meta_test = parse_json(test) # train classifier on TRAIN, predict on VAL (for k=1,2,...,max_k) acc = {} for k in range(1, max_k + 1): knn = KNNClassifier(k=k) knn.fit(X_train, y_train, meta_train) y_pred = knn.predict(X_val, verbose=False) acc[k] = accuracy_score(y_val, y_pred) print("{},{}".format(k, acc[k])) best_k = max(acc, key=lambda key: acc[ key]) # note that 'max' always returns first value in case of ties print(best_k) # train on TRAIN + VAL, predict on TEST knn_best = KNNClassifier(k=best_k) X_train_val = pd.concat([X_train, X_val], ignore_index=True) y_train_val = pd.concat([y_train, y_val], ignore_index=True)
import pandas as pd if __name__ == "__main__": if len(sys.argv) == 1: k = 10 train = "datasets/votes_train.json" test = "datasets/votes_test.json" else: k = int(sys.argv[1]) train = str(sys.argv[2]) test = str(sys.argv[3]) # parse the json files for data X_train, y_train, meta_train = parse_json(train) X_test, y_test, meta_test = parse_json(test) for i in range(10): N = X_train.shape[0] ind = math.floor( (i + 1) * N / 10 - 1) # subtract 1 since indexing starts at 0 knn = KNNClassifier(k=k) knn.fit(X_train.ix[0:ind, :], y_train.ix[0:ind], meta_train) y_pred = knn.predict(X_test, verbose=False) acc = accuracy_score(y_test, y_pred) print(X_train.ix[0:ind, :].shape[0], end="") print(",{}".format(acc))