kmeans_model = KMeans(test[1]) hac_model = HAC(test[1]) # Glass dataset if "glass" in test[0]: kmeans_sfs_glass = np.array([1,3]) kmeans_model.cluster(data_instances[:,kmeans_sfs_glass]) print "Kmeans SFS glass performance = %f" % kmeans_model.calculate_performance() kmeans_ga_glass = np.array([0,1,2,3,4,5,6]) kmeans_model = KMeans(test[1]) kmeans_model.cluster(data_instances[:,kmeans_ga_glass]) print "Kmeans GA glass performance = %f" % kmeans_model.calculate_performance() hac_sfs_glass = np.array([0]) hac_model.cluster(data_instances[:,hac_sfs_glass]) print "HAC SFS glass performance = %f" % hac_model.calculate_performance() # Iris dataset elif "iris" in test[0]: kmeans_sfs_iris = np.array([1]) kmeans_model = KMeans(test[1]) kmeans_model.cluster(data_instances[:,kmeans_sfs_iris]) print "Kmeans SFS iris performance = %f" % kmeans_model.calculate_performance() kmeans_ga_iris = np.array([0,1]) kmeans_model = KMeans(test[1]) kmeans_model.cluster(data_instances[:,kmeans_ga_iris]) print "Kmeans GA iris performance = %f" % kmeans_model.calculate_performance() hac_sfs_iris = np.array([0]) hac_model.cluster(data_instances[:,hac_sfs_iris]) print "HAC SFS glass performance = %f" % hac_model.calculate_performance()
def train(args, seed=0): blocks = np.array([ 'allen_d', 'moore_a', 'lee_l', 'robinson_h', 'mcguire_j', 'blum_a', 'jones_s', 'young_s' ]) use_gpu = args['use_gpu'] np.random.seed(seed) torch.manual_seed(seed) idxs = np.random.permutation(len(blocks)) train_blocks = list(blocks[idxs[0:3]]) val_blocks = list(blocks[idxs[3:5]]) test_blocks = list(blocks[idxs[5:8]]) # train_blocks = ['robinson_h'] # val_blocks = ['robinson_h'] # test_blocks = list(blocks) # print(train_blocks) num_epochs = args['n_epochs'] in_dim = 14 margin = args['margin'] model = DeepSetLinkage(in_dim=in_dim, lr=args['lr'], linear=args['linear'], wd=args['wd'], feature_dim=args['feature_dim']) train_losses = [] val_losses = [] prev_train_loss = np.inf best_val_loss = np.inf best_model = deepcopy(model) irritaion = 0 for epoch in range(num_epochs): train_loss = 0 for idx, tb in enumerate(train_blocks): pair_features = np.loadtxt( 'data/rexa/{}/pairFeatures.csv'.format(tb), delimiter=',', dtype=np.float) pairs = process_pair_features(pair_features) gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(tb), delimiter='\t', dtype=np.float)[:, 1] hac = HAC(pairs, gt_clusters, model, margin=margin, use_gpu=use_gpu, feature_dim=args['feature_dim'], teacher_force=args['teacher_force']) loss = hac.train_epoch() #print(tb, 'train loss:', loss) train_loss += loss train_loss = train_loss / len(train_blocks) print('epoch:', epoch, 'train loss:', train_loss) val_loss = 0 for idx, vb in enumerate(val_blocks): pair_features = np.loadtxt( 'data/rexa/{}/pairFeatures.csv'.format(vb), delimiter=',', dtype=np.float) pairs = process_pair_features(pair_features) gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(vb), delimiter='\t', dtype=np.float)[:, 1] hac = HAC(pairs, gt_clusters, model, margin=margin, use_gpu=use_gpu, feature_dim=args['feature_dim'], teacher_force=args['teacher_force']) loss = hac.validate() #print(vb, 'val loss:', loss) val_loss += loss val_loss = val_loss / len(val_blocks) print('epoch:', epoch, 'val loss:', val_loss) if train_loss > prev_train_loss: print('train loss went up, stopping now') model = best_model break if val_loss >= best_val_loss: irritaion += 1 elif val_loss < best_val_loss: best_val_loss = val_loss best_model = deepcopy(model) irritaion = 0 if irritaion >= args['patience']: print("val loss hasn't improved in {} epochs, stopping now".format( args['patience'])) model = best_model break train_losses.append(train_loss) val_losses.append(val_loss) prev_train_loss = train_loss print('saving results') np.save(args['path'] + '/train_losses_' + str(seed), np.array(train_losses)) np.save(args['path'] + '/val_losses_' + str(seed), np.array(val_losses)) print('done saving results') # find f1 score link_list = [] f1_list = [] # for idx, vb in enumerate(val_blocks): for idx, vb in enumerate(val_blocks + train_blocks): pair_features = np.loadtxt('data/rexa/{}/pairFeatures.csv'.format(vb), delimiter=',', dtype=np.float) pairs = process_pair_features(pair_features) gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(vb), delimiter='\t', dtype=np.float)[:, 1] hac = HAC(pairs, gt_clusters, model, margin=margin, use_gpu=use_gpu, feature_dim=args['feature_dim']) links, f1s = hac.cluster() link_list.append(links) f1_list.append(f1s) idx = np.argmax(f1s) best_f1 = f1s[idx] best_link = links[idx] print('{} best f1: {} best link: {}'.format(vb, best_f1, best_link)) if args['thresh'] == 'find': print('finding best thresh') best_thresh = find_thresh(link_list, f1_list) else: best_thresh = float(args['thresh']) print('best threshold:', best_thresh) test_f1s = [] for idx, teb in enumerate(test_blocks): pair_features = np.loadtxt('data/rexa/{}/pairFeatures.csv'.format(teb), delimiter=',', dtype=np.float) pairs = process_pair_features(pair_features) gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(teb), delimiter='\t', dtype=np.float)[:, 1] hac = HAC(pairs, gt_clusters, model, margin=margin, use_gpu=use_gpu, feature_dim=args['feature_dim']) f1, log = hac.get_test_f1(best_thresh) print('test f1 on {}: {}'.format(teb, f1)) test_f1s.append(f1) np.savetxt(args['path'] + '/log_' + teb + '_' + str(seed) + '.csv', log, delimiter=',') print('test f1:', np.mean(test_f1s)) np.save(args['path'] + '/test_f1_' + str(seed), np.mean(test_f1s))