def test_HAC(): test = [[1, 1.1, 1], [1.2, .8, 1.1], [.8, 1, 1.2], [3.7, 3.5, 3.6], [3.9, 3.9, 3.5], [3.4, 3.5, 3.7], [15, 15, 15]] hac = HAC() for i in xrange(1, 4): hac.clusterLevel = i + 1 hac.run(test, "Synthetic Data with Cluster Level " + str(i))
def main(video): """Main function """ ### Arguments used during training -- removed the args manager for simplicity during evaluation # --dspace sqeuclidean # --init_ctrdbias 0.1 # --loss_components ctrd_pos ctrd_neg # --mlp_dims 256 128 64 64 # --l2norm # --learn_ctrdbias # --critparam_train_epoch 0 # --batch_size 2000 # --ctrd_alpha_pos 4 # --ctrd_alpha_neg 1 # --gamma_eps 0.05 gpu = 0 global device device = torch.device( "cuda:0" if torch.cuda.is_available() and gpu != -1 else "cpu") print(device) ### Dataset ### # simplified evaluation example (normally uses PyTorch datasets) X, y = simple_read_dataset(video) ### Create Model ### model = modules.EmbedMLP(mlp_dims=[256, 256, 128, 64, 64], nonlin='relu', use_bn=False, l2norm=True, dropout=False, resnet_blocks=False, use_classifier=False) model = model.to(device) print(model) ### Load checkpoint ### print('Loading checkpoint') chkpt_fname = 'model_chkpts/20181113-235913.mlp-256-256-128-64-64.ep-0127.trn-03.5879.bs-2000.pth.tar' checkpoint = torch.load(chkpt_fname) model.load_state_dict(checkpoint['model_state']) ### HAC ### hac = HAC(stop_criterion='distance', distance_metric='sqeuclidean', linkage_method='complete') # set the HAC threshold to be 4*b! # IMPORTANT: the threshold is learned as part of the criterion module, and not the main MLP model hac.thresh = 4 * F.softplus( checkpoint['criterion_state']['ctrd.h_bias']).item() ### Run evaluation ### val_metrics = validate(hac, [X, y], model, curve=False)
def evaluate_model(model, model_type, num_of_classes, candidate_feature_set, data_set): '''This method uses the inputted feature subset to cluster the inputted data and scores performance using a LDA-like objective function.''' # Convert candidate_feature_set representation from # f_1, ... f_d to the list of indices of the f_i = 1 # (for example, [1 0 0 1 0] -> [0 3] candidate_feature_set = \ [idx for idx in xrange(len(candidate_feature_set)) if candidate_feature_set[idx] == 1] if model_type == "Kmeans": model = KMeans(num_of_classes) elif model_type == "HAC": model = HAC(num_of_classes) model.cluster(data_set[:,candidate_feature_set]) return model.calculate_performance()
def perform_SFS_feature_selection(model, model_type, num_of_classes, data_set): # Create a boolean string, 1 = include feature, 0 = leave it out feature_set = [i for i in xrange(data_set.shape[1])] chosen_features = [] chosen_clusters = [] base_performance = float("-inf") # while there are still features to choose from... while len(feature_set) > 0: # initialize performance metrics best_performance = float("-inf") best_clusters = [] #print "best performance = %f" % best_performance # Pick a feature that hasn't be chosen yet and train the model for feature in feature_set: chosen_features.append(feature) # Train model if model_type == "Kmeans": model = KMeans(num_of_classes) elif model_type == "HAC": model = HAC(num_of_classes) #print "Modeling with %s" % chosen_features clusters = model.cluster(data_set) # Calculate performance via LDA-like objective function current_performance = model.calculate_performance() #print "model performance = %f" % current_performance # if this combo of features beats the best performance so far # take note... if current_performance > best_performance: best_performance = current_performance best_feature = feature best_clusters = clusters #print "best performance updated to %f" % best_performance chosen_features.remove(feature) # If best noted performance beats the best performance we've seen # so far, add to chosen features if best_performance > base_performance: base_performance = best_performance feature_set.remove(best_feature) chosen_features.append(best_feature) chosen_clusters = best_clusters #print "base performance = %f" % base_performance else: #print "best performance = %f" % base_performance break return chosen_features, chosen_clusters
tests = [('data_sets/original/glass_data.txt', 7), ('data_sets/original/iris_data.txt', 3), ('data_sets/original/spam_data.txt', 2)] for test in tests: data_instances = [] data_file = open(test[0]) print "Running with %s" % test[0] for line in data_file: line_split = line.split(',') data_instances.append(map(float, line_split)) data_instances = np.array(data_instances) # Run SFS using k-means and HAC kmeans_model = KMeans(test[1]) hac_model = HAC(test[1]) # Glass dataset if "glass" in test[0]: kmeans_sfs_glass = np.array([1,3]) kmeans_model.cluster(data_instances[:,kmeans_sfs_glass]) print "Kmeans SFS glass performance = %f" % kmeans_model.calculate_performance() kmeans_ga_glass = np.array([0,1,2,3,4,5,6]) kmeans_model = KMeans(test[1]) kmeans_model.cluster(data_instances[:,kmeans_ga_glass]) print "Kmeans GA glass performance = %f" % kmeans_model.calculate_performance() hac_sfs_glass = np.array([0]) hac_model.cluster(data_instances[:,hac_sfs_glass]) print "HAC SFS glass performance = %f" % hac_model.calculate_performance()
tests = [('data_sets/original/glass_data.txt', 7)] #('data_sets/original/iris_data.txt', 3)] #('data_sets/original/spam_data.txt', 2)] for test in tests: data_instances = [] data_file = open(test[0]) print "Running with %s" % test[0] for line in data_file: line_split = line.split(',') data_instances.append(map(float, line_split)) data_instances = np.array(data_instances) # Run SFS using k-means and HAC kmeans_model = KMeans(test[1]) hac_model = HAC(test[1]) '''chosen_features = perform_SFS_feature_selection( \ kmeans_model, "Kmeans", test[1], data_instances) print "K-means chosen features: %s" % str(chosen_features)''' chosen_features = perform_SFS_feature_selection( \ hac_model, "HAC", test[1], data_instances) print "HAC chosen features: %s" % str(chosen_features) # Run GA feature selection using k-means and HAC '''kmeans_model = KMeans(test[1]) hac_model = HAC(test[1]) chosen_features = perform_GA_feature_selection(kmeans_model, "Kmeans", test[1], data_instances) print "Chosen features for K-means GA: %s" % str(chosen_features) chosen_features = perform_GA_feature_selection(hac_model, "HAC", test[1], data_instances) print "Chosen features for HAC GA: %s" % str(chosen_features)'''
import os.path import numpy as np from sklearn.linear_model import Perceptron from stepwise_forward_selection import perform_SFS_feature_selection from genetic_algorithm_feature_selection import * from k_means import KMeans from hac import HAC '''This program reads in the test data and runs SFS and GA feature selection using k-means and HAC clustering''' # Datasets to test tests = [('data_sets/original/iris_data.txt', 3)] #('data_sets/original/spam_data.txt', 2)] for test in tests: data_instances = [] data_file = open(test[0]) print "Running with %s" % test[0] for line in data_file: line_split = line.split(',') data_instances.append(map(float, line_split)) data_instances = np.array(data_instances) # Run GA using k-means hac_model = HAC(test[1]) chosen_features = perform_GA_feature_selection(hac_model, "HAC", test[1], data_instances) feature_set = \ [idx for idx in xrange(len(chosen_features[0])) if chosen_features[0][idx] == 1] print "Chosen features for HAC GA: %s" % str(chosen_features) for cluster in hac_model.get_clusters(): print "HAC chosen cluster: %s" % str(cluster)
# Datasets to test tests = [('data/glass_data.txt', 7), ('data/iris_data.txt', 3), ('data/spam_data.txt', 2)] for test in tests: data_instances = [] data_file = open(test[0]) print "Running with %s" % test[0] for line in data_file: line_split = line.split(',') data_instances.append(map(float, line_split)) data_instances = np.array(data_instances) # Run SFS using k-means and HAC kmeans_model = KMeans(test[1]) hac_model = HAC(test[1]) # Glass dataset if "glass" in test[0]: kmeans_sfs_glass = np.array([1, 3]) kmeans_model.cluster(data_instances[:, kmeans_sfs_glass]) print("K-means SFS glass performance = %f" % kmeans_model.calculate_performance()) kmeans_ga_glass = np.array([0, 1, 2, 3, 4, 5, 6]) kmeans_model = KMeans(test[1]) kmeans_model.cluster(data_instances[:, kmeans_ga_glass]) print("K-means GA glass performance = %f" % kmeans_model.calculate_performance()) hac_sfs_glass = np.array([0])
def train(args, seed=0): blocks = np.array([ 'allen_d', 'moore_a', 'lee_l', 'robinson_h', 'mcguire_j', 'blum_a', 'jones_s', 'young_s' ]) use_gpu = args['use_gpu'] np.random.seed(seed) torch.manual_seed(seed) idxs = np.random.permutation(len(blocks)) train_blocks = list(blocks[idxs[0:3]]) val_blocks = list(blocks[idxs[3:5]]) test_blocks = list(blocks[idxs[5:8]]) # train_blocks = ['robinson_h'] # val_blocks = ['robinson_h'] # test_blocks = list(blocks) # print(train_blocks) num_epochs = args['n_epochs'] in_dim = 14 margin = args['margin'] model = DeepSetLinkage(in_dim=in_dim, lr=args['lr'], linear=args['linear'], wd=args['wd'], feature_dim=args['feature_dim']) train_losses = [] val_losses = [] prev_train_loss = np.inf best_val_loss = np.inf best_model = deepcopy(model) irritaion = 0 for epoch in range(num_epochs): train_loss = 0 for idx, tb in enumerate(train_blocks): pair_features = np.loadtxt( 'data/rexa/{}/pairFeatures.csv'.format(tb), delimiter=',', dtype=np.float) pairs = process_pair_features(pair_features) gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(tb), delimiter='\t', dtype=np.float)[:, 1] hac = HAC(pairs, gt_clusters, model, margin=margin, use_gpu=use_gpu, feature_dim=args['feature_dim'], teacher_force=args['teacher_force']) loss = hac.train_epoch() #print(tb, 'train loss:', loss) train_loss += loss train_loss = train_loss / len(train_blocks) print('epoch:', epoch, 'train loss:', train_loss) val_loss = 0 for idx, vb in enumerate(val_blocks): pair_features = np.loadtxt( 'data/rexa/{}/pairFeatures.csv'.format(vb), delimiter=',', dtype=np.float) pairs = process_pair_features(pair_features) gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(vb), delimiter='\t', dtype=np.float)[:, 1] hac = HAC(pairs, gt_clusters, model, margin=margin, use_gpu=use_gpu, feature_dim=args['feature_dim'], teacher_force=args['teacher_force']) loss = hac.validate() #print(vb, 'val loss:', loss) val_loss += loss val_loss = val_loss / len(val_blocks) print('epoch:', epoch, 'val loss:', val_loss) if train_loss > prev_train_loss: print('train loss went up, stopping now') model = best_model break if val_loss >= best_val_loss: irritaion += 1 elif val_loss < best_val_loss: best_val_loss = val_loss best_model = deepcopy(model) irritaion = 0 if irritaion >= args['patience']: print("val loss hasn't improved in {} epochs, stopping now".format( args['patience'])) model = best_model break train_losses.append(train_loss) val_losses.append(val_loss) prev_train_loss = train_loss print('saving results') np.save(args['path'] + '/train_losses_' + str(seed), np.array(train_losses)) np.save(args['path'] + '/val_losses_' + str(seed), np.array(val_losses)) print('done saving results') # find f1 score link_list = [] f1_list = [] # for idx, vb in enumerate(val_blocks): for idx, vb in enumerate(val_blocks + train_blocks): pair_features = np.loadtxt('data/rexa/{}/pairFeatures.csv'.format(vb), delimiter=',', dtype=np.float) pairs = process_pair_features(pair_features) gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(vb), delimiter='\t', dtype=np.float)[:, 1] hac = HAC(pairs, gt_clusters, model, margin=margin, use_gpu=use_gpu, feature_dim=args['feature_dim']) links, f1s = hac.cluster() link_list.append(links) f1_list.append(f1s) idx = np.argmax(f1s) best_f1 = f1s[idx] best_link = links[idx] print('{} best f1: {} best link: {}'.format(vb, best_f1, best_link)) if args['thresh'] == 'find': print('finding best thresh') best_thresh = find_thresh(link_list, f1_list) else: best_thresh = float(args['thresh']) print('best threshold:', best_thresh) test_f1s = [] for idx, teb in enumerate(test_blocks): pair_features = np.loadtxt('data/rexa/{}/pairFeatures.csv'.format(teb), delimiter=',', dtype=np.float) pairs = process_pair_features(pair_features) gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(teb), delimiter='\t', dtype=np.float)[:, 1] hac = HAC(pairs, gt_clusters, model, margin=margin, use_gpu=use_gpu, feature_dim=args['feature_dim']) f1, log = hac.get_test_f1(best_thresh) print('test f1 on {}: {}'.format(teb, f1)) test_f1s.append(f1) np.savetxt(args['path'] + '/log_' + teb + '_' + str(seed) + '.csv', log, delimiter=',') print('test f1:', np.mean(test_f1s)) np.save(args['path'] + '/test_f1_' + str(seed), np.mean(test_f1s))
[3.9, 3.9, 3.5], [3.4, 3.5, 3.7], [15, 15, 15]]) eps = 0.5 min_points = 2 dbscanalgo = DBSCAN(eps=eps, min_points=min_points) dbscanalgo.run(X, "Synthetic Data") def test_HAC(): test = [[1, 1.1, 1], [1.2, .8, 1.1], [.8, 1, 1.2], [3.7, 3.5, 3.6], [3.9, 3.9, 3.5], [3.4, 3.5, 3.7], [15, 15, 15]] hac = HAC() for i in xrange(1, 4): hac.clusterLevel = i + 1 hac.run(test, "Synthetic Data with Cluster Level " + str(i)) dbscan = DBSCAN() hac = HAC() experiment = Experiments() experiment.runSynthetic(dbscan) experiment.runSynthetic(hac) ind = 500 dim = 3 #experiment.run(dbscan, True, ind, dim) # test_HAC() # experiment.runSynthetic(hac) # experiment.run(hac, True, ind, dim)