예제 #1
0
def test_HAC():
    test = [[1, 1.1, 1], [1.2, .8, 1.1], [.8, 1, 1.2], [3.7, 3.5, 3.6],
            [3.9, 3.9, 3.5], [3.4, 3.5, 3.7], [15, 15, 15]]
    hac = HAC()
    for i in xrange(1, 4):
        hac.clusterLevel = i + 1
        hac.run(test, "Synthetic Data with Cluster Level " + str(i))
예제 #2
0
def main(video):
    """Main function
    """

    ### Arguments used during training -- removed the args manager for simplicity during evaluation
    # --dspace sqeuclidean
    # --init_ctrdbias 0.1
    # --loss_components ctrd_pos ctrd_neg
    # --mlp_dims 256 128 64 64
    # --l2norm
    # --learn_ctrdbias
    # --critparam_train_epoch 0
    # --batch_size 2000
    # --ctrd_alpha_pos 4
    # --ctrd_alpha_neg 1
    # --gamma_eps 0.05

    gpu = 0
    global device
    device = torch.device(
        "cuda:0" if torch.cuda.is_available() and gpu != -1 else "cpu")
    print(device)

    ### Dataset ###
    # simplified evaluation example (normally uses PyTorch datasets)
    X, y = simple_read_dataset(video)

    ### Create Model ###
    model = modules.EmbedMLP(mlp_dims=[256, 256, 128, 64, 64],
                             nonlin='relu',
                             use_bn=False,
                             l2norm=True,
                             dropout=False,
                             resnet_blocks=False,
                             use_classifier=False)
    model = model.to(device)
    print(model)

    ### Load checkpoint ###
    print('Loading checkpoint')
    chkpt_fname = 'model_chkpts/20181113-235913.mlp-256-256-128-64-64.ep-0127.trn-03.5879.bs-2000.pth.tar'
    checkpoint = torch.load(chkpt_fname)
    model.load_state_dict(checkpoint['model_state'])

    ### HAC ###
    hac = HAC(stop_criterion='distance',
              distance_metric='sqeuclidean',
              linkage_method='complete')
    # set the HAC threshold to be 4*b!
    # IMPORTANT: the threshold is learned as part of the criterion module, and not the main MLP model
    hac.thresh = 4 * F.softplus(
        checkpoint['criterion_state']['ctrd.h_bias']).item()

    ### Run evaluation ###
    val_metrics = validate(hac, [X, y], model, curve=False)
def evaluate_model(model, model_type, num_of_classes, candidate_feature_set, data_set):
    '''This method uses the inputted feature subset to cluster the inputted data and
    scores performance using a LDA-like objective function.'''
    # Convert candidate_feature_set representation from
    # f_1, ... f_d to the list of indices of the f_i = 1
    # (for example, [1 0 0 1 0] -> [0 3]
    candidate_feature_set = \
        [idx for idx in xrange(len(candidate_feature_set)) if candidate_feature_set[idx] == 1]
    if model_type == "Kmeans":
        model = KMeans(num_of_classes)
    elif model_type == "HAC":
        model = HAC(num_of_classes)
    model.cluster(data_set[:,candidate_feature_set])
    return model.calculate_performance() 
def perform_SFS_feature_selection(model, model_type, num_of_classes, data_set):
    # Create a boolean string, 1 = include feature, 0 = leave it out
    feature_set = [i for i in xrange(data_set.shape[1])]
    chosen_features = []
    chosen_clusters = []
    base_performance = float("-inf")
    # while there are still features to choose from...
    while len(feature_set) > 0:
        # initialize performance metrics
        best_performance = float("-inf")
        best_clusters = []
        #print "best performance = %f" % best_performance
        # Pick a feature that hasn't be chosen yet and train the model
        for feature in feature_set:
            chosen_features.append(feature)
            # Train model
            if model_type == "Kmeans":
                model = KMeans(num_of_classes)
            elif model_type == "HAC":
                model = HAC(num_of_classes)
            #print "Modeling with %s" % chosen_features
            clusters = model.cluster(data_set)
            # Calculate performance via LDA-like objective function
            current_performance = model.calculate_performance()
            #print "model performance = %f" % current_performance
            # if this combo of features beats the best performance so far
            # take note...
            if current_performance > best_performance:
                best_performance = current_performance
                best_feature = feature
                best_clusters = clusters
                #print "best performance updated to %f" % best_performance
            chosen_features.remove(feature)
        # If best noted performance beats the best performance we've seen
        # so far, add to chosen features
        if best_performance > base_performance:
            base_performance = best_performance
            feature_set.remove(best_feature)
            chosen_features.append(best_feature)
            chosen_clusters = best_clusters
            #print "base performance = %f" % base_performance
        else:
            #print "best performance = %f" % base_performance
            break
    return chosen_features, chosen_clusters
예제 #5
0
tests = [('data_sets/original/glass_data.txt', 7),
         ('data_sets/original/iris_data.txt', 3),
         ('data_sets/original/spam_data.txt', 2)]

for test in tests:
   data_instances = []
   data_file = open(test[0])
   print "Running with %s" % test[0]
   for line in data_file:
       line_split = line.split(',')
       data_instances.append(map(float, line_split))
   data_instances = np.array(data_instances)

   # Run SFS using k-means and HAC
   kmeans_model = KMeans(test[1])
   hac_model = HAC(test[1])

   # Glass dataset
   if "glass" in test[0]:
      kmeans_sfs_glass = np.array([1,3])
      kmeans_model.cluster(data_instances[:,kmeans_sfs_glass])
      print "Kmeans SFS glass performance = %f" % kmeans_model.calculate_performance()

      kmeans_ga_glass = np.array([0,1,2,3,4,5,6])
      kmeans_model = KMeans(test[1])
      kmeans_model.cluster(data_instances[:,kmeans_ga_glass])
      print "Kmeans GA glass performance = %f" % kmeans_model.calculate_performance()

      hac_sfs_glass = np.array([0])
      hac_model.cluster(data_instances[:,hac_sfs_glass])
      print "HAC SFS glass performance = %f" % hac_model.calculate_performance()
예제 #6
0
tests = [('data_sets/original/glass_data.txt', 7)]
         #('data_sets/original/iris_data.txt', 3)]
         #('data_sets/original/spam_data.txt', 2)]

for test in tests:
   data_instances = []
   data_file = open(test[0])
   print "Running with %s" % test[0]
   for line in data_file:
       line_split = line.split(',')
       data_instances.append(map(float, line_split))
   data_instances = np.array(data_instances)

   # Run SFS using k-means and HAC
   kmeans_model = KMeans(test[1])
   hac_model = HAC(test[1])
   '''chosen_features = perform_SFS_feature_selection( \
       kmeans_model, "Kmeans", test[1], data_instances)
   print "K-means chosen features: %s" % str(chosen_features)'''
   chosen_features = perform_SFS_feature_selection( \
       hac_model, "HAC", test[1], data_instances)
   print "HAC chosen features: %s" % str(chosen_features)

   # Run GA feature selection using k-means and HAC
   '''kmeans_model = KMeans(test[1])
   hac_model = HAC(test[1])
   chosen_features = perform_GA_feature_selection(kmeans_model, "Kmeans", test[1], data_instances)
   print "Chosen features for K-means GA: %s" % str(chosen_features)
   chosen_features = perform_GA_feature_selection(hac_model, "HAC", test[1], data_instances)
   print "Chosen features for HAC GA: %s" % str(chosen_features)'''
예제 #7
0
import os.path
import numpy as np
from sklearn.linear_model import Perceptron
from stepwise_forward_selection import perform_SFS_feature_selection
from genetic_algorithm_feature_selection import *
from k_means import KMeans
from hac import HAC

'''This program reads in the test data and runs SFS and GA feature selection using k-means and HAC clustering'''
# Datasets to test
tests = [('data_sets/original/iris_data.txt', 3)]
         #('data_sets/original/spam_data.txt', 2)]

for test in tests:
   data_instances = []
   data_file = open(test[0])
   print "Running with %s" % test[0]
   for line in data_file:
       line_split = line.split(',')
       data_instances.append(map(float, line_split))
   data_instances = np.array(data_instances)

   # Run GA using k-means 
   hac_model = HAC(test[1])
   chosen_features = perform_GA_feature_selection(hac_model, "HAC", test[1], data_instances)
   feature_set = \
       [idx for idx in xrange(len(chosen_features[0])) if chosen_features[0][idx] == 1]
   print "Chosen features for HAC GA: %s" % str(chosen_features)
   for cluster in hac_model.get_clusters():
       print "HAC chosen cluster: %s" % str(cluster)
예제 #8
0
# Datasets to test
tests = [('data/glass_data.txt', 7), ('data/iris_data.txt', 3),
         ('data/spam_data.txt', 2)]

for test in tests:
    data_instances = []
    data_file = open(test[0])
    print "Running with %s" % test[0]
    for line in data_file:
        line_split = line.split(',')
        data_instances.append(map(float, line_split))
    data_instances = np.array(data_instances)

    # Run SFS using k-means and HAC
    kmeans_model = KMeans(test[1])
    hac_model = HAC(test[1])

    # Glass dataset
    if "glass" in test[0]:
        kmeans_sfs_glass = np.array([1, 3])
        kmeans_model.cluster(data_instances[:, kmeans_sfs_glass])
        print("K-means SFS glass performance = %f" %
              kmeans_model.calculate_performance())

        kmeans_ga_glass = np.array([0, 1, 2, 3, 4, 5, 6])
        kmeans_model = KMeans(test[1])
        kmeans_model.cluster(data_instances[:, kmeans_ga_glass])
        print("K-means GA glass performance = %f" %
              kmeans_model.calculate_performance())

        hac_sfs_glass = np.array([0])
예제 #9
0
def train(args, seed=0):
    blocks = np.array([
        'allen_d', 'moore_a', 'lee_l', 'robinson_h', 'mcguire_j', 'blum_a',
        'jones_s', 'young_s'
    ])

    use_gpu = args['use_gpu']

    np.random.seed(seed)
    torch.manual_seed(seed)
    idxs = np.random.permutation(len(blocks))
    train_blocks = list(blocks[idxs[0:3]])
    val_blocks = list(blocks[idxs[3:5]])
    test_blocks = list(blocks[idxs[5:8]])

    # train_blocks = ['robinson_h']
    # val_blocks = ['robinson_h']
    # test_blocks = list(blocks)

    # print(train_blocks)

    num_epochs = args['n_epochs']
    in_dim = 14
    margin = args['margin']
    model = DeepSetLinkage(in_dim=in_dim,
                           lr=args['lr'],
                           linear=args['linear'],
                           wd=args['wd'],
                           feature_dim=args['feature_dim'])

    train_losses = []
    val_losses = []

    prev_train_loss = np.inf
    best_val_loss = np.inf
    best_model = deepcopy(model)
    irritaion = 0

    for epoch in range(num_epochs):

        train_loss = 0
        for idx, tb in enumerate(train_blocks):
            pair_features = np.loadtxt(
                'data/rexa/{}/pairFeatures.csv'.format(tb),
                delimiter=',',
                dtype=np.float)

            pairs = process_pair_features(pair_features)
            gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(tb),
                                     delimiter='\t',
                                     dtype=np.float)[:, 1]
            hac = HAC(pairs,
                      gt_clusters,
                      model,
                      margin=margin,
                      use_gpu=use_gpu,
                      feature_dim=args['feature_dim'],
                      teacher_force=args['teacher_force'])

            loss = hac.train_epoch()
            #print(tb, 'train loss:', loss)
            train_loss += loss
        train_loss = train_loss / len(train_blocks)
        print('epoch:', epoch, 'train loss:', train_loss)

        val_loss = 0
        for idx, vb in enumerate(val_blocks):
            pair_features = np.loadtxt(
                'data/rexa/{}/pairFeatures.csv'.format(vb),
                delimiter=',',
                dtype=np.float)
            pairs = process_pair_features(pair_features)
            gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(vb),
                                     delimiter='\t',
                                     dtype=np.float)[:, 1]
            hac = HAC(pairs,
                      gt_clusters,
                      model,
                      margin=margin,
                      use_gpu=use_gpu,
                      feature_dim=args['feature_dim'],
                      teacher_force=args['teacher_force'])

            loss = hac.validate()
            #print(vb, 'val loss:', loss)
            val_loss += loss
        val_loss = val_loss / len(val_blocks)
        print('epoch:', epoch, 'val loss:', val_loss)

        if train_loss > prev_train_loss:
            print('train loss went up, stopping now')
            model = best_model
            break

        if val_loss >= best_val_loss:
            irritaion += 1
        elif val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = deepcopy(model)
            irritaion = 0

        if irritaion >= args['patience']:
            print("val loss hasn't improved in {} epochs, stopping now".format(
                args['patience']))
            model = best_model
            break

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        prev_train_loss = train_loss

    print('saving results')
    np.save(args['path'] + '/train_losses_' + str(seed),
            np.array(train_losses))
    np.save(args['path'] + '/val_losses_' + str(seed), np.array(val_losses))
    print('done saving results')

    # find f1 score
    link_list = []
    f1_list = []
    # for idx, vb in enumerate(val_blocks):
    for idx, vb in enumerate(val_blocks + train_blocks):
        pair_features = np.loadtxt('data/rexa/{}/pairFeatures.csv'.format(vb),
                                   delimiter=',',
                                   dtype=np.float)
        pairs = process_pair_features(pair_features)
        gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(vb),
                                 delimiter='\t',
                                 dtype=np.float)[:, 1]
        hac = HAC(pairs,
                  gt_clusters,
                  model,
                  margin=margin,
                  use_gpu=use_gpu,
                  feature_dim=args['feature_dim'])

        links, f1s = hac.cluster()
        link_list.append(links)
        f1_list.append(f1s)

        idx = np.argmax(f1s)
        best_f1 = f1s[idx]
        best_link = links[idx]
        print('{} best f1: {} best link: {}'.format(vb, best_f1, best_link))

    if args['thresh'] == 'find':
        print('finding best thresh')
        best_thresh = find_thresh(link_list, f1_list)
    else:
        best_thresh = float(args['thresh'])
    print('best threshold:', best_thresh)

    test_f1s = []
    for idx, teb in enumerate(test_blocks):
        pair_features = np.loadtxt('data/rexa/{}/pairFeatures.csv'.format(teb),
                                   delimiter=',',
                                   dtype=np.float)
        pairs = process_pair_features(pair_features)
        gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(teb),
                                 delimiter='\t',
                                 dtype=np.float)[:, 1]
        hac = HAC(pairs,
                  gt_clusters,
                  model,
                  margin=margin,
                  use_gpu=use_gpu,
                  feature_dim=args['feature_dim'])

        f1, log = hac.get_test_f1(best_thresh)
        print('test f1 on {}: {}'.format(teb, f1))
        test_f1s.append(f1)
        np.savetxt(args['path'] + '/log_' + teb + '_' + str(seed) + '.csv',
                   log,
                   delimiter=',')

    print('test f1:', np.mean(test_f1s))
    np.save(args['path'] + '/test_f1_' + str(seed), np.mean(test_f1s))
예제 #10
0
                  [3.9, 3.9, 3.5], [3.4, 3.5, 3.7], [15, 15, 15]])
    eps = 0.5
    min_points = 2
    dbscanalgo = DBSCAN(eps=eps, min_points=min_points)
    dbscanalgo.run(X, "Synthetic Data")


def test_HAC():
    test = [[1, 1.1, 1], [1.2, .8, 1.1], [.8, 1, 1.2], [3.7, 3.5, 3.6],
            [3.9, 3.9, 3.5], [3.4, 3.5, 3.7], [15, 15, 15]]
    hac = HAC()
    for i in xrange(1, 4):
        hac.clusterLevel = i + 1
        hac.run(test, "Synthetic Data with Cluster Level " + str(i))


dbscan = DBSCAN()
hac = HAC()
experiment = Experiments()

experiment.runSynthetic(dbscan)
experiment.runSynthetic(hac)

ind = 500
dim = 3
#experiment.run(dbscan, True, ind, dim)

# test_HAC()
# experiment.runSynthetic(hac)
# experiment.run(hac, True, ind, dim)