Пример #1
0
def main(args):

    config = load_config(args.config_path)
    dataset_name = config['dataset_selected']
    error_change_threshold = config['error_change_threshold']
    batch_size = config['batch_size']
    learning_rate = config['optimizer_params']['learning_rate']
    epochs = config['nb_epochs']
    input_dim = config['input_dimension']
    embedding_dimension = config['output_dimension']
    n_samples = config['number_of_points']
    number_of_test_triplets = config['n_test_triplets']
    triplet_multiplier = config['triplets_multiplier']
    log_dir = config['log']['path']
    hyper_search = config['hyper_search']['activation']
    optimizer = config['optimizer']

    if hyper_search:
        run_hyper_search(config=config)
    else:
        vec_data, labels = select_dataset(dataset_name,
                                          n_samples=n_samples,
                                          input_dim=input_dim)
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        n = vec_data.shape[0]
        logn = int(np.log2(n))
        triplet_num = triplet_multiplier * logn * n * embedding_dimension

        bs = min(batch_size, triplet_num)

        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

        experiment_name = 'tste_' + \
                          'data_' + dataset_name + \
                          '_error_change_threshold_' + str(error_change_threshold) + \
                          '_input_dim_' + str(input_dim) + \
                          '_output_dim_' + str(embedding_dimension) + \
                           '_originaldimension_' + str(vec_data.shape[1]) + \
                          '_triplet_num_' + str(triplet_multiplier) + \
                          '_n_pts_' + str(n) + \
                          '_lr_' + str(learning_rate) + \
                          '_optimizer_' + str(optimizer) + \
                          '_bs_' + str(batch_size)

        # create a logging file for extensive logging
        logging_path = os.path.join(log_dir, experiment_name + '.log')
        logger = logging_util.my_custom_logger(logger_name=logging_path,
                                               level=logging.INFO)

        logger.info('Name of Experiments: ' + experiment_name)
        logger.info('Logging Path:' + logging_path)
        logger.info('Dataset Name: ' + dataset_name)
        logger.info('Error Change Threshold: ' + str(error_change_threshold))
        logger.info('Epochs: ' + str(epochs))
        logger.info('Learning Rate: ' + str(learning_rate))
        logger.info('Number of Points: ' + str(n))
        logger.info('Input Dimension: ' + str(input_dim))
        logger.info('Output Dimension: ' + str(embedding_dimension))
        logger.info('Number of Test Triplets: ' + str(number_of_test_triplets))
        logger.info('Triplet Multiplier: ' + str(triplet_multiplier))
        logger.info('Batch Size: ' + str(batch_size))

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, bs, device)

        logger.info('Computing TSTE...')

        x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam(
            triplets=train_triplets_dataset.trips_data_indices,
            n=n,
            emb_dim=embedding_dimension,
            epochs=epochs,
            batch_size=bs,
            learning_rate=learning_rate,
            device=device,
            logger=logger,
            error_change_threshold=error_change_threshold)

        logger.info('Evaluating the computed embeddings...')
        # compute triplet error for train and test data
        train_error = train_triplets_dataset.triplet_error(x)
        test_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                      number_of_test_triplets,
                                                      1000, device)
        test_error = test_triplets_dataset.triplet_error(x)
        procrustes_error = procrustes_disparity(vec_data, x)
        knn_error_ord_emb, knn_error_true_emb = knn_classification_error(
            x, vec_data, labels)

        # sample points for tsne visualization
        subsample = np.random.permutation(n)[0:500]
        x = x[subsample, :]
        sub_labels = labels[subsample]

        x_embedded = TSNE(n_components=2, perplexity=15,
                          learning_rate=10).fit_transform(x)
        fig, ax = plt.subplots(1, 1)

        ax.scatter(x_embedded[:, 0], x_embedded[:, 1], s=3, c=sub_labels)
        fig.savefig(os.path.join(log_dir, experiment_name + '.png'))

        logger.info('Name of Experiments: ' + experiment_name)
        logger.info('Epochs: ' + str(epochs))
        logger.info('Time Taken: ' + str(time_taken) + ' seconds.')
        logger.info('Train Error: ' + str(train_error))
        logger.info('Test Error: ' + str(test_error))
        logger.info('Procrustes Disparity: ' + str(procrustes_error))
        logger.info('kNN Classification Error on ground-truth: ' +
                    str(knn_error_true_emb))
        logger.info('kNN Classification Error on embedding: ' +
                    str(knn_error_ord_emb))

        results = {
            'train_error': train_error,
            'test_error': test_error,
            'procrustes': procrustes_error,
            'knn_true': knn_error_true_emb,
            'knn_ord_emb': knn_error_ord_emb,
            'labels': labels,
            'loss_history': loss_history,
            'error_history': triplet_error_history,
            'ordinal_embedding': x,
            'time_taken': time_taken
        }
        joblib.dump(results, os.path.join(log_dir, experiment_name + '.pkl'))
Пример #2
0
def run_hyper_search(config):
    """
    Important Hyperparameters for tSTE:
    Learning Rate: [1, 0.1, 0.01]
    """
    dataset_name = config['dataset_selected']
    batch_size = config['batch_size']
    epochs = config['nb_epochs']
    input_dim = config['input_dimension']
    n_samples = config['number_of_points']
    number_of_test_triplets = config['n_test_triplets']
    log_dir = config['log']['path']
    triplet_multiplier_range = config['hyper_search']['triplets_multiplier']
    learning_rate_range = config['hyper_search']['learning_rate']
    optimizer = config['optimizer']
    dimensions_range = config['hyper_search']['output_dimension']

    separator = '_'
    experiment_name = 'tste_hyper_search_' + \
                      'data_' + dataset_name + \
                      '_input_dim_' + str(input_dim) + \
                      '_n_pts_' + str(n_samples) + \
                      '_num_test_trips_' + str(number_of_test_triplets) + \
                      '_output_dim_' + separator.join([str(i) for i in dimensions_range]) + \
                      '_lr_' + separator.join([str(i) for i in learning_rate_range]) + \
                      '_optimizer_' + str(optimizer) + \
                      '_bs_' + str(batch_size) + \
                      '_triplet_number_' + separator.join([str(i) for i in triplet_multiplier_range])

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    logging_path = os.path.join(log_dir, experiment_name + '.log')
    logger = logging_util.my_custom_logger(logger_name=logging_path,
                                           level=logging.INFO)
    logger.info('Name of Experiment: ' + experiment_name)
    logger.info('Logging Path:' + logging_path)
    logger.info('Dataset Name: ' + dataset_name)
    logger.info('Epochs: ' + str(epochs))

    best_params_train = {}
    best_params_test = {}
    all_results = {}

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    vec_data, labels = select_dataset(
        dataset_name, n_samples=n_samples, input_dim=input_dim
    )  # input_dim is only argument for uniform. Ignored otherwise

    n = vec_data.shape[0]
    logn = int(np.log2(n))
    for (emb_dim, triplet_multiplier) in product(dimensions_range,
                                                 triplet_multiplier_range):
        all_results[(emb_dim, triplet_multiplier)] = {}
        best_train_error = 1
        best_test_error = 1

        triplet_num = triplet_multiplier * logn * n * emb_dim

        bs = min(batch_size, triplet_num)

        train_triplets_dataset = TripletBatchesDataset(vec_data, labels,
                                                       triplet_num, bs, device)
        logger.info('Testing on: ' + dataset_name +
                    '. Embedding dimension is ' + str(emb_dim))
        logger.info(' ')
        for learning_rate in learning_rate_range:

            logger.info(10 * '-' + ' New parameters' + 10 * '-')
            logger.info('Learning Rate: ' + str(learning_rate))
            logger.info('Number of Points: ' + str(n))
            logger.info('Input Dimension: ' + str(input_dim))
            logger.info('Output Dimension: ' + str(emb_dim))
            logger.info('Number of Test Triplets: ' +
                        str(number_of_test_triplets))
            logger.info('Triplet Multiplier: ' + str(triplet_multiplier))
            logger.info('Batch Size: ' + str(batch_size))

            logger.info('Computing tSTE...')

            x, loss_history, triplet_error_history, time_taken, time_history = tste.t_ste_adam(
                triplets=train_triplets_dataset.trips_data_indices,
                n=n,
                emb_dim=emb_dim,
                epochs=epochs,
                batch_size=bs,
                learning_rate=learning_rate,
                device=device,
                logger=logger)

            # compute triplet error for train and test data
            train_error = train_triplets_dataset.triplet_error(x)
            logger.info('Triplet Error on Training Triplets: ' +
                        str(train_error))
            test_triplets_dataset = TripletBatchesDataset(
                vec_data, labels, number_of_test_triplets, 1000, device)
            test_error = test_triplets_dataset.triplet_error(x)
            #procrustes_error = procrustes_disparity(vec_data, x)
            #knn_error_ord_emb, knn_error_true_emb = knn_classification_error(x, vec_data, labels)

            logger.info('Epochs: ' + str(epochs))
            logger.info('Time Taken: ' + str(time_taken) + ' seconds.')
            logger.info('Train Error: ' + str(train_error))
            logger.info('Test Error: ' + str(test_error))
            #logger.info('Procrustes Disparity: ' + str(procrustes_error))
            #logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb))
            #logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb))

            results = {
                'train_error': train_error,
                'test_error': test_error,
                'loss_history': loss_history,
                'error_history': triplet_error_history,
                'last_embedding': x
            }
            all_results[(emb_dim,
                         triplet_multiplier)].update({learning_rate: results})

            if test_error < best_test_error:
                best_params_test[(emb_dim, triplet_multiplier)] = {
                    'learning_rate': learning_rate,
                    'optimizer': optimizer,
                    'error': test_error
                }
                best_test_error = test_error
            if train_error < best_train_error:
                best_params_train[(emb_dim, triplet_multiplier)] = {
                    'learning_rate': learning_rate,
                    'optimizer': optimizer,
                    'error': train_error
                }
                best_train_error = train_error
        result_name = 'tste_convergence_' + \
                      'data_' + dataset_name + \
                      '_input_dim_' + str(input_dim) + \
                      '_n_pts_' + str(n_samples) + \
                      '_output_dim_' + str(emb_dim) + \
                      '_bs_' + str(batch_size) + \
                      '_triplet_number_' + str(triplet_multiplier)
        all_results['labels'] = labels
        joblib.dump(all_results[(emb_dim, triplet_multiplier)],
                    os.path.join(log_dir, result_name + '.pkl'))

    # print all results as well again
    logger.info(10 * '-' + 'ALL RESULTS ' + 10 * '-')
    for (emb_dim, triplet_multiplier) in product(dimensions_range,
                                                 triplet_multiplier_range):
        results = all_results[(emb_dim, triplet_multiplier)]
        logger.info('Results for emb dimension ' + str(emb_dim) +
                    ' and triplet multiplier ' + str(triplet_multiplier))
        for learning_rate in learning_rate_range:
            logger.info('learning rate ' + str(learning_rate) +
                        ' -- train error: ' +
                        str(results[learning_rate]['train_error']) +
                        ' test error: ' +
                        str(results[learning_rate]['test_error']))

    # print best parameter settings
    for (emb_dim, triplet_multiplier) in product(dimensions_range,
                                                 triplet_multiplier_range):
        logger.info('Best Parameters for emb dimension ' + str(emb_dim) +
                    ' and triplet multiplier ' + str(triplet_multiplier))
        best_on_train = best_params_train[(emb_dim, triplet_multiplier)]
        best_on_test = best_params_test[(emb_dim, triplet_multiplier)]
        logger.info('achieved ' + str(best_on_train['error']) +
                    ' train error with learning rate: ' +
                    str(best_on_train['learning_rate']))
        logger.info('achieved ' + str(best_on_test['error']) +
                    ' test error with learning rate: ' +
                    str(best_on_test['learning_rate']))
def main(args):

    config = load_config(args.config_path)
    dataset_name = config['dataset_selected']
    batch_size = config['batch_size']
    phase1_learning_rate = config['optimizer_params']['phase1_learning_rate']
    phase2_learning_rate = config['optimizer_params']['phase2_learning_rate']
    num_landmarks = config['optimizer_params']['num_landmarks']
    subset_size = config['optimizer_params']['subset_size']
    target_loss = config['optimizer_params']['target_loss']
    epochs = config['nb_epochs']
    input_dim = config['input_dimension']
    embedding_dimension = config['output_dimension']
    n_samples = config['number_of_points']
    number_of_test_triplets = config['n_test_triplets']
    log_dir = config['log']['path']
    hyper_search = config['hyper_search']['activation']

    if hyper_search:
        run_hyper_search(config=config)
    else:
        vec_data, labels = select_dataset(dataset_name=dataset_name,
                                      input_dim=input_dim, n_samples=n_samples)

        n_points = vec_data.shape[0]  # do not remove
        number_of_landmarks = min(int(num_landmarks * n_points), 100)
        subset_size = subset_size * number_of_landmarks

        experiment_name = 'lsoe_' + 'data_' + dataset_name \
                          + '_input_dim_' + str(input_dim) \
                          + '_emb_dimension_' + str(embedding_dimension) \
                          + '_originaldimension_' + str(vec_data.shape[1]) \
                          + '_n_' + str(n_samples) \
                          + '_landmarks_' + str(number_of_landmarks) \
                          + '_bs_ ' + str(batch_size) \
                          + '_pplr_' + str(phase2_learning_rate) \
                          + '_soe_lr_' + str(phase1_learning_rate) \
                          + '_epochs_' + str(epochs)

        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

        logging_path = os.path.join(log_dir, experiment_name + '.log')
        logger = logging_util.my_custom_logger(logger_name=logging_path, level=logging.INFO)

        logger.info('Name of Experiments: ' + experiment_name)
        logger.info('Dataset Name:' + dataset_name)
        logger.info('Number of Points: ' + str(n_samples))
        logger.info('Dataset Dimension:' + str(input_dim))
        logger.info('Number of Landmarks:' + str(number_of_landmarks))
        logger.info('Number of Subset Size:' + str(subset_size))
        logger.info('First Phase Epochs: ' + str(epochs))

        # set the gpu id
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


        logger.info('Computing SOE - Phase 1...')


        # first phase of the algorithm
        landmarks, first_phase_indices, \
        first_phase_subset_size, first_phase_reconstruction, \
        first_phase_loss, first_phase_triplet_error, time_first_phase = first_phase_soe(
            num_landmarks=number_of_landmarks,
            subset_size=subset_size,
            data=vec_data, dataset_size=n_points,
            embedding_dim=embedding_dimension, epochs=epochs,
            first_phase_lr=phase1_learning_rate,
            device=device,
            target_loss=target_loss,
            batch_size=batch_size,
            logger=logger)

        logger.info('First Phase Loss: ' + str(first_phase_loss))
        logger.info('First Phase Triplet Error: ' + str(first_phase_triplet_error))
        logger.info('First Phase Number of Landmarks: ' + str(landmarks.shape))
        logger.info('First Phase Indices Number: ' + str(len(first_phase_indices)))
        logger.info('First Phase Reconstruction Size: ' + str(first_phase_reconstruction.shape))

        embedded_indices = first_phase_indices
        embedded_points = first_phase_reconstruction
        non_embedded_indices = list(set(range(vec_data.shape[0])).difference(set(embedded_indices)))
        my_oracle = Oracle(data=vec_data)
        logger.info('Second Phase: ')
        logger.info('Oracle Created...')

        logger.info('Computing LLOE - Phase 2...')
        # second phase for embedding point by point update
        second_phase_embeddings_index, \
        second_phase_embeddings, time_second_phase = second_phase(my_oracle=my_oracle,
                                                                  non_embedded_indices=non_embedded_indices,
                                                                  embedded_indices=embedded_indices,
                                                                  first_phase_embedded_points=embedded_points,
                                                                  dim=embedding_dimension,
                                                                  lr=phase2_learning_rate, logger=logger)

        # combine the first phase and second phase points and index
        final_embedding = np.zeros((vec_data.shape[0], embedding_dimension))
        # phase 1 points
        final_embedding[embedded_indices] = embedded_points
        # second phase points
        final_embedding[second_phase_embeddings_index] = second_phase_embeddings
        time_taken = time_first_phase + time_second_phase

        logger.info('Size of Dataset: ' + str(vec_data.shape[0]))
        logger.info('Size of First Phase Indices: ' + str(len(embedded_indices)))
        logger.info('Size of Second Phase Indices: ' + str(len(second_phase_embeddings_index)))

        # Evaluation
        logger.info('Evaluation of the Complete Embedding Dataset: ')
        random_trip_indices = gen_triplet_indices(n=vec_data.shape[0], num_trips=number_of_test_triplets)
        test_triplet_data = gen_triplet_data(data=vec_data, random_triplet_indices=random_trip_indices, batch_size=1000)

        test_error, embedding_error_list = triplet_error(final_embedding, test_triplet_data)
        procrustes_error = procrustes_disparity(vec_data, final_embedding)
        knn_error_ord_emb, knn_error_true_emb = knn_classification_error(final_embedding, vec_data, labels)

        # sample points for tsne visualization
        subsample = np.random.permutation(n_points)[0:500]
        x = final_embedding[subsample, :]
        sub_labels = labels[subsample]

        x_embedded = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(x)
        fig, ax = plt.subplots(1, 1)

        ax.scatter(x_embedded[:, 0], x_embedded[:, 1], s=3, c=sub_labels)
        fig.savefig(os.path.join(log_dir, experiment_name + '.png'))

        logger.info('Name of Experiments: ' + experiment_name)
        logger.info('Epochs: ' + str(epochs))
        logger.info('Time Taken: ' + str(time_taken) + ' seconds.')
        logger.info('Test Error: ' + str(test_error))
        logger.info('Procrustes Disparity: ' + str(procrustes_error))
        logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb))
        logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb))

        results = {'test_error': test_error, 'procrustes': procrustes_error, 'knn_true': knn_error_true_emb,
                   'knn_ord_emb': knn_error_ord_emb, 'labels': labels,
                   'ordinal_embedding': final_embedding, 'time_taken': time_taken}
        joblib.dump(results, os.path.join(log_dir, experiment_name + '.pkl'))
def main(args):
    config = load_config(args.config_path)
    algorithm = config['algorithm']
    error_change_threshold = config['error_change_threshold']
    input_dim_range = config['tradeoff_set']['input_dimension']
    output_dimensions_range = config['tradeoff_set']['output_dimension']
    nmb_points_range = config['tradeoff_set']['number_of_points']
    batch_size_range = config['tradeoff_set']['batch_size']
    learning_rate_range = config['tradeoff_set']['learning_rate']
    triplet_multiplier_range = config['tradeoff_set']['triplets_multiplier']

    if args.data_set == 'not_selected':
        dataset_name = config['dataset_selected']
    else:
        dataset_name = args.data_set

    try:
        input_equals_output = config['tradeoff_set']['input_equals_output']
    except:
        input_equals_output = False

    epochs = config['nb_epochs']
    optimizer = config['optimizer']
    n_test_triplets = config['n_test_triplets']

    log_dir = config['log']['path']

    separator = '_'
    experiment_name = algorithm + \
                      '_data_' + dataset_name + \
                      '_input_dim_' + separator.join([str(i) for i in input_dim_range]) + \
                      '_output_dim_' + separator.join([str(i) for i in output_dimensions_range]) + \
                      '_n_pts_' + separator.join([str(i) for i in nmb_points_range]) + \
                      '_bs_' + separator.join([str(i) for i in batch_size_range]) + \
                      '_change_criterion_' + str(error_change_threshold) + \
                      '_lr_' + separator.join([str(i) for i in learning_rate_range]) + \
                      '_triplet_num_' + separator.join([str(i) for i in triplet_multiplier_range]) + \
                      '_ep_' + str(epochs)
    # create a log directory if it does not exist
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    logging_path = os.path.join(log_dir, experiment_name + '.log')
    logger = logging_util.my_custom_logger(logger_name=logging_path,
                                           level=logging.INFO)
    logger.info('Name of Experiments: ' + experiment_name)
    logger.info('Dataset Name: ' + dataset_name)
    logger.info('Error Change Threshold: ' + str(error_change_threshold))
    logger.info('Epochs: ' + str(epochs))

    tradeoff_results = defaultdict(dict)

    order = [
        'Input Dimension', 'Output Dimension', 'Number of Points',
        'Batch Size', 'Learning Rate', 'Triplet Multiplier',
        [
            'Train Error', 'Test Error', 'Procrustes Error', 'Knn Orig Error',
            'Knn Ordinal Error', 'Time', 'Embedding', 'Labels',
            'Train Triplets'
        ]
    ]
    experiment_range = OrderedDict({
        'input_dim':
        input_dim_range,
        'output_dim':
        output_dimensions_range,
        'number_of_points':
        nmb_points_range,
        'batch_size':
        batch_size_range,
        'learning_rate':
        learning_rate_range,
        'triplet_multiplier':
        triplet_multiplier_range
    })

    for input_dim_index, input_dim in enumerate(input_dim_range):
        for dimensions_index, embedding_dimension in enumerate(
                output_dimensions_range):
            for subset_index, nmb_points in enumerate(nmb_points_range):
                for batch_size_index, batch_size in enumerate(
                        batch_size_range):
                    for lr_index, learning_rate in enumerate(
                            learning_rate_range):
                        for trip_mindex, triplet_multiplier in enumerate(
                                triplet_multiplier_range):

                            if (not input_equals_output) or (
                                    input_equals_output
                                    and input_dim == embedding_dimension):
                                logger.info('Learning Rate: ' +
                                            str(learning_rate))
                                logger.info('Number of Points: ' +
                                            str(nmb_points))
                                logger.info('Input Dimension: ' +
                                            str(input_dim))
                                logger.info('Output Dimension: ' +
                                            str(embedding_dimension))
                                logger.info('Number of Test Triplets: ' +
                                            str(n_test_triplets))
                                logger.info('Triplet Multiplier: ' +
                                            str(triplet_multiplier))
                                logger.info('Batch Size: ' + str(batch_size))

                                embedding, train_triplets, labels, train_error, test_error, \
                                procrustes_error, knn_orig, knn_embed, time_taken, \
                                loss_history, triplet_error_history, time_history = run_method(config, dataset_name,
                                                                                               algorithm, nmb_points,
                                                                                               input_dim,
                                                                                               embedding_dimension,
                                                                                               learning_rate,
                                                                                               batch_size,
                                                                                               triplet_multiplier,
                                                                                               optimizer, epochs,
                                                                                               n_test_triplets, logger,
                                                                                               error_change_threshold)

                                tradeoff_results[input_dim_index,
                                                 dimensions_index,
                                                 subset_index,
                                                 batch_size_index, lr_index,
                                                 trip_mindex, 0] = train_error
                                tradeoff_results[input_dim_index,
                                                 dimensions_index,
                                                 subset_index,
                                                 batch_size_index, lr_index,
                                                 trip_mindex, 1] = test_error
                                tradeoff_results[input_dim_index,
                                                 dimensions_index,
                                                 subset_index,
                                                 batch_size_index, lr_index,
                                                 trip_mindex,
                                                 2] = procrustes_error
                                tradeoff_results[input_dim_index,
                                                 dimensions_index,
                                                 subset_index,
                                                 batch_size_index, lr_index,
                                                 trip_mindex, 3] = knn_orig
                                tradeoff_results[input_dim_index,
                                                 dimensions_index,
                                                 subset_index,
                                                 batch_size_index, lr_index,
                                                 trip_mindex, 4] = knn_embed
                                tradeoff_results[input_dim_index,
                                                 dimensions_index,
                                                 subset_index,
                                                 batch_size_index, lr_index,
                                                 trip_mindex, 5] = time_taken
                                tradeoff_results[input_dim_index,
                                                 dimensions_index,
                                                 subset_index,
                                                 batch_size_index, lr_index,
                                                 trip_mindex, 6] = embedding
                                tradeoff_results[input_dim_index,
                                                 dimensions_index,
                                                 subset_index,
                                                 batch_size_index, lr_index,
                                                 trip_mindex, 7] = labels
                                tradeoff_results[input_dim_index,
                                                 dimensions_index,
                                                 subset_index,
                                                 batch_size_index, lr_index,
                                                 trip_mindex, 8] = loss_history
                                tradeoff_results[input_dim_index,
                                                 dimensions_index,
                                                 subset_index,
                                                 batch_size_index, lr_index,
                                                 trip_mindex,
                                                 9] = triplet_error_history
                                tradeoff_results[input_dim_index,
                                                 dimensions_index,
                                                 subset_index,
                                                 batch_size_index, lr_index,
                                                 trip_mindex,
                                                 10] = time_history
                                # tradeoff_results[input_dim_index, dimensions_index, subset_index,
                                #                  batch_size_index, lr_index, trip_mindex, 8] = train_triplets

    for input_dim_index, input_dim in enumerate(input_dim_range):
        for dimensions_index, embedding_dimension in enumerate(
                output_dimensions_range):
            for subset_index, nmb_points in enumerate(nmb_points_range):
                for batch_size_index, batch_size in enumerate(
                        batch_size_range):
                    for lr_index, learning_rate in enumerate(
                            learning_rate_range):
                        for trip_mindex, triplet_multiplier in enumerate(
                                triplet_multiplier_range):
                            if (not input_equals_output) or (
                                    input_equals_output
                                    and input_dim == embedding_dimension):
                                logger.info('Input Dimension ' +
                                            str(input_dim) +
                                            ' Output Dimension ' +
                                            str(embedding_dimension) +
                                            ' Number of Points ' +
                                            str(nmb_points) + ' Batch Size ' +
                                            str(batch_size) +
                                            ' Learning Rate ' +
                                            str(learning_rate) +
                                            ' Triplet Multiplier ' +
                                            str(triplet_multiplier))
                                logger.info(
                                    ' Train Error ' + str(tradeoff_results[
                                        input_dim_index, dimensions_index,
                                        subset_index, batch_size_index,
                                        lr_index, trip_mindex, 0]) +
                                    ' Test Error ' + str(tradeoff_results[
                                        input_dim_index, dimensions_index,
                                        subset_index, batch_size_index,
                                        lr_index, trip_mindex, 1]))

                                logger.info(' Procrustes Error ' +
                                            str(tradeoff_results[
                                                input_dim_index,
                                                dimensions_index, subset_index,
                                                batch_size_index, lr_index,
                                                trip_mindex, 2]))

                                logger.info(
                                    ' kNN original Emb Loss ' +
                                    str(tradeoff_results[
                                        input_dim_index, dimensions_index,
                                        subset_index, batch_size_index,
                                        lr_index, trip_mindex, 3]) +
                                    ' kNN on Ordinal Emb Loss ' +
                                    str(tradeoff_results[
                                        input_dim_index, dimensions_index,
                                        subset_index, batch_size_index,
                                        lr_index, trip_mindex, 4]))

                                logger.info(' Time ' + str(tradeoff_results[
                                    input_dim_index, dimensions_index,
                                    subset_index, batch_size_index, lr_index,
                                    trip_mindex, 5]))
                                logger.info('-' * 20)

    data_dump = [order, experiment_range, tradeoff_results]
    joblib.dump(data_dump, os.path.join(log_dir, experiment_name + '.pkl'))
def run_hyper_search(config):

    dataset_name = config['dataset_selected']
    batch_size = config['batch_size']
    epochs = config['nb_epochs']
    input_dim = config['input_dimension']
    n_samples = config['number_of_points']
    number_of_test_triplets = config['n_test_triplets']
    log_dir = config['log']['path']

    phase1_learning_rate_range = config['hyper_search']['phase1_learning_rate']
    phase2_learning_rate_range = config['hyper_search']['phase2_learning_rate']
    dimensions_range = config['hyper_search']['output_dimension']

    separator = '_'
    experiment_name = 'lloe_full_hyper_search_' + \
                      'data_' + dataset_name + \
                      '_input_dim_' + str(input_dim) + \
                      '_n_pts_' + str(n_samples) + \
                      '_num_test_trips_' + str(number_of_test_triplets) + \
                      '_output_dim_' + separator.join([str(i) for i in dimensions_range]) + \
                      '_phase1lr_' + separator.join([str(i) for i in phase1_learning_rate_range]) + \
                      '_phase2lr_' + separator.join([str(i) for i in phase2_learning_rate_range]) + \
                      '_bs_' + str(batch_size)

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    logging_path = os.path.join(log_dir, experiment_name + '.log')
    logger = logging_util.my_custom_logger(logger_name=logging_path, level=logging.INFO)
    logger.info('Name of Experiment: ' + experiment_name)
    logger.info('Logging Path:' + logging_path)
    logger.info('Dataset Name: ' + dataset_name)
    logger.info('Epochs: ' + str(epochs))

    best_params_test = {}
    all_results = {}

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    vec_data, labels = select_dataset(dataset_name, n_samples=n_samples, input_dim=input_dim) # input_dim is only argument for uniform. Ignored otherwise

    n_samples = vec_data.shape[0] # do not remove
    number_of_landmarks = int(0.1 * n_samples)
    subset_size = 10 * number_of_landmarks

    for emb_dim in dimensions_range:
        all_results[emb_dim] = {}
        best_test_error = 1

        logger.info('Testing on: ' + dataset_name + '. Embedding dimension is ' + str(emb_dim))
        logger.info(' ')

        for (phase1_learning_rate, phase2_learning_rate) \
                in product(phase1_learning_rate_range, phase2_learning_rate_range):

                logger.info(10*'-'+' New parameters' + 10*'-')
                logger.info('phase1_Learning Rate: ' + str(phase1_learning_rate))
                logger.info('phase2_Learning Rate: ' + str(phase2_learning_rate))
                logger.info('Number of Points: ' + str(n_samples))
                logger.info('Input Dimension: ' + str(input_dim))
                logger.info('Output Dimension: ' + str(emb_dim))
                logger.info('Number of Test Triplets: ' + str(number_of_test_triplets))
                logger.info('Batch Size: ' + str(batch_size))

                logger.info('Computing LOE_FULL...')

                # first phase of the algorithm
                landmarks, first_phase_indices, \
                first_phase_subset_size, first_phase_reconstruction, \
                first_phase_loss, first_phase_triplet_error, time_first_phase = first_phase_soe(num_landmarks=number_of_landmarks,
                                                                                            subset_size=subset_size,
                                                                                            data=vec_data, dataset_size=n_samples,
                                                                                            embedding_dim=emb_dim, epochs=epochs,
                                                                                            target_loss=0.1,
                                                                                            first_phase_lr=phase1_learning_rate,
                                                                                            device=device,
                                                                                            batch_size=batch_size, logger=logger)
                logger.info('First Phase Loss: ' + str(first_phase_loss))
                logger.info('First Phase Triplet Error: ' + str(first_phase_triplet_error))
                logger.info('First Phase Number of Landmarks: ' + str(landmarks.shape))
                logger.info('First Phase Indices Number: ' + str(len(first_phase_indices)))
                logger.info('First Phase Reconstruction Size: ' + str(first_phase_reconstruction.shape))

                embedded_indices = first_phase_indices
                embedded_points = first_phase_reconstruction
                non_embedded_indices = list(set(range(vec_data.shape[0])).difference(set(embedded_indices)))
                non_embedded_points = vec_data[non_embedded_indices, :]

                my_oracle = Oracle(data=vec_data)
                logger.info('Second Phase: ')
                logger.info('Oracle Created...')

                logger.info('Computing LLOE - Phase 2...')
                # second phase for embedding point by point update
                seocnd_phase_embeddings_index, \
                second_phase_embeddings, time_second_phase = second_phase(my_oracle=my_oracle,
                                                                          non_embedded_indices=non_embedded_indices,
                                                                          embedded_indices=embedded_indices,
                                                                          first_phase_embedded_points=embedded_points,
                                                                          dim=emb_dim,
                                                                          lr=phase2_learning_rate, logger=logger)

                # combine the first phase and second phase points and index
                final_embedding = np.zeros((vec_data.shape[0], emb_dim))
                # phase 1 points
                final_embedding[first_phase_indices] = first_phase_reconstruction
                # second phase points
                final_embedding[seocnd_phase_embeddings_index] = second_phase_embeddings

                logger.info('Size of First Phase Indices: ' + str(len(first_phase_indices)))
                logger.info('Size of Second Phase Indices: ' + str(len(seocnd_phase_embeddings_index)))
                logger.info('First Phase Triplet Error: ' + str(first_phase_triplet_error))

                # Evaluation
                logger.info('Evaluation of the Complete Embedding Dataset: ')
                random_trip_indices = gen_triplet_indices(n=vec_data.shape[0], num_trips=number_of_test_triplets)
                test_triplet_data = gen_triplet_data(data=vec_data, random_triplet_indices=random_trip_indices, batch_size=1000)

                test_error, embedding_error_list = triplet_error(final_embedding, test_triplet_data)
                time_taken = time_first_phase + time_second_phase

                logger.info('Time Taken: ' + str(time_taken) + ' seconds.')
                logger.info('Test Error: ' + str(test_error))
                #logger.info('Procrustes Disparity: ' + str(procrustes_error))
                #logger.info('kNN Classification Error on ground-truth: ' + str(knn_error_true_emb))
                #logger.info('kNN Classification Error on embedding: ' + str(knn_error_ord_emb))

                results = {'test_error': test_error, 'last_embedding': final_embedding}

                all_results[emb_dim].update({(phase1_learning_rate, phase2_learning_rate): results})

                if test_error < best_test_error:
                    best_params_test[emb_dim] = {'phase1_learning_rate': phase1_learning_rate, 'phase2_learning_rate': phase2_learning_rate,
                                              'error': test_error}
                    best_test_error = test_error

        result_name = 'lloe_full_hypersearch_' + \
                      'data_' + dataset_name + \
                      '_input_dim_' + str(input_dim) + \
                      '_n_pts_' + str(n_samples) + \
                      '_output_dim_' + str(emb_dim) + \
                      '_bs_' + str(batch_size)
        all_results['labels'] = labels
        joblib.dump(all_results[emb_dim], os.path.join(log_dir, result_name + '.pkl'))


# print all results as well again
    logger.info(10 * '-' + 'ALL RESULTS ' + 10 * '-')
    for emb_dim in dimensions_range:
        results = all_results[emb_dim]
        logger.info('Results for emb dimension ' + str(emb_dim))
        for (phase1_learning_rate, phase2_learning_rate) \
                in product(phase1_learning_rate_range, phase2_learning_rate_range):
            logger.info('phase1_learning_rate ' + str(phase1_learning_rate) + ' phase2_learning_rate ' + str(phase2_learning_rate)
                        + ' -- test error: ' + str(results[(phase1_learning_rate, phase2_learning_rate)]['test_error']))

# print best parameter settings
    for emb_dim in dimensions_range:
        logger.info('Best Parameters for emb dimension ' + str(emb_dim))
        best_on_test = best_params_test[emb_dim]
        logger.info('achieved ' + str(best_on_test['error']) + ' test error with phase1_learning_rate: ' + str(best_on_test['phase1_learning_rate'])
                    + ' phase2_learning_rate: ' + str(best_on_test['phase2_learning_rate']))