Exemplo n.º 1
0
def compute_similarity_matrix_ngram_parallel(
    *,
    repr_vocab,
    full_vocab,
    processes,
    n,
    ngram_to_index,
) -> np.ndarray:
    """

    :param repr_vocab:
    :param full_vocab:
    :param processes:
    :param n:
    :param ngram_to_index:
    :return:
    """
    from ratvec.similarity import n_gram_sim_list

    secho(
        f"Splitting data for computing similarities in {processes} processes")
    elements = get_ngram_elements(
        full_vocab=full_vocab,
        repr_vocab=repr_vocab,
        ngram_to_index=ngram_to_index,
        n=n,
    )
    compute_similarities_on_splits = partial(n_gram_sim_list, n_ngram=n)
    return _calculate_similarity_matrix_parallel(
        full_vocab=full_vocab,
        repr_vocab=repr_vocab,
        processes=processes,
        elements=elements,
        compute_similarities_on_splits=compute_similarities_on_splits,
    )
Exemplo n.º 2
0
def _run_evaluation(
        *,
        y,
        save_dataset,
        family_labels,
        n_components,
        n_iterations,
        max_neighbors,
        pool,
        subdirectory,
) -> None:
    kpca = os.path.join(subdirectory, 'kpca.npy')
    secho(f'Loading embeddings file: {kpca}')
    x = np.load(kpca)

    balanced_datasets, counts = make_balanced(x, y)
    if save_dataset:
        family_labels_balanced_path = os.path.join(subdirectory, family_labels.name + "_balanced")
        with open(family_labels_balanced_path, "wb") as file:
            pickle.dump((balanced_datasets, counts), file)

    _sub_run_evaluation(
        balanced_datasets=balanced_datasets,
        counts=counts,
        n_components=n_components,
        n_iterations=n_iterations,
        max_neighbors=max_neighbors,
        pool=pool,
        subdirectory=subdirectory,
    )
Exemplo n.º 3
0
def main(
        family_labels,
        directory,
        n_components: int,
        max_neighbors: int,
        n_iterations: int,
        no_save_dataset: bool,
        load_dataset: bool,
) -> None:
    """Evaluate KPCA embeddings."""
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        if load_dataset:
            click.echo('Loading balanced datasets')
            subdirectory = os.path.dirname(family_labels.name)
            with open(family_labels.name + "_balanced", "rb") as file:
                balanced_datasets, counts = pickle.load(file)
                _sub_run_evaluation(
                    balanced_datasets=balanced_datasets,
                    counts=counts,
                    n_components=n_components,
                    n_iterations=n_iterations,
                    max_neighbors=max_neighbors,
                    pool=pool,
                    subdirectory=subdirectory,
                )
        else:
            secho(f'Loading family labels file: {family_labels}')
            y = np.array([
                l[:-1]
                for l in family_labels
            ])

            optim_dir = os.path.join(directory, 'optim')
            os.makedirs(optim_dir, exist_ok=True)

            secho(f'Dynamically generating balanced datasets from {optim_dir}')
            for subdirectory_name in os.listdir(optim_dir):
                subdirectory = os.path.join(optim_dir, subdirectory_name)
                if not os.path.isdir(subdirectory):
                    continue
                secho(f'Handling {subdirectory}')
                _run_evaluation(
                    y=y,
                    save_dataset=(not no_save_dataset),
                    family_labels=family_labels,
                    n_components=n_components,
                    n_iterations=n_iterations,
                    max_neighbors=max_neighbors,
                    pool=pool,
                    subdirectory=subdirectory,
                )

    secho(f"done. Enjoy your {make_ratvec(3)}")
Exemplo n.º 4
0
def infer(
    full_sim_matrix_file: str,
    repr_sim_matrix_file: str,
    output: str,
    n_components: int,
    sim: str,
    use_gpu: bool,
):
    """Load pre-computed similarity matrix."""
    secho(
        f"Loading the repr similarity matrix for the full vocabulary to {repr_sim_matrix_file}"
    )
    repr_similarity_matrix = np.load(repr_sim_matrix_file)
    secho(
        f"Loading the full similarity matrix for the full vocabulary to {full_sim_matrix_file}"
    )
    full_similarity_matrix = np.load(full_sim_matrix_file)
    optim_folder = os.path.join(output, 'optim')
    os.makedirs(optim_folder, exist_ok=True)

    optimize_projections(
        output=optim_folder,
        repr_similarity_matrix=repr_similarity_matrix,
        full_similarity_matrix=full_similarity_matrix,
        n_components=n_components,
        similarity_type=sim,
        use_gpu=use_gpu,
    )

    if use_gpu:  # only shut down after all loops have used this function
        import cudamat as cm
        cm.shutdown()

    secho(f"done. Enjoy your {make_ratvec(3)}")
Exemplo n.º 5
0
def _calculate_similarity_matrix_parallel(
    *,
    full_vocab,
    repr_vocab,
    processes,
    elements,
    compute_similarities_on_splits: Callable,
) -> np.ndarray:
    full_vocab_len = len(full_vocab)
    repr_vocab_len = len(repr_vocab)
    split_size = ceil((full_vocab_len * repr_vocab_len) / processes)
    splits: List[List[Any]] = compute_splits(
        elements=elements,
        split_size=split_size,
        processes=processes,
    )

    secho(f'Computing similarities in {processes} processes')
    with multiprocessing.Pool(processes=processes) as pool:
        res = pool.map(compute_similarities_on_splits, splits)
        res = np.hstack(res)

    return res.reshape(full_vocab_len, repr_vocab_len)
Exemplo n.º 6
0
def main(directory: str, force: bool):
    """Generate the protein vocabularies."""
    # Ensure data directory exists
    make_data_directory()

    sequences_path = os.path.join(directory, PROTEIN_FAMILY_SEQUENCES)
    metadata_path = os.path.join(directory, PROTEIN_FAMILY_METADATA)

    if not force and os.path.isfile(sequences_path) and os.path.isfile(metadata_path):
        secho(f"Files are already existing in {directory}. Use --force to re-compute.")
        sys.exit(0)

    secho(f"Downloading files from the internet. Please be patient.")
    # Download the protein files
    download_protein_files(directory)

    generate_protein_vocabularies(directory, directory)
    secho(f"done. Enjoy your {make_ratvec(3)}")
Exemplo n.º 7
0
def main(
    directory,
    n_components: int,
    max_neighbors: int,
    n_iterations: int,
) -> None:
    """Evaluate KPCA embeddings."""
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:

        optim_dir = os.path.join(directory, 'optim')
        os.makedirs(optim_dir, exist_ok=True)

        for subdirectory_name in os.listdir(optim_dir):
            subdirectory = os.path.join(optim_dir, subdirectory_name)
            if not os.path.isdir(subdirectory):
                continue
            secho(f'Handling {subdirectory}')

            kpca = os.path.join(subdirectory, 'kpca.npy')
            secho(f'Loading embeddings file: {kpca}')
            X = np.load(kpca)
            n_pos_seqs = int(X.shape[0] / 2)
            n_neg_seqs = n_pos_seqs
            y = np.array(n_pos_seqs * [True] + n_neg_seqs * [False])

            balanced_datasets = [(X, y)]
            counts = [len(y)]

            _sub_run_evaluation(
                balanced_datasets=balanced_datasets,
                counts=counts,
                n_components=n_components,
                n_iterations=n_iterations,
                max_neighbors=max_neighbors,
                pool=pool,
                subdirectory=subdirectory,
            )

    secho(f"done. Enjoy your {make_ratvec(3)}")
Exemplo n.º 8
0
def generate_protein_vocabularies(source_directory: str,
                                  output_directory: str) -> None:
    """Use the data in the source directory to pre-compute files for RatVec."""
    metadata_path = os.path.join(source_directory, PROTEIN_FAMILY_METADATA)
    seq_path = os.path.join(source_directory, PROTEIN_FAMILY_SEQUENCES)
    vocab_file_path = os.path.join(source_directory, "X.txt")
    labels_file_path = os.path.join(source_directory, "Y.txt")

    secho(f'Reading labels from {metadata_path}', fg="cyan")
    with codecs.open(metadata_path) as file:
        _ = next(file)  # skip the header
        # Parse each line to get the protein name
        protein_names = np.array([line[:-1].split("\t")[-2] for line in file])

    number_of_proteins = len(protein_names)

    # Ensure the number of proteins is 323018
    assert number_of_proteins == 324018, 'Wrong number of protein sequences'

    # Get a list from 0 to number of proteins and shuffle it
    idx = list(range(number_of_proteins))
    shuffle(idx)

    protein_names = protein_names[idx]

    assert len(set(protein_names)) == 7027, 'Wrong number of protein families'

    secho(f'Reading sequences from {seq_path}', fg="cyan")
    with codecs.open(seq_path) as file:
        _ = next(file)  # Skip the header
        seqs = np.array([line[:-1] for line in file])

    seqs = seqs[idx]

    # Use of a dictionary to remove duplicated sequences
    secho(f'Removing duplicates', fg="cyan")

    secho(
        f'Number of sequences before removing duplicates {number_of_proteins}',
        fg="cyan")

    dataset = {seqs[i]: protein_names[i] for i in idx}

    secho(f'Number of sequences after removing duplicates {len(dataset)}',
          fg="cyan")

    # Free up memory
    del seqs
    del protein_names

    # Get the keys and values of the cleaned up dictionary with no duplicates
    x, y = np.array(list(dataset.keys())), np.array(list(dataset.values()))

    secho(f'Saving vocabulary to {vocab_file_path}', fg="cyan")
    with codecs.open(vocab_file_path,
                     "w") as file:  # Store the sequences into X.txt
        file.write("\n".join(x))

    secho(f'Saving labels to to {labels_file_path}', fg="cyan")
    with codecs.open(labels_file_path, "w") as file:
        file.write("\n".join(y))

    # Make a counter to get the representative vocabularies
    label_counter = Counter(y)
    d = np.array([(key, label_counter[key]) for key in label_counter])
    d_sorted = sorted(d, key=lambda tup: float(tup[1]))

    preset_lengths = 100, 200, 500, 1000, 2000, 3000, 4000
    length_to_subdirectory = {
        length: os.path.join(output_directory, str(length))
        for length in preset_lengths
    }
    length_to_subdirectory[len(set(y))] = os.path.join(output_directory,
                                                       'full')

    with open(os.path.join(output_directory, 'manifest.json'), 'w') as file:
        json.dump(
            [
                dict(length=length, subdirectory=subdirectory)
                for length, subdirectory in length_to_subdirectory.items()
            ],
            file,
            indent=2,
        )

    secho(
        f'Processing for lengths: {", ".join(map(str, sorted(length_to_subdirectory)))}'
    )
    for length, subdirectory in length_to_subdirectory.items():
        os.makedirs(subdirectory, exist_ok=True)
        secho(f'Processing top {length} in {subdirectory}', fg="cyan")

        top_labels = [t[0] for t in d_sorted[-length:]]
        idx_top = [l in top_labels for l in y]

        y_top = y[idx_top]
        x_top = x[idx_top]

        top_n_labels_path = os.path.join(subdirectory, "labels.txt")
        with codecs.open(top_n_labels_path, "w") as file:
            file.write("\n".join(y_top))

        top_n_vocab_path = os.path.join(subdirectory, "full_vocab.txt")
        with codecs.open(top_n_vocab_path, "w") as file:
            file.write("\n".join(x_top))

        single_family_representatives = []
        for family in top_labels:
            family_idx = np.where(y == family)
            # FIXME why does this look for the index, then just get the value? Why not just do
            #  min(x[family_idx], key=len)
            repr_idx = np.argmin([len(s) for s in x[family_idx]])
            single_family_representatives.append(x[family_idx][repr_idx])

        repr_top_n_path = os.path.join(subdirectory, "repr_vocab.txt")
        with codecs.open(repr_top_n_path, "w") as file:
            file.write("\n".join(single_family_representatives))
Exemplo n.º 9
0
def optimize_projections(
    *,
    output: str,
    repr_similarity_matrix,
    full_similarity_matrix,
    n_components: int,
    similarity_type: str,
    use_gpu: bool,
) -> None:
    """

    :param output: The output folder
    :param repr_similarity_matrix: A square matrix with dimensions |repr| x |repr|
    :param full_similarity_matrix: A rectangular matrix with dimensions |full| x |repr|
    :param n_components:
    :return:
    """
    khc = ((kernel_name, KERNEL_TO_PROJECTION[kernel_name], hyperparam)
           for kernel_name, hyperparams in kernels.items()
           for hyperparam in hyperparams)

    for kernel_name, project_with_kernel, hyperparam in khc:
        # Make output folder for the optimization with this kernel/hyper-parameter pair
        param_folder = os.path.join(output, f'{kernel_name}_{hyperparam}')
        os.makedirs(param_folder, exist_ok=True)

        secho(
            f"({kernel_name}/{hyperparam}) calculating normalized/symmetric kernel matrix"
        )
        repr_kernel_matrix = project_with_kernel(repr_similarity_matrix,
                                                 hyperparam)
        repr_kernel_matrix_normalized = normalize_kernel_matrix(
            repr_kernel_matrix)

        secho(
            f"({kernel_name}/{hyperparam}) solving eigenvector/eigenvalues problem"
        )
        eigenvalues, eigenvectors = eigh(repr_kernel_matrix_normalized)

        # Calculate alphas
        repr_alphas = np.column_stack(
            [eigenvectors[:, -i] for i in range(1, n_components + 1)])
        # Save Alphas
        _alphas_path = os.path.join(param_folder, f"alphas.p")
        secho(
            f"({kernel_name}/{hyperparam}) outputting alphas to {_alphas_path}"
        )
        with open(_alphas_path, "wb") as file:
            pickle.dump(repr_alphas, file)

        # Calculate lambdas
        repr_lambdas = [eigenvalues[-i] for i in range(1, n_components + 1)]
        # Save lambdas
        _lambdas_path = os.path.join(param_folder, f"lambdas.p")
        secho(
            f"({kernel_name}/{hyperparam}) outputting lambdas to {_lambdas_path}"
        )
        with open(_lambdas_path, 'wb') as file:
            pickle.dump(repr_lambdas, file)

        secho(
            f"({kernel_name}/{hyperparam}) projecting known vocabulary to KPCA embeddings"
        )
        repr_projection_matrix = repr_alphas / repr_lambdas

        # Calculate KPCA matrix
        if similarity_type == "ngram_intersec":  # There is no additional kernel function on top of the similarity function
            kpca_matrix = project_full_vocab_linear(
                projection_matrix=repr_projection_matrix,
                similarity_matrix=full_similarity_matrix,
            )
        elif use_gpu:
            kpca_matrix = project_words_gpu(
                projection_matrix=repr_projection_matrix,
                similarity_matrix=full_similarity_matrix,
                kernel_name=kernel_name,
                hyperparam=hyperparam,
            )
        else:
            kpca_matrix = project_similarity_matrix(
                projection_matrix=repr_projection_matrix,
                similarity_matrix=full_similarity_matrix,
                kernel_name=kernel_name,
                hyperparam=hyperparam,
            )

        # Save KPCA matrix
        _kpca_path = os.path.join(param_folder, f"kpca.npy")
        secho(
            f"({kernel_name}/{hyperparam}) outputting KPCA matrix to {_kpca_path}"
        )
        np.save(_kpca_path, kpca_matrix)
Exemplo n.º 10
0
def main(
    full_vocab_file: str,
    repr_vocab_file: str,
    output: str,
    n_components: int,
    sim: str,
    sim_alignment_matrix: str,
    n_ngram: int,
    use_gpu: bool,
    processes: int,
) -> None:
    """Compute KPCA embeddings on a given data set."""
    n = n_ngram  # meh
    output = os.path.abspath(output)
    os.makedirs(output, exist_ok=True)

    full_vocab = _preprocess_vocab_file(full_vocab_file)

    if repr_vocab_file is None:
        repr_vocab = full_vocab
    else:
        repr_vocab = _preprocess_vocab_file(repr_vocab_file)

    params_path = os.path.join(output, 'training_manifest.json')
    secho(f'Outputting training information to {params_path}')
    manifest = dict(
        sim=sim,
        n=n,
        len_full_vocab=len(full_vocab),
        len_repr_vocab=len(repr_vocab),
        kernels=kernels,
    )
    with open(params_path, 'w') as file:
        json.dump(manifest, file, sort_keys=True, indent=2)

    if use_gpu:
        import cudamat as cm
        cm.cublas_init()

    if sim == 'global-alignment':
        secho(
            f'Computing global alignment similarities with {sim_alignment_matrix}'
        )
        repr_similarity_matrix = calculate_global_alignment_similarity_matrix(
            full_vocab=repr_vocab,
            repr_vocab=repr_vocab,
            processes=processes,
            matrix=sim_alignment_matrix,
            tqdm_desc=f'{EMOJI} Computing self-similarity matrix for '
            f'repr vocab with global alignment ({sim_alignment_matrix})')
        full_similarity_matrix = calculate_global_alignment_similarity_matrix(
            full_vocab=full_vocab,
            repr_vocab=repr_vocab,
            processes=processes,
            matrix=sim_alignment_matrix,
            tqdm_desc=f'{EMOJI} Computing similarity matrix between '
            f'full/repr vocab with global alignment ({sim_alignment_matrix})')
    else:
        alphabet = set(itt.chain.from_iterable(repr_vocab))
        alphabet.add(" ")

        ngram_to_index = {
            ngram: i
            for i, ngram in enumerate(
                ["".join(t) for t in itt.product(alphabet, repeat=n)])
        }

        if sim == "ngram_intersec":
            secho(f'Computing n-gram sparse similarities with {sim}')
            repr_similarity_matrix = compute_similarity_matrix_ngram_sparse(
                full_vocab=repr_vocab,
                repr_vocab=repr_vocab,
                ngram_to_index=ngram_to_index,
                n=n,
            )
            full_similarity_matrix = compute_similarity_matrix_ngram_sparse(
                full_vocab=full_vocab,
                repr_vocab=repr_vocab,
                ngram_to_index=ngram_to_index,
                n=n,
            )
        else:  # sim == 'ngram_sim'
            secho(f'Computing n-gram similarities with {sim}')
            repr_similarity_matrix = compute_similarity_matrix_ngram_parallel(
                full_vocab=repr_vocab,
                repr_vocab=repr_vocab,
                n=n,
                ngram_to_index=ngram_to_index,
                processes=processes,  # Extra because this gets multi-processed
            )
            full_similarity_matrix = compute_similarity_matrix_ngram_parallel(
                full_vocab=full_vocab,
                repr_vocab=repr_vocab,
                n=n,
                ngram_to_index=ngram_to_index,
                processes=processes,  # Extra because this gets multi-processed
            )

    repr_similarity_matrix_path = os.path.join(output,
                                               f"repr_similarity_matrix.npy")
    secho(
        f"Saving the repr similarity matrix for the full vocabulary to {repr_similarity_matrix_path}"
    )
    np.save(repr_similarity_matrix_path,
            repr_similarity_matrix,
            allow_pickle=False)

    full_similarity_matrix_path = os.path.join(output,
                                               f"full_similarity_matrix.npy")
    secho(
        f"Saving the full similarity matrix for the full vocabulary to {full_similarity_matrix_path}"
    )
    np.save(full_similarity_matrix_path,
            full_similarity_matrix,
            allow_pickle=False)

    optim_folder = os.path.join(output, 'optim')
    os.makedirs(optim_folder, exist_ok=True)

    if n_components is None:
        n_components = int(0.5 + len(repr_vocab) * 2 / 3)

    optimize_projections(
        output=optim_folder,
        repr_similarity_matrix=repr_similarity_matrix,
        full_similarity_matrix=full_similarity_matrix,
        n_components=n_components,
        similarity_type=sim,
        use_gpu=use_gpu,
    )

    if use_gpu:  # only shut down after all loops have used this function
        import cudamat as cm
        cm.shutdown()

    secho(f"done. Enjoy your {make_ratvec(3)}")
Exemplo n.º 11
0
def _sub_run_evaluation(
    *,
    balanced_datasets,
    counts,
    n_components,
    n_iterations,
    max_neighbors,
    pool,
    subdirectory,
):
    with open(os.path.join(subdirectory, 'evaluation_params.json'),
              'w') as file:
        json.dump(
            dict(
                components=n_components,
                iterations=n_iterations,
                max_neighbors=max_neighbors,
            ),
            file,
            indent=2,
        )

    filt_counts = [family_size for family_size in counts if family_size >= 10]

    secho("Exploring different number of components")
    number_components_grid_search_results = {}
    number_components_low = 1
    number_components_high = int(n_components)
    it = tqdm(
        range(
            number_components_low,
            number_components_high,
            max(
                1,
                int(
                    np.floor((number_components_high - number_components_low) /
                             n_iterations))),
        ),
        desc=f'{EMOJI} Optimizing number of components',
    )
    it.write('Number Components\tMean CV Score')
    for reduced_n_components in it:
        n_neighbors = 1
        partial_eval_function = partial(
            score_overview,
            reduced_n_components,
            n_neighbors,
        )

        best_mean_score, _, _ = np.array(
            pool.starmap(partial_eval_function, balanced_datasets))[0]

        it.write(f"{reduced_n_components}\t{best_mean_score:.3f}")
        number_components_grid_search_results[
            reduced_n_components] = best_mean_score

    best_number_components = max(
        number_components_grid_search_results,
        key=number_components_grid_search_results.get,
    )
    best_result1 = number_components_grid_search_results[
        best_number_components]
    secho(
        f"Best at components={best_number_components}, score={best_result1:.3f}"
    )

    secho("Exploring different number of neighbors")
    number_neighbors_grid_search_results = {}

    it = tqdm(range(1, max_neighbors),
              desc=f'{EMOJI} Optimizing number of neighbors')
    for n_neighbors in it:
        partial_eval_function = partial(
            score_overview,
            best_number_components,
            n_neighbors,
        )
        best_mean_score, _, _ = np.array(
            pool.starmap(partial_eval_function, balanced_datasets))[0]

        it.write(f"{n_neighbors}\t{best_mean_score:.3f}\b")
        number_neighbors_grid_search_results[n_neighbors] = best_mean_score

    best_number_neighbors = max(number_neighbors_grid_search_results,
                                key=number_neighbors_grid_search_results.get)
    best_result2 = number_neighbors_grid_search_results[best_number_neighbors]
    secho(
        f"Best at neighbors={best_number_neighbors}, score={best_result2:.3f}")

    mean_score, pos_score, neg_score = score_overview(best_number_components,
                                                      best_number_neighbors,
                                                      balanced_datasets[0][0],
                                                      balanced_datasets[0][1])
    secho(
        f"10-fold-crossvalidation accuracy on positive examples={pos_score:.3f}"
    )
    secho(
        f"10-fold-crossvalidation accuracy on negative examples={neg_score:.3f}"
    )
    secho(f"Overall 10-fold-crossvalidation accuracy {mean_score:.3f}")

    with open(os.path.join(subdirectory, 'evaluation_results.json'),
              'w') as file:
        json.dump(
            {
                'number_components_grid_search': {
                    'best_number_components': best_number_components,
                    'results': number_components_grid_search_results,
                },
                'number_neighbors_grid_search': {
                    'best_number_neighbors': best_number_neighbors,
                    'results': number_neighbors_grid_search_results,
                },
            },
            file,
            indent=2,
        )
Exemplo n.º 12
0
def _sub_run_evaluation(
        *,
        balanced_datasets,
        counts,
        n_components,
        n_iterations,
        max_neighbors,
        pool,
        subdirectory,
):
    with open(os.path.join(subdirectory, 'evaluation_params.json'), 'w') as file:
        json.dump(
            dict(
                components=n_components,
                iterations=n_iterations,
                max_neighbors=max_neighbors,
            ),
            file,
            indent=2,
        )

    filt_counts = [
        family_size
        for family_size in counts
        if family_size >= 10
    ]

    secho("Exploring different number of components")
    number_components_grid_search_results = {}
    number_components_low = 1
    number_components_high = int(n_components)
    it = tqdm(
        range(
            number_components_low,
            number_components_high,
            max(1, int(np.floor((number_components_high - number_components_low) / n_iterations))),
        ),
        desc=f'{EMOJI} Optimizing number of components',
    )
    it.write('Number Components\tMean CV Score')
    for reduced_n_components in it:
        n_neighbors = 1
        partial_eval_function = partial(
            plos_cross_val_score,
            reduced_n_components,
            n_neighbors,
        )
        plos_scores = np.array(pool.starmap(partial_eval_function, balanced_datasets))
        weighted_score = np.dot(plos_scores, filt_counts) / np.sum(filt_counts)

        it.write(f"{reduced_n_components}\t{weighted_score:.3f}")
        number_components_grid_search_results[reduced_n_components] = weighted_score

    best_number_components = max(
        number_components_grid_search_results,
        key=number_components_grid_search_results.get,
    )
    best_result1 = number_components_grid_search_results[best_number_components]
    secho(f"Best at components={best_number_components}, score={best_result1:.3f}")

    secho("Exploring different number of neighbors")
    number_neighbors_grid_search_results = {}

    it = tqdm(range(1, max_neighbors), desc=f'{EMOJI} Optimizing number of neighbors')
    for n_neighbors in it:
        partial_eval_function = partial(
            plos_cross_val_score,
            best_number_components,
            n_neighbors,
        )
        plos_scores = np.array(pool.starmap(partial_eval_function, balanced_datasets))
        weighted_score = np.dot(plos_scores, filt_counts) / np.sum(filt_counts)

        it.write(f"{n_neighbors}\t{weighted_score:.3f}\b")
        number_neighbors_grid_search_results[n_neighbors] = weighted_score

    best_number_neighbors = max(number_neighbors_grid_search_results, key=number_neighbors_grid_search_results.get)
    best_result2 = number_neighbors_grid_search_results[best_number_neighbors]
    secho(f"Best at neighbors={best_number_neighbors}, score={best_result2:.3f}")

    with open(os.path.join(subdirectory, 'evaluation_results.json'), 'w') as file:
        json.dump(
            {
                'number_components_grid_search': {
                    'best_number_components': best_number_components,
                    'results': number_components_grid_search_results,
                },
                'number_neighbors_grid_search': {
                    'best_number_neighbors': best_number_neighbors,
                    'results': number_neighbors_grid_search_results,
                },
            },
            file,
            indent=2,
        )