示例#1
0
def create_hmms(aln_path_map,
                name=False,
                outdir=None,
                M=50,
                seq_id=90,
                add_cons=False,
                seq_lim=None,
                threads=1,
                verbose=False):
    verbose_num = 0

    work_items = []
    hmm_path_map = {}
    for pham, aln_path in aln_path_map.items():

        if outdir is not None:
            hmm_path_name = aln_path.with_suffix(".hmm").name
            hmm_path = outdir.joinpath(hmm_path_name)
        else:
            hmm_path = aln_path.with_suffix(".hmm")

        hmm_path_map[pham] = hmm_path

        hmm_name = None
        if name:
            hmm_name = str(pham)

        work_items.append((aln_path, hmm_path, hmm_name, add_cons, seq_lim, M,
                           seq_id, verbose_num))

    parallelize.parallelize(work_items, threads, hhmake, verbose=verbose)

    return hmm_path_map
示例#2
0
def align_pham_out_fastas(working_dir,
                          pham_fasta_map,
                          threads=1,
                          verbose=False):
    """Uses multiple processes to align fasta-formatted multiple sequence files
    for all of the phams listed.

    :param working_dir: Path to the directory where the files will be written
    :type working_dir: pathlib.Path
    :param phams_dict: Dictionary that maps phams to their fasta file path
    :type phams_dict: dict{Path}
    :param threads: Number of processes/threads to spawn during alignment
    :type threads: int
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :return pham_aln_map: Dictionary that maps phams to their aln file path
    :rtype pham_aln_map: dict
    """
    pham_aln_map = dict()

    if verbose:
        print("...Aligning pham gene fasta files...")

    work_items = []
    for pham, filepath in pham_fasta_map.items():
        aln_name = filepath.with_suffix(".aln").name
        aln_path = working_dir.joinpath(aln_name)

        pham_aln_map[pham] = aln_path

        work_items.append((filepath, aln_path))

    parallelize.parallelize(work_items, threads, run_clustalo, verbose=verbose)

    return pham_aln_map
示例#3
0
def write_phams(fasta_dir,
                aln_dir,
                phams_translations_dict,
                cores=1,
                verbose=False):
    work_items = []
    for pham, pham_translations in phams_translations_dict.items():
        work_items.append((fasta_dir, aln_dir, pham, pham_translations))

    parallelize.parallelize(work_items,
                            cores,
                            write_phams_process,
                            verbose=verbose)
示例#4
0
def align_fastas(fasta_path_map,
                 mat_out=False,
                 tree_out=False,
                 file_type="fasta",
                 mode="clustalo",
                 override=False,
                 outdir=None,
                 threads=1,
                 verbose=False):
    verbose_num = 0

    work_items = []
    aln_path_map = {}
    for pham, fasta_path in fasta_path_map.items():
        if outdir is not None:
            working_dir = outdir
        else:
            working_dir = fasta_path.parent

        fasta_path_name = fasta_path.with_suffix("").name

        if override:
            aln_path = working_dir.joinpath(".".join(
                [fasta_path_name, "fasta"]))
        else:
            aln_path = working_dir.joinpath(".".join([fasta_path_name, "aln"]))

        aln_path_map[pham] = aln_path

        mat_path = None
        if mat_out:
            mat_path = working_dir.joinpath(".".join([fasta_path_name, "mat"]))

        tree_path = None
        if tree_out:
            tree_path = working_dir.joinpath(".".join(
                [fasta_path_name, "tree"]))

        work_items.append((fasta_path, aln_path, mat_path, tree_path,
                           file_type, "fasta", 1, verbose_num))

    if mode == "clustalo":
        aln_driver = clustalo
    else:
        raise NotImplementedError("Alignment program not supported.")

    parallelize.parallelize(work_items, threads, aln_driver, verbose=verbose)

    return aln_path_map
示例#5
0
def create_centroid_graph(pan_alchemist,
                          clusters,
                          aln_dir,
                          threads=1,
                          verbose=False):
    thread_manager = multiprocessing.Manager()
    cluster_data = thread_manager.list()

    cluster_data_dicts = pan_handling.retrieve_cluster_data(
        pan_alchemist, clusters)

    for cluster_data_dict in cluster_data_dicts:
        cluster_data.append((cluster_data_dict["ClusterID"],
                             cluster_data_dict["CentroidSeq"].decode("utf-8")))

    work_items = []
    for i in range(len(cluster_data)):
        work_items.append((i, cluster_data))
    random.shuffle(work_items)

    temp_dir_path = Path(TEMP_DIR)
    if temp_dir_path.is_dir():
        shutil.rmtree(temp_dir_path)

    temp_dir_path.mkdir()

    if verbose:
        print("...Calculating centroid Levenshtein distances...")
    matrix_chunks = parallelize.parallelize(work_items,
                                            threads,
                                            create_centroid_graph_process,
                                            verbose=verbose)

    return matrix_chunks
示例#6
0
def create_pham_hmms(alchemist,
                     working_dir,
                     pham_ts_to_id,
                     cores=1,
                     name_map=None,
                     M=50,
                     seq_id=90,
                     add_cons=False,
                     seq_lim=None,
                     verbose=False):
    if name_map is None:
        name_map = dict()

    work_items = []
    for pham, pham_ts in pham_ts_to_id.items():
        name = name_map.get(pham)
        work_items.append(
            (working_dir, pham, pham_ts, name, M, seq_id, add_cons, seq_lim))

    hmm_paths = parallelize.parallelize(work_items,
                                        cores,
                                        create_pham_hmms_process,
                                        verbose=verbose)

    return {pham: path for pham, path in hmm_paths}
示例#7
0
def retrieve_intracluster_edges(db_filter, working_dir, node_names, matrix,
                                lookup_dict, gcs=DEFAULT_SETTINGS["gcs"],
                                kmer=DEFAULT_SETTINGS["kmer"],
                                sketch=DEFAULT_SETTINGS["sketch"],
                                threads=1, verbose=False):
    intracluster_edges = []
    node_name_set = set()

    for i in range(matrix.size):
        target_name = matrix.labels[i]
        target_cluster = lookup_dict[target_name]
        for j in range(i, matrix.size):
            query_name = matrix.labels[j]
            query_cluster = lookup_dict[query_name]

            if (target_cluster is not None) and (query_cluster is not None):
                if query_cluster == target_cluster:
                    continue

            pairwise_gcs = matrix.get_cell(i, j)
            if pairwise_gcs >= gcs:
                node_name_set.add(target_name)
                node_name_set.add(query_name)

                intracluster_edges.append((
                                        target_name, str(target_cluster),
                                        query_name, str(query_cluster),
                                        str(round(pairwise_gcs, 3))))

    db_filter.values = list(node_name_set)
    gs_and_ts = db_filter.select(["phage.PhageID", "phage.Sequence"],
                                 return_dict=False)

    fasta_dir = create_temp_path(str(working_dir.joinpath("fasta")))
    fasta_path_map = write_genome_fastas(
                                 gs_and_ts, fasta_dir, verbose=verbose,
                                 threads=threads)

    sketch_dir = create_temp_path(str(working_dir.joinpath("sketches")))
    sketch_path_map = sketch_genome_fastas(
                                      fasta_path_map, sketch_dir,
                                      verbose=verbose, threads=threads,
                                      kmer=kmer, sketch=sketch)

    work_items = []
    for edge in intracluster_edges:
        work_items.append((sketch_path_map[edge[0]], sketch_path_map[edge[2]],
                           edge))

    if verbose:
        print("Calculating phage genome ANI...")
    intracluster_edges = parallelize.parallelize(
                            work_items, threads, calculate_ani_process,
                            verbose=verbose)

    intracluster_edges.sort(reverse=True, key=lambda x: (
                                                float(x[4]) + float(x[5])) / 2)
    return intracluster_edges
示例#8
0
def sketch_genome_fastas(fasta_path_map, sketch_dir, verbose=False,
                         threads=1, kmer=DEFAULT_SETTINGS["kmer"],
                         sketch=DEFAULT_SETTINGS["sketch"]):
    if verbose:
        print("Sketching genome fasta files...")

    work_items = []
    sketch_path_map = {}
    for seq_id, fasta_path in fasta_path_map.items():
        sketch_path = sketch_dir.joinpath(f"{seq_id}.msh")
        sketch_path_map[seq_id] = sketch_path

        work_items.append((fasta_path, sketch_path, kmer, sketch))

    parallelize.parallelize(work_items, threads, alignment.mash_sketch,
                            verbose=verbose)

    return sketch_path_map
示例#9
0
def build_pan_towns(alchemist,
                    pan_alchemist,
                    hhdb_path,
                    pan_dict,
                    hmm_data_dir,
                    data_maps_tuple,
                    threads=1,
                    verbose=False):
    work_items = []
    hhr_path_map = {}
    for pham, hmm_path in data_maps_tuple[3].items():
        hhr_path = hmm_data_dir.joinpath(".".join([str(pham), "hhr"]))
        hhr_path_map[pham] = hhr_path

        work_items.append(
            (hmm_path, hhdb_path, hhr_path, None, False, 1, 0, 0, 1))

    if verbose:
        print("...Performing iterations of hhblitz to find HMM-HMM "
              "relationships...")
    parallelize.parallelize(work_items,
                            threads,
                            search.hhblits,
                            verbose=verbose)
示例#10
0
def build_symmetric_matrix(nodes,
                           distance_function,
                           is_distance=True,
                           names=None,
                           cores=1,
                           verbose=False):
    work_items = []

    row_indicies = [i for i in range(len(nodes))]
    chunk_size = int(floor(sqrt(len(nodes))))
    for i in row_indicies:
        subject = nodes[i]
        if len(nodes) - 1 == i:
            work_items.append((distance_function, subject, [], i, 0))
        else:
            query_node_chunks = basic.partition_list(nodes[i + 1:], chunk_size)
            for j in range(len(query_node_chunks)):
                work_items.append(
                    (distance_function, subject, query_node_chunks[j], i, j))

    matrix_data = parallelize.parallelize(work_items,
                                          cores,
                                          build_matrix_process,
                                          verbose=verbose)

    matrix_data.sort(key=lambda x: (x[1], x[2]))

    if names is None:
        names = row_indicies
    else:
        if len(names) != len(row_indicies):
            names = row_indicies
    matrix = SymmetricMatrix(names, is_distance=is_distance)

    for data in matrix_data:
        for i in range(len(data[0])):
            col = (data[2] * chunk_size) + (data[1] + i + 1)
            matrix.fill_cell(data[1], col, data[0][i])

    diagonal_value = 1
    if is_distance:
        diagonal_value = 0

    matrix.fill_diagonal(diagonal_value)
    return matrix
示例#11
0
def create_pham_alns(alchemist,
                     working_dir,
                     pham_ts_to_id,
                     cores=1,
                     mat_out=False,
                     tree_out=False,
                     infile_type="fasta",
                     outfile_type="fasta",
                     verbose=False):
    work_items = []
    for pham, pham_ts in pham_ts_to_id.items():
        work_items.append((working_dir, pham, pham_ts, mat_out, tree_out,
                           infile_type, outfile_type))

    fasta_paths = parallelize.parallelize(work_items,
                                          cores,
                                          create_pham_alns_process,
                                          verbose=verbose)

    return {pham: path for pham, path in fasta_paths}
示例#12
0
def evaluate_clustering_scheme(matrix, cluster_scheme, cores=1, verbose=False,
                               matrix_cache=None):
    if matrix_cache is None:
        matrix_cache = dict()

    work_items = []
    for cluster, cluster_members in cluster_scheme.items():
        if cluster is None:
            continue

        cluster_matrix = matrix_cache.get(cluster)

        if cluster_matrix is None:
            cluster_matrix = matrix.get_submatrix_from_labels(
                                            cluster_scheme[cluster])

        work_items.append((cluster, cluster_matrix))

    evaluations = parallelize.parallelize(work_items, cores,
                                          cluster_evaluation_subprocess,
                                          verbose=verbose)

    return {data[0]: data[1] for data in evaluations}
示例#13
0
def build_pan_neighborhoods(alchemist,
                            pan_alchemist,
                            values,
                            data_dir,
                            data_maps_tuple,
                            aD=75,
                            mD=65,
                            B=0.2,
                            threads=1,
                            verbose=False):
    matrix_chunks = create_centroid_graph(pan_alchemist,
                                          values,
                                          data_dir,
                                          threads=threads,
                                          verbose=verbose)

    thread_manager = multiprocessing.Manager()
    mD_cache = thread_manager.dict()
    data_cache = thread_manager.dict()
    path_cache = thread_manager.dict()

    temp_dir = Path(TEMP_DIR).joinpath("linker_files")
    temp_dir.mkdir()

    read_work_set = set()
    aln_work_items = []

    if verbose:
        print("...Constructing base for pham neighborhoods...")
    construct_neighborhood_base(pan_alchemist, matrix_chunks, read_work_set,
                                aln_work_items, data_dir, temp_dir, mD_cache,
                                data_cache, path_cache, aD, mD, B)

    if verbose:
        print("...Reloading pham neighborhood cluster data...")
    for cluster in read_work_set:
        path_cache[cluster] = (data_maps_tuple[0].get(int(cluster)),
                               data_maps_tuple[1].get(int(cluster)),
                               data_maps_tuple[2].get(int(cluster)))

    chunk_size = int(math.sqrt(len(aln_work_items)))
    if chunk_size > 0:
        aln_work_chunks = basic.partition_list(
            aln_work_items, int(math.sqrt(len(aln_work_items))))
    else:
        aln_work_chunks = [aln_work_items]

    if verbose:
        print("...Computing pham cluster minimum distances...")
    identity_edge_chunks = parallelize.parallelize(
        aln_work_chunks,
        threads,
        build_neighborhood_edge_process,
        verbose=verbose)
    identity_edges = []
    for chunk in identity_edge_chunks:
        identity_edges = identity_edges + chunk

    if verbose:
        print("...Writing neighborhood data to PAN...")
    pan_alchemist.session.add_all(identity_edges)
    pan_alchemist.session.commit()
    pan_alchemist.session.close()
示例#14
0
def cluster_db(matrix, eps, cores=1, verbose=False, is_distance=False,
               emax=0.9, S=1.6, M=2):
    if verbose:
        print("...Performing clustering iterations...")

    iteration_counter = 1
    cluster_counter = 0
    iter_scheme = dict()
    unclustered_matrix = matrix
    while True:
        iter_scheme[None] = list()

        if verbose:
            print(f"...Starting clustering iteration {iteration_counter}")

        iteration_counter += 1

        greedy_scheme = clustering.dbscan(unclustered_matrix, eps, 1,
                                          is_distance=is_distance,
                                          return_matrix=True)

        work_items = []
        for greedy_cluster, submatrix in greedy_scheme.items():
            if greedy_cluster is None or submatrix.size <= 1:
                iter_scheme[None] = iter_scheme[None] + submatrix.labels
                continue

            work_items.append((submatrix, is_distance, eps, emax, S, M))

        layered_schemes = parallelize.parallelize(
                                work_items, cores, iter_cluster_process,
                                verbose=False)

        work_items = []
        for scheme in layered_schemes:
            for cluster, cluster_members in scheme.items():
                if cluster is None:
                    iter_scheme[None] = iter_scheme[None] + cluster_members
                    continue

                cluster_counter += 1
                iter_scheme[cluster_counter] = cluster_members

        new_unclustered_members = set(iter_scheme[None])

        diff_unclustered = set(unclustered_matrix.labels).difference(
                                                    new_unclustered_members)

        if not diff_unclustered:
            break

        unclustered_matrix = matrix.get_submatrix_from_labels(
                                                list(new_unclustered_members))

    if verbose:
        print("...Finalizing clustering scheme...")

    iter_scheme_centroids = list()
    for cluster, cluster_members in iter_scheme.items():
        if cluster is None:
            continue

        submatrix = matrix.get_submatrix_from_labels(cluster_members)
        iter_scheme_centroids.append(submatrix.get_centroid())

    final_scheme = clustering.lloyds(matrix, iter_scheme_centroids,
                                     eps=eps, is_distance=is_distance,
                                     return_matrix=False)

    return final_scheme