Пример #1
0
def generate_fasta_index(fasta, aligner, outdir):
    """Generate fasta index.

    Parameters:
    -----------
    fasta : str
        Path to the fasta reference to index.
    aligner : str
        Aligner to use to build the index.
    outdir : str
        Path to the directory to write the index.

    Returns:
    --------
    str:
        Path to the bowtie2 index build
    """
    logger.info("Build index from the given fasta.")
    index = join(outdir, "index")
    if aligner == "bowtie2":
        cmd = "bowtie2-build -q {0} {1}".format(fasta, index)
    elif aligner == "bwa":
        cmd = "bwa index -p {1} {0}".format(fasta, index)
    process = sp.Popen(cmd, shell=True, stdout=sp.PIPE)
    _out, _err = process.communicate()
    return index
Пример #2
0
def leiden_iterations_java(
    network_file, iterations, resolution_parameter, tmp_dir, leiden_path
):
    """Use the java implementation of Leiden to prtition the network.

    Parameters:
    -----------
    network_file : str
        Path to the network computed previously. The file is 3 columns table
        separated by a tabulation with the id of the first contigs the id of the
        second one and the weights of the edge normalized or not.
    iterations : int
        Number of iterations of the algorithm of Leiden.
    resolution_parameter : float
        Resolution parameter for Leiden clustering.
    tmp_dir : str
        Path to the temporary directory.
    leiden_path : str
        Path to the directory with network analysis java implementation.

    Returns:
    --------
    dict:
        Dictionnary with the id of the contig as key and the list of the results
        of each iterations separated by a semicolon as values.
    """
    output_partition = dict()

    # Run the iterations of Leiden
    for i in range(iterations):
        logger.info("Iteration in progress: {0}".format(i))

        output = join(tmp_dir, "partition_{0}.txt".format(i))

        # Clusterize the network using Leiden.
        cmd = (
            " java -cp {0} nl.cwts.networkanalysis.run.RunNetworkClustering -i 4 -r {1} -w -o {2} -q Modularity -a Leiden {3}"
        ).format(leiden_path, resolution_parameter, output, network_file)
        process = sp.Popen(cmd, shell=True, stderr=sp.DEVNULL)
        process.communicate()

        # Save the results in a dictionnary
        if i == 0:
            with open(output, "r") as out:
                for line in out:
                    result = line.split("\t")
                    output_partition[int(result[0])] = result[1][:-1]

        else:
            with open(output, "r") as out:
                for line in out:
                    result = line.split("\t")
                    output_partition[int(result[0])] += ";" + result[1][:-1]

    # Remove isolates (nodes with no contacts):
    output_partition.pop(0)
    output_partition = remove_isolates(output_partition, network_file)

    return output_partition
Пример #3
0
def detect_core_bins(output_partition, iterations):
    """Detect core bins from the output of the partition algorithm.

    The function search for duplicated values in the output of Louvain or Leiden
    algorithm in order to find contigs which are always in the same bin. The
    bins find with this method are called the core bins.

    Parameters:
    -----------
    output_partition : dict
        Dictionnary with the id of the contig as key and the list of the results
        of each iterations separated by a semicolon as values.
    iterations : int
        Number of iterations made previously with the partition algorithm.

    Returns:
    --------
    dict:
        Dictionnary which has as keys the core bins id and as value the id of
        the contigs of the core bin.
    pandas.core.frame.DataFrame:
        Table with the id of the core bin and their values for each iterations.
    """
    # finding duplicate values in the output of louvain or leiden using a
    # flipped dictionary.

    # Create dictionnary for core bins
    core_bins = {}
    core_bins_contigs = {}
    core_bins_iterations = np.empty((0, iterations), int)
    core_bin_id = 0
    for key, value in output_partition.items():
        if value not in core_bins:
            # Create an entry in a dictionnary with all the contigs with
            # iterations list as a key.
            core_bins[value] = core_bin_id

            # Create an entry in a dictionnary with all the contigs with core
            # bin id as a key.
            core_bins_contigs[core_bin_id] = [key]
            core_bin_id += 1
            # Add a line to compute the array used to compute the distance
            # between two core bins
            core_bins_iterations = np.append(
                core_bins_iterations,
                np.array([list(map(int, value.split(";")))]),
                axis=0,
            )
        # If already an entry created for this bin add a contig in the lists.
        else:
            core_bins_contigs[core_bins[value]].append(key)

    # Transform the array in a dataframe
    core_bins_iterations = pd.DataFrame(core_bins_iterations)

    logger.info("{0} core bins were found.\n".format(len(core_bins)))

    return core_bins_contigs, core_bins_iterations
Пример #4
0
def retrieve_fasta(in_file, aligner, tmpdir):
    """
    Function to retrieve fasta from the given reference file. If index is given
    retrieve it using bowtie2 inspect. Thraw an error if not a fasta or bowtie2
    index.

    Parameters:
    -----------
    in_file : str
        Path to the reference file given.
    aligner : str
        Name of the aligner used. Either 'bowtie2' or 'bwa'.
    tmpdir : str
        Path to the temp directory to write the fasta if necessary.

    Returns:
    --------
    str:
        Path to the fasta file.
    """
    if check_is_fasta(in_file):
        fasta = in_file
    else:
        if check_fasta_index(in_file, aligner):
            if aligner == "bowtie2":
                logger.info("Retrieve fasta from bowtie2 index.")
                fasta = join(tmpdir, "assembly.fa")
                cmd = "bowtie2-inspect {0} > {1}".format(in_file, fasta)
                process = sp.Popen(cmd, shell=True, stdout=sp.PIPE)
                _out, _err = process.communicate()
            elif aligner == "bwa":
                if isfile(in_file + ".fa"):
                    if check_is_fasta(in_file + ".fa"):
                        fasta = in_file + ".fa"
                elif isfile(in_file + ".fasta"):
                    if check_is_fasta(in_file + ".fasta"):
                        fasta = in_file + ".fasta"
                else:
                    logger.error(
                        "If you give bwa index, please make sure the fasta exists with the same prefix."
                    )
                    raise ValueError
        else:
            logger.error(
                "Please give as a reference a bowtie2 index or a fasta.")
            raise ValueError
    return fasta
Пример #5
0
def generate_contact_map(
    assembly,
    contig_data_file,
    enzyme,
    name,
    pairs,
    out_dir,
    tmp_dir,
    filter_events=False,
    force=False,
    mat_fmt="graal",
    metator_object="final_bin",
    min_size=5000,
    pcr_duplicates=False,
    threads=1,
):
    """General function to extract pairs of the MetaTOR object and generate its
    the contact map.

    Parameters:
    -----------
    assembly : str
        Path to the fasta file containing the contigs of interest. Could be the
        whole or the extracted contigs of one bin.
    contig_data_file : str
        Path to the contig_data_final.txt file form MetaTOR output.
    enzyme : str
        Enzyme used to digest the genome in the HiC experiment. Example:
        HpaII,MluCI.
    name : str
        Name of the object. Could be the name of a contig, an id of a bin or the
        name of the bin. Example: "NODE_1" or "MetaTOR_1_0".
    pairs : str
        Path of the ".pairs" file or bgzip indexed pair file. If more than one
        is given, files should be separated by a comma.
    out_dir : str
        Path where output files should be written. Current directory by default.
    tmp_dir : str
        Path where temporary files will be written.
    filter_events : bool
        Filter spurious or uninformative 3C events. Requires a restriction
        enzyme. Default: False.
    force : bool
        If True, overwrite existing files with the same name as output.
        Default: False.
    mat_fmt : str
        Select the output matrix format. Can be either "bg2" for the bedgraph2
        format, "cool" for Mirnylab's cool format, or graal for a plain text COO
        format compatible with Koszullab's instagraal software.
        Default: "graal".
    metator_object : str
        Object to extract contigs to build the matrix. Either "contig",
        "core_bin", "overlapping_bin", "recursive_bin", "final_bin" or "other".
    min_size : int
        Minimum contig size required to keep it.
    pcr_duplicates : bool
        If True, PCR duplicates will be filtered based on genomic positions.
        Pairs where both reads have exactly the same coordinates are considered
        duplicates and only one of those will be conserved. Default: False.
    threads : int
        Numbers of threads to use. Default: 1.
    """

    # Extract bin information from metaTOR outdir.
    logger.info("Generate HiC contact map for %s", name)
    metator_data = MetatorObject(metator_object, name, assembly,
                                 contig_data_file, pairs, min_size)
    metator_data.set_contigs()
    if min_size > 0:
        metator_data.set_large_contigs()
    metator_data.write_fasta(tmp_dir, out_dir)
    metator_data.pairs = join(tmp_dir, name + ".pairs")

    # Extract pairs of the bin.
    n_pairs = extract_pairs(metator_data)

    if n_pairs == 0:
        logger.info("No pairs have been extracted")

    else:
        logger.info("%d pairs have been extracted.", n_pairs)

        # Launch hicstuff pipeline.
        hcp.full_pipeline(
            genome=metator_data.fasta,
            input1=metator_data.pairs,
            distance_law=False,
            enzyme=enzyme,
            filter_events=filter_events,
            force=force,
            mat_fmt=mat_fmt,
            out_dir=out_dir,
            pcr_duplicates=pcr_duplicates,
            plot=False,
            start_stage="pairs",
            threads=threads,
            tmp_dir=tmp_dir,
        )
    return n_pairs
Пример #6
0
def recursive_decontamination(
    algorithm,
    assembly,
    cluster_matrix,
    contig_data_file,
    final_fasta_dir,
    input_fasta_dir,
    iterations,
    network_file,
    outdir,
    overlapping_parameter,
    recursive_fasta_dir,
    resolution_parameter,
    size,
    temp_directory,
    threads,
):
    """Function to validate bins do the recursive decontamination using Louvain
    or Leiden algorithm

    Parameters:
    -----------
    algorithm : str
        Algorithm to use to recursively partition the network. Either leiden or
        louvain.
    assembly : str
        Path to the assembly file used for the partition.
    cluster_matrix : bool
        If True, build the clustering matrix and save it.
    contig_data_file : str
        Path to the contig data table to update.
    final_fasta_dir : str
        Path to write the final fasta decontaminated bins.
    input_fasta_dir : str
        Path to the directory where the fasta bin from the partition are.
    iterations : int
        Number of iterations to use for the recursive partition.
    network_file : str
        Path to the network file.
    outdir : str
        Path to the output directory where to write the output files.
    overlapping_parameter : int
        Hamming distance threshold in percentage to use to consider to bins as
        one in the recursive partition.
    recursive_fasta_dir : str
        Path to write the fasta decontaminated bins.
    resolution_parameter : float
        Resolution parameter to use if Leiden algorithm is chosen. It will be a
        factor of the cost function used. A resolution parameter of 1 will be
        equivalent as the modularity function used in Louvain. Higher these
        parameters, smaller the bins will be in the output.
    size : int
        Threshold size in base pair of the output bins.
    temp_directory : str
        Path to the directory used to write temporary files.
    threads : int
        Number of threads to use.

    Returns:
    --------
    scipy.sparse.coo.coo_matrix:
        Matrix with all the previously computed hamming distance between two
        contigs.
    """

    # Create folders in the temporary directory
    tmpdir_checkm = join(temp_directory, "checkm")
    os.makedirs(tmpdir_checkm, exist_ok=True)
    tmpdir_recursive_clustering = join(temp_directory, "recursive_clustering")
    os.makedirs(tmpdir_recursive_clustering, exist_ok=True)

    # Defined checkm output file path
    overlapping_checkm_file = join(outdir, "overlapping_checkm_results.txt")
    overlapping_taxonomy_file = join(outdir, "overlapping_checkm_taxonomy.txt")
    recursive_checkm_file = join(outdir, "recursive_checkm_results.txt")
    recursive_taxonomy_file = join(outdir, "recursive_checkm_taxonomy.txt")

    # Launch checkM
    checkm(
        input_fasta_dir,
        overlapping_checkm_file,
        overlapping_taxonomy_file,
        tmpdir_checkm,
        threads,
    )

    # Iterates Louvain or Leiden on contaminated and complete bins.
    contamination, contigs_data, clustering_matrix_file = recursive_clustering(
        assembly,
        iterations,
        overlapping_parameter,
        resolution_parameter,
        outdir,
        recursive_fasta_dir,
        algorithm,
        tmpdir_recursive_clustering,
        overlapping_checkm_file,
        overlapping_taxonomy_file,
        contig_data_file,
        network_file,
        cluster_matrix,
        size,
        threads,
    )

    # Recursive iterations of Louvain or Leiden on the contaminated bins. Save
    # bin information if the new bins have the same quality otherwise keep the
    # original bin information.
    if contamination:

        # Run checkm on the recursive bins.
        tmpdir_checkm = join(temp_directory, "checkm2")
        checkm(
            recursive_fasta_dir,
            recursive_checkm_file,
            recursive_taxonomy_file,
            tmpdir_checkm,
            threads,
        )

        # Compare
        bin_summary = compare_bins(
            overlapping_checkm_file,
            overlapping_taxonomy_file,
            recursive_checkm_file,
            recursive_taxonomy_file,
        )

    # Keep overlapping bin information
    else:
        logger.info("No contaminated bin have been found")
        bin_summary = mio.read_results_checkm(overlapping_checkm_file,
                                              overlapping_taxonomy_file)

    # Create fasta directory and copy final bins.
    for bin_name in bin_summary:
        dst = join(final_fasta_dir, bin_name + ".fa")
        if bin_name.split("_")[2] == "0":
            src = join(input_fasta_dir, bin_name + ".fa")
        else:
            src = join(recursive_fasta_dir, bin_name + ".fa")
        shutil.copyfile(src, dst)

    # Return some values of efficiency of the binning.
    give_results_info(bin_summary)

    # Write relevant bins/contigs information for anvio.
    binning_file = join(outdir, "binning.txt")
    contigs_data = write_bins_contigs(bin_summary, contigs_data, binning_file)

    # Compute the abundance of the mags.
    bin_summary = get_bin_coverage(bin_summary, contigs_data)

    # Save bin information in final file
    bin_summary_file = join(outdir, "bin_summary.txt")
    mio.write_checkm_summary(bin_summary, bin_summary_file)

    # Write the new file
    contig_data_file_final = join(outdir, "contig_data_final.txt")
    contigs_data.to_csv(contig_data_file_final,
                        sep="\t",
                        header=True,
                        index=False)

    # Plot some figures of contigs distribution inside bins:
    mtf.plot_figures(outdir, contigs_data, bin_summary, size)

    return clustering_matrix_file
Пример #7
0
def checkm(fasta_dir, outfile, taxonomy_file, tmpdir, threads):
    """Function to evaluate fasta bins using CheckM. Write the checkM results
    summary in the outfile and the taxonomy results in the the taxonomy file.

    Parameters:
    -----------
    fasta_dir : str
        Path to the input fasta of the bins to evaluate.
    outfile : str
        Path to the file where the results of checkm will be written.
    taxonomy_file : str
        path to the file where checkm taxonomy results will be written.
    tmpdir : str
        Path to the temporary directory where CheckM intermediary files will be
        written.
    threads : int
        Numbers of threads to use for CheckM.
    """

    logger.info("Start CheckM validation.")

    # Build CheckM tree
    cmd = "checkm tree -q -t {0} -x fa {1} {2}".format(threads, fasta_dir,
                                                       tmpdir)
    logger.info(cmd)
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()

    # Build taxonomy values of the bins
    cmd = "checkm tree_qa {0} -q -o 1 -f {1}".format(tmpdir, taxonomy_file)
    logger.info(cmd)
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()

    # Build lineage marker set
    markers_set = join(tmpdir, "markers.txt")
    cmd = "checkm lineage_set -q {0} {1}".format(tmpdir, markers_set)
    logger.info(cmd)
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()

    # Compute the analysis
    cmd = "checkm analyze -q -x fa -t {0} {1} {2} {3}".format(
        threads, markers_set, fasta_dir, tmpdir)
    logger.info(cmd)
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()

    # Write the summary file
    cmd = "checkm qa -q {0} {1} -o 2 > {2}".format(markers_set, tmpdir,
                                                   outfile)
    logger.info(cmd)
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()
Пример #8
0
def recursive_clustering(
    assembly,
    iterations,
    overlapping_parameter,
    resolution_parameter,
    outdir,
    recursive_fasta_dir,
    algorithm,
    tmpdir,
    checkm_file,
    taxonomy_file,
    contigs_data_file,
    network_file,
    cluster_matrix,
    size,
    threads,
):
    """Function to run recursive iterations on contaminated bins in order to try
    to improve the quality of the bins using Louvain or Leiden algorthm.

    Parameters:
    -----------
    assembly : str
        Path to the fasta file used as assembly.
    iterations : int
        Number of iterations to use for recursive iterations of Louvain or
        Leiden.
    overlapping_parameter : float
        Hamming distance threshold to consider two bins as the same bin.
    resolution parameter : float
        Resolution parameter of Leiden algorithm.
    outdir : str
        Path to the output directory.
    recursive_fasta_dir : str
        Path to the directory where to write the decontaminated fasta.
    algorithm : str
        Algorithm to use, either louvain or leiden.
    tmpdir : str
        Path the temp directory.
    checkm_file : str
        Path to the output file of CheckM from checkm function.
    taxonomy_file : str
        Path to the taxonomy CheckM file.
    contigs_data_file : str
        Path to the contigs data file from metator partition.
    network_file : str
        Path to the network file from metator network.
    cluster_matrix : bool
        If True, build the clustering matrix and save it.
    size : int
        Size threshodl in base pairs of the bins.
    threads : int
        Number of threads to use.

    Returns:
    --------
    boolean:
        True if at least one new recursive bin has been generated.
    pandas.DataFrame
        Updated dictionnary which has as keys the values of the iterations from
        the recursive partition separated by a semicolon and as values the list
        of the id of the contigs.
    scipy.sparse.coo.coo_matrix:
        Matrix with all the previously computed hamming distance between two
        contigs.
    """

    # Create temporary folders
    tmpdir_subnetwork = join(tmpdir, "recursive_bins")
    os.makedirs(tmpdir_subnetwork, exist_ok=True)
    tmpdir_clustering = join(tmpdir, "recursive_clustering")
    os.makedirs(tmpdir_clustering, exist_ok=True)
    tmpdir_binning = join(tmpdir, "recursive_bins")
    os.makedirs(tmpdir_binning, exist_ok=True)

    # Load CheckM result:
    checkm_summary = mio.read_results_checkm(checkm_file, taxonomy_file)

    # Load network:
    network = nx.read_edgelist(network_file,
                               nodetype=int,
                               data=(("weight", float), ))

    # Load contigs data:
    contigs_data = pd.read_csv(contigs_data_file,
                               sep="\t",
                               header=0,
                               index_col=False)

    # Add new coulumns for recursive information.
    contigs_data["Recursive_bin_ID"] = "0"
    contigs_data["Recursive_bin_contigs"] = "-"
    contigs_data["Recursive_bin_size"] = "-"
    contigs_data["Final_bin"] = "ND"

    # Default no contamination
    contamination = False

    # Create an empty matrix
    N = len(contigs_data.ID)
    clustering_matrix = sparse.coo_matrix((N + 1, N + 1), dtype=np.float32)

    # Iterate on chcekm summary to find conatminated bins:
    for bin_id in checkm_summary:
        if (float(checkm_summary[bin_id]["completness"]) >= 50) & (float(
                checkm_summary[bin_id]["contamination"]) >= 5):

            logger.info("Bin in progress: {0}".format(bin_id))
            subnetwork_file = join(tmpdir_subnetwork,
                                   "subnetwork_" + bin_id + ".txt")
            bin_id = str(bin_id.split("_")[1])

            # Extract contigs
            mask = contigs_data["Overlapping_bin_ID"].apply(str) == bin_id
            list_contigs = list(contigs_data.loc[mask, "ID"])

            # Extract subnetwork
            subnetwork = network.subgraph(list_contigs)

            # Write the new subnetwork
            nx.write_edgelist(subnetwork,
                              subnetwork_file,
                              delimiter="\t",
                              data=["weight"])

            # Stop to report info log
            logger.setLevel(logging.WARNING)

            # Use Louvain or Leiden algorithm the subnetwork.
            if algorithm == "leiden":
                LEIDEN_PATH = os.environ["LEIDEN_PATH"]
                output_partition = mtp.leiden_iterations_java(
                    subnetwork_file,
                    iterations,
                    resolution_parameter,
                    tmpdir_clustering,
                    LEIDEN_PATH,
                )
            elif algorithm == "louvain":
                LOUVAIN_PATH = os.environ["LOUVAIN_PATH"]
                output_partition = mtp.louvain_iterations_cpp(
                    subnetwork_file,
                    iterations,
                    tmpdir_clustering,
                    LOUVAIN_PATH,
                )
            else:
                logger.error(
                    'algorithm should be either "louvain" or "leiden"')
                raise ValueError

            # Detect core bins
            (
                recursive_core_bins,
                recursive_bins_iterations,
            ) = mtp.detect_core_bins(output_partition, iterations)

            # Compute the Hamming distance between core bins.
            hamming_distance = mtp.get_hamming_distance(
                recursive_bins_iterations,
                iterations,
                threads,
            )

            # Defined overlapping bins according to the threshold
            recursive_bins = mtp.defined_overlapping_bins(
                overlapping_parameter,
                hamming_distance,
                recursive_core_bins,
                recursive_bins_iterations,
            )

            # update bin data and generate fasta
            contamination, contigs_data = update_contigs_data_recursive(
                contigs_data,
                recursive_bins,
                assembly,
                recursive_fasta_dir,
                tmpdir_binning,
                size,
                contamination,
            )

            # Build the clustering matrix of the subnetwork and add it.
            if cluster_matrix:
                clustering_matrix += mtp.build_clustering_matrix(
                    recursive_core_bins, hamming_distance, N)

            # Put back the info log
            logger.setLevel(logging.INFO)

    # Save the clustering matrix
    if cluster_matrix:
        clustering_matrix_file = join(outdir, "clustering_matrix_recursive")
        sparse.save_npz(clustering_matrix_file, clustering_matrix)
    else:
        clustering_matrix_file = None

    return contamination, contigs_data, clustering_matrix_file
Пример #9
0
def give_results_info(bin_summary):
    """Function to return the general information about the binning results.

    Parameters:
    -----------
    bin_summary : dict
        Dictionnary with the summary results of the kept bins.
    """

    # Defined categories of the bins
    HQ = 0  # Completness >= 90 and Contamination <= 5
    total_size_HQ = 0
    MQ = 0  # Completness >= 70 and Contamination <= 10
    total_size_MQ = 0
    LQ = 0  # Completness >= 50 and Contamination <= 10
    total_size_LQ = 0
    conta_bins = 0  # Completness >= 50 and Contamination > 10
    total_size_conta_bins = 0
    others = 0  # Not determined bins.
    total_size_others = 0

    # Class each bin in a category
    for bin_name in bin_summary:
        completness = float(bin_summary[bin_name]["completness"])
        contamination = float(bin_summary[bin_name]["contamination"])
        size = int(bin_summary[bin_name]["size"])
        if completness >= 50:
            if contamination > 10:
                conta_bins += 1
                total_size_conta_bins += size
            else:
                if completness >= 90 and contamination <= 5:
                    HQ += 1
                    total_size_HQ += size
                elif completness >= 70:
                    MQ += 1
                    total_size_MQ += size
                else:
                    LQ += 1
                    total_size_LQ += size
        else:
            others += 1
            total_size_others += size
    total = HQ + MQ + LQ + conta_bins + others
    total_size = (total_size_HQ + total_size_MQ + total_size_LQ +
                  total_size_conta_bins + total_size_others)

    # Return info in the logger:
    logger.info(
        "{0} bins have been kept after the recursive iterations.".format(
            total))
    logger.info("Total size of the extracted bins: {0}".format(total_size))
    logger.info("HQ MAGs: {0}\tTotal Size: {1}".format(HQ, total_size_HQ))
    logger.info("MQ MAGs: {0}\tTotal Size: {1}".format(MQ, total_size_MQ))
    logger.info("LQ MAGs: {0}\tTotal Size: {1}".format(LQ, total_size_LQ))
    logger.info("Contaminated potential MAGs: {0}\tTotal Size: {1}".format(
        conta_bins, total_size_conta_bins))
    logger.info("Others bins: {0}\tTotal Size: {1}".format(
        others, total_size_others))
Пример #10
0
def precompute_network(
    alignment_files,
    contig_data,
    hit_data,
    out_file,
    tmp_dir,
    self_contacts=False,
):
    """Write a file with only the contig id separated by a tabulation and count
    the contacts by contigs to be able to compute directlty the normalized
    network.

    Parameters:
    -----------
    alignment_files : list of str
        List of path to the alignment file(s).
    contig_data : dict
        Dictionnary of the all the contigs from the assembly, the contigs names
        are the keys to the data of the contig available with the following
        keys: "id", "length", "GC", "hit", "coverage". Coverage still at 0 and
        need to be updated later.
    hit_data : dict
        Dictionnary with the count of hits for each aligment file.
    out_file : str
        Path to the write the output_file which will be necessary to compute the
        network.
    self_contacts : bool
        If True, the contacts on the same contigs will be kept. Otherwise only
        displays the inter contigs contacts. [Default False]

    Return:
    -------
    dict:
        Dictionnary of the all the contigs from the assembly, the contigs names
        are the keys to the data of the contig available with the following
        keys: "id", "length", "GC", "hit", "coverage" "RS". Coverage still at 0
        and need to be updated later.
    """
    # Initiate value to compute 3D ratio
    all_contacts = 0
    inter_contacts = 0
    out_files_list = []

    # Prepare a file to save contact with their global ID
    with open(out_file, "w") as pre_net:

        # Iterates on the alignment files
        for i, aligment_file in enumerate(alignment_files):

            all_contacts_temp = 0
            inter_contacts_temp = 0
            out_file_sample = join(tmp_dir, "prenetwork" + str(i) + ".txt")
            out_files_list.append(out_file_sample)

            # Read the alignment_file and build pairs for the network
            with open(aligment_file,
                      "r") as pairs, open(out_file_sample,
                                          "w") as pre_net_sample:
                for pair in pairs:
                    # Ignore header lines
                    if pair.startswith("#"):
                        continue

                    # Split the line on the tabulation
                    p = pair.split("\t")

                    # Extract the contig names which are at the position 2 and
                    # 4.
                    contig1, contig2 = p[1], p[3]
                    id1 = contig_data[contig1]["id"]
                    id2 = contig_data[contig2]["id"]

                    # Count the contact
                    all_contacts_temp += 1
                    contig_data[contig1]["hit"] += 1
                    contig_data[contig2]["hit"] += 1
                    if len(alignment_files) > 1:
                        hit_data[contig1]["hit"][i] += 1
                        hit_data[contig2]["hit"][i] += 1

                    # Write the file used for the computation of the network.
                    if self_contacts and id1 == id2:
                        pre_net.write("\t".join(map(str, [contig1, contig2])) +
                                      "\n")
                        pre_net_sample.write(
                            "\t".join(map(str, [contig1, contig2])) + "\n")
                    elif id1 < id2:
                        inter_contacts_temp += 1
                        pre_net.write("\t".join(map(str, [contig1, contig2])) +
                                      "\n")
                        pre_net_sample.write(
                            "\t".join(map(str, [contig1, contig2])) + "\n")
                    elif id1 > id2:
                        inter_contacts_temp += 1
                        pre_net.write("\t".join(map(str, [contig2, contig1])) +
                                      "\n")
                        pre_net_sample.write(
                            "\t".join(map(str, [contig2, contig1])) + "\n")

            # Count contacts and return sample informations.
            all_contacts += all_contacts_temp
            inter_contacts += inter_contacts_temp
            logger.info("Information of {0}:".format(basename(aligment_file)))
            logger.info(
                "{0} contacts in the library.".format(all_contacts_temp))
            logger.info("{0} contacts inter-contigs in the library.".format(
                inter_contacts_temp))
            logger.info("3D ratio : {0}\n".format(inter_contacts_temp /
                                                  all_contacts_temp))

    # Return information about the network
    if len(alignment_files) > 1:
        logger.info("General information:")
        logger.info("{0} contacts in the library.".format(all_contacts))
        logger.info("{0} contacts inter-contigs in the library.".format(
            inter_contacts))
        logger.info("3D ratio : {0}\n".format(inter_contacts / all_contacts))

    return contig_data, out_files_list
Пример #11
0
    def execute(self):

        # Defined the temporary directory.
        if not self.args["--tmpdir"]:
            tmp_dir = mio.generate_temp_dir("./tmp")
        else:
            tmp_dir = self.args["--tmpdir"]
            os.makedirs(tmp_dir, exist_ok=True)

        # Defined the output directory and output file names.
        if not self.args["--outdir"]:
            self.args["--outdir"] = "."
        os.makedirs(self.args["--outdir"], exist_ok=True)
        overlapping_fasta_dir = join(self.args["--outdir"], "overlapping_bin")
        if not exists(overlapping_fasta_dir):
            os.makedirs(overlapping_fasta_dir)
        else:
            if self.args["--force"]:
                shutil.rmtree(overlapping_fasta_dir)
                os.makedirs(overlapping_fasta_dir)
            else:
                print(self.args["--force"])
                logger.error(
                    "%s already existed. Remove directory or use -F argument to overwrite it.",
                    overlapping_fasta_dir,
                )
                raise ValueError

        # Enable file logging
        now = time.strftime("%Y%m%d%H%M%S")
        log_file = join(self.args["--outdir"], ("metator_" + now + ".log"))
        mtl.set_file_handler(log_file)

        # Define variable
        min_qual = int(self.args["--min-quality"])
        iterations = int(self.args["--iterations"])
        recursive_iterations = int(self.args["--rec-iter"])
        overlapping_parameter = int(self.args["--overlap"]) / 100
        recursive_overlapping_parameter = int(self.args["--rec-overlap"]) / 100
        size = int(self.args["--size"])
        threads = int(self.args["--threads"])
        resolution_parameter = float(self.args["--res-param"])

        # Check correct algorithm value
        if self.args["--algorithm"] not in ["louvain", "leiden"]:
            logger.error('algorithm should be either "louvain" or "leiden"')
            raise ValueError

        # Check if normalization in the list of possible normalization.
        list_normalization = [
            "None",
            "abundance",
            "length",
            "RS",
            "empirical_hit",
            "theoritical_hit",
        ]
        if self.args["--normalization"] not in list_normalization:
            logger.error(
                'Normalization should be among this list: "None", "abundance", "length", "RS", "empirical_hit", "theoritical_hit"'
            )
            raise ValueError
        enzyme_required = ["RS", "theoritical_hit"]
        if (self.args["--normalization"] in enzyme_required
                and not self.args["--enzyme"]):
            logger.error(
                'For "RS" and "theoritical_hit" normalization, enzyme is required.'
            )
            raise ValueError
        depth_required = ["abundance", "theoritical_hit"]
        if (self.args["--normalization"] in depth_required
                and not self.args["--depth"]):
            logger.error(
                'For "abundance" and "theoritical_hit" normalization, depth is required.'
            )
            raise ValueError

        # Sanity check for validation
        if not self.args["--skip-validation"]:
            recursive_fasta_dir = join(self.args["--outdir"], "recursive_bin")
            if not exists(recursive_fasta_dir):
                os.makedirs(recursive_fasta_dir)
            else:
                if self.args["--force"]:
                    shutil.rmtree(recursive_fasta_dir)
                    os.makedirs(recursive_fasta_dir)
                else:
                    logger.error(
                        "%s already existed. Remove directory or use -F argument to overwrite it",
                        recursive_fasta_dir,
                    )
                    raise ValueError
            final_fasta_dir = join(self.args["--outdir"], "final_bin")
            if not exists(final_fasta_dir):
                os.makedirs(final_fasta_dir)
            else:
                if self.args["--force"]:
                    shutil.rmtree(final_fasta_dir)
                    os.makedirs(final_fasta_dir)
                else:
                    logger.error(
                        "%s already existed. Remove directory or use -F argument to overwrite it.",
                        final_fasta_dir,
                    )
                    raise ValueError

            # Check checkM availability
            if not mio.check_checkm():
                logger.error(
                    "CheckM is not in the path. Could not make the iterations")
                raise NameError

        # Manage start point.
        if self.args["--start"] == "fastq":
            start = 1
        elif self.args["--start"] == "bam":
            start = 2
        elif self.args["--start"] == "pair":
            start = 3
        elif self.args["--start"] == "network":
            start = 4
        else:
            logger.error(
                "Start argument should be 'fastq', 'bam', 'pair' or 'network'."
            )
            raise ValueError

        # Check if forward and reverse reads are given for fastq and bam start.
        if (self.args["--start"] == "fastq" or
            (self.args["--start"] == "bam" and self.args["--aligner"]
             == "bowtie2")) and not self.args["--reverse"]:
            logger.error(
                "Forward and reverse arguments are necessary for fastq with %s start and %s aligner.",
                self.args["--start"],
                self.args["--aligner"],
            )
            raise ValueError

        # Print information of the workflow:
        if start == 1:
            logger.info("Minimum mapping quality: %d", min_qual)
        if start <= 2:
            logger.info("Enzyme: %s", self.args["--enzyme"])
            logger.info("Normalization: %s", self.args["--normalization"])
        logger.info("Aligner algorithm: %s", self.args["--aligner"])
        logger.info("Partition algorithm: %s", self.args["--algorithm"])
        logger.info("Partition iterations: %s", iterations)
        logger.info("Overlapping parameter: %s", overlapping_parameter)
        if not self.args["--skip-validation"]:
            logger.info("Recursive partition iterations: %d",
                        recursive_iterations)
            logger.info(
                "Recursive overlapping parameter: %s",
                recursive_overlapping_parameter,
            )

        # Extract index and genome file
        assembly = self.args["--assembly"]
        # Check what is the reference. If a fasta is given build the index. If a
        # bowtie2 index is given, retreive the fasta.
        index = mio.check_fasta_index(assembly, mode=self.args["--aligner"])
        if index is None:
            if mio.check_is_fasta(assembly):
                fasta = assembly
                if start == 1:
                    index = mio.generate_fasta_index(fasta,
                                                     self.args["--aligner"],
                                                     tmp_dir)
            else:
                logger.error(
                    "Please give as assembly argument a bowtie2 index or a fasta."
                )
                raise ValueError
        else:
            fasta = mio.retrieve_fasta(index, self.args["--aligner"], tmp_dir)

        # Run the whole workflow
        if start <= 3:
            if start <= 2:
                # Align pair-end reads with bowtie2
                alignment_files, contig_data, hit_data = mta.get_contact_pairs(
                    self.args["--forward"],
                    self.args["--reverse"],
                    index,
                    fasta,
                    self.args["--aligner"],
                    min_qual,
                    self.args["--start"],
                    self.args["--depth"],
                    self.args["--enzyme"],
                    self.args["--outdir"],
                    tmp_dir,
                    self.args["--threads"],
                )
            else:
                alignment_files = self.args["--forward"].split(",")
                nb_alignment = len(alignment_files)
                contig_data, hit_data = mtn.create_contig_data(
                    fasta,
                    nb_alignment,
                    self.args["--depth"],
                    self.args["--enzyme"],
                )
            # Build the network
            network_file, contigs_data_file = mtn.alignment_to_contacts(
                alignment_files,
                contig_data,
                hit_data,
                self.args["--outdir"],
                "network.txt",
                "contig_data_network.txt",
                tmp_dir,
                self.args["--threads"],
                self.args["--normalization"],
                False,
            )
        else:
            contigs_data_file = self.args["--contigs"]
            network_file = self.args["--network"]

        # Partition the network
        clustering_matrix_partition_file, contigs_data_file = mtp.partition(
            self.args["--algorithm"],
            fasta,
            self.args["--cluster-matrix"],
            contigs_data_file,
            iterations,
            network_file,
            self.args["--outdir"],
            overlapping_fasta_dir,
            overlapping_parameter,
            resolution_parameter,
            size,
            tmp_dir,
            threads,
        )

        # remove contig_data_network if not an input
        if start <= 2:
            contig_data_network_file = join(self.args["--outdir"],
                                            "contig_data_network.txt")
            os.remove(contig_data_network_file)

        # Launch validation if desired.
        if not self.args["--skip-validation"]:
            clustering_matrix_recursive_file = mtv.recursive_decontamination(
                self.args["--algorithm"],
                fasta,
                self.args["--cluster-matrix"],
                contigs_data_file,
                final_fasta_dir,
                overlapping_fasta_dir,
                recursive_iterations,
                network_file,
                self.args["--outdir"],
                recursive_overlapping_parameter,
                recursive_fasta_dir,
                resolution_parameter,
                size,
                tmp_dir,
                threads,
            )

            if self.args["--cluster-matrix"]:
                # Make the sum with the partiton clustering matrix and save it.
                clustering_matrix = load_npz(clustering_matrix_partition_file +
                                             ".npz")
                clustering_matrix_recursive = load_npz(
                    clustering_matrix_recursive_file + ".npz")
                clustering_matrix = (
                    (clustering_matrix + clustering_matrix_recursive) /
                    2).tocoo()
                clustering_matrix_file = join(self.args["--outdir"],
                                              "clustering_matrix")
                save_npz(clustering_matrix_file, clustering_matrix)

            # Remove contig_data_partition file
            contig_data_partition_file = join(self.args["--outdir"],
                                              "contig_data_partition.txt")
            os.remove(contig_data_partition_file)

        # Delete pyfastx index:
        os.remove(fasta + ".fxi")
        # Delete the temporary folder.
        if not self.args["--no-clean-up"]:
            shutil.rmtree(tmp_dir)
Пример #12
0
def defined_overlapping_bins(
    overlap, hamming_distance, core_bins_contigs, core_bins_iterations
):
    """This function extract the overlapped bins

    From the hamming distances between the core bins, the function identifies
    the overlapping bins and create a dictionnary with the list of the contigs
    ID for each core bin.

    Two core bins are considered overlapping if there have a percentage of
    identity superior or equal to the threshold given.

    Parameters:
    -----------
    overlap : float
        hamming distance threshold use to consider that two bins are
        overlapping.
    hamming_distance : scipy.sparse.csr.csr_matrix
        Matrix with all the previously computed hamming distance between two
        core bins.
    core_bins_contigs : dict
        Dictionnary which has as keys the core bins id and as value the id of
        the contigs of the core bin.
    core_bins_iteration : pandas.core.frame.DataFrame
        Table with the id of the core bin and their values for each iterations.

    Returns:
    --------
    dict:
        A dictionnary with the id of the overlapping bins as keys and the list
        of id of their contigs as values.
    """
    # Extract bins which are connected, i.e. bins with an hamming distance
    # superior than the threshold given. The small variation is necessary as
    # python give a float not really equal to the true value
    # (i.e. 0.1 -> 0.09999999999999998)
    connections = hamming_distance >= (overlap - 1e-10)
    overlapping_bins_id = sparse.csgraph.connected_components(
        connections, directed=False
    )[1]

    # Create a dictionnary of the overlapped bins (ID from the previous file)
    # with the ID of their contigs as value
    overlapping_bins = {}
    cc_id = 0
    # Iterate on each core bins.
    for oc_id in overlapping_bins_id:
        # Extract contig ID from the core bin.
        core_bin_contigs = core_bins_contigs[cc_id].copy()
        # Add the contig ID on the overlapping bin.
        if oc_id + 1 not in overlapping_bins:
            overlapping_bins[oc_id + 1] = core_bin_contigs
        else:
            overlapping_bins[oc_id + 1] += core_bin_contigs
        cc_id += 1

    logger.info(
        "{0} overlapping bins were found.".format(len(overlapping_bins))
    )

    return overlapping_bins
Пример #13
0
def partition(
    algorithm,
    assembly,
    cluster_matrix,
    contig_data_file,
    iterations,
    network_file,
    outdir,
    fasta_dir,
    overlapping_parameter,
    resolution_parameter,
    size,
    temp_directory,
    threads,
):
    """Function to call the others functions to partition the network.

    Parameters:
    -----------
    algorithm : str
        Algorithm to use to partition the network. Either leiden or louvain.
    assembly : str
        Path to the assembly file used for the partition.
    cluster_matrix : bool
        If True, build and save the clustering matrix.
    contig_data_file : str
        Path to the contig data table to update.
    iterations : int
        Number of iterations to use for the partition.
    network_file : str
        Path to the network file.
    outdir : str
        Path to the output directory where to write the output files.
    fasta_dir : str
        Path to directory where to write the fasta files.
    overlapping_parameter : int
        Hamming distance threshold to use to merge bins (percentage).
    resolution_parameter : float
        Resolution parameter to use if Leiden algorithm is chosen. It will be a
        factor of the cost function used. A resolution parameter of 1 will be
        equivalent as the modularity function used in Louvain. Higher these
        parameters, smaller the bins will be in the output.
    size : int
        Threshold size in base pair of the output bins.
    temp_directory : str
        Path to the directory used to write temporary files.
    threads : int
        Number of threads to use.

    Returns:
    --------
    scipy.sparse.coo.coo_matrix:
        Matrix with all the previously computed hamming distance between two
        contigs.
    str:
        Path to the new contig data file with the bin informations in it.
    """

    # Create partition folders in the temporary directory
    temp_directory = join(temp_directory, "partition")
    os.makedirs(temp_directory, exist_ok=True)
    temp_directory_clustering = join(temp_directory, "clustering")
    os.makedirs(temp_directory_clustering, exist_ok=True)
    temp_directory_bins = join(temp_directory, "partition_bins")
    os.makedirs(temp_directory_bins, exist_ok=True)

    # Perform the iterations of Louvain or Leiden to partition the network.
    logger.info("Start iterations:")
    if algorithm == "leiden":
        LEIDEN_PATH = os.environ["LEIDEN_PATH"]
        output_partition = leiden_iterations_java(
            network_file,
            iterations,
            resolution_parameter,
            temp_directory_clustering,
            LEIDEN_PATH,
        )
    elif algorithm == "louvain":
        LOUVAIN_PATH = os.environ["LOUVAIN_PATH"]
        output_partition = louvain_iterations_cpp(
            network_file,
            iterations,
            temp_directory_clustering,
            LOUVAIN_PATH,
        )
    else:
        logger.error('algorithm should be either "louvain" or "leiden"')
        raise ValueError

    # Detect core bins
    logger.info("Detect core bins:")
    (
        core_bins_contigs,
        core_bins_iterations,
    ) = detect_core_bins(output_partition, iterations)

    # Compute the Hamming distance between core bins.
    logger.info("Detect overlapping bins:")
    hamming_distance = get_hamming_distance(
        core_bins_iterations,
        iterations,
        threads,
    )

    # Defined overlapping bins according to the threshold
    overlapping_bins = defined_overlapping_bins(
        overlapping_parameter,
        hamming_distance,
        core_bins_contigs,
        core_bins_iterations,
    )

    # Update the contigs_data_file.
    logger.info("Extract bins:")
    contigs_data, contigs_data_file = update_contigs_data(
        contig_data_file,
        core_bins_contigs,
        overlapping_bins,
        outdir,
    )

    # Generate Fasta file
    generate_fasta(
        assembly,
        overlapping_bins,
        contigs_data,
        size,
        fasta_dir,
        temp_directory_bins,
    )

    if cluster_matrix:
        # Build clustering matrix and save it.
        logger.info("Build  clustering matrix")
        clustering_matrix = build_clustering_matrix(
            core_bins_contigs, hamming_distance, len(contigs_data.ID)
        )
        # Save the clustering matrix
        clustering_matrix_file = join(outdir, "clustering_matrix_partition")
        sparse.save_npz(clustering_matrix_file, clustering_matrix)
    else:
        clustering_matrix_file = None

    return clustering_matrix_file, contigs_data_file
Пример #14
0
def louvain_iterations_cpp(network_file, iterations, tmp_dir, louvain_path):
    """Use the cpp original Louvain to partition the network.

    Parameters:
    -----------
    network_file : str
        Path to the network computed previously. The file is 3 columns table
        separated by a tabulation with the id of the first contigs the id of the
        second one and the weights of the edge normalized or not.
    iterations : int
        Number of iterations of the algorithm of Louvain.
    tmp_dir : str
        Path to the temporary directory.
    louvain_path : str
        Path to the directory with louvain functions.

    Returns:
    --------
    dict:
        Dictionnary with the id of the contig as key and the list of the results
        of each iterations separated by a semicolon as values.
    """

    # Check if louvain cpp is available in the computer. If it's not available
    # launch python_louvain instead.
    if not mio.check_louvain_cpp(louvain_path):
        logger.error("Louvain implementation was not found.")
        logger.error(
            "You should have a LOUVAIN_PATH variable in your environnement"
        )
        raise NameError

    # Defined temporary files and args for louvain fonction calling and path to
    # the variables to call.
    network_bin = join(tmp_dir, "net_bin")
    network_weight = join(tmp_dir, "net_weight")
    network_tree = join(tmp_dir, "net_tree")
    network_labels = join(tmp_dir, "labels.txt")
    level_louvain = join(tmp_dir, "level.txt")
    output = join(tmp_dir, "output_louvain_")
    louvain = join(louvain_path, "louvain")
    convert = join(louvain_path, "convert")
    hierarchy = join(louvain_path, "hierarchy")
    output_louvain = dict()

    # Create dictionnary of all arguments
    louvain_args = {
        "net_txt": network_file,
        "net_bin": network_bin,
        "net_weight": network_weight,
        "net_tree": network_tree,
        "net_labels": network_labels,
        "level_file": level_louvain,
        "output": output,
        "level": 0,
        "iteration": 0,
        "convert": convert,
        "louvain": louvain,
        "hierarchy": hierarchy,
    }

    # Convert the file in binary file for Louvain partitionning.
    cmd = (
        "{convert} -i {net_txt} -o {net_bin} -r {net_labels} -w {net_weight}"
    ).format(**louvain_args)
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()

    # Create a dictionary of Louvain labels and original contig id.
    labels = dict()
    with open(louvain_args["net_labels"]) as label_file:
        for label in label_file:
            label = label.split()
            labels[label[1]] = int(label[0])

    # Run the iterations of Louvain
    for i in range(iterations):
        logger.info("Iteration in progress: {0}".format(i))

        louvain_args["iteration"] = i

        # Partiotining with weights using louvain and compute the bin tree.
        cmd = ("{louvain} {net_bin} -l -1 -w {net_weight} > {net_tree}").format(
            **louvain_args
        )
        process = sp.Popen(cmd, shell=True)
        out, err = process.communicate()

        cmd = ("{hierarchy} {net_tree} > {level_file}").format(**louvain_args)
        process = sp.Popen(cmd, shell=True)
        out, err = process.communicate()

        level_file = open(level_louvain, "r")
        louvain_args["level"] = level_file.readlines()[-1][6]
        level_file.close()

        cmd = (
            "{hierarchy} {net_tree} -l {level} > {output}{iteration}.txt"
        ).format(**louvain_args)
        process = sp.Popen(cmd, shell=True)
        out, err = process.communicate()

        # Save the results in a dictionnary
        if i == 0:
            with open(output + str(i) + ".txt", "r") as out:
                for line in out:
                    result = line.split(" ")
                    output_louvain[labels[result[0]]] = result[1][:-1]

        else:
            with open(output + str(i) + ".txt", "r") as out:
                for line in out:
                    result = line.split(" ")
                    output_louvain[labels[result[0]]] += ";" + result[1][:-1]

    return output_louvain
Пример #15
0
def generate_fasta(
    assembly, overlapping_bins, contigs_data, size, output_dir, tmpdir
):
    """Generate the fasta files of each bins from the assembly.

    Parameters:
    -----------
    assembly : str
        Path to the fasta file of the original assembly.
    overlapping_bins : dict
        A dictionnary with the id of the overlapping bins as keys and the list
        of id of their contigs as values.
    contigs_data : pandas.core.frame.DataFrame
        Table with all the information on the contigs included their
        appartenance to the bins.
    size :  int
        Thrshold size chosen to write the bins.
    output_dir : str
        Path to the output directory where the fasta of all the bin will be
        written.
    tmpdir : str
        Path to the temporary directory to write the temporary contigs list
        files.
    """

    nb_bins = 0
    length_bins = 0
    # For each bin create a list of the contigs and extract them from the
    # assembly to create a new fasta file with only the bin.
    for bin_id in overlapping_bins:
        # Extract the list of the contigs from the contigs data file.
        list_contigs_id = overlapping_bins[bin_id]
        list_contigs_name = []
        # Test if the bin is bigger than the size threshold given.
        length_bin = contigs_data.loc[
            list_contigs_id[0] - 1, "Overlapping_bin_size"
        ]
        if length_bin >= size:
            nb_bins += 1
            length_bins += length_bin
            for contig_id in list_contigs_id:
                list_contigs_name.append(
                    contigs_data.loc[contig_id - 1, "Name"]
                )
            # Define the output file.
            output_file = join(output_dir, "MetaTOR_{0}_0.fa".format(bin_id))
            # Create the fasta file.
            contigs_file = join(tmpdir, "MetaTOR_{0}_0.txt".format(bin_id))
            with open(contigs_file, "w") as f:
                for contig_name in list_contigs_name:
                    f.write("%s\n" % contig_name)
            cmd = "pyfastx extract {0} -l {1} > {2}".format(
                assembly, contigs_file, output_file
            )
            process = sp.Popen(cmd, shell=True)
            process.communicate()
    logger.info("{0} bins have been extracted".format(nb_bins))
    logger.info(
        "Total size of the extracted bins: {0}Mb".format(
            round(length_bins / 10 ** 6, 3)
        )
    )
Пример #16
0
def get_contact_pairs(
    for_in,
    rev_in,
    index,
    assembly,
    aligner,
    min_qual,
    start,
    depth_file,
    enzyme,
    out_dir,
    tmp_dir,
    n_cpu,
):
    """General function to do the whole alignment of both fastq.

    The Function write at the output directory location given as an argument and
    return a tsv file of the aligned reads with 9 columns: ReadID, ContigA,
    Position_startA, Position_endA, StrandA, ContigB, Position_startB,
    Position_endB, StrandB. The name of the file will be alignment.txt.

    Two start stages are possible, from fastq or bam files.

    Parameters:
    -----------
    for_in : str
        Path to input forward fastq or bam file to align. If multiple files are
        given, list of path separated by a comma.
    rev_in : str
        Path to input reverse fastq or bam file to align. If multiple files are
        given, list of path separated by a comma.
    index : str
        Path to the bowtie2 index of the assembly.
    assembly : str
        The initial assembly path acting as the alignment file's reference
        assembly.
    aligner : str
        Either 'bowtie2' or 'bwa' aligner used or to be use to map the reads.
    min_qual : int
        Minimum mapping quality required to keep Hi-C pairs.
    start : str
        Either fastq or bam. Starting point for the pipeline.
    depth_file : str or None
        Path to the depth.txt file from jgi_summarize_bam_contig_depths from
        Metabat2 Software.
    enzyme : str or None
        String that contains the names of the enzyme separated by a comma.
    out_dir : str
        Path to directory where to write the output file.
    tmp_dir : str
        Path where temporary files should be written.
    n_cpu : int
        The number of CPUs to use for the alignment.

    Returns:
    --------
    list of str:
        List of path of the Files with the table containing the alignement data
        of the pairs: ReadID, ContigA, Position_startA, Position_endA, StrandA,
        ContigB, Position_startB, Position_endB, StrandB.
    dict
        Dictionnary of the all the contigs from the assembly, the contigs names
        are the keys to the data of the contig available with the following
        keys: "id", "length", "GC", "hit", "coverage". Coverage still at 0 and
        need to be updated later.
    dict:
        Dictionnary for hit information on each contigs.
    """

    # Iterates on all the input files:
    for_list = for_in.split(",")
    rev_list = rev_in.split(",")
    out_file_list = []
    total_aligned_pairs = 0

    # Create the contig data dictionnary and hit from each alignments
    nb_alignment = len(for_list)
    contig_data, hit_data = mtn.create_contig_data(assembly, nb_alignment,
                                                   depth_file, enzyme)

    for i in range(len(for_list)):
        for_in = for_list[i]
        try:
            rev_in = rev_list[i]
        except IndexError:
            rev_in = None
        name = "alignment_" + str(i)
        out_file = join(out_dir, "alignment_" + str(i) + ".pairs")
        out_file_list.append(out_file)

        # Align if necessary
        if start == "fastq":
            if aligner == "bowtie2":
                # Create files to save the alignment.
                alignment_for = join(out_dir, name + "_for.bam")
                alignment_rev = join(out_dir, name + "_rev.bam")

                # Align the forward reads
                logger.info("Alignment of %s:", for_in)
                align(for_in, index, aligner, alignment_for, n_cpu)

                # Align the reverse reads
                logger.info("Alignment of %s:", rev_in)
                align(rev_in, index, aligner, alignment_rev, n_cpu)
            elif aligner == "bwa":
                # Create file to save the alignement.
                alignment = join(out_dir, name + ".bam")
                logger.info("Alignment of %s and %s:", for_in, rev_in)
                align(for_in, index, aligner, alignment, n_cpu, rev_in)

        elif start == "bam":
            if aligner == "bowtie2":
                logger.info("Processing %s and %s:", for_in, rev_in)
                alignment_for = for_in
                alignment_rev = rev_in
            elif aligner == "bwa":
                alignment = for_in

        else:
            logger.error("Start argument should be either 'fastq' or 'bam'.")
            raise ValueError

        if aligner == "bowtie2":
            # Create files to save the alignment.
            alignment_temp_for = join(tmp_dir, name + "_for_temp.txt")
            alignment_temp_rev = join(tmp_dir, name + "_rev_temp.txt")

            # Filters the aligned and non aligned reads from the forward and
            # reverse bam files.
            aligned_reads_for = process_bamfile(alignment_for, min_qual,
                                                alignment_temp_for)
            aligned_reads_rev = process_bamfile(alignment_rev, min_qual,
                                                alignment_temp_rev)
            logger.info(
                "%s forward reads aligned and %s reverse reads aligned",
                aligned_reads_for,
                aligned_reads_rev,
            )

            # Merge alignement to create a pairs file
            logger.info("Merging the pairs:")
            n_pairs = merge_alignment(alignment_temp_for, alignment_temp_rev,
                                      contig_data, out_file)
            logger.info("%s pairs aligned.", n_pairs)
            total_aligned_pairs += n_pairs

        # Case where a bam file from bwa is given as input.
        if aligner == "bwa":
            n_pairs = process_bwa_bamfile(alignment, min_qual, contig_data,
                                          out_file)
            logger.info("%s pairs aligned.", n_pairs)
            total_aligned_pairs += n_pairs

    if len(out_file_list) > 1:
        logger.info("TOTAL PAIRS MAPPED: %s", total_aligned_pairs)

    return out_file_list, contig_data, hit_data
Пример #17
0
def alignment_to_contacts(
    alignment_files,
    contig_data,
    hit_data,
    out_dir,
    output_file_network,
    output_file_contig_data,
    tmp_dir,
    n_cpus,
    normalization,
    self_contacts,
):
    """Generates a network file (in edgelist form) from an alignment. Contigs
    are the network nodes and the edges are the contact counts.

    The network is in a strict barebone form so that it can be reused and
    imported quickly into other applications etc. Verbose information about
    every single node in the network is written on a 'contig data' file.

    Parameters:
    -----------
    alignment_files : list of str
        List of path to the alignment file(s) used as input.
    contig_data : dict
        Dictionnary of the all the contigs from the assembly, the contigs names
        are the keys to the data of the contig available with the following
        keys: "id", "length", "GC", "hit", "coverage". Coverage still at 0 and
        need to be updated later.
    hit_data : dict:
        Dictionnary for hit information on each contigs.
    out_dir : str
        The output directory to write the network and chunk data into.
    output_file_network : str, optional
        The specific file name for the output network file. Default is
        'network.txt'
    output_file_contig_data : str, optional
        The specific file name for the output chunk data file. Default is
        'idx_contig_length_GC_hit_cov.txt'
    tmp_dir : str
        Path to th temporary directory. Default in the working directory
    normalization : str
        If None, do not normalized the count of a contact by the geometric mean
        of the coverage of the contigs. Otherwise it's the type of
        normalization.
    self_contacts : bool
        Whether to return network with self contact. Default is False.

    Returns:
    --------
    str:
        Path to the network file.
    str:
        Path to the verbose contig data file.
    """

    # Create temporary and output file which will be necessary
    precompute_network_file = join(tmp_dir, "precompute_network_file.txt")
    pre_network_sorted_file = join(tmp_dir, "tmp_network_sorted.txt")
    network_file = join(out_dir, output_file_network)
    contig_data_file = join(out_dir, output_file_contig_data)
    hit_data_file = join(out_dir, "hit_data_alignment.txt")
    nb_alignment = len(alignment_files)
    logger.info("New time course network")

    # Create a contact file easily readable for counting the contacts.
    contig_data, out_files_list = precompute_network(
        alignment_files,
        contig_data,
        hit_data,
        precompute_network_file,
        tmp_dir,
        self_contacts,
    )

    # Compute network
    compute_network(
        precompute_network_file,
        network_file,
        contig_data,
        tmp_dir,
        pre_network_sorted_file,
        n_cpus,
        normalization,
    )

    # Compute sample network
    for i, precompute_network_file_sample in enumerate(out_files_list):
        network_file_sample = join(out_dir, "network_{0}.txt".format(i))
        pre_network_sorted_file = join(tmp_dir,
                                       "tmp_network_sorted_{0}.txt".format(i))
        compute_network(
            precompute_network_file_sample,
            network_file_sample,
            contig_data,
            tmp_dir,
            pre_network_sorted_file,
            n_cpus,
            normalization,
        )

    # Write the data from the contigs
    write_contig_data(contig_data, contig_data_file)
    if nb_alignment > 1:
        write_hit_data(hit_data, hit_data_file)

    return network_file, contig_data_file
Пример #18
0
    def execute(self):

        # Defined the temporary directory.
        if not self.args["--tmpdir"]:
            tmp_dir = mio.generate_temp_dir("./tmp")
        else:
            tmp_dir = self.args["--tmpdir"]
            os.makedirs(tmp_dir, exist_ok=True)

        # Defined the output directory and output file names.
        if not self.args["--outdir"]:
            self.args["--outdir"] = "."
        os.makedirs(self.args["--outdir"], exist_ok=True)

        # Enable file logging
        now = time.strftime("%Y%m%d%H%M%S")
        log_file = join(self.args["--outdir"],
                        ("metator_network_" + now + ".log"))
        mtl.set_file_handler(log_file)

        # Transform integer variables as integer.
        min_qual = int(self.args["--min-quality"])

        # Defined boolean variables:
        self_contacts = self.args["--self-contacts"]

        # Check if forward and reverse arguments are given:
        if (self.args["--start"] == "fastq" or
            (self.args["--start"] == "bam" and self.args["--aligner"]
             == "bowtie2")) and not self.args["--reverse"]:
            logger.error(
                "Forward and reverse arguments are necessary for fastq with %s start and %s aligner.",
                self.args["--start"],
                self.args["--aligner"],
            )
            raise ValueError

        # Check if normalization in the list of possible normalization.
        list_normalization = [
            "None",
            "abundance",
            "length",
            "RS",
            "empirical_hit",
            "theoritical_hit",
        ]
        if self.args["--normalization"] not in list_normalization:
            logger.error(
                'Normalization should be among this list: "None", "abundance", "length", "RS", "empirical_hit", "theoritical_hit"'
            )
            raise ValueError
        enzyme_required = ["RS", "theoritical_hit"]
        if (self.args["--normalization"] in enzyme_required
                and not self.args["--enzyme"]):
            logger.error(
                'For "RS" and "theoritical_hit" normalization, enzyme is required.'
            )
            raise ValueError
        depth_required = ["abundance", "theoritical_hit"]
        if (self.args["--normalization"] in depth_required
                and not self.args["--depth"]):
            logger.error(
                'For "abundance" and "theoritical_hit" normalization, depth is required.'
            )
            raise ValueError
        if self.args["--start"] not in ["fastq", "bam", "pair", "network"]:
            logger.error(
                "Start argument should be 'fastq', 'bam', 'pair' or 'network'."
            )
            raise ValueError
        # Extract index and genome file
        assembly = self.args["--assembly"]
        # Check what is the reference. If a fasta is given build the index. If a
        # bowtie2 index is given, retreive the fasta.
        index = mio.check_fasta_index(assembly, mode=self.args["--aligner"])
        if index is None:
            if mio.check_is_fasta(assembly):
                fasta = assembly
                # If start at bam could skip the index generation.
                if self.args["--start"] == "fastq":
                    index = mio.generate_fasta_index(fasta,
                                                     self.args["--aligner"],
                                                     tmp_dir)
            else:
                logger.error(
                    "Please give as assembly argument a %s index or a fasta.",
                    self.args["--aligner"],
                )
                raise ValueError
        else:
            fasta = mio.retrieve_fasta(index, self.args["--aligner"], tmp_dir)

        # Print information of teh workflow:
        logger.info("Aligner algorithm: %s", self.args["--aligner"])
        logger.info("Enzyme: %s", self.args["--enzyme"])
        logger.info("Normalization: %s", self.args["--normalization"])
        logger.info("Minimum mapping quality: %s", self.args["--min-quality"])

        # Do not align if pair start
        if self.args["--start"] == "pair":
            alignment_files = self.args["--forward"].split(",")
            nb_alignment = len(alignment_files)
            contig_data, hit_data = mtn.create_contig_data(
                fasta,
                nb_alignment,
                self.args["--depth"],
                self.args["--enzyme"],
            )

        else:
            # Align pair-end reads with bowtie2
            alignment_files, contig_data, hit_data = mta.get_contact_pairs(
                self.args["--forward"],
                self.args["--reverse"],
                index,
                fasta,
                self.args["--aligner"],
                min_qual,
                self.args["--start"],
                self.args["--depth"],
                self.args["--enzyme"],
                self.args["--outdir"],
                tmp_dir,
                self.args["--threads"],
            )

        # Build the network
        mtn.alignment_to_contacts(
            alignment_files,
            contig_data,
            hit_data,
            self.args["--outdir"],
            "network.txt",
            "contig_data_network.txt",
            tmp_dir,
            self.args["--threads"],
            self.args["--normalization"],
            self_contacts,
        )

        # Delete the temporary folder
        if not self.args["--no-clean-up"]:
            shutil.rmtree(tmp_dir)