Exemplo n.º 1
0
def flexible_hic_saver(mat,
                       out_prefix,
                       frags=None,
                       chroms=None,
                       hic_fmt="graal"):
    """
    Saves objects to the desired Hi-C file format. 

    Parameters
    ----------
    mat : scipy.sparse.coo_matrix
    frags : pandas.DataFrame or None
        Table of fragments informations.
    chroms : pandas.DataFrame or None
        Table of chromosomes / contigs informations.
    hic_fmt : str
        Output format. Can be one of graal for graal-compatible COO format, bg2 for
        2D bedgraph format, or cool for cooler compatible format.
    """
    if hic_fmt == "graal":
        save_sparse_matrix(mat, out_prefix + ".mat.tsv")
        try:
            frags.to_csv(out_prefix + ".frag.tsv", sep="\t", index=False)
        except AttributeError:
            logger.warning(
                "Could not create fragments_list.txt from input files")
        try:
            chroms.to_csv(out_prefix + ".chr.tsv", sep="\t", index=False)
        except AttributeError:
            logger.warning(
                "Could not create info_contigs.txt from input files")
    elif hic_fmt == "cool":
        frag_sizes = frags.end_pos - frags.start_pos
        size_mad = np.median(frag_sizes - np.median(frag_sizes))
        bin_type = 'variable' if size_mad else 'fixed'
        try:
            save_cool(out_prefix + ".cool",
                      mat,
                      frags,
                      metadata={
                          "hicstuff": __version__,
                          'bin-type': bin_type
                      })
        except NameError:
            NameError("frags is required to save a cool file")
    elif hic_fmt == "bg2":
        try:
            save_bedgraph2d(mat, frags, out_prefix + ".bg2")
        except NameError:
            NameError("frags is required to save a bg2 file")
    else:
        raise ValueError("Unknown output format: {0}".format(hic_fmt))
Exemplo n.º 2
0
def normalize_distance_law(xs, ps, inf=3000, sup=None):
    """Normalize the distance in order to have the sum of the ps values between
    'inf' (default value is 3kb) until the end of the array equal to one and
    limit the effect of coverage between two conditions/chromosomes/arms when
    you compare them together. If we have a list of ps, it will normalize until
    the length of the shorter object or the value of sup, whichever is smaller.

    Parameters
    ----------
    xs : list of numpy.ndarray
        list of logbins corresponding to the ps.
    ps : list of numpy.ndarray
        Average ps or list of ps of the chromosomes/arms. xs and ps have to 
        have the same shape.
    inf : integer
        Inferior value of the interval on which, the normalization is applied.
    sup : integer
        Superior value of the interval on which, the normalization is applied.

    Returns
    -------
    list of numpy.ndarray :
        List of ps each normalized separately.
    """
    # Sanity check: xs and ps have the same dimension
    if np.shape(xs) != np.shape(ps):
        logger.error("xs and ps should have the same dimension.")
        sys.exit(1)
    # Define the length of shortest chromosomes as a lower bound for the sup boundary
    min_xs = len(min(xs, key=len))
    normed_ps = [None] * len(ps)
    if sup is None:
        sup = np.inf
    for chrom_id, chrom_ps in enumerate(ps):
        # Iterate on the different ps to normalize each of theme separately
        chrom_sum = 0
        # Change the last value to have something continuous because the last
        # one is much bigger (computed on matrix corner = triangle instead of trapezoid).
        chrom_ps[-1] = chrom_ps[-2]
        for bin_id, bin_value in enumerate(chrom_ps):
            # Compute normalization factor based on values between inf and sup
            # Sup will be whatever is smaller between user-provided sup and length of
            # the shortest chromosome
            if (xs[chrom_id][bin_id] > inf) and (xs[chrom_id][bin_id] <
                                                 sup) and (bin_id < min_xs):
                chrom_sum += bin_value
        if chrom_sum == 0:
            chrom_sum += 1
            logger.warning("No values of p(s) in one segment")
        # Make the normalisation
        normed_ps[chrom_id] = np.array(ps[chrom_id]) / chrom_sum
    return normed_ps
Exemplo n.º 3
0
def normalize_distance_law(xs, ps, inf=3000):
    """Normalize the distance in order to have the sum of the ps values between
    'inf' (default value is 3kb) until the end of the array equal to one and
    limit the effect of coverage between two conditions/chromosomes/arms when
    you compare them together. If we have a list of ps, it will normalize until
    the length of the shorter object.

    Parameters
    ----------
    xs : list of numpy.ndarray
        list of logbins corresponding to the ps.
    ps : list of numpy.ndarray
        Average ps or list of ps of the chromosomes/arms. xs and ps have to 
        have the same shape.
    inf : integer
        Inferior value of the intervall on which, the normalization is making.

    Returns
    -------
    list of numpy.ndarray :
        List of ps each normalized separately.
    """
    # Sanity check: xs and ps have the same dimension
    if np.shape(xs) != np.shape(ps):
        logger.error("xs and ps should have the same dimension.")
        sys.exit(1)
    # Take the min of xs as superior limit to choose the limits of the
    # interval use for the normalisation
    min_xs = len(min(xs, key=len))
    normed_ps = [None] * len(ps)
    for j, my_list in enumerate(ps):
        # Iterate on the different ps to normalize each of theme separately
        sum_values = 0
        # Change the last value to have something continuous because the last
        # one is much bigger.
        my_list[-1] = my_list[-2]
        for i, value in enumerate(my_list):
            # Keep only the value between 1kb and the length of the shorter
            # object given in the list
            if (xs[j][i] > inf) and (i < min_xs):
                sum_values += value
        if sum_values == 0:
            sum_values += 1
            logger.warning("No values of p(s) in one segment")
        # Make the normalisation
        normed_ps[j] = np.array(ps[j]) / sum_values
    return normed_ps
Exemplo n.º 4
0
def to_dade_matrix(M, annotations="", filename=None):
    """Returns a Dade matrix from input numpy matrix. Any annotations are added
    as header. If filename is provided and valid, said matrix is also saved
    as text.
    """

    n, m = M.shape
    A = np.zeros((n + 1, m + 1))
    A[1:, 1:] = M
    if not annotations:
        annotations = np.array(["" for _ in n], dtype=str)
    A[0, :] = annotations
    A[:, 0] = annotations.T
    if filename:
        try:
            np.savetxt(filename, A, fmt="%i")
            logger.info("I saved input matrix in dade format as {0}".format(
                str(filename)))
        except ValueError as e:
            logger.warning("I couldn't save input matrix.")
            logger.warning(str(e))

    return A
Exemplo n.º 5
0
def get_chr_segment_bins_index(fragments, centro_file=None, rm_centro=0):
    """Get the index positions of the start and end bins of different 
    chromosomes, or arms if the centromers position have been given from the
    fragments file made by hicstuff.
    
    Parameters
    ----------
    fragments : pandas.DataFrame
        Table containing in the first coulum the ID of the fragment, in the 
        second the names of the chromosome in the third and fourth the start 
        position and the end position of the fragment. The file have no header.
        (File like the 'fragments_list.txt' from hicstuff)
    centro_file : None or str
        None or path to a file with the genomic positions of the centromers 
        sorted as the chromosomes separated by a space. The file have only one 
        line.
    rm_centro : int
        If a value is given, will remove the contacts close the centromeres.
        It will remove as many kb as the argument given. Default is zero.
        
    Returns
    -------
    list of floats :
        The start and end indices of chromosomes/arms to compute the distance
        law on each chromosome/arm separately.
    """
    # Get bins where chromosomes start
    chr_start_bins = np.where(fragments == 0)[0]
    # Create a list of same length for the end of the bins
    chr_end_bins = np.zeros(len(chr_start_bins))
    # Get bins where chromsomes end
    for i in range(len(chr_start_bins) - 1):
        chr_end_bins[i] = chr_start_bins[i + 1]
    chr_end_bins[-1] = len(fragments.iloc[:, 0])
    # Combine start and end of bins in a single array. Values are the id of the
    # bins
    chr_segment_bins = np.sort(np.concatenate((chr_start_bins, chr_end_bins)))
    if centro_file is not None:
        # Read the file of the centromers
        with open(centro_file, "r", newline="") as centro:
            centro = csv.reader(centro, delimiter=" ")
            centro_pos = next(centro)
        # Sanity check: as many chroms as centromeres
        if len(chr_start_bins) != len(centro_pos):
            logger.warning(
                "Number of chromosomes and centromeres differ, centromeres position are not taking into account."
            )
            centro_file = None
    if centro_file is not None:
        # Get bins of centromeres
        centro_bins = np.zeros(2 * len(centro_pos))
        for i in range(len(chr_start_bins)):
            if (i + 1) < len(chr_start_bins):
                subfrags = fragments[chr_start_bins[i]:chr_start_bins[i + 1]]
            else:
                subfrags = fragments[chr_start_bins[i]:]
            # index of last fragment starting before centro in same chrom
            centro_bins[2 * i] = chr_start_bins[i] + max(
                np.where(subfrags["start_pos"][:] //
                         (int(centro_pos[i]) - rm_centro) == 0)[0])
            centro_bins[2 * i + 1] = chr_start_bins[i] + max(
                np.where(subfrags["start_pos"][:] //
                         (int(centro_pos[i]) + rm_centro) == 0)[0])
        # Combine centro and chrom bins into a single array. Values are the id
        # of the bins started and ending the arms.
        chr_segment_bins = np.sort(
            np.concatenate((chr_start_bins, chr_end_bins, centro_bins)))
    return list(chr_segment_bins)
Exemplo n.º 6
0
def flexible_hic_loader(mat,
                        fragments_file=None,
                        chroms_file=None,
                        quiet=False):
    """
    Wrapper function to load COO, bg2 or cool input and return the same output.
    COO formats requires fragments_file and chroms_file options. bg2 format can
    infer bin_size if fixed. When providing a bg2 matrix with uneven fragments
    length, one should provide fragments_file as well or empty bins will be
    truncated from the output.
    
    Parameters
    ----------
    mat : str
        Path to the matrix in graal, bedgraph2 or cool format.
    fragments_file : str or None
        Path to the file with fragments information (fragments_list.txt).
        Only required if the matrix is in graal format.
    chroms_file : str or None
        Path to the file with chromosome information (info_contigs.txt). Only required
        if the matrix is in graal format.
    quiet : bool
        If True, will silence warnings for empty outputs.

    Returns
    -------
    mat : scipy.sparse.coo_matrix
        Sparse upper triangle Hi-C matrix.
    frags : pandas.DataFrame or None
        Table of fragment informations. None if information was not provided.
    chroms : pandas.DataFrame or None
        Table of chromosomes/contig information. None if information was not provided.
    """
    hic_format = get_hic_format(mat)
    # Load cool based on file extension
    if hic_format == "cool":
        mat, frags, chroms = load_cool(mat)
    # Use the first line to determine COO / bg2 format
    if hic_format == "bg2":
        # Use the frags file to define bins if available
        if fragments_file is not None:
            mat, frags, chroms = load_bedgraph2d(mat,
                                                 fragments_file=fragments_file)
        else:
            # Guess if bin size is fixed based on MAD
            bg2 = pd.read_csv(mat, sep="\t")
            sizes = np.array(bg2.iloc[:, 2] - bg2.iloc[:, 1])
            size_mad = ss.median_abs_deviation(sizes, scale='normal')
            # Use only the bg2
            if size_mad > 0:
                mat, frags, chroms = load_bedgraph2d(mat)
                logger.warning(
                    "Input is a bedgraph2d file with uneven bin size, "
                    "but no fragments_file was provided. Empty bins will "
                    "be missing from the output. To avoid this, provide a "
                    "fragments file.")
            # Use fixed bin size
            else:
                mat, frags, chroms = load_bedgraph2d(mat,
                                                     bin_size=int(
                                                         np.median(sizes)))

    elif hic_format == "graal":
        mat = load_sparse_matrix(mat)
        try:
            frags = pd.read_csv(fragments_file, sep="\t")
        except ValueError:
            if not quiet:
                logger.warning(
                    "fragments_file was not provided when "
                    "loading a matrix in COO/graal format. frags will be None."
                )
            frags = None
        try:
            chroms = pd.read_csv(chroms_file, sep="\t")
        except ValueError:
            if not quiet:
                logger.warning(
                    "chroms_file was not provided when "
                    "loading a matrix in COO/graal format. chroms will be None."
                )

            chroms = None

    # Ensure the matrix is upper triangle symmetric
    if mat.shape[0] == mat.shape[1]:
        if (abs(mat - mat.T) > 1e-10).nnz > 0:
            mat = mat + tril(mat, k=-1).T
        mat = triu(mat, format="coo")

    return mat, frags, chroms
Exemplo n.º 7
0
def load_bedgraph2d(filename, bin_size=None, fragments_file=None):
    """
    Loads matrix and fragment information from a 2D bedgraph file. Note this
    function assumes chromosomes are ordered in alphabetical. order
    
    Parameters
    ----------
    filename : str
        Path to the bedgraph2D file.
    bin_size : int
        The size of bins in the case of fixed bin size.
    fragments_file : str
        Path to a fragments file to explicitely provide fragments positions.
        If the matrix does not have fixed bin size, this prevents errors.
    
    Returns
    -------
    mat : scipy.sparse.coo_matrix
        The Hi-C contact map as the upper triangle of a symetric matrix, in
        sparse format.
    frags : pandas.DataFrame
        The list of fragments/bin present in the matrix with their genomic
        positions.
    """
    bed2d = pd.read_csv(filename, sep="\t", header=None)
    chrom_sizes = {}
    if bin_size is not None:
        # If bin size if provided, retrieve chromosome lengths, this will be
        # used when regenerating bin coordinates
        chroms_left = bed2d[[3, 5]]
        chroms_left.columns = [0, 2]
        chroms = (pd.concat([bed2d[[0, 2]],
                             chroms_left]).groupby([0], sort=False).max())
        for chrom, size in zip(chroms.index, np.array(chroms)):
            chrom_sizes[chrom] = size[0]
    elif fragments_file is None:
        logger.warning(
            "Please be aware that not all information can be restored from a "
            "bg2 file without fixed bin size; fragments without any contact "
            "will be lost")
    # Get all possible fragment chrom-positions into an array
    frag_pos = np.vstack(
        [np.array(bed2d[[0, 1, 2]]),
         np.array(bed2d[[3, 4, 5]])])
    # Sort by position (least important, col 1)
    frag_pos = frag_pos[frag_pos[:, 1].argsort(kind="mergesort")]
    # Then by chrom (most important, col 0)
    frag_pos = frag_pos[frag_pos[:, 0].argsort(kind="mergesort")]
    # Get unique names for fragments (chrom+pos)
    ordered_frag_pos = (pd.DataFrame(frag_pos).drop_duplicates().reset_index(
        drop=True))
    frag_pos_a = bed2d[[0, 1]].apply(lambda x: tuple(x), axis=1)
    frag_pos_b = bed2d[[3, 4]].apply(lambda x: tuple(x), axis=1)
    # If fragments file is provided, use fragments positions to indices mapping
    if fragments_file is not None:
        frags = pd.read_csv(fragments_file, delimiter="\t")
        frag_map = frags.apply(lambda x: (str(x.chrom), x.start_pos), axis=1)
        frag_map = {f_name: f_idx for f_idx, f_name in enumerate(frag_map)}
    # If fixed fragment size available, use it to reconstruct original
    # fragments ID (even if they are absent from the bedgraph file).
    elif bin_size is not None:
        frag_map = {}
        chrom_frags = []
        for chrom, size in chrom_sizes.items():
            prev_frags = len(frag_map)
            for bin_id, bin_pos in enumerate(range(0, size, bin_size)):
                frag_map[(chrom, bin_pos)] = bin_id + prev_frags
            n_bins = size // bin_size
            chrom_frags.append(
                pd.DataFrame({
                    "id":
                    range(1, n_bins + 1),
                    "chrom":
                    np.repeat(chrom, n_bins),
                    "start_pos":
                    range(0, size, bin_size),
                    "end_pos":
                    range(bin_size, size + bin_size, bin_size),
                }))
        frags = pd.concat(chrom_frags, axis=0).reset_index(drop=True)
        frags.insert(loc=3,
                     column="size",
                     value=frags.end_pos - frags.start_pos)
    # If None available, guess fragments indices from bedgraph (potentially wrong)
    else:
        frag_map = {(v[0], v[1]): i
                    for i, v in ordered_frag_pos.iloc[:, [0, 1]].iterrows()}
        frags = ordered_frag_pos.copy()
        frags[3] = frags.iloc[:, 2] - frags.iloc[:, 1]
        frags.insert(loc=0, column="id", value=0)
        frags.id = frags.groupby([0], sort=False).cumcount() + 1
        frags.columns = ["id", "chrom", "start_pos", "end_pos", "size"]
    # Match bin indices to their names
    frag_id_a = np.array(list(map(lambda x: frag_map[x], frag_pos_a)))
    frag_id_b = np.array(list(map(lambda x: frag_map[x], frag_pos_b)))
    contacts = np.array(bed2d.iloc[:, 6].tolist())
    # Use index to build matrix
    n_frags = len(frag_map.keys())
    mat = coo_matrix((contacts, (frag_id_a, frag_id_b)),
                     shape=(n_frags, n_frags))

    # Get size of each chromosome in basepairs
    chromsizes = frags.groupby(
        "chrom", sort=False).apply(lambda x: np.int64(max(x.end_pos)))
    chrom_bins = frags.groupby("chrom", sort=False).size()
    # Shift chromsizes by one to get starting bin, first one is zero
    # Make chromsize cumulative to get start bin of each chrom
    # Get chroms into a 1D array of bin starts
    chrom_start = chrom_bins.shift(1, fill_value=0).cumsum()
    chroms = pd.DataFrame({
        "contig": chromsizes.index,
        "length": chromsizes.values,
        "n_frags": chrom_bins,
        "cumul_length": chrom_start,
    })
    return mat, frags, chroms
Exemplo n.º 8
0
def dade_to_graal(
    filename,
    output_matrix=DEFAULT_SPARSE_MATRIX_FILE_NAME,
    output_contigs=DEFAULT_INFO_CONTIGS_FILE_NAME,
    output_frags=DEFAULT_SPARSE_MATRIX_FILE_NAME,
    output_dir=None,
):
    """Convert a matrix from DADE format (https://github.com/scovit/dade)
    to a graal-compatible format. Since DADE matrices contain both fragment
    and contact information all files are generated at the same time.
    """

    with open(output_matrix, "w") as sparse_file:
        sparse_file.write("id_frag_a\tid_frag_b\tn_contact")
        with open(filename) as file_handle:
            first_line = file_handle.readline()
            for row_index, line in enumerate(file_handle):
                dense_row = np.array(line.split("\t")[1:], dtype=np.int32)
                for col_index in np.nonzero(dense_row)[0]:
                    line_to_write = "{}\t{}\t{}\n".format(
                        row_index, col_index, dense_row[col_index])
                    sparse_file.write(line_to_write)

    header = first_line.split("\t")
    bin_type = header[0]
    if bin_type == '"RST"':
        logger.info("I detected fragment-wise binning")
    elif bin_type == '"BIN"':
        logger.info("I detected fixed size binning")
    else:
        logger.warning(("Sorry, I don't understand this matrix's "
                        "binning: I read {}".format(str(bin_type))))

    header_data = [
        header_elt.replace("'", "").replace('"', "").replace("\n",
                                                             "").split("~")
        for header_elt in header[1:]
    ]

    (
        global_frag_ids,
        contig_names,
        local_frag_ids,
        frag_starts,
        frag_ends,
    ) = np.array(list(zip(*header_data)))

    frag_starts = frag_starts.astype(np.int32) - 1
    frag_ends = frag_ends.astype(np.int32) - 1
    frag_lengths = frag_ends - frag_starts

    total_length = len(global_frag_ids)

    with open(output_contigs, "w") as info_contigs:

        info_contigs.write("contig\tlength\tn_frags\tcumul_length\n")

        cumul_length = 0

        for contig in collections.OrderedDict.fromkeys(contig_names):

            length_tig = np.sum(frag_lengths[contig_names == contig])
            n_frags = collections.Counter(contig_names)[contig]
            line_to_write = "%s\t%s\t%s\t%s\n" % (
                contig,
                length_tig,
                n_frags,
                cumul_length,
            )
            info_contigs.write(line_to_write)
            cumul_length += n_frags

    with open(output_frags, "w") as fragments_list:

        fragments_list.write("id\tchrom\tstart_pos\tend_pos"
                             "\tsize\tgc_content\n")
        bogus_gc = 0.5

        for i in range(total_length):
            line_to_write = "%s\t%s\t%s\t%s\t%s\t%s\n" % (
                int(local_frag_ids[i]) + 1,
                contig_names[i],
                frag_starts[i],
                frag_ends[i],
                frag_lengths[i],
                bogus_gc,
            )
            fragments_list.write(line_to_write)
Exemplo n.º 9
0
def sort_pairs(in_file, out_file, keys, tmp_dir=None, threads=1, buffer="2G"):
    """
    Sort a pairs file in batches using UNIX sort.

    Parameters
    ----------
    in_file : str
        Path to the unsorted input file
    out_file : str
        Path to the sorted output file.
    keys : list of str
        list of columns to use as sort keys. Each column can be one of readID,
        chr1, pos1, chr2, pos2, frag1, frag2. Key priorities are according to
        the order in the list.
    tmp_dir : str
        Path to the directory where temporary files will be created. Defaults
        to current directory.
    threads : int
        Number of parallel sorting threads.
    buffer : str
        Buffer size used for sorting. Consists of a number and a unit.
    """
    # TODO: Write a pure python implementation to drop GNU coreutils depencency,
    # could be inspired from: https://stackoverflow.com/q/14465154/8440675

    # Check if UNIX sort version supports parallelism
    parallel_ok = True
    sort_ver = sp.Popen(["sort", "--version"], stdout=sp.PIPE)
    sort_ver = (sort_ver.communicate()[0].decode().split("\n")[0].split(" ")
                [-1].split("."))
    # If so, specify threads, otherwise don't mention it in the command line
    try:
        sort_ver = list(map(int, sort_ver))
        if sort_ver[0] < 8 or (sort_ver[0] == 8 and sort_ver[1] < 23):
            logger.warning(
                "GNU sort version is {0} but >8.23 is required for parallel "
                "sort. Sorting on a single thread.".format(".".join(
                    map(str, sort_ver))))
            parallel_ok = False
    # BSD sort has a different format and will throw error upon parsing. It does
    # not support parallel processes anyway.
    except ValueError:
        logger.warning(
            "Using BSD sort instead of GNU sort, sorting on a single thread.")
        parallel_ok = False

    key_map = {
        "readID": "-k1,1d",
        "chr1": "-k2,2V",
        "pos1": "-k3,3n",
        "chr2": "-k4,4V",
        "pos2": "-k5,5n",
        "strand1": "-k6,6d",
        "strand2": "-k7,7d",
        "frag1": "-k8,8n",
        "frag2": "-k9,9n",
    }

    # transform column names to corresponding sort keys
    try:
        sort_keys = map(lambda k: key_map[k], keys)
    except KeyError:
        print("Unkown column name.")
        raise
    # Rewrite header with new sorting order
    header = get_pairs_header(in_file)
    with open(out_file, "w") as output:
        for line in header:
            if line.startswith("#sorted"):
                output.write("#sorted: {0}\n".format("-".join(keys)))
            else:
                output.write(line + "\n")

    # Sort pairs and append to file.
    with open(out_file, "a") as output:
        grep_proc = sp.Popen(["grep", "-v", "^#", in_file], stdout=sp.PIPE)
        sort_cmd = ["sort", "-S %s" % buffer] + list(sort_keys)
        if tmp_dir is not None:
            sort_cmd.append("--temporary-directory={0}".format(tmp_dir))
        if parallel_ok:
            sort_cmd.append("--parallel={0}".format(threads))
        sort_proc = sp.Popen(sort_cmd, stdin=grep_proc.stdout, stdout=output)
        sort_proc.communicate()
Exemplo n.º 10
0
def attribute_fragments(pairs_file, idx_pairs_file, restriction_table):
    """
    Writes the indexed pairs file, which has two more columns than the input
    pairs file corresponding to the restriction fragment index of each read.
    Note that pairs files have 1bp point positions whereas restriction table
    has 0bp point poisitions.

    Parameters
    ----------
    pairs_file: str
        Path the the input pairs file. Consists of 7 white-space separated
        columns: readID, chr1, pos1, chr2, pos2, strand1, strand2
    idx_pairs_file: str
        Path to the output indexed pairs file. Consists of 9 white space
        separated columns: readID, chr1, pos1, chr2, pos2, strand1, strand2,
        frag1, frag2. frag1 and frag2 are 0-based restriction fragments based
        on whole genome.
    restriction_table: dict
        Dictionary with chromosome identifiers (str) as keys and list of
        positions (int) of restriction sites as values.
    """

    # NOTE: Bottlenecks here are 1. binary search in find_frag and 2. writerow
    # 1. could be reduced by searching groups of N frags in parallel and 2. by
    # writing N frags simultaneously using a single call of writerows.

    # Parse and update header section
    pairs_header = hio.get_pairs_header(pairs_file)
    header_size = len(pairs_header)
    chrom_order = []
    with open(idx_pairs_file, "w") as idx_pairs:
        for line in pairs_header:
            # Add new column names to header
            if line.startswith("#columns"):
                line = line.rstrip() + " frag1 frag2"
            if line.startswith("#chromsize"):
                chrom_order.append(line.split()[1])
            idx_pairs.write(line + "\n")

    # Get number of fragments per chrom to allow genome-based indices
    shift_frags = {}
    prev_frags = 0
    for rank, chrom in enumerate(chrom_order):
        if rank > 0:
            # Note the "-1" because there are nfrags + 1 sites in rest table
            prev_frags += len(restriction_table[chrom_order[rank - 1]]) - 1
        # Idx of each chrom's frags will be shifted by n frags in previous chroms
        shift_frags[chrom] = prev_frags

    missing_contigs = set()
    # Attribute pairs to fragments and append them to output file (after header)
    with open(pairs_file, "r") as pairs, open(idx_pairs_file,
                                              "a") as idx_pairs:
        # Skip header lines
        for _ in range(header_size):
            next(pairs)

        # Define input and output fields
        pairs_cols = [
            "readID",
            "chr1",
            "pos1",
            "chr2",
            "pos2",
            "strand1",
            "strand2",
        ]
        idx_cols = pairs_cols + ["frag1", "frag2"]

        # Use csv reader / writer to automatically parse columns into a dict
        pairs_reader = csv.DictReader(pairs,
                                      fieldnames=pairs_cols,
                                      delimiter="\t")
        pairs_writer = csv.DictWriter(idx_pairs,
                                      fieldnames=idx_cols,
                                      delimiter="\t")

        for pair in pairs_reader:
            # Get the 0-based indices of corresponding restriction fragments
            # Deducing 1 from pair position to get it into 0bp point
            pair["frag1"] = find_frag(
                int(pair["pos1"]) - 1, restriction_table[pair["chr1"]])
            pair["frag2"] = find_frag(
                int(pair["pos2"]) - 1, restriction_table[pair["chr2"]])
            # Shift fragment indices to make them genome-based instead of
            # chromosome-based
            try:
                pair["frag1"] += shift_frags[pair["chr1"]]
            except KeyError:
                missing_contigs.add(pair["chr1"])
            try:
                pair["frag2"] += shift_frags[pair["chr2"]]
            except KeyError:
                missing_contigs.add(pair["chr2"])

            # Write indexed pairs in the new file
            pairs_writer.writerow(pair)

        if missing_contigs:
            logger.warning(
                "Pairs on the following contigs were discarded as "
                "those contigs are not listed in the paris file header. "
                "This is normal if you filtered out small contigs: %s" %
                " ".join(list(missing_contigs)))
Exemplo n.º 11
0
def bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual=30):
    """
    Make a .pairs file from two Hi-C bam files sorted by read names.
    The Hi-C mates are matched by read identifier. Pairs where at least one
    reads maps with MAPQ below  min_qual threshold are discarded. Pairs are
    sorted by readID and stored in upper triangle (first pair higher).

    Parameters
    ----------
    bam1 : str
        Path to the name-sorted BAM file with aligned Hi-C forward reads.
    bam2 : str
        Path to the name-sorted BAM file with aligned Hi-C reverse reads.
    out_pairs : str
        Path to the output space-separated .pairs file with columns 
        readID, chr1 pos1 chr2 pos2 strand1 strand2
    info_contigs : str
        Path to the info contigs file, to get info on chromosome sizes and order.
    min_qual : int
        Minimum mapping quality required to keep a Hi-C pair.
    """
    forward = ps.AlignmentFile(bam1, "rb")
    reverse = ps.AlignmentFile(bam2, "rb")

    # Generate header lines
    format_version = "## pairs format v1.0\n"
    sorting = "#sorted: readID\n"
    cols = "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n"
    # Chromosome order will be identical in info_contigs and pair files
    chroms = pd.read_csv(info_contigs,
                         sep="\t").apply(lambda x: "#chromsize: %s %d\n" %
                                         (x.contig, x.length),
                                         axis=1)
    with open(out_pairs, "w") as pairs:
        pairs.writelines([format_version, sorting, cols] + chroms.tolist())
        pairs_writer = csv.writer(pairs, delimiter="\t")
        n_reads = {"total": 0, "mapped": 0}
        # Remember if some read IDs were missing from either file
        unmatched_reads = 0
        # Remember if all reads in one bam file have been read
        exhausted = [False, False]
        # Iterate on both BAM simultaneously
        for end1, end2 in itertools.zip_longest(forward, reverse):
            # Both file still have reads
            # Check if reads pass filter
            try:
                end1_passed = end1.mapping_quality >= min_qual
            # Happens if end1 bam file has been exhausted
            except AttributeError:
                exhausted[0] = True
                end1_passed = False
            try:
                end2_passed = end2.mapping_quality >= min_qual
            # Happens if end2 bam file has been exhausted
            except AttributeError:
                exhausted[1] = True
                end2_passed = False
            # Skip read if mate is not present until they match or reads
            # have been exhausted
            while sum(exhausted) == 0 and end1.query_name != end2.query_name:
                # Get next read and check filters again
                # Count single-read iteration
                unmatched_reads += 1
                n_reads["total"] += 1
                if end1.query_name < end2.query_name:
                    try:
                        end1 = next(forward)
                        end1_passed = end1.mapping_quality >= min_qual
                    # If EOF is reached in BAM 1
                    except (StopIteration, AttributeError):
                        exhausted[0] = True
                        end1_passed = False
                    n_reads["mapped"] += end1_passed
                elif end1.query_name > end2.query_name:
                    try:
                        end2 = next(reverse)
                        end2_passed = end2.mapping_quality >= min_qual
                    # If EOF is reached in BAM 2
                    except (StopIteration, AttributeError):
                        exhausted[1] = True
                        end2_passed = False
                    n_reads["mapped"] += end2_passed

            # 2 reads processed per iteration, unless one file is exhausted
            n_reads["total"] += 2 - sum(exhausted)
            n_reads["mapped"] += sum([end1_passed, end2_passed])
            # Keep only pairs where both reads have good quality
            if end1_passed and end2_passed:

                # Flipping to get upper triangle
                if (end1.reference_id == end2.reference_id
                        and end1.reference_start > end2.reference_start
                    ) or end1.reference_id > end2.reference_id:
                    end1, end2 = end2, end1
                pairs_writer.writerow([
                    end1.query_name,
                    end1.reference_name,
                    end1.reference_start + 1,
                    end2.reference_name,
                    end2.reference_start + 1,
                    "-" if end1.is_reverse else "+",
                    "-" if end2.is_reverse else "+",
                ])
    pairs.close()
    if unmatched_reads > 0:
        logger.warning(
            "%d reads were only present in one BAM file. Make sure you sorted reads by name before running the pipeline.",
            unmatched_reads,
        )
    logger.info(
        "{perc_map}% reads (single ends) mapped with Q >= {qual} ({mapped}/{total})"
        .format(
            total=n_reads["total"],
            mapped=n_reads["mapped"],
            perc_map=round(100 * n_reads["mapped"] / n_reads["total"]),
            qual=min_qual,
        ))
Exemplo n.º 12
0
def iterative_align(
    fq_in,
    tmp_dir,
    ref,
    n_cpu,
    bam_out,
    aligner="bowtie2",
    min_len=20,
    min_qual=30,
    read_len=None,
):
    """Iterative alignment

    Aligns reads iteratively reads of fq_in with bowtie2, minimap2 or bwa. Reads are
    truncated to the 20 first nucleotides and unmapped reads are extended by 20
    nucleotides and realigned on each iteration.

    Parameters
    ----------
    fq_in : str
        Path to input fastq file to align iteratively.
    tmp_dir : str
        Path where temporary files should be written.
    ref : str
        Path to the reference genome if Minimap2 is used for alignment.
        Path to the index genome if Bowtie2/bwa is used for alignment. 
    n_cpu : int
        The number of CPUs to use for the iterative alignment.
    bam_out : str
        Path where the final alignment should be written in BAM format.
    aligner : str
        Choose between minimap2, bwa or bowtie2 for the alignment.
    min_len : int
        The initial length of the fragments to align.
    min_qual : int
        Minimum mapping quality required to keep Hi-C pairs.
    read_len : int
        Read length in the fasta file. If set to None, the length of the first read
        is used. Set this value to the longest read length in the file if you have
        different read lengths.
        
    Examples
    --------
    iterative_align(fq_in='example_for.fastq', ref='example_bt2_index', bam_out='example_for.bam', aligner="bowtie2")
    iterative_align(fq_in='example_for.fastq', ref='example_genome.fa', bam_out='example_for.bam', aligner="minimap2")
    """
    # set with the name of the unaligned reads :
    remaining_reads = set()
    total_reads = 0
    # Store path of SAM containing aligned reads at each iteration.
    iter_out = []

    # If there is already a file with the same name as the output file,
    # remove it. Otherwise, ignore.
    with contextlib.suppress(FileNotFoundError):
        try:
            os.remove(bam_out)
        except IsADirectoryError:
            logger.error("You need to give the BAM output file, not a folder.")
            raise

    # Bowtie only accepts uncompressed fastq: uncompress it into a temp file
    if aligner == "bowtie2" and hio.is_compressed(fq_in):
        uncomp_path = join(tmp_dir, os.path.basename(fq_in) + ".tmp")
        with hio.read_compressed(fq_in) as inf:
            with open(uncomp_path, "w") as uncomp:
                st.copyfileobj(inf, uncomp)
    else:
        uncomp_path = fq_in

    # throw error if index does not exist
    index = hio.check_fasta_index(ref, mode=aligner)
    if index is None:
        logger.error(
            "Reference index is missing, please build the {} ".format(aligner),
            "index first.")
        sys.exit(1)
    # Counting reads
    with hio.read_compressed(uncomp_path) as inf:
        for _ in inf:
            total_reads += 1
    total_reads /= 4

    # Use first read to guess read length if not provided.
    if read_len is None:
        with hio.read_compressed(uncomp_path) as inf:
            # Skip first line (read header)
            size = inf.readline()
            # Stripping newline from sequence line.
            read_len = len(inf.readline().rstrip())

    # initial length of the fragments to align
    # In case reads are shorter than provided min_len
    if read_len > min_len:
        n = min_len
    else:
        logger.warning(
            "min_len is longer than the reads. Iterative mapping will have no effect."
        )
        n = read_len
    logger.info("{0} reads to parse".format(int(total_reads)))

    first_round = True
    # iterative alignment per se
    while n <= read_len:
        logger.info(
            "Truncating unaligned reads to {size}bp and mapping{again}.".
            format(size=int(n), again="" if first_round else " again"))
        iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))]
        # Generate a temporary input fastq file with the n first nucleotids
        # of the reads.
        truncated_reads = truncate_reads(tmp_dir, uncomp_path, remaining_reads,
                                         n, first_round)

        # Align the truncated reads on reference genome
        temp_alignment = join(tmp_dir, "temp_alignment.bam")
        map_args = {
            "fa": ref,
            "cpus": n_cpu,
            "fq": truncated_reads,
            "idx": index,
            "bam": temp_alignment,
        }
        if re.match(r"^(minimap[2]?|mm[2]?)$", aligner, flags=re.IGNORECASE):
            cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format(**map_args)
        elif re.match(r"^(bwa)$", aligner, flags=re.IGNORECASE):
            cmd = "bwa mem -t {cpus} -v 1 {idx} {fq}".format(**map_args)
        elif re.match(r"^(bowtie[2]?|bt[2]?)$", aligner, flags=re.IGNORECASE):
            cmd = ("bowtie2 -x {idx} -p {cpus}"
                   " --quiet --very-sensitive {fq}").format(**map_args)
        else:
            raise ValueError(
                "Unknown aligner. Select bowtie2, minimap2 or bwa.")

        map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE)
        sort_process = sp.Popen(
            "samtools sort -n -@ {cpus} -O BAM -o {bam}".format(**map_args),
            shell=True,
            stdin=map_process.stdout,
        )
        out, err = sort_process.communicate()

        # filter the reads: the reads whose truncated end was aligned are written
        # to the output file.
        # The reads whose truncated end was not aligned are kept for the next round.
        remaining_reads = filter_bamfile(temp_alignment, iter_out[-1],
                                         min_qual)

        n += 20
        first_round = False

    # one last round without trimming
    logger.info("Trying to map unaligned reads at full length ({0}bp).".format(
        int(read_len)))

    truncated_reads = truncate_reads(
        tmp_dir,
        infile=uncomp_path,
        unaligned_set=remaining_reads,
        trunc_len=n,
        first_round=first_round,
    )
    if aligner == "minimap2" or aligner == "Minimap2":
        cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format(
            fa=ref, cpus=n_cpu, fq=truncated_reads)
    elif aligner == "bwa" or aligner == "Bwa" or aligner == "BWA":
        cmd = "bwa mem -v 1 -t {cpus} {idx} {fq}".format(idx=index,
                                                         cpus=n_cpu,
                                                         fq=truncated_reads)
    else:
        cmd = ("bowtie2 -x {idx} -p {cpus} --quiet "
               "--very-sensitive {fq}").format(idx=index,
                                               cpus=n_cpu,
                                               fq=truncated_reads)
    map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE)
    # Keep reads sorted by name
    sort_process = sp.Popen(
        "samtools sort -n -@ {cpus} -O BAM -o {bam}".format(
            cpus=n_cpu, bam=temp_alignment),
        shell=True,
        stdin=map_process.stdout,
    )
    out, err = sort_process.communicate()
    iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))]
    remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual)

    # Report unaligned reads as well
    iter_out += [join(tmp_dir, "unaligned.bam")]
    temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False)
    unmapped = ps.AlignmentFile(iter_out[-1], "wb", template=temp_bam)
    for r in temp_bam:
        # Do not write supplementary alignments (keeping 1 alignment/read)
        if r.query_name in remaining_reads and not r.is_supplementary:
            unmapped.write(r)
    unmapped.close()
    temp_bam.close()

    # Merge all aligned reads and unmapped reads into a single bam
    ps.merge("-n", "-O", "BAM", "-@", str(n_cpu), bam_out, *iter_out)
    logger.info("{0} reads aligned / {1} total reads.".format(
        int(total_reads - len(remaining_reads)), int(total_reads)))

    return 0
Exemplo n.º 13
0
def full_pipeline(
    genome,
    input1,
    input2=None,
    aligner="bowtie2",
    centromeres=None,
    circular=False,
    distance_law=False,
    enzyme=5000,
    filter_events=False,
    force=False,
    mapping="normal",
    mat_fmt="graal",
    min_qual=30,
    min_size=0,
    no_cleanup=False,
    out_dir=None,
    pcr_duplicates=False,
    plot=False,
    prefix=None,
    read_len=None,
    remove_centros=None,
    start_stage="fastq",
    threads=1,
    tmp_dir=None,
):
    """
    Run the whole hicstuff pipeline. Starting from fastq files and a genome to
    obtain a contact matrix.

    Parameters
    ----------
    genome : str
        Path to the bowtie2/bwa index prefix if using bowtie2/bwa or to the genome 
        in fasta format if using minimap2.
    input1 : str
        Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads
        in BAM format, or the pairs file, depending on the value of start_stage.
    input2 : str
        Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads
        in BAM format, or None, depending on the value of start_stage.
    enzyme : int or strtest_data/genome/seq.fa
    circular : bool
        Use if the genome is circular.
    out_dir : str or None
        Path where output files should be written. Current directory by default.
    tmp_dir : str or None
        Path where temporary files will be written. Creates a "tmp" folder in
        out_dir by default.
    plot : bool
        Whether plots should be generated at different steps of the pipeline.
        Plots are saved in a "plots" directory inside out_dir.
    min_qual : int
        Minimum mapping quality required to keep a pair of Hi-C reads.
    min_size : int
        Minimum contig size required to keep it.
    threads : int
        Number of threads to use for parallel operations.
    no_cleanup : bool
        Whether temporary files should be deleted at the end of the pipeline.
    mapping : str
        normal|iterative|cutsite. Use normal, iterative or cutsite mapping. 
        "normal": Normal alignement. "iterative": Truncates and extends reads 
        until unambiguous alignment. "cutsite": Digest reads at religation sites 
        and build new pairs from the fragments created.
    filter_events : bool
        Filter spurious or uninformative 3C events. Requires a restriction enzyme.
    force : bool
        If True, overwrite existing files with the same name as output.
    prefix : str or None
        Choose a common name for output files instead of default graal names.
    start_stage : str
        Step at which the pipeline should start. Can be "fastq", "bam", "pairs"
        or "pairs_idx". With starting from bam allows to skip alignment and start
        from named-sorted bam files. With
        "pairs", a single pairs file is given as input, and with "pairs_idx", the
        pairs in the input must already be attributed to fragments and fragment
        attribution is skipped.
    mat_fmt : str
        Select the output matrix format. Can be either "bg2" for the
        bedgraph2 format, "cool" for Mirnylab's cool format, or graal for a
        plain text COO format compatible with Koszullab's instagraal software.
    aligner : str
        Read alignment software to use. Can be either "minimap2", "bwa" or "bowtie2".
    pcr_duplicates : bool
        If True, PCR duplicates will be filtered based on genomic positions.
        Pairs where both reads have exactly the same coordinates are considered
        duplicates and only one of those will be conserved.
    distance_law : bool
        If True, generates a distance law file with the values of the probabilities
        to have a contact between two distances for each chromosomes or arms if the
        file with the positions has been given. The values are not normalized, or
        averaged.
    centromeres : None or str
        If not None, path of file with Positions of the centromeres separated by a
        space and in the same order than the chromosomes.
    read_len : int
        Maximum read length to expect in the fastq file. Optionally used in iterative
        alignment mode. Estimated from the first read by default. Useful if input fastq
        is a composite of different read lengths.
    remove_centros : None or int
        If the distance law is computed, this is the number of kb that will be removed
        around the centromere position given by in the centromere file.
    """
    # Check if third parties can be run
    if aligner in ("bowtie2", "minimap2", "bwa"):
        if (not check_tool(aligner)) | (check_tool(aligner) is None):
            logger.error("%s is not installed or not on PATH", aligner)
            raise ImportError(f"{aligner} is required.")
    else:
        logger.error("Incompatible aligner software, choose bowtie2, minimap2 or bwa.")
        raise ValueError("aligner should be either bowtie2, minimap2 or bwa.")
    if (not check_tool("samtools")) | (check_tool("samtools") is None):
        logger.error("samtools is not installed or not on PATH")
        raise ImportError("samtools is required.")
    if mat_fmt == 'cool':
        try:
            import cooler
        except ImportError:
            logger.error(
                "The cooler package is require to return matrix in cool format, please install it first."
            )
            raise ImportError("The cooler package is required.")

    # Pipeline can start from 3 input types
    start_time = datetime.now()
    stages = {"fastq": 0, "bam": 1, "pairs": 2, "pairs_idx": 3}
    start_stage = stages[start_stage]

    # Check if the number of input files is correct
    if start_stage <= 1:
        if input2 is None:
            logger.error(
                "You must provide 2 input files when --start-stage is fastq " "or bam."
            )
            sys.exit(1)
    else:
        if input2 is not None:
            logger.error(
                "You must provide a single input file when --start-stage is "
                "pairs or pairs_idx."
            )
            sys.exit(1)
    # sanitize enzyme
    enzyme = str(enzyme)
    # Remember whether fragments_file has been generated during this run
    fragments_updated = False

    if out_dir is None:
        out_dir = os.getcwd()

    if tmp_dir is None:
        tmp_dir = join(out_dir, "tmp")

    os.makedirs(out_dir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)

    # Define figures output paths
    if plot:
        fig_dir = join(out_dir, "plots")
        os.makedirs(fig_dir, exist_ok=True)
        if prefix:
            frag_plot = join(fig_dir, prefix + "_frags_hist.pdf")
            dist_plot = join(fig_dir, prefix + "_event_distance.pdf")
            pie_plot = join(fig_dir, prefix + "_event_distribution.pdf")
            distance_law_plot = join(fig_dir, prefix + "_distance_law.pdf")
        else:
            frag_plot = join(fig_dir, "frags_hist.pdf")
            dist_plot = join(fig_dir, "event_distance.pdf")
            pie_plot = join(fig_dir, "event_distribution.pdf")
            distance_law_plot = join(fig_dir, "distance_law.pdf")
        matplotlib.use("Agg")
    else:
        fig_dir = None
        dist_plot = pie_plot = frag_plot = None

    # Use current time for logging and to identify files
    now = time.strftime("%Y%m%d%H%M%S")

    def _tmp_file(fname):
        if prefix:
            fname = prefix + "." + fname
        full_path = join(tmp_dir, fname)
        if not force and os.path.exists(full_path):
            raise IOError(
                "Temporary file {} already exists. Use --force to overwrite".format(
                    full_path
                )
            )
        return full_path

    def _out_file(fname):
        if prefix:
            fname = prefix + "." + fname
        full_path = join(out_dir, fname)
        if not force and os.path.exists(full_path):
            raise IOError(
                "Output file {} already exists. Use --force to overwrite".format(
                    full_path
                )
            )

        return full_path

    # Define temporary file names
    log_file = _out_file("hicstuff_" + now + ".log")
    tmp_genome = _tmp_file("genome.fa.gz")
    bam1 = _tmp_file("for.bam")
    bam2 = _tmp_file("rev.bam")
    pairs = _tmp_file("valid.pairs")
    pairs_idx = _tmp_file("valid_idx.pairs")
    pairs_filtered = _tmp_file("valid_idx_filtered.pairs")
    pairs_pcr = _tmp_file("valid_idx_pcrfree.pairs")

    # Enable file logging
    hcl.set_file_handler(log_file)
    generate_log_header(log_file, input1, input2, genome, enzyme)

    # If the user chose bowtie2 and supplied an index, extract fasta from it
    # For later steps of the pipeline (digestion / frag attribution)
    # Check if the genome is an index or fasta file
    idx = hio.check_fasta_index(genome, mode=aligner)
    is_fasta = hio.check_is_fasta(genome)
    
    # Different aligners accept different files. Make sure the input format is good.
    # Note bowtie2 can extract fasta from the index, but bwa cannot
    sane_input = {
            'bowtie2': is_fasta or idx,
            'minimap2': is_fasta, 
            'bwa': is_fasta
    }

    if not sane_input[aligner]:
        logger.error("You must provide either a fasta or bowtie2 index prefix as genome")

    # Just use the input genome if it is indexed
    if is_fasta and idx:
        fasta = genome
    # Otherwise copy it in tmpdir (in compressed format) for indexing, unless the input is a
    # bt2 index, in which case fasta will be extracted later from it.
    else:
        if is_fasta:
            with hio.read_compressed(genome, 'rb') as src, gzip.open(tmp_genome, 'wb') as dst:
                dst.writelines(src)
            genome = tmp_genome
        fasta = tmp_genome
        

    # Bowtie2-specific feature: extract fasta from the index
    if aligner == 'bowtie2' and not is_fasta:
        # Index is present, extract fasta file from it and compress it
        bt2fa = sp.Popen(
            ["bowtie2-inspect", genome],
            stdout=sp.PIPE,
            stderr=sp.PIPE,
        )
        _ = sp.run(['gzip', '-c'], stdin=bt2fa.stdout, stdout=open(tmp_genome, "w"))
        _, bt2err = bt2fa.communicate()
        # bowtie2-inspect still has return code 0 when crashing, need to
        # actively look for error in stderr
        if re.search(r"[Ee]rror", bt2err.decode()):

            logger.error(bt2err)
            logger.error(
                "bowtie2-inspect has failed, make sure you provided "
                "the path to the bowtie2 index without the extension."
            )
            sys.exit(1)

    # Build index with bowtie2 / bwa if required
    if idx is None and aligner in ['bowtie2', 'bwa']:
        if aligner == 'bowtie2':
            index_cmd = ["bowtie2-build", '-q', fasta, fasta]
        elif aligner == 'bwa':
            index_cmd = ['bwa', 'index', fasta]
        # We only need the index if the user provided fastq input
        if start_stage == 0:
            # If no index present assume input is fasta, copy it in tmp and
            # index it (to avoid conflict between instances)
            logger.info(
                "%s index not found at %s, generating "
                "a local temporary index.", aligner, genome
            )
            sp.run(index_cmd, stderr=sp.PIPE)

    # Check for spaces in fasta headers and issue error if found
    for record in SeqIO.parse(hio.read_compressed(fasta), "fasta"):
        if " " in record.id:
            logger.error(
                "Sequence identifiers contain spaces. Please clean the input genome."
            )
    # Define output file names (tsv files)
    if prefix:
        fragments_list = _out_file("frags.tsv")
        info_contigs = _out_file("chr.tsv")
        mat = _out_file("mat.tsv")
        # If matrix has a different format, give it the right extension
        if mat_fmt != "graal":
            mat = _out_file(mat_fmt)
    else:
        # Default graal file names
        fragments_list = _out_file("fragments_list.txt")
        info_contigs = _out_file("info_contigs.txt")
        mat = _out_file("abs_fragments_contacts_weighted.txt")
        if mat_fmt != "graal":
            mat = _out_file("abs_fragments_contacts_weighted." + mat_fmt)
    # Define what input files are given
    if start_stage == 0:
        reads1, reads2 = input1, input2
    elif start_stage == 1:
        bam1, bam2 = input1, input2
    elif start_stage == 2:
        pairs = input1
    elif start_stage == 3:
        pairs_idx = input1
 
    # Perform genome alignment
    if start_stage == 0:
        
        # Define mapping choice (default normal):
        if mapping == "normal":
            iterative = False
        elif mapping == "iterative":
            iterative = True   
        elif mapping == "cutsite":
            # If no enzyme given use iterative alignment.
            try:
                int(enzyme)
                logger.warning("No enzyme has been given. Can't map using cutsite, iterative mapping will be used instead.")
                iterative = True
            # If cutsite enabled and enzyme given, cut the reads before making a 
            # normal alignment.
            except ValueError:
                iterative = False
                digest_for = _tmp_file("digest_for.fq.gz")
                digest_rev = _tmp_file("digest_rev.fq.gz")
                hcc.cut_ligation_sites(
                    fq_for=reads1,
                    fq_rev=reads2,
                    digest_for=digest_for,
                    digest_rev=digest_rev,
                    enzyme=enzyme,
                    mode="for_vs_rev",
                    seed_size=20,
                    n_cpu=threads,
                )
                reads1, reads2 = digest_for, digest_rev
        else:
            logger.error("mapping must be either normal, iterative or cutsite.")
            raise ValueError
        
        align_reads(
            reads1,
            genome,
            bam1,
            tmp_dir=tmp_dir,
            threads=threads,
            aligner=aligner,
            iterative=iterative,
            min_qual=min_qual,
            read_len=read_len,
        )
        align_reads(
            reads2,
            genome,
            bam2,
            tmp_dir=tmp_dir,
            threads=threads,
            aligner=aligner,
            iterative=iterative,
            min_qual=min_qual,
            read_len=read_len,
        )

    # Detect if multiple enzymes are given
    if re.search(",", enzyme):
        enzyme = enzyme.split(",")
        
    # Starting from bam files
    if start_stage <= 1:

        fragments_updated = True
        # Generate info_contigs and fragments_list output files
        hcd.write_frag_info(
            fasta,
            enzyme,
            min_size=min_size,
            circular=circular,
            output_contigs=info_contigs,
            output_frags=fragments_list,
        )

        # Log fragment size distribution
        hcd.frag_len(frags_file_name=fragments_list, plot=plot, fig_path=frag_plot)

        # Make pairs file (readID, chr1, chr2, pos1, pos2, strand1, strand2)
        bam2pairs(bam1, bam2, pairs, info_contigs, min_qual=min_qual)

    # Starting from pairs file
    if start_stage <= 2:
        restrict_table = {}
        for record in SeqIO.parse(hio.read_compressed(fasta), "fasta"):
            # Get chromosome restriction table
            restrict_table[record.id] = hcd.get_restriction_table(
                record.seq, enzyme, circular=circular
            )

        # Add fragment index to pairs (readID, chr1, pos1, chr2,
        # pos2, strand1, strand2, frag1, frag2)
        hcd.attribute_fragments(pairs, pairs_idx, restrict_table)

    # Sort pairs file by coordinates for next steps
    hio.sort_pairs(
        pairs_idx,
        pairs_idx + ".sorted",
        keys=["chr1", "pos1", "chr2", "pos2"],
        threads=threads,
        tmp_dir=tmp_dir,
    )
    os.rename(pairs_idx + ".sorted", pairs_idx)

    if filter_events:
        uncut_thr, loop_thr = hcf.get_thresholds(
            pairs_idx, plot_events=plot, fig_path=dist_plot, prefix=prefix
        )
        hcf.filter_events(
            pairs_idx,
            pairs_filtered,
            uncut_thr,
            loop_thr,
            plot_events=plot,
            fig_path=pie_plot,
            prefix=prefix,
        )
        use_pairs = pairs_filtered
    else:
        use_pairs = pairs_idx

    # Generate fragments file if it has not been already
    if not fragments_updated:
        hcd.write_frag_info(
            fasta,
            enzyme,
            min_size=min_size,
            circular=circular,
            output_contigs=info_contigs,
            output_frags=fragments_list,
        )

    # Generate distance law table if enabled
    if distance_law:
        out_distance_law = _out_file("distance_law.txt")
        if remove_centros is None:
            remove_centros = 0
        remove_centros = int(remove_centros)
        x_s, p_s, _ = hcdl.get_distance_law(
            pairs_idx,
            fragments_list,
            centro_file=centromeres,
            base=1.1,
            out_file=out_distance_law,
            circular=circular,
            rm_centro=remove_centros,
        )
        # Generate distance law figure is plots are enabled
        if plot:
            # Retrieve chrom labels from distance law file
            _, _, chr_labels = hcdl.import_distance_law(out_distance_law)
            chr_labels = [lab[0] for lab in chr_labels]
            chr_labels_idx = np.unique(chr_labels, return_index=True)[1]
            chr_labels = [chr_labels[index] for index in sorted(chr_labels_idx)]
            p_s = hcdl.normalize_distance_law(x_s, p_s)
            hcdl.plot_ps_slope(x_s, p_s, labels=chr_labels, fig_path=distance_law_plot)

    # Filter out PCR duplicates if requested
    if pcr_duplicates:
        filter_pcr_dup(use_pairs, pairs_pcr)
        use_pairs = pairs_pcr

    # Build matrix from pairs.
    if mat_fmt == "cool":
        # Name matrix file in .cool
        cool_file = os.path.splitext(mat)[0] + ".cool"
        pairs2cool(use_pairs, cool_file, fragments_list)
    else:
        pairs2matrix(
            use_pairs,
            mat,
            fragments_list,
            mat_fmt=mat_fmt,
            threads=threads,
            tmp_dir=tmp_dir,
        )

    # Clean temporary files
    if not no_cleanup:
        tempfiles = [
            pairs,
            pairs_idx,
            pairs_filtered,
            bam1,
            bam2,
            pairs_pcr,
            tmp_genome,
        ]
        # Do not delete files that were given as input
        try:
            tempfiles.remove(input1)
            tempfiles.remove(input2)
        except ValueError:
            pass
        for file in tempfiles:
            try:
                os.remove(file)
            except FileNotFoundError:
                pass

    end_time = datetime.now()
    duration = relativedelta(end_time, start_time)
    logger.info(
        "Contact map generated after {h}h {m}m {s}s".format(
            h=duration.hours, m=duration.minutes, s=duration.seconds
        )
    )