Exemplo n.º 1
0
def test_diag_trim(matrix):
    """Check if trimming diagonals preserves shape and sets diagonals to zero."""
    for d in range(matrix.shape[0]):
        trimmed = preproc.diag_trim(matrix.tocsr(), d)
        diag_sums = [
            trimmed.diagonal(d).sum() for d in range(trimmed.shape[0])
        ]
        assert trimmed.shape == matrix.shape
        assert np.sum(diag_sums[d + 1:]) == 0
Exemplo n.º 2
0
 def test_make_missing_mask(self):
     """Test if missing bin masks are generated properly according to matrix type"""
     missing_bins = np.array([0, 4, 9])
     valid_bins = np.array([i for i in range(10) if i not in missing_bins])
     valid_cols = np.array([i for i in range(15) if i not in missing_bins])
     max_dist = 3
     # Symmetric mask, whole matrix masked
     exp_mask_sym = np.zeros((10, 10), dtype=bool)
     exp_mask_sym[:, missing_bins] = True
     exp_mask_sym[missing_bins, :] = True
     # Asymmetric mask, whole matrix masked
     exp_mask_asym = np.zeros((10, 15), dtype=bool)
     exp_mask_asym[:, missing_bins] = True
     exp_mask_asym[missing_bins, :] = True
     # Symmetric mask, only upper triangle masked
     exp_mask_sym_upper = np.triu(exp_mask_sym)
     # Symmetric upper triangle masked up to a certain distance
     exp_mask_sym_upper_maxdist = preproc.diag_trim(exp_mask_sym_upper,
                                                    max_dist + 1)
     # Test if correct bins are masked
     obs_mask_sym = preproc.make_missing_mask(exp_mask_sym.shape,
                                              valid_bins,
                                              valid_bins,
                                              sym_upper=False)
     assert np.all(obs_mask_sym == exp_mask_sym)
     # Test if only upper triangle is masked in upper symmetric matrices
     obs_mask_sym_upper = preproc.make_missing_mask(exp_mask_sym.shape,
                                                    valid_bins,
                                                    valid_bins,
                                                    sym_upper=True)
     assert np.all(obs_mask_sym_upper == exp_mask_sym_upper)
     # Test masking of asymmetric matrices
     obs_mask_asym = preproc.make_missing_mask(exp_mask_asym.shape,
                                               valid_bins, valid_cols)
     assert np.all(obs_mask_asym == exp_mask_asym)
     # Test if giving an asymmetric matrix with sym_upper results in error
     with self.assertRaises(ValueError):
         preproc.make_missing_mask(obs_mask_asym.shape,
                                   valid_bins,
                                   valid_bins,
                                   sym_upper=True)
     # Test if using max_dist yields the same results as manually truncating diagonals
     obs_mask_sym_upper_maxdist = preproc.make_missing_mask(
         exp_mask_sym.shape,
         valid_bins,
         valid_bins,
         sym_upper=True,
         max_dist=max_dist,
     )
     assert np.all(obs_mask_sym_upper_maxdist == exp_mask_sym_upper_maxdist)
Exemplo n.º 3
0
def pattern_detector(
    contact_map,
    kernel_config,
    kernel_matrix,
    coords=None,
    dump=None,
    full=False,
    tsvd=None,
):
    """
    Detect patterns in a contact map by kernel matching, and extract windows
    around the detected patterns. If coordinates are provided, detection is
    skipped and windows are extracted around those coordinates.

    Parameters
    ----------
    contact_map : ContactMap object
        An object containing an inter- or intra-chromosomal Hi-C contact map
        and additional metadata.
    kernel_config : dict
        The kernel configuration, as documented in
        chromosight.utils.io.load_kernel_config
    kernel_matrix : numpy.array
        The kernel matrix to use for convolution as a 2D numpy array
    coords : numpy.array of ints or None
        A table with coordinates of patterns, with one pattern per row
        and 2 columns being the row and column number of the pattern in
        the input contact map. If this is provided, detection is skipped
        and quantification is performed on those coordinates.
    dump : str or None
        Folder in which dumps should be generated after each step of the
        detection process. If None, no dump is generated
    tsvd : float or None
        If a float between 0 and 1 is given, the input kernel is factorised
        using truncated SVD, keeping enough singular vectors to retain this
        proportion of information. Factorisation speeds up convolution at
        the cost of a loss of information. If the number of singular vectors
        required to retain the desired information is disabled by default.

    Returns
    -------
    filtered_chrom_patterns : pandas.DataFrame
        A table of detected patterns with 4 columns: bin1, bin2, score, qvalue.
    chrom_pattern_windows : numpy array
        A 3D array containing the pile of windows around detected patterns.
    """
    km, kn = kernel_matrix.shape
    kh, kw = (km - 1) // 2, (kn - 1) // 2


    def save_dump(base, mat):
        """Define where to save the dump"""
        sp.save_npz(
            pathlib.Path(dump) / f"{contact_map.name}_{base}", mat
        )

    # Define type of analysis.
    run_mode = "detect" if coords is None else "quantify"

    # Do not attempt pattern detection unless matrix is larger than the kernel
    if min(contact_map.matrix.shape) <= max(kernel_matrix.shape):
        return None, None

    # If full is specified, missing bins are accounted for using a mask
    if full:
        missing_mask = preproc.make_missing_mask(
            contact_map.matrix.shape,
            valid_rows=contact_map.detectable_bins[0],
            valid_cols=contact_map.detectable_bins[1],
            max_dist=contact_map.max_dist,
            sym_upper=not contact_map.inter,
        )
    else:
        missing_mask = None

    # Pattern matching operates here
    mat_conv, mat_log10_pvals = normxcorr2(
        contact_map.matrix.tocsr(),
        kernel_matrix,
        max_dist=contact_map.max_dist,
        sym_upper=not contact_map.inter,
        full=full,
        missing_mask=missing_mask,
        tsvd=tsvd,
        pval=True,
        missing_tol=kernel_config["max_perc_undetected"] / 100,
    )
    if dump:
        save_dump("03_normxcorr2", mat_conv)
    # Clean potential missing values
    mat_conv.data[np.isnan(mat_conv.data)] = 0
    # Only keep corrcoefs in scannable range
    if not contact_map.inter:
        mat_conv = preproc.diag_trim(mat_conv.tocsr(), contact_map.max_dist)
        if dump:
            save_dump("04_diag_trim", mat_conv)
    mat_conv = mat_conv.tocoo()
    mat_conv.eliminate_zeros()

    # Only attempt detection if no input coordinates were given
    if run_mode == "detect":
        # Find foci of highly correlated pixels and pick local maxima
        # coords, foci_mat = pick_foci(np.abs(mat_log10_pvals), 5)
        coords, foci_mat = pick_foci(mat_conv, kernel_config["pearson"],)
        # If nothing was detected, no point in resuming
        if coords is None:
            return None, None
        if dump:
            save_dump("05_foci", foci_mat)
    mat = contact_map.matrix.copy()
    det = [d.copy() for d in contact_map.detectable_bins]
    # Zero pad contact and convolution maps and shift missing bins and detected
    # pattern coords before validation if in full mode

    if full:
        mat = mat.tocoo()
        mat = preproc.zero_pad_sparse(mat, kh, kw, fmt="csr")
        mat_conv = preproc.zero_pad_sparse(mat_conv, kh, kw, fmt="csr")
        det[0] += kh
        det[1] += kw
        coords[:, 0] += kh
        coords[:, 1] += kw

    if not contact_map.inter:
        # set the first kh / 2 diagonals in the lower triangle to NaN
        # so that pileups do not count them
        big_k = max(km, kn)
        mat = mat.tocsr()
        mat += sp.diags(
            np.full(big_k, np.nan),
            -np.arange(1, big_k + 1),
            shape=mat.shape,
            format="csr",
        )
        # When detecting 1D pattern, enforce coordinates on diagonal
        # coordinates can be shifted by 1 since we keep the two first
        # diagonals to allow formation of foci via 4-way adjacency
        if kernel_config["max_dist"] == 0:
            coords[:, 0] = coords[:, 1]

    # Extract windows around coordinates and assign a correlation
    # to each pattern. In detection mode, we drop invalid patterns
    # in quantification mode, all input patterns are returned.
    filtered_coords, filtered_windows = validate_patterns(
        coords,
        mat,
        mat_conv.tocsr(),
        det,
        kernel_matrix,
        zero_tol=kernel_config["max_perc_zero"] / 100,
        missing_tol=kernel_config["max_perc_undetected"] / 100,
        drop=True if run_mode == "detect" else False,
    )

    # Shift coordinates of detected patterns back if padding was added
    if full:
        filtered_coords.bin1 -= kh
        filtered_coords.bin2 -= kw

    try:
        filtered_coords["pvalue"] = mat_log10_pvals[
            filtered_coords.bin1, filtered_coords.bin2
        ].A1
    # No coordinate passed the validation filters
    except AttributeError:
        filtered_coords["pvalue"] = None
    # Remove log10 transform and correct p-values for multiple testing
    filtered_coords["pvalue"] = 10 ** filtered_coords["pvalue"]
    return filtered_coords, filtered_windows
Exemplo n.º 4
0
def _corrcoef2d_dense(
    signal, kernel, max_dist=None, sym_upper=False, scaling="pearson"
):
    """Implementation of signal-kernel 2D correlation for dense matrices
    Pearson correlation coefficient between signal and sliding kernel. Convolutes
    the input signal and kernel computes a cross correlation coefficient.

    Parameters
    ----------
    signal : numpy.array
        The input processed Hi-C matrix.
    kernel : numpy.array
        The pattern kernel to use for convolution.
    max_dist : int
        Maximum scan distance, in number of bins from the diagonal. If None, the whole
        matrix is convoluted. Otherwise, pixels further than this distance from the
        diagonal are set to 0 and ignored for performance. Only useful for 
        intrachromosomal matrices.
    sym_upper : False
        Whether the matrix is symmetric and upper triangle. True for intrachromosomal
        matrices.
    scaling : str
        Which metric to use when computing correlation coefficients. Either 'pearson'
        for Pearson correlation, or 'cross' for cross correlation.

    Returns
    -------
    numpy.array
        The sparse matrix of correlation coefficients
    """

    # Convert numpy matrices to array to avoid operator overloading
    if isinstance(signal, np.matrix):
        signal = np.array(signal)
    if isinstance(kernel, np.matrix):
        kernel = np.array(kernel)
    # If using only the upper triangle matrix, set diagonals that will
    # overlap the kernel in the lower triangle to their opposite diagonal
    # in the upper triangle
    if sym_upper:
        # Full matrix is stored for dense arrays anyway
        # -> make symmetric
        sys.stderr.write("Making dense matrix symmetric.\n")
        signal = signal + np.transpose(signal) - np.diag(np.diag(signal))

    kernel_size = kernel.shape[0] * kernel.shape[1]

    if scaling == "cross":
        # Compute convolution product
        conv = xcorr2(signal, kernel)
        # Generate constant kernel
        kernel1 = np.ones(kernel.shape)
        # Convolute squared signal with constant kernel
        signal2 = xcorr2(signal ** 2, kernel1)
        kernel2 = float(np.sum(kernel ** 2))
        denom = signal2 * kernel2
        denom = np.sqrt(denom)
    elif scaling == "pearson":
        mean_kernel = float(kernel.mean())
        std_kernel = float(kernel.std())
        if not (std_kernel > 0):
            raise ValueError(
                "Cannot have scaling=pearson when kernel"
                "is flat. Use scaling=cross."
            )

        kernel1 = np.ones(kernel.shape)
        mean_signal = xcorr2(signal, kernel1 / kernel_size)

        std_signal = (
            xcorr2(signal ** 2, kernel1 / kernel_size) - mean_signal ** 2
        )
        std_signal = np.sqrt(std_signal)
        conv = xcorr2(signal, kernel / kernel_size) - mean_signal * mean_kernel
        denom = std_signal * std_kernel

    conv /= denom

    if (max_dist is not None) and sym_upper:
        # Trim diagonals further than max_scan_distance
        conv = preproc.diag_trim(conv, max_dist)

    if sym_upper:
        conv = np.triu(conv)
    conv[~np.isfinite(conv)] = 0.0
    conv[conv < 0] = 0.0
    return conv
Exemplo n.º 5
0
def _corrcoef2d_sparse(
    signal, kernel, max_dist=None, sym_upper=False, scaling="pearson"
):
    """Implementation of signal-kernel 2D correlation for sparse matrices
    Pearson correlation coefficient between signal and sliding kernel. Convolutes
    the input signal and kernel computes a cross correlation coefficient.

    Parameters
    ----------
    signal : scipy.sparse.csr_matrix
        The input processed Hi-C matrix.
    kernel : numpy.array
        The pattern kernel to use for convolution.
    max_dist : int
        Maximum scan distance, in number of bins from the diagonal. If None, the whole
        matrix is convoluted. Otherwise, pixels further than this distance from the
        diagonal are set to 0 and ignored for performance. Only useful for 
        intrachromosomal matrices.
    sym_upper : False
        Whether the matrix is symmetric and upper triangle. True for intrachromosomal
        matrices.
    scaling : str
        Which metric to use when computing correlation coefficients. Either 'pearson'
        for Pearson correlation, or 'cross' for cross correlation.

    Returns
    -------
    scipy.sparse.csr_matrix
        The sparse matrix of correlation coefficients
    """
    # If using only the upper triangle matrix, set diagonals that will
    # overlap the kernel in the lower triangle to their opposite diagonal
    # in the upper triangle
    if sym_upper:
        signal = signal.tolil()
        for i in range(1, kernel.shape[0]):
            signal.setdiag(signal.diagonal(i), -i)
    signal = signal.tocsr()
    kernel_size = kernel.shape[0] * kernel.shape[1]

    if scaling == "cross":
        # Compute convolution product
        conv = xcorr2(signal, kernel)
        # Generate constant kernel
        kernel1 = np.ones(kernel.shape)
        # Convolute squared signal with constant kernel
        signal2 = xcorr2(signal.power(2), kernel1)
        kernel2 = float(np.sum(np.power(kernel, 2)))
        denom = signal2 * kernel2
        denom = denom.sqrt()
    elif scaling == "pearson":
        mean_kernel = float(kernel.mean())
        std_kernel = float(kernel.std())
        if not (std_kernel > 0):
            raise ValueError(
                "Cannot have scaling=pearson when kernel"
                "is flat. Use scaling=cross."
            )

        kernel1 = np.ones(kernel.shape)
        mean_signal = xcorr2(signal, kernel1 / kernel_size)
        std_signal = xcorr2(
            signal.power(2), kernel1 / kernel_size
        ) - mean_signal.power(2)
        std_signal = std_signal.sqrt()
        conv = xcorr2(signal, kernel / kernel_size) - mean_signal * mean_kernel
        denom = std_signal * std_kernel
    # Since elementwise sparse matrices division is not implemented, compute
    # numerator and denominator and perform division on the 1D array of nonzero
    # values.
    # Get coords of non-zero (nz) values in the numerator
    nz_vals = conv.nonzero()
    # Divide them by corresponding entries in the numerator
    denom = denom.tocsr()
    try:
        conv.data /= denom[nz_vals].A1
    # Case there are no nonzero corrcoef
    except AttributeError:
        pass

    if (max_dist is not None) and sym_upper:
        # Trim diagonals further than max_scan_distance
        conv = preproc.diag_trim(conv.todia(), max_dist)

    if sym_upper:
        conv = sp.triu(conv)
    conv = conv.tocoo()
    conv.data[~np.isfinite(conv.data)] = 0.0
    conv.data[conv.data < 0] = 0.0
    conv.eliminate_zeros()
    conv = conv.tocsr()
    return conv
Exemplo n.º 6
0
def pattern_detector(contact_map, kernel_config, kernel_matrix, dump=None):
    """Pattern detector

    Detect patterns by iterated kernel matching, and extract windows around the
    detected patterns.

    Parameters
    ----------
    contact_map : ContactMap object
        An object containing an inter- or intra-chromosomal Hi-C contact map
        and additional metadata.
    kernel_config : dict
        The kernel configuration, as documented in
        chromosight.utils.io.load_kernel_config
    kernel_matrix : numpy.array
        The kernel matrix to use for convolution as a 2D numpy array
    dump : str or None
        Folder in which dumps should be generated after each step of the detection
        process. If None, no dump is generated

    Returns
    -------
    filtered_chrom_patterns : numpy.array
        A 2D array of detected patterns with 3 columns: x, y, score.
    chrom_pattern_windows : numpy array
        A 3D array containing the pile of windows around detected patterns.
    """

    # Define where to save the dump
    save_dump = lambda base, mat: sp.save_npz(
        pathlib.Path(dump) / f"{contact_map.name}_{base}", mat
    )

    # Do not attempt pattern detection unless matrix is larger than the kernel
    if min(contact_map.matrix.shape) <= max(kernel_matrix.shape):
        return None, None

    # Dirty trick: Since sparse implementation of convolution currently works
    # only for symmetric matrices, use dense implementation for inter-matrices
    # This is very expensive in RAM
    # Pattern matching operate here
    mat_conv = corrcoef2d(
        contact_map.matrix,
        kernel_matrix,
        max_dist=kernel_config["max_dist"],
        sym_upper=not contact_map.inter,
    )
    if dump:
        save_dump("03_corrcoef2d", mat_conv)
    # Only trim diagonals for intra matrices (makes no sense for inter)
    mat_conv = mat_conv.tocoo()
    # Clean potential missing values
    mat_conv.data[np.isnan(mat_conv.data)] = 0

    # Only keep corrcoefs in scannable range
    if not contact_map.inter:
        mat_conv = preproc.diag_trim(mat_conv.todia(), contact_map.max_dist)
        if dump:
            save_dump("04_diag_trim", mat_conv)
    mat_conv = mat_conv.tocoo()
    mat_conv.eliminate_zeros()

    # Find foci of highly correlated pixels
    chrom_pattern_coords, foci_mat = picker(
        mat_conv, kernel_config["precision"]
    )
    if chrom_pattern_coords is None:
        return None, None
    if dump:
        save_dump("05_foci", foci_mat)
    filtered_chrom_patterns, chrom_pattern_windows = validate_patterns(
        chrom_pattern_coords,
        contact_map.matrix,
        mat_conv.tocsr(),
        contact_map.detectable_bins,
        kernel_matrix,
        kernel_config["max_perc_undetected"],
    )
    return filtered_chrom_patterns, chrom_pattern_windows
Exemplo n.º 7
0
def detection_matrix(
    samples: pd.DataFrame,
    kernel: np.ndarray,
    region: Optional[str] = None,
    subsample: Optional[int] = None,
    max_dist: Optional[int] = None,
    pearson_thresh: Optional[float] = None,
    density_thresh: Optional[float] = None,
    snr_thresh: Optional[float] = 1.0,
    n_cpus: int = 4,
) -> Tuple[Optional[sp.csr_matrix], Optional[sp.csr_matrix]]:
    """
    Run the detection process for a single chromosome or region. This is abstracted from all
    notions of chromosomes and genomic coordinates.
    """
    # We consider the matrix is symmetric upper (i.e. intrachromosomal)
    sym_upper = True
    # Diagonals will be trimmed at max_dist with a margin for convolution
    if max_dist is None:
        trim_dist = None
    else:
        mat_size = samples.cool[0].matrix(sparse=True).fetch(region).shape[0]
        trim_dist = min(mat_size, max_dist + max(kernel.shape))
    # Compute number of contacts in the matrix with the lowest coverage
    if subsample:
        min_contacts = get_min_contacts(samples.cool, region=region)
    else:
        min_contacts = None
    # Define the condition of the first sample as the baseline condition
    control = samples.cond.values[0]
    # Preprocess all matrices (subsample, balance, detrend)
    # Samples pocessed in parallel if requested
    if n_cpus > 1:
        pool = mp.Pool(n_cpus)
        map_fun = pool.starmap
    else:
        map_fun = lambda x, y: [x(*args) for args in y]

    # Hi-C specific preprocessing individual matrices (subsample, balance, detrend)
    samples["mat"] = map_fun(
        preprocess_hic,
        zip(samples.cool, it.repeat(min_contacts), it.repeat(region)),
    )
    print(f"{region} preprocessed", file=sys.stderr)
    # Return nothing if the matrix is smaller than kernel
    if np.any(np.array(samples["mat"][0].shape) <= np.array(kernel.shape)):
        return None, None
    # Retrieve the indices of bins which are valid in all samples (not missing
    # because of repeated sequences or low coverage)
    common_bins = pap.get_common_valid_bins(samples["mat"])
    # Trim diagonals beyond max_dist (with kernel margin for the convolution)
    # to spare resources
    if trim_dist is not None:
        samples["mat"] = map_fun(cup.diag_trim,
                                 zip(samples["mat"], it.repeat(trim_dist)))
    # Generate a missing mask from these bins
    missing_mask = cup.make_missing_mask(
        samples["mat"][0].shape,
        common_bins,
        common_bins,
        max_dist=trim_dist,
        sym_upper=sym_upper,
    )
    # Remove all missing values from each sample's matrix
    samples["mat"] = map_fun(
        cup.erase_missing,
        zip(
            map(sp.triu, samples["mat"]),
            it.repeat(common_bins),
            it.repeat(common_bins),
            it.repeat(sym_upper),
        ),
    )
    print(f"{region} missing bins erased", file=sys.stderr)

    # Compute a density filter: regions with sufficient proportion of nonzero
    # pixels in kernel windows, in all samples. We will use it for downstream
    # which filter
    if (density_thresh is not None) and (density_thresh > 0):
        density_filter = make_density_filter(
            samples["mat"],
            density_thresh=density_thresh,
            win_size=kernel.shape[0],
            sym_upper=sym_upper,
        )
    # Generate correlation maps for all samples using chromosight's algorithm
    corrs = map_fun(
        cud.normxcorr2,
        zip(
            samples.mat.values,
            it.repeat(kernel),
            it.repeat(max_dist),
            it.repeat(True),
            it.repeat(True),
            it.repeat(missing_mask),
            it.repeat(0.75),
            it.repeat(None),
            it.repeat(False),
        ),
    )
    samples["mat"] = [tup[0] for tup in corrs]
    del corrs
    print(f"{region} correlation matrices computed", file=sys.stderr)
    # Get the union of nonzero coordinates across all samples
    total_nnz_set = pap.get_nnz_union(samples["mat"])
    # Fill zeros at these coordinates
    samples["mat"] = samples["mat"].apply(
        lambda cor: pap.fill_nnz(cor, total_nnz_set))
    # Erase pixels where all samples are below pearson threshold
    if pearson_thresh is not None:
        pearson_fail = [(m.data < pearson_thresh).astype(bool)
                        for m in samples["mat"]]
        pearson_fail = np.bitwise_and.reduce(pearson_fail)
        # Threshold maps using pearson correlations to reduce noisy detections
        for i, m in enumerate(samples["mat"]):
            m.data[pearson_fail] = 0.0
            samples["mat"][i] = m

    if n_cpus > 1:
        pool.close()

    # Use median background
    diff, snr = _median_bg_subtraction(samples, control, snr_thresh)

    # Erase pixels which do not pass the density filter in all samples
    if (density_thresh is not None) and (density_thresh > 0):
        diff = diff.multiply(density_filter)
    # Remove all values beyond user-specified max_dist
    if max_dist is not None:
        diff = cup.diag_trim(diff, max_dist + 2)

    return diff, snr