Пример #1
0
def test_get_coolers():
    """Test loading of cool files from multiple samples"""
    # Load multiple files
    assert len(pai.get_coolers(COOLS)) == len(COOLS)
    # Load single file correctly
    assert isinstance(pai.get_coolers([COOLS[0]])[0], cooler.Cooler)
    # Load single file wrong, should crash
    with pytest.raises(TypeError):
        assert pai.get_coolers(COOLS[0])
    # Dimensions unmatched, should give an explicit error
    with pytest.raises(ValueError) as err:
        assert pai.get_coolers(COOLS + [str(DATA / "natural.cool")])
    assert str(err.value) == "Shapes are inconsistent."
    # Resolutions unmatched, should give an explicit error
    with pytest.raises(ValueError) as err:
        assert pai.get_coolers(COOLS + [str(DATA / "B_rebin.cool")])
    assert str(err.value) == "Resolutions are inconsistent."
Пример #2
0
def test_get_min_contacts_region():
    """Test if lowest contact value is found correctly"""
    min_exp = 415
    min_obs = pah.get_min_contacts(pai.get_coolers(COOLS), REGION)
    assert min_obs == min_exp
Пример #3
0
def test_get_min_contacts():
    """Test if lowest contact value is found correctly"""
    min_exp = 200606
    min_obs = pah.get_min_contacts(pai.get_coolers(COOLS))
    assert min_obs == min_exp
Пример #4
0
def change_detection_pipeline(
    cool_files: Iterable[str],
    conditions: Iterable[str],
    kernel: Union[np.ndarray, str] = "loops",
    bed2d_file: Optional[str] = None,
    region: Optional[Union[Iterable[str], str]] = None,
    max_dist: Optional[int] = None,
    min_dist: Optional[int] = None,
    subsample: bool = True,
    pearson_thresh: Optional[float] = None,
    density_thresh: Optional[float] = 0.10,
    snr_thresh: Optional[float] = 1.0,
    n_cpus: int = 4,
) -> pd.DataFrame:
    """
    Run end to end pattern change detection pipeline on input cool files. A
    list of conditions of the same lengths as the sample list must be provided.
    The first condition in the list is used as the reference (control) state.

    Changes for a specific pattern are computed. A valid chromosight pattern
    name can be supplied (e.g. loops, borders, hairpins, ...) or a kernel
    matrix can be supplied directly instead. maximum scanning distance can be
    specified directly (in basepairs) to override the kernel default value.

    Positions with significant changes will be reported in a pandas
    dataframe. Significance is determined based on the percentile threshold,
    between 1 and 100. Optionally, a 2D bed file with positions of interest can
    be specified, in which case change value at these positions will be
    reported instead. When using a bed2d file, the threshold is optional (one
    can report either scores at all positions, or only where they are
    significant).

    Positive diff_scores mean the pattern intensity was increased relative to
    control (first condition).

    Parameters
    ----------
    cool_files :
        The list of paths to cool files for the input samples.
    conditions :
        The list of conditions matching the samples.
    kernel :
        Either the kernel to use as pattern as a numpy array, or the name of a
        valid chromosight pattern.
    bed2d_file :
        Path to a bed2D file containing a list of 2D positions. If this is
        provided, pattern changes at these coordinates will be quantified.
        Otherwise, they will be detected based on a threshold.
    region :
        Either a single UCSC format region string, or a list of multiple
        regions. The analysis will be restricted to those regions.
    max_dist :
        Maximum interaction distance (in basepairs) to consider in the analysis.
        If this is not specified and a chromosight kernel was specified, the
        default max_dist for that kernel is used. If the case of a custom kernel,
        the whole matrix will be scanned if no max_dist is specified.
    subsample :
        Whether all input matrices should be subsampled to the same number of
        contacts as the least covered sample.
    pearson_thresh :
        The pearson correlation threshold to use when detecting patterns. If None,
        the default value for the kernel is used.
    density_thresh :
        The pixel density threshold to require. Low coverage windows with a
        proportion of nonzero pixels below this value are discarded.
    n_cpus :
        Number of CPU cores to allocate for parallel operations.

    Returns
    -------
    pd.DataFrame :
        The list of reported 2D coordinates and their change intensities.
    """
    # Make sure each sample has an associated condition
    if len(cool_files) != len(conditions):
        raise ValueError(
            "The lists of cool files and conditions must have the same length")

    # If a pattern name was provided, load corresponding chromosight kernel
    if isinstance(kernel, str):
        kernel_name = kernel
        try:
            kernel = getattr(ck, kernel_name)["kernels"][0]
            if max_dist is None:
                max_dist = getattr(ck, kernel_name)["max_dist"]
            if min_dist is None:
                min_dist = getattr(ck, kernel_name)["min_dist"]
            if pearson_thresh is None:
                pearson_thresh = getattr(ck, kernel_name)["pearson"]
        except AttributeError:
            raise AttributeError(f"{kernel_name} is not a valid pattern name")
        print(f"Loading default parameter for kernel '{kernel_name}'...")
        print(f"pearson_thresh: {pearson_thresh}")
        print(f"min_dist: {min_dist}")
        print(f"max_dist: {max_dist}")
    elif isinstance(kernel, np.ndarray):
        kernel_name = "custom kernel"
    else:
        raise ValueError(
            "Kernel must either be a valid chromosight pattern name, or a 2D numpy.ndarray of floats"
        )
    # Associate samples with their conditions
    samples = pd.DataFrame({
        "cond": conditions,
        "cool": pai.get_coolers(cool_files)
    })
    print(
        f"Changes will be computed relative to condition: {samples.cond.values[0]}"
    )
    # Define each chromosome as a region, if None specified
    clr = samples.cool.values[0]
    if max_dist is not None:
        max_dist = max_dist // clr.binsize
    if min_dist is None:
        min_dist = 0
    else:
        min_dist = min_dist // clr.binsize
    if region is None:
        regions = clr.chroms()[:]["name"].tolist()
    elif isinstance(region, str):
        regions = [region]
    else:
        regions = region
    pos_cols = [
        "chrom1",
        "start1",
        "end1",
        "chrom2",
        "start2",
        "end2",
        "bin1",
        "bin2",
        "diff_score",
        "snr",
    ]
    if bed2d_file:
        positions = cio.load_bed2d(bed2d_file)
        for col in ["diff_score", "snr", "bin1", "bin2"]:
            positions[col] = np.nan
    else:
        positions = pd.DataFrame(columns=pos_cols)
    for reg in regions:
        # Subset bins to the range of interest
        bins = clr.bins().fetch(reg).reset_index(drop=True)
        diff, snr = detection_matrix(
            samples,
            kernel,
            region=reg,
            subsample=subsample,
            max_dist=max_dist,
            pearson_thresh=pearson_thresh,
            density_thresh=density_thresh,
            n_cpus=n_cpus,
            snr_thresh=snr_thresh,
        )

        # If the matrix was too small or no difference was found, skip it
        if diff is None or diff.nnz == 0:
            continue
        # If positions were provided, return the change value for each of them
        if bed2d_file:
            tmp_chr = reg.split(":")[0]
            tmp_rows = (positions.chrom1 == tmp_chr) & (positions.chrom2
                                                        == tmp_chr)
            # If there are no positions of interest on this chromosome, just
            # skip it
            if not np.any(tmp_rows):
                continue
            tmp_pos = positions.loc[tmp_rows, :]
            # Convert both coordinates from genomic coords to bins
            for i in [1, 2]:
                tmp_pos["chrom"] = tmp_pos[f"chrom{i}"]
                tmp_pos["pos"] = (tmp_pos[f"start{i}"] +
                                  tmp_pos[f"end{i}"]) // 2
                tmp_pos[f"bin{i}"] = coords_to_bins(clr, tmp_pos).astype(int)
                # Save bin coordinates from current chromosome to the full table
                positions.loc[tmp_rows, f"bin{i}"] = tmp_pos[f"bin{i}"]
            tmp_pos = tmp_pos.drop(columns=["pos", "chrom"])
            # Retrieve diff values for each coordinate
            positions.loc[tmp_rows,
                          "diff_score"] = diff[tmp_pos.start1 // clr.binsize,
                                               tmp_pos.start2 //
                                               clr.binsize].A1
            positions.loc[tmp_rows,
                          "snr"] = snr[tmp_pos.start1 // clr.binsize,
                                       tmp_pos.start2 // clr.binsize].A1
        # Otherwise report individual spots of change using chromosight
        else:
            # Pick "foci" of changed pixels and their local maxima
            tmp_pos, _ = cud.pick_foci(abs(diff), 0.01, min_size=3)
            # Get genomic positions from matrix coordinates
            tmp_pos = pd.DataFrame(tmp_pos, columns=["bin1", "bin2"])
            for i in [1, 2]:
                coords = (bins.loc[tmp_pos[f"bin{i}"],
                                   ["chrom", "start", "end"]].reset_index(
                                       drop=True).rename(
                                           columns={
                                               "chrom": f"chrom{i}",
                                               "start": f"start{i}",
                                               "end": f"end{i}",
                                           }))
                # Add axis' columns to  dataframe
                tmp_pos = pd.concat([coords, tmp_pos], axis=1)
            # Retrieve diff values for each coordinate
            try:
                tmp_pos["diff_score"] = diff[tmp_pos.bin1, tmp_pos.bin2].A1
            # No position found, go to next region
            except AttributeError:
                continue
            tmp_pos["snr"] = snr[tmp_pos.bin1, tmp_pos.bin2].A1
            # Append new chromosome's rows
            positions = pd.concat([positions, tmp_pos], axis=0)
            # For 1D patterns (e.g. borders) set diagonal positions.
            if max_dist == 0:
                positions[["bin1", "chrom1", "start1", "end1"
                           ]] = positions[["bin2", "chrom2", "start2", "end2"]]
    positions = positions.loc[:, pos_cols]
    positions = positions.loc[abs(positions.bin2 -
                                  positions.bin1) >= min_dist, :].reset_index(
                                      drop=True)
    print(positions)
    return positions
Пример #5
0
def change_detection_pipeline(
    cool_files: Iterable[str],
    conditions: Iterable[str],
    kernel: Union[np.ndarray, str] = "loops",
    bed2d_file: Optional[str] = None,
    region: Optional[Union[Iterable[str], str]] = None,
    max_dist: Optional[int] = None,
    subsample: bool = True,
    percentile_thresh: float = 95.0,
    n_cpus: int = 4,
) -> pd.DataFrame:
    """
    Run end to end pattern change detection pipeline on input cool files. A
    list of conditions of the same lengths as the sample list must be provided.
    The first condition in the list is used as the reference (control) state.

    Changes for a specific pattern are computed. A valid chromosight pattern
    name can be supplied (e.g. loops, borders, hairpins, ...) or a kernel matrix
    can be supplied directly instead.

    Positions with significant changes will be reported in a pandas
    dataframe. Optionally, a 2D bed file with positions of interest can be
    specified, in which case change value at these positions will be reported
    instead.

    Positive diff_scores mean the pattern intensity was increased relative to
    control (first condition).
    """
    # Make sure each sample has an associated condition
    if len(cool_files) != len(conditions):
        raise ValueError(
            "The lists of cool files and conditions must have the same length")

    # If a pattern name was provided, load corresponding chromosight kernel
    if isinstance(kernel, str):
        kernel_name = kernel
        try:
            kernel = getattr(ck, kernel)["kernels"][0]
        except AttributeError:
            raise AttributeError(f"{kernel_name} is not a valid pattern name")
    elif isinstance(kernel, np.ndarray):
        kernel_name = "custom kernel"
    else:
        raise ValueError(
            "Kernel must either be a valid chromosight pattern name, or a 2D numpy.ndarray of floats"
        )
    # Associate samples with their conditions
    samples = pd.DataFrame({
        "cond": conditions,
        "cool": pai.get_coolers(cool_files)
    })
    print(
        f"Changes will be computed relative to condition: {samples.cond.values[0]}"
    )
    # Define each chromosome as a region, if None specified
    clr = samples.cool.values[0]
    if region is None:
        regions = clr.chroms()[:]["name"].tolist()
    elif isinstance(region, str):
        region = [region]
    pos_cols = [
        "chrom1",
        "start1",
        "end1",
        "chrom2",
        "start2",
        "end2",
        "bin1",
        "bin2",
        "diff_score",
    ]
    if bed2d_file:
        positions = cio.load_bed2d(bed2d_file)
        for col in ["diff_score", " bin1", "bin2"]:
            positions[col] = np.nan
    else:
        positions = pd.DataFrame(columns=pos_cols)
    for reg in regions:
        # Subset bins to the range of interest
        bins = clr.bins().fetch(reg).reset_index(drop=True)
        diff, thresh = detection_matrix(
            samples,
            kernel,
            region=reg,
            subsample=subsample,
            max_dist=max_dist,
            percentile_thresh=percentile_thresh,
            n_cpus=n_cpus,
        )
        # If positions were provided, return the change value for each of them
        if bed2d_file:
            tmp_chr = reg.split(":")[0]
            tmp_rows = (positions.chrom1 == tmp_chr) & (positions.chrom2
                                                        == tmp_chr)
            # If there are no positions of interest on this chromosome, just
            # skip it
            if not np.any(tmp_rows):
                continue
            tmp_pos = positions.loc[tmp_rows, :]
            # Convert both coordinates from genomic coords to bins
            for i in [1, 2]:
                tmp_pos["chrom"] = tmp_pos[f"chrom{i}"]
                tmp_pos["pos"] = (tmp_pos[f"start{i}"] +
                                  tmp_pos[f"end{i}"]) // 2
                tmp_pos[f"bin{i}"] = coords_to_bins(clr, tmp_pos).astype(int)
                # Save bin coordinates from current chromosome to the full table
                positions.loc[tmp_rows, f"bin{i}"] = tmp_pos[f"bin{i}"]
            tmp_pos = tmp_pos.drop(columns=["pos", "chrom"])
            # Retrieve diff values for each coordinate
            positions.loc[tmp_rows,
                          "diff_score"] = diff[tmp_pos.start1 // clr.binsize,
                                               tmp_pos.start2 //
                                               clr.binsize, ].A1
        # Otherwise report individual spots of change using chromosight
        else:
            # Pick "foci" of changed pixels and their local maxima
            tmp_pos, _ = cud.picker(abs(diff), thresh)
            # Get genomic positions from matrix coordinates
            tmp_pos = pd.DataFrame(tmp_pos, columns=["bin1", "bin2"])
            for i in [1, 2]:
                coords = (bins.loc[tmp_pos[f"bin{i}"],
                                   ["chrom", "start", "end"]].reset_index(
                                       drop=True).rename(
                                           columns={
                                               "chrom": f"chrom{i}",
                                               "start": f"start{i}",
                                               "end": f"end{i}",
                                           }))
                # Add axis' columns to  dataframe
                tmp_pos = pd.concat([coords, tmp_pos], axis=1)
            # Retrieve diff values for each coordinate
            tmp_pos["diff_score"] = diff[tmp_pos.bin1, tmp_pos.bin2].A1
            # Append new chromosome's rows
            positions = pd.concat([positions, tmp_pos], axis=0)
    positions = positions.loc[:, pos_cols, ]
    return positions