def test_get_coolers(): """Test loading of cool files from multiple samples""" # Load multiple files assert len(pai.get_coolers(COOLS)) == len(COOLS) # Load single file correctly assert isinstance(pai.get_coolers([COOLS[0]])[0], cooler.Cooler) # Load single file wrong, should crash with pytest.raises(TypeError): assert pai.get_coolers(COOLS[0]) # Dimensions unmatched, should give an explicit error with pytest.raises(ValueError) as err: assert pai.get_coolers(COOLS + [str(DATA / "natural.cool")]) assert str(err.value) == "Shapes are inconsistent." # Resolutions unmatched, should give an explicit error with pytest.raises(ValueError) as err: assert pai.get_coolers(COOLS + [str(DATA / "B_rebin.cool")]) assert str(err.value) == "Resolutions are inconsistent."
def test_get_min_contacts_region(): """Test if lowest contact value is found correctly""" min_exp = 415 min_obs = pah.get_min_contacts(pai.get_coolers(COOLS), REGION) assert min_obs == min_exp
def test_get_min_contacts(): """Test if lowest contact value is found correctly""" min_exp = 200606 min_obs = pah.get_min_contacts(pai.get_coolers(COOLS)) assert min_obs == min_exp
def change_detection_pipeline( cool_files: Iterable[str], conditions: Iterable[str], kernel: Union[np.ndarray, str] = "loops", bed2d_file: Optional[str] = None, region: Optional[Union[Iterable[str], str]] = None, max_dist: Optional[int] = None, min_dist: Optional[int] = None, subsample: bool = True, pearson_thresh: Optional[float] = None, density_thresh: Optional[float] = 0.10, snr_thresh: Optional[float] = 1.0, n_cpus: int = 4, ) -> pd.DataFrame: """ Run end to end pattern change detection pipeline on input cool files. A list of conditions of the same lengths as the sample list must be provided. The first condition in the list is used as the reference (control) state. Changes for a specific pattern are computed. A valid chromosight pattern name can be supplied (e.g. loops, borders, hairpins, ...) or a kernel matrix can be supplied directly instead. maximum scanning distance can be specified directly (in basepairs) to override the kernel default value. Positions with significant changes will be reported in a pandas dataframe. Significance is determined based on the percentile threshold, between 1 and 100. Optionally, a 2D bed file with positions of interest can be specified, in which case change value at these positions will be reported instead. When using a bed2d file, the threshold is optional (one can report either scores at all positions, or only where they are significant). Positive diff_scores mean the pattern intensity was increased relative to control (first condition). Parameters ---------- cool_files : The list of paths to cool files for the input samples. conditions : The list of conditions matching the samples. kernel : Either the kernel to use as pattern as a numpy array, or the name of a valid chromosight pattern. bed2d_file : Path to a bed2D file containing a list of 2D positions. If this is provided, pattern changes at these coordinates will be quantified. Otherwise, they will be detected based on a threshold. region : Either a single UCSC format region string, or a list of multiple regions. The analysis will be restricted to those regions. max_dist : Maximum interaction distance (in basepairs) to consider in the analysis. If this is not specified and a chromosight kernel was specified, the default max_dist for that kernel is used. If the case of a custom kernel, the whole matrix will be scanned if no max_dist is specified. subsample : Whether all input matrices should be subsampled to the same number of contacts as the least covered sample. pearson_thresh : The pearson correlation threshold to use when detecting patterns. If None, the default value for the kernel is used. density_thresh : The pixel density threshold to require. Low coverage windows with a proportion of nonzero pixels below this value are discarded. n_cpus : Number of CPU cores to allocate for parallel operations. Returns ------- pd.DataFrame : The list of reported 2D coordinates and their change intensities. """ # Make sure each sample has an associated condition if len(cool_files) != len(conditions): raise ValueError( "The lists of cool files and conditions must have the same length") # If a pattern name was provided, load corresponding chromosight kernel if isinstance(kernel, str): kernel_name = kernel try: kernel = getattr(ck, kernel_name)["kernels"][0] if max_dist is None: max_dist = getattr(ck, kernel_name)["max_dist"] if min_dist is None: min_dist = getattr(ck, kernel_name)["min_dist"] if pearson_thresh is None: pearson_thresh = getattr(ck, kernel_name)["pearson"] except AttributeError: raise AttributeError(f"{kernel_name} is not a valid pattern name") print(f"Loading default parameter for kernel '{kernel_name}'...") print(f"pearson_thresh: {pearson_thresh}") print(f"min_dist: {min_dist}") print(f"max_dist: {max_dist}") elif isinstance(kernel, np.ndarray): kernel_name = "custom kernel" else: raise ValueError( "Kernel must either be a valid chromosight pattern name, or a 2D numpy.ndarray of floats" ) # Associate samples with their conditions samples = pd.DataFrame({ "cond": conditions, "cool": pai.get_coolers(cool_files) }) print( f"Changes will be computed relative to condition: {samples.cond.values[0]}" ) # Define each chromosome as a region, if None specified clr = samples.cool.values[0] if max_dist is not None: max_dist = max_dist // clr.binsize if min_dist is None: min_dist = 0 else: min_dist = min_dist // clr.binsize if region is None: regions = clr.chroms()[:]["name"].tolist() elif isinstance(region, str): regions = [region] else: regions = region pos_cols = [ "chrom1", "start1", "end1", "chrom2", "start2", "end2", "bin1", "bin2", "diff_score", "snr", ] if bed2d_file: positions = cio.load_bed2d(bed2d_file) for col in ["diff_score", "snr", "bin1", "bin2"]: positions[col] = np.nan else: positions = pd.DataFrame(columns=pos_cols) for reg in regions: # Subset bins to the range of interest bins = clr.bins().fetch(reg).reset_index(drop=True) diff, snr = detection_matrix( samples, kernel, region=reg, subsample=subsample, max_dist=max_dist, pearson_thresh=pearson_thresh, density_thresh=density_thresh, n_cpus=n_cpus, snr_thresh=snr_thresh, ) # If the matrix was too small or no difference was found, skip it if diff is None or diff.nnz == 0: continue # If positions were provided, return the change value for each of them if bed2d_file: tmp_chr = reg.split(":")[0] tmp_rows = (positions.chrom1 == tmp_chr) & (positions.chrom2 == tmp_chr) # If there are no positions of interest on this chromosome, just # skip it if not np.any(tmp_rows): continue tmp_pos = positions.loc[tmp_rows, :] # Convert both coordinates from genomic coords to bins for i in [1, 2]: tmp_pos["chrom"] = tmp_pos[f"chrom{i}"] tmp_pos["pos"] = (tmp_pos[f"start{i}"] + tmp_pos[f"end{i}"]) // 2 tmp_pos[f"bin{i}"] = coords_to_bins(clr, tmp_pos).astype(int) # Save bin coordinates from current chromosome to the full table positions.loc[tmp_rows, f"bin{i}"] = tmp_pos[f"bin{i}"] tmp_pos = tmp_pos.drop(columns=["pos", "chrom"]) # Retrieve diff values for each coordinate positions.loc[tmp_rows, "diff_score"] = diff[tmp_pos.start1 // clr.binsize, tmp_pos.start2 // clr.binsize].A1 positions.loc[tmp_rows, "snr"] = snr[tmp_pos.start1 // clr.binsize, tmp_pos.start2 // clr.binsize].A1 # Otherwise report individual spots of change using chromosight else: # Pick "foci" of changed pixels and their local maxima tmp_pos, _ = cud.pick_foci(abs(diff), 0.01, min_size=3) # Get genomic positions from matrix coordinates tmp_pos = pd.DataFrame(tmp_pos, columns=["bin1", "bin2"]) for i in [1, 2]: coords = (bins.loc[tmp_pos[f"bin{i}"], ["chrom", "start", "end"]].reset_index( drop=True).rename( columns={ "chrom": f"chrom{i}", "start": f"start{i}", "end": f"end{i}", })) # Add axis' columns to dataframe tmp_pos = pd.concat([coords, tmp_pos], axis=1) # Retrieve diff values for each coordinate try: tmp_pos["diff_score"] = diff[tmp_pos.bin1, tmp_pos.bin2].A1 # No position found, go to next region except AttributeError: continue tmp_pos["snr"] = snr[tmp_pos.bin1, tmp_pos.bin2].A1 # Append new chromosome's rows positions = pd.concat([positions, tmp_pos], axis=0) # For 1D patterns (e.g. borders) set diagonal positions. if max_dist == 0: positions[["bin1", "chrom1", "start1", "end1" ]] = positions[["bin2", "chrom2", "start2", "end2"]] positions = positions.loc[:, pos_cols] positions = positions.loc[abs(positions.bin2 - positions.bin1) >= min_dist, :].reset_index( drop=True) print(positions) return positions
def change_detection_pipeline( cool_files: Iterable[str], conditions: Iterable[str], kernel: Union[np.ndarray, str] = "loops", bed2d_file: Optional[str] = None, region: Optional[Union[Iterable[str], str]] = None, max_dist: Optional[int] = None, subsample: bool = True, percentile_thresh: float = 95.0, n_cpus: int = 4, ) -> pd.DataFrame: """ Run end to end pattern change detection pipeline on input cool files. A list of conditions of the same lengths as the sample list must be provided. The first condition in the list is used as the reference (control) state. Changes for a specific pattern are computed. A valid chromosight pattern name can be supplied (e.g. loops, borders, hairpins, ...) or a kernel matrix can be supplied directly instead. Positions with significant changes will be reported in a pandas dataframe. Optionally, a 2D bed file with positions of interest can be specified, in which case change value at these positions will be reported instead. Positive diff_scores mean the pattern intensity was increased relative to control (first condition). """ # Make sure each sample has an associated condition if len(cool_files) != len(conditions): raise ValueError( "The lists of cool files and conditions must have the same length") # If a pattern name was provided, load corresponding chromosight kernel if isinstance(kernel, str): kernel_name = kernel try: kernel = getattr(ck, kernel)["kernels"][0] except AttributeError: raise AttributeError(f"{kernel_name} is not a valid pattern name") elif isinstance(kernel, np.ndarray): kernel_name = "custom kernel" else: raise ValueError( "Kernel must either be a valid chromosight pattern name, or a 2D numpy.ndarray of floats" ) # Associate samples with their conditions samples = pd.DataFrame({ "cond": conditions, "cool": pai.get_coolers(cool_files) }) print( f"Changes will be computed relative to condition: {samples.cond.values[0]}" ) # Define each chromosome as a region, if None specified clr = samples.cool.values[0] if region is None: regions = clr.chroms()[:]["name"].tolist() elif isinstance(region, str): region = [region] pos_cols = [ "chrom1", "start1", "end1", "chrom2", "start2", "end2", "bin1", "bin2", "diff_score", ] if bed2d_file: positions = cio.load_bed2d(bed2d_file) for col in ["diff_score", " bin1", "bin2"]: positions[col] = np.nan else: positions = pd.DataFrame(columns=pos_cols) for reg in regions: # Subset bins to the range of interest bins = clr.bins().fetch(reg).reset_index(drop=True) diff, thresh = detection_matrix( samples, kernel, region=reg, subsample=subsample, max_dist=max_dist, percentile_thresh=percentile_thresh, n_cpus=n_cpus, ) # If positions were provided, return the change value for each of them if bed2d_file: tmp_chr = reg.split(":")[0] tmp_rows = (positions.chrom1 == tmp_chr) & (positions.chrom2 == tmp_chr) # If there are no positions of interest on this chromosome, just # skip it if not np.any(tmp_rows): continue tmp_pos = positions.loc[tmp_rows, :] # Convert both coordinates from genomic coords to bins for i in [1, 2]: tmp_pos["chrom"] = tmp_pos[f"chrom{i}"] tmp_pos["pos"] = (tmp_pos[f"start{i}"] + tmp_pos[f"end{i}"]) // 2 tmp_pos[f"bin{i}"] = coords_to_bins(clr, tmp_pos).astype(int) # Save bin coordinates from current chromosome to the full table positions.loc[tmp_rows, f"bin{i}"] = tmp_pos[f"bin{i}"] tmp_pos = tmp_pos.drop(columns=["pos", "chrom"]) # Retrieve diff values for each coordinate positions.loc[tmp_rows, "diff_score"] = diff[tmp_pos.start1 // clr.binsize, tmp_pos.start2 // clr.binsize, ].A1 # Otherwise report individual spots of change using chromosight else: # Pick "foci" of changed pixels and their local maxima tmp_pos, _ = cud.picker(abs(diff), thresh) # Get genomic positions from matrix coordinates tmp_pos = pd.DataFrame(tmp_pos, columns=["bin1", "bin2"]) for i in [1, 2]: coords = (bins.loc[tmp_pos[f"bin{i}"], ["chrom", "start", "end"]].reset_index( drop=True).rename( columns={ "chrom": f"chrom{i}", "start": f"start{i}", "end": f"end{i}", })) # Add axis' columns to dataframe tmp_pos = pd.concat([coords, tmp_pos], axis=1) # Retrieve diff values for each coordinate tmp_pos["diff_score"] = diff[tmp_pos.bin1, tmp_pos.bin2].A1 # Append new chromosome's rows positions = pd.concat([positions, tmp_pos], axis=0) positions = positions.loc[:, pos_cols, ] return positions