def __init__(self, clr, expected, cooler_opts=None, view_df=None): self.clr = clr self.expected = expected # Detecting the columns for the detection of regions columns = expected.columns assert len(columns) > 0 if ("region1" not in columns) or ("region2" not in columns): if ("chrom" in columns) or ("region" in columns): raise ValueError( "Provided expected appears to have old format, it has to comply with the format of expected v1.0" ) else: raise ValueError( "Please check the expected dataframe, it has to comply with the format of expected v1.0" ) # get chromosomes from cooler, if view_df not specified: if view_df is None: view_df = bioframe.make_viewframe([ (chrom, 0, l, chrom) for chrom, l in clr.chromsizes.items() ]) else: # appropriate viewframe checks: if not bioframe.is_viewframe(view_df): raise ValueError("view_df is not a valid viewframe.") if not bioframe.is_contained( view_df, bioframe.make_viewframe(clr.chromsizes)): raise ValueError( "view_df is out of the bounds of chromosomes in cooler.") self.view_df = view_df.set_index("name") for (name1, name2), group in self.expected.groupby(["region1", "region2"]): if name1 != name2: raise ValueError( "Only symmetric regions a supported, e.g. chromosomes, arms, etc" ) n_diags = group.shape[0] region = self.view_df.loc[name1] lo, hi = self.clr.extent(region) if n_diags != (hi - lo): raise ValueError( "Region shape mismatch between expected and cooler. " "Are they using the same resolution?") self.binsize = self.clr.binsize self.offsets = {} self.pad = True self.cooler_opts = {} if cooler_opts is None else cooler_opts self.cooler_opts.setdefault("sparse", True)
def make_cooler_view(clr, ucsc_names=False): """ Generate a full chromosome viewframe using cooler's chromsizes Parameters ---------- clr : cooler cooler-object to extract chromsizes ucsc_names : bool Use full UCSC formatted names instead of short chromosome names. Returns ------- cooler_view : viewframe full chromosome viewframe """ cooler_view = bioframe.make_viewframe(clr.chromsizes) if ucsc_names: # UCSC formatted names return cooler_view else: # rename back to short chromnames cooler_view["name"] = cooler_view["chrom"] return cooler_view
def _make_cooler_view(view_df, clr): try: if not bioframe.is_viewframe(view_df, raise_errors=True): raise ValueError("view_df is not a valid viewframe.") except Exception as e: # AssertionError or ValueError, see https://github.com/gfudenberg/bioframe/blob/main/bioframe/core/checks.py#L177 warnings.warn( "view_df has to be a proper viewframe from next release", DeprecationWarning, stacklevel=2, ) view_df = bioframe.make_viewframe(view_df) if not bioframe.is_contained(view_df, bioframe.make_viewframe(clr.chromsizes)): raise ValueError( "View table is out of the bounds of chromosomes in cooler.") return view_df
def read_viewframe( fname, verify_cooler_view=None, ): """ Read a BED file with regions that conforms a definition of a viewframe (non-overlaping, unique names, etc). Parameters ---------- fname : str Path to a BED file with regions. verify_cooler_view : None or viewframe Viewframe with entire chromosome sizes Returns ------- view_df : pd.DataFrame DataFrame with the viewframe """ # define chromsizes based on verify_cooler_view chromsizes = None if (verify_cooler_view is None) else \ verify_cooler_view.set_index("chrom")["end"] # read BED file assuming bed4/3 formats (with names-columns and without): try: view_df = bioframe.read_table(fname, schema="bed4", index_col=False) except Exception: view_df = bioframe.read_table(fname, schema="bed3", index_col=False) # Convert view dataframe to viewframe: try: view_df = bioframe.make_viewframe(view_df) if (verify_cooler_view is None) else \ bioframe.make_viewframe(view_df, check_bounds=chromsizes) except ValueError as e: raise ValueError( "View table is incorrect, please, comply with the format. ") from e # Check that input view is contained in cooler bounds, but not vice versa (because cooler may have more regions): if verify_cooler_view is not None: if not bioframe.is_contained(view_df, verify_cooler_view): raise ValueError( "View regions are not contained in cooler chromsizes bounds") return view_df
def __init__(self, clr, cooler_opts=None, view_df=None): # get chromosomes from bins, if view_df not specified: if view_df is None: view_df = bioframe.make_viewframe([ (chrom, 0, l, chrom) for chrom, l in clr.chromsizes.items() ]) else: # appropriate viewframe checks: if not bioframe.is_viewframe(view_df): raise ValueError("view_df is not a valid viewframe.") if not bioframe.is_contained( view_df, bioframe.make_viewframe(clr.chromsizes)): raise ValueError( "view_df is out of the bounds of chromosomes in cooler.") self.view_df = view_df.set_index("name") self.clr = clr self.binsize = self.clr.binsize self.offsets = {} self.pad = True self.cooler_opts = {} if cooler_opts is None else cooler_opts self.cooler_opts.setdefault("sparse", True)
def cooler_cis_eig( clr, bins, regions=None, n_eigs=3, phasing_track_col="GC", balance="weight", ignore_diags=None, bad_bins=None, clip_percentile=99.9, sort_metric=None, smooth=False, cutoff = 3, max_levels = 8, OE_log=False, map=map, ): """ Compute compartment eigenvector for a given cooler `clr` in a number of symmetric intra chromosomal regions (cis-regions), or for each chromosome. Note that the amplitude of compartment eigenvectors is weighted by their corresponding eigenvalue Parameters ---------- clr : cooler cooler object to fetch data from bins : DataFrame table of bins derived from clr with phasing track added regions : iterable or DataFrame, optional if provided, eigenvectors are calculated for the regions only, otherwise chromosome-wide eigenvectors are computed, for chromosomes specified in bins. n_eigs : int number of eigenvectors to compute phasing_track_col : str, optional name of the columns in `bins` table, if provided, eigenvectors are flipped to achieve a positive correlation with `bins[phasing_track_col]`. balance : str name of the column with balancing weights to be used. ignore_diags : int, optional the number of diagonals to ignore. Derived from cooler metadata if not specified. bad_bins : array-like a list of bins to ignore. Indexes of bins must be absolute, as in clr.bins()[:], as opposed to being offset by chromosome start. `bad_bins` will be combined with the bad bins masked by balancing. clip_percentile : float if >0 and <100, clip pixels with diagonal-normalized values higher than the specified percentile of matrix-wide values. sort_metric : str If provided, re-sort `eigenvecs` and `eigvals` in the order of decreasing correlation between phasing_track and eigenvector, using the specified measure of correlation. Possible values: 'pearsonr' - sort by decreasing Pearson correlation. 'var_explained' - sort by decreasing absolute amount of variation in `eigvecs` explained by `phasing_track` (i.e. R^2 * var(eigvec)) 'MAD_explained' - sort by decreasing absolute amount of Median Absolute Deviation from the median of `eigvecs` explained by `phasing_track` (i.e. COMED(eigvec, phasing_track) * MAD(eigvec)). 'spearmanr' - sort by decreasing Spearman correlation. This option is designed to report the most "biologically" informative eigenvectors first, and prevent eigenvector swapping caused by translocations. In reality, however, sometimes it shows poor performance and may lead to reporting of non-informative eigenvectors. Off by default. smooth : boolean, optional This option lets you coarsegrain the matrix prior to calling eigendecomposition. cutoff: int, optional Cutoff to pass to adaptive_coarsegrain's cutoff argument max_levels: int, optional Max level to pass to adaptive_coarsegrain's max_levels argument OE_log: boolean, optional Pass OE_log to cis_eig's OE_log argument. This works only if matrix does not contain zeroes (eg. after using adaptive_coarsegrain) map : callable, optional Map functor implementation. Returns ------- eigvals, eigvec_table -> DataFrames with eigenvalues for each region and a table of eigenvectors filled in the `bins` table. .. note:: ALWAYS check your EVs by eye. The first one occasionally does not reflect the compartment structure, but instead describes chromosomal arms or translocation blowouts. Possible mitigations: employ `regions` (e.g. arms) to avoid issues with chromosomal arms, use `bad_bins` to ignore small transolcations. """ # get chromosomes from bins, if regions not specified: if regions is None: regions = list(bins["chrom"].unique()) # parse_regions fill in the rest # make sure phasing_track_col is in bins, if phasing is requested if phasing_track_col and (phasing_track_col not in bins): raise ValueError(f'No column "{phasing_track_col}" in the bin table') # regions to dataframe # regions = bioframe.parse_regions(regions, clr.chromsizes) regions = bioframe.make_viewframe(regions) # ignore diags as in cooler inless specified ignore_diags = ( clr._load_attrs("bins/weight").get("ignore_diags", 2) if ignore_diags is None else ignore_diags ) # prepare output table for eigen vectors eigvec_table = bins.copy() eigvec_columns = [f"E{i + 1}" for i in range(n_eigs)] for ev_col in eigvec_columns: eigvec_table[ev_col] = np.nan # prepare output table for eigenvalues eigvals_table = regions.copy() eigval_columns = [f"eigval{i + 1}" for i in range(n_eigs)] for eval_col in eigval_columns: eigvals_table[eval_col] = np.nan def _each(region): """ perform eigen decomposition for a given region assuming safety checks are done outside of this function. Parameters ---------- region: tuple-like tuple of the form (chroms,start,end,*) Returns ------- _region, eigvals, eigvecs -> ndarrays array of eigenvalues and an array eigenvectors """ _region = region[:3] # take only (chrom, start, end) print("now doing region:", _region) if smooth: A = numutils.adaptive_coarsegrain( clr.matrix(balance=True).fetch(_region), clr.matrix(balance=False).fetch(_region), cutoff=cutoff, max_levels=max_levels) else: A = clr.matrix(balance=balance).fetch(_region) # filter bad_bins relevant for the _region from A if bad_bins is not None: # filter bad_bins for the _region and turn relative: lo, hi = clr.extent(_region) bad_bins_region = bad_bins[(bad_bins>=lo)&(bad_bins<hi)] bad_bins_region -= lo if len(bad_bins_region) > 0: # apply bad bins to symmetric matrix A: A[:,bad_bins_region] = np.nan A[bad_bins_region,:] = np.nan # extract phasing track relevant for the _region phasing_track = ( bioframe.select(bins, _region)[phasing_track_col].values if phasing_track_col else None ) eigvals, eigvecs = cis_eig( A, n_eigs=n_eigs, ignore_diags=ignore_diags, phasing_track=phasing_track, clip_percentile=clip_percentile, sort_metric=sort_metric, OE_log=OE_log ) return _region, eigvals, eigvecs # eigendecompose matrix per region (can be multiprocessed) # output assumes that the order of results matches regions results = map(_each, regions.values) # go through eigendecomposition results and fill in # output table eigvec_table and eigvals_table for _region, _eigvals, _eigvecs in results: idx = bioframe.select(eigvec_table, _region).index eigvec_table.at[idx, eigvec_columns] = _eigvecs.T idx = bioframe.select(eigvals_table, _region).index eigvals_table.at[idx, eigval_columns] = _eigvals return eigvals_table, eigvec_table
def is_compatible_viewframe(view_df, verify_cooler, check_sorting=False, raise_errors=False): """ Check if view_df is a viewframe and if it is compatible with the provided cooler. Parameters ---------- view_df : DataFrame view_df DataFrame to be validated verify_cooler : cooler cooler object to use for verification check_sorting : bool Check is regions in view_df are sorted as in chromosomes in cooler. raise_errors : bool raise expection instead of returning False Returns ------- is_compatible_viewframe : bool True when view_df is compatible, False otherwise """ try: try: _ = bioframe.is_viewframe(view_df, raise_errors=True) except Exception as error_not_viewframe: try: _ = bioframe.make_viewframe(view_df) except Exception as error_cannot_make_viewframe: # view_df is not viewframe and cannot be easily converted raise ValueError( "view_df is not a valid viewframe and cannot be recovered" ) from error_cannot_make_viewframe else: # view_df is not viewframe, but can be converted - formatting issue ? name-column ? raise ValueError( "view_df is not a valid viewframe, apply bioframe.make_viewframe to convert" ) from error_not_viewframe # is view_df contained inside cooler-chromosomes ? cooler_view = make_cooler_view(verify_cooler) if not bioframe.is_contained(view_df, cooler_view, raise_errors=False): raise ValueError( "View table is out of the bounds of chromosomes in cooler.") # is view_df sorted by coord and chrom order as in cooler ? if check_sorting: if not bioframe.is_sorted( view_df, cooler_view, df_view_col="chrom"): raise ValueError( "regions in the view_df must be sorted by coordinate" " and chromosomes order as as in the verify_cooler.") except Exception as e: if raise_errors: raise ValueError( "view_df is not compatible, or not a viewframe") from e else: # something went wrong: it's not a viewframe return False else: # no exceptions were raised: it's a compatible viewframe return True
supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes] # test the most frequent use cases, balancing applied, no bad bins, etc. common_regions = [] for i in range(4): chrom = chromosomes[i] halfway_chrom = int(chromsizes[chrom] / 2) # make halfway_chrom point "bin-aligned" according to anticipated binsize halfway_chrom = round(halfway_chrom / assumed_binsize) * assumed_binsize reg1 = (chrom, 0, halfway_chrom) reg2 = (chrom, halfway_chrom, chromsizes[chrom]) common_regions.append(reg1) common_regions.append(reg2) view_df = bioframe.make_viewframe(common_regions, name_style='ucsc') def test_diagsum_symm(request): # perform test: clr = cooler.Cooler( op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) res = cooltools.api.expected.diagsum_symm( clr, view_df=view_df, transforms=transforms, clr_weight_name=clr_weight_name, bad_bins=bad_bins, ignore_diags=ignore_diags, chunksize=chunksize, )
def saddle( cool_path, track_path, expected_path, contact_type, min_dist, max_dist, n_bins, vrange, qrange, clr_weight_name, strength, view, out_prefix, fig, scale, cmap, vmin, vmax, hist_color, verbose, ): """ Calculate saddle statistics and generate saddle plots for an arbitrary signal track on the genomic bins of a contact matrix. COOL_PATH : The paths to a .cool file with a balanced Hi-C map. Use the '::' syntax to specify a group path in a multicooler file. TRACK_PATH : The path to bedGraph-like file with a binned compartment track (eigenvector), including a header. Use the '::' syntax to specify a column name. EXPECTED_PATH : The paths to a tsv-like file with expected signal, including a header. Use the '::' syntax to specify a column name. Analysis will be performed for chromosomes referred to in TRACK_PATH, and therefore these chromosomes must be a subset of chromosomes referred to in COOL_PATH and EXPECTED_PATH. COOL_PATH, TRACK_PATH and EXPECTED_PATH must be binned at the same resolution (expect for EXPECTED_PATH in case of trans contact type). EXPECTED_PATH must contain at least the following columns for cis contacts: 'chrom', 'diag', 'n_valid', value_name and the following columns for trans contacts: 'chrom1', 'chrom2', 'n_valid', value_name value_name is controlled using options. Header must be present in a file. """ #### Read inputs: #### clr = cooler.Cooler(cool_path) expected_path, expected_value_col = expected_path track_path, track_name = track_path #### Read track: #### # read bedGraph-file : track_columns = ["chrom", "start", "end", track_name] # specify dtype as a rudimentary form of validation: track_dtype = { "chrom": np.str_, "start": np.int64, "end": np.int64, track_name: np.float64, } track = pd.read_table( track_path, usecols=track_columns, dtype=track_dtype, comment=None, verbose=verbose, ) #### Generate viewframes #### # 1:cooler_view_df. Generate viewframe from clr.chromsizes: cooler_view_df = make_cooler_view(clr) # 2:view_df. Define global view for calculating calling dots # use input "view" BED file or all chromosomes : if view is None: view_df = cooler_view_df else: view_df = read_viewframe_from_file(view, clr, check_sorting=True) # 3:track_view_df. Generate viewframe from track table: track_view_df = bioframe.make_viewframe([ (group.chrom.iloc[0], np.nanmin(group.start), np.nanmax(group.end)) for i, group in track.reset_index().groupby("chrom") ]) #### Read expected: #### expected_summary_cols = [ expected_value_col, ] expected = read_expected_from_file( expected_path, contact_type=contact_type, expected_value_cols=expected_summary_cols, verify_view=view_df, verify_cooler=clr, ) if min_dist < 0: min_diag = 3 else: min_diag = int(np.ceil(min_dist / clr.binsize)) if max_dist >= 0: max_diag = int(np.floor(max_dist / clr.binsize)) else: max_diag = -1 if clr_weight_name: track = mask_cooler_bad_bins((track, track_name), (clr.bins()[:], clr_weight_name)) if vrange[0] is None: vrange = None if qrange[0] is None: qrange = None if (qrange is not None) and (vrange is not None): raise ValueError("only one of vrange or qrange can be supplied") # digitize outside of saddle so that we have binedges to save below track = align_track_with_cooler( track, clr, view_df=view_df, clr_weight_name=clr_weight_name, mask_bad_bins=True, ) digitized_track, binedges = api.saddle.digitize( track.iloc[:, :4], n_bins, vrange=vrange, qrange=qrange, digitized_suffix=".d", ) S, C = api.saddle.saddle( clr, expected, digitized_track, contact_type, None, vrange=None, qrange=None, view_df=view_df, clr_weight_name=clr_weight_name, expected_value_col=expected_value_col, view_name_col="name", min_diag=min_diag, max_diag=max_diag, verbose=verbose, ) saddledata = S / C to_save = dict( saddledata=saddledata, binedges=binedges, digitized=digitized_track, saddlecounts=C, ) if strength: ratios = api.saddle.saddle_strength(S, C) ratios = ratios[1:-1] # drop outlier bins to_save["saddle_strength"] = ratios # Save data np.savez(out_prefix + ".saddledump", **to_save) # .npz auto-added digitized_track.to_csv(out_prefix + ".digitized.tsv", sep="\t", index=False) # Generate figure if len(fig): try: import matplotlib as mpl mpl.use("Agg") # savefig only for now: import matplotlib.pyplot as plt except ImportError: print("Install matplotlib to use ", file=sys.stderr) sys.exit(1) if hist_color is None: color = ( 0.41568627450980394, 0.8, 0.39215686274509803, ) # sns.color_palette('muted')[2] else: color = mpl.colors.colorConverter.to_rgb(hist_color) title = op.basename(cool_path) + " ({})".format(contact_type) if qrange is not None: track_label = track_name + " quantiles" else: track_label = track_name clabel = "(contact frequency / expected)" api.saddle.saddleplot( track, saddledata, n_bins, vrange=vrange, qrange=qrange, scale=scale, vmin=vmin, vmax=vmax, color=color, title=title, xlabel=track_label, ylabel=track_label, clabel=clabel, cmap=cmap, ) for ext in fig: plt.savefig(out_prefix + "." + ext, bbox_inches="tight")
def compute_expected( cool_path, nproc, chunksize, output, contact_type, view, balance, clr_weight_name, ignore_diags, ): """ Calculate expected Hi-C signal either for cis or for trans regions of chromosomal interaction map. When balancing weights are not applied to the data, there is no masking of bad bins performed. COOL_PATH : The paths to a .cool file with a balanced Hi-C map. """ clr = cooler.Cooler(cool_path) if view is not None: # Read view_df dataframe: try: view_df = bioframe.read_table(view, schema="bed4", index_col=False) except Exception: view_df = bioframe.read_table(view, schema="bed3", index_col=False) # Convert view dataframe to viewframe: try: view_df = bioframe.make_viewframe(view_df, check_bounds=clr.chromsizes) except ValueError as e: raise ValueError( "View table is incorrect, please, comply with the format. " ) from e else: view_df = None # full chromosome case if contact_type == "cis": result = expected.get_cis_expected( clr, view_df=view_df, intra_only=True, clr_weight_name=clr_weight_name if balance else None, ignore_diags=ignore_diags, chunksize=chunksize, nproc=nproc ) elif contact_type == "trans": result = expected.get_trans_expected( clr, view_df=view_df, clr_weight_name=clr_weight_name if balance else None, chunksize=chunksize, nproc=nproc, ) # output to file if specified: if output: result.to_csv(output, sep="\t", index=False, na_rep="nan") # or print into stdout otherwise: else: print(result.to_csv(sep="\t", index=False, na_rep="nan"))
def read_viewframe_from_file( view_fname, verify_cooler=None, check_sorting=False, ): """ Read a BED file with regions that conforms a definition of a viewframe (non-overlaping, unique names, etc). Parameters ---------- view_fname : str Path to a BED file with regions. verify_cooler : cooler | None cooler object to get chromsizes for bound checking No checks are done when None. check_sorting : bool Check is regions in view_df are sorted as in chromosomes in cooler. Returns ------- view_df : pd.DataFrame DataFrame with the viewframe """ # read BED file assuming bed4/3 formats (with names-columns and without): try: view_df = bioframe.read_table(view_fname, schema="bed4", index_col=False) except Exception as err_bed4: try: view_df = bioframe.read_table(view_fname, schema="bed3", index_col=False) except Exception as err_bed3: raise ValueError( f"{view_fname} is not a BED file with 3 or 4 columns" ) from err_bed4 # Convert view dataframe to viewframe: try: view_df = bioframe.make_viewframe(view_df) except ValueError as e: raise ValueError( "View table is incorrect, please, comply with the format. ") from e if verify_cooler is not None: try: _ = is_compatible_viewframe(view_df, verify_cooler, check_sorting, raise_errors=True) except Exception as e: raise ValueError( "view_df is not compatible with the cooler") from e else: # view_df is compaible, returning return view_df else: # no cooler for checking, returning return view_df
supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes] # test the most frequent use cases, balancing applied, no bad bins, etc. common_regions = [] for i in range(4): chrom = chromosomes[i] halfway_chrom = int(chromsizes[chrom] / 2) # make halfway_chrom point "bin-aligned" according to anticipated binsize halfway_chrom = round(halfway_chrom / assumed_binsize) * assumed_binsize reg1 = (chrom, 0, halfway_chrom) reg2 = (chrom, halfway_chrom, chromsizes[chrom]) common_regions.append(reg1) common_regions.append(reg2) view_df = bioframe.make_viewframe(common_regions) def test_diagsum_symm(request): # perform test: clr = cooler.Cooler( op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) res = cooltools.expected.diagsum_symm( clr, view_df=view_df, transforms=transforms, weight_name=weight_name, bad_bins=bad_bins, ignore_diags=ignore_diags, chunksize=chunksize, )
def compute_pileup( cool_path, features, view, expected, flank, features_format, weight_name, out, out_format, store_snips, nproc, ignore_diags, aggregate, force, verbose, ): """ Perform retrieval of the snippets from .cool file. COOL_PATH : The paths to a .cool file with a balanced Hi-C map. Use the '::' syntax to specify a group path in a multicooler file. FEATURES_PATH : the path to a BED or BEDPE-like file that contains features for snipping windows. If BED, then the features are on-diagonal. If BEDPE, then the features can be off-diagonal (but not in trans or between different regions in the view). """ clr = cooler.Cooler(cool_path) #### Read the features: buf, names = sniff_for_header(features) if features_format.lower() == "bedpe": default_cols = [0, 1, 2, 3, 4, 5] bedpe_cols = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"] dtypes = { "chrom1": str, "start1": np.int64, "end1": np.int64, "chrom2": str, "start2": np.int64, "end2": np.int64, } if names is None: kwargs = dict( header=None, usecols=default_cols, dtype=dtypes, names=bedpe_cols, ) else: kwargs = dict(header="infer", usecols=bedpe_cols) elif features_format.lower() == "bed": default_cols = [0, 1, 2] bed_cols = ["chrom", "start", "end"] dtypes = {"chrom": str, "start": np.int64, "end": np.int64} if names is None: kwargs = dict( header=None, names=bed_cols, ) else: kwargs = dict(header="infer", usecols=bed_cols) else: raise ValueError( "Automatic detection of features format is not implemented yet. " "Please provide BED or BEDPE as --features-format") features_df = pd.read_table(buf, comment="#", usecols=default_cols, dtype=dtypes, verbose=verbose, **kwargs) ###### Define view for cis compartment-calling # use input "view" BED file or all chromosomes mentioned in "track": if view is None: # Generate viewframe from clr.chromsizes: view_df = bioframe.make_viewframe([(chrom, 0, clr.chromsizes[chrom]) for chrom in clr.chromnames]) if not bioframe.is_contained(features_df, view_df): raise ValueError( "Features are not contained in chromosomes bounds") else: # Make viewframe out of table: # Read view_df: try: view_df = bioframe.read_table(view, schema="bed4", index_col=False) except Exception: view_df = bioframe.read_table(view, schema="bed3", index_col=False) # Convert view_df to viewframe: try: view_df = bioframe.make_viewframe(view_df, check_bounds=clr.chromsizes) except ValueError as e: raise ValueError( "View table is incorrect, please, comply with the format. " ) from e if not bioframe.is_contained(features_df, view_df): raise ValueError("Features are not contained in view bounds") ##### Read expected, should be cis-expected: if not expected is None: expected_path, expected_value_col = expected expected_summary_cols = [ expected_value_col, ] expected = read_expected( expected_path, contact_type="cis", expected_value_cols=expected_summary_cols, verify_view=view_df, verify_cooler=clr, ) ##### CReate the pileup: stack = snipping.pileup( clr, features_df, view_df=view_df, expected_df=expected, flank=flank, min_diag=ignore_diags, # TODO: implement in pileup API clr_weight_name=weight_name, # TODO: implement in pileup API force=force, # TODO: implement in pileup API nproc=nproc, ) ##### Aggregate the signal: aggregate = aggregate.lower() if aggregate is None or aggregate == "mean" or aggregate == "none": agg_func = np.nanmean elif aggregate == "median": agg_func = np.nanmedian elif aggregate == "min": agg_func = np.nanmin elif aggregate == "max": agg_func = np.nanmax elif aggregate == "std": agg_func = np.nanstd else: raise ValueError( f"Aggregation mode {aggregate} not supported. Please use mean/median/min/max/std." ) pileup = agg_func(stack, axis=2) ##### Store the data as NPZ file: if out_format.lower() == "npz": if store_snips: np.savez(out, pileup=pileup) else: np.savez(out, pileup=pileup, stack=stack) elif out_format.lower() == "hdf5": h5 = h5py.File(out, "w") h5.create_dataset("pileup", data=pileup) if store_snips: h5.create_dataset("stack", data=stack)
def call_dots( cool_path, expected_path, view, clr_weight_name, nproc, max_loci_separation, max_nans_tolerated, tile_size, kernel_width, kernel_peak, num_lambda_chunks, fdr, dots_clustering_radius, verbose, out_prefix, ): """ Call dots on a Hi-C heatmap that are not larger than max_loci_separation. COOL_PATH : The paths to a .cool file with a balanced Hi-C map. EXPECTED_PATH : The paths to a tsv-like file with expected signal, including a header. Use the '::' syntax to specify a column name. Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and therefore these chromosomes must be a subset of chromosomes referred to in COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial, i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH before applying this script. COOL_PATH and EXPECTED_PATH must be binned at the same resolution. EXPECTED_PATH must contain at least the following columns for cis contacts: 'region1/2', 'diag', 'n_valid', value_name. value_name is controlled using options. Header must be present in a file. """ clr = cooler.Cooler(cool_path) expected_path, expected_value_col = expected_path #### Generate viewframes #### # 1:cooler_view_df. Generate viewframe from clr.chromsizes: cooler_view_df = bioframe.make_viewframe( clr.chromsizes ) # 2:view_df. Define global view for calculating calling dots # use input "view" BED file or all chromosomes : view_df = cooler_view_df if (view is None) else read_viewframe(view, cooler_view_df) #### Read expected: #### expected_summary_cols = [expected_value_col, ] expected = read_expected( expected_path, contact_type="cis", expected_value_cols=expected_summary_cols, verify_view=view_df, verify_cooler=clr, ) # add checks to make sure cis-expected is symmetric # Prepare some parameters. binsize = clr.binsize loci_separation_bins = int(max_loci_separation / binsize) tile_size_bins = int(tile_size / binsize) balance_factor = 1.0 # clr._load_attrs("bins/weight")["scale"] # clustering would deal with bases-units for now, so supress this for now # clustering_radius_bins = int(dots_clustering_radius/binsize) # kernels # 'upright' is a symmetrical inversion of "lowleft", not needed. ktypes = ["donut", "vertical", "horizontal", "lowleft"] if (kernel_width is None) or (kernel_peak is None): w, p = dotfinder.recommend_kernel_params(binsize) print(f"Using kernel parameters w={w}, p={p} recommended for binsize {binsize}") else: w, p = kernel_width, kernel_peak # add some sanity check for w,p: if not w > p: raise ValueError(f"Wrong inner/outer kernel parameters w={w}, p={p}") print(f"Using kernel parameters w={w}, p={p} provided by user") # once kernel parameters are setup check max_nans_tolerated # to make sure kernel footprints overlaping 1 side with the # NaNs filled row/column are not "allowed" # this requires dynamic adjustment for the "shrinking donut" if not max_nans_tolerated <= 2 * w: raise ValueError("Too many NaNs allowed!") # may lead to scoring the same pixel twice, - i.e. duplicates. # generate standard kernels - consider providing custom ones kernels = {k: dotfinder.get_kernel(w, p, k) for k in ktypes} # list of tile coordinate ranges tiles = list( dotfinder.heatmap_tiles_generator_diag( clr, view_df, w, tile_size_bins, loci_separation_bins ) ) # lambda-chunking edges ... if not dotfinder.HiCCUPS_W1_MAX_INDX <= num_lambda_chunks <= 50: raise ValueError("Incompatible num_lambda_chunks") base = 2 ** (1 / 3) ledges = np.concatenate( ( [-np.inf], np.logspace( 0, num_lambda_chunks - 1, num=num_lambda_chunks, base=base, dtype=np.float, ), [np.inf], ) ) # 1. Calculate genome-wide histograms of scores. gw_hist = dotfinder.scoring_and_histogramming_step( clr, expected.set_index(["region1","region2","diag"]), expected_value_col, clr_weight_name, tiles, kernels, ledges, max_nans_tolerated, loci_separation_bins, nproc, verbose, ) if verbose: print("Done building histograms ...") # 2. Determine the FDR thresholds. threshold_df, qvalues = dotfinder.determine_thresholds( kernels, ledges, gw_hist, fdr ) # 3. Filter using FDR thresholds calculated in the histogramming step filtered_pixels = dotfinder.scoring_and_extraction_step( clr, expected.set_index(["region1","region2","diag"]), expected_value_col, clr_weight_name, tiles, kernels, ledges, threshold_df, max_nans_tolerated, balance_factor, loci_separation_bins, op.join(op.dirname(out_prefix), op.basename(out_prefix) + ".enriched.tsv"), nproc, verbose, bin1_id_name="bin1_id", bin2_id_name="bin2_id", ) # 4. Post-processing if verbose: print(f"Begin post-processing of {len(filtered_pixels)} filtered pixels") print("preparing to extract needed q-values ...") filtered_pixels_qvals = dotfinder.annotate_pixels_with_qvalues( filtered_pixels, qvalues, kernels ) # 4a. clustering ######################################################################## # Clustering has to be done using annotated DataFrame of filtered pixels # why ? - because - clustering has to be done independently for every region! ######################################################################## filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals, clr.bins()[:]) filtered_pixels_annotated = assign_regions(filtered_pixels_annotated, view_df) # consider reseting index here centroids = dotfinder.clustering_step( filtered_pixels_annotated, view_df["name"], dots_clustering_radius, verbose, ) # 4b. filter by enrichment and qval postprocessed_calls = dotfinder.thresholding_step(centroids) # Final-postprocessed result if out_prefix is not None: postprocessed_fname = op.join( op.dirname(out_prefix), op.basename(out_prefix) + ".postproc.bedpe" ) postprocessed_calls.to_csv( postprocessed_fname, sep="\t", header=True, index=False, compression=None )
if {"chrom", "start", "end"}.issubset(features_df.columns): feature_type = "bed" elif {"chrom1", "start1", "end1", "chrom2", "start2", "end1"}.issubset(features_df.columns): feature_type = "bedpe" else: raise ValueError("Unknown feature_df format") if flank is not None: features_df = expand_align_features(features_df, flank, clr.binsize, format=feature_type) if view_df is None: view_df = bioframe.make_viewframe(clr.chromsizes) else: if not bioframe.is_contained(view_df, bioframe.make_viewframe(clr.chromsizes)): raise ValueError( "view_df is out of the bounds of chromosomes in cooler.") features_df = assign_regions(features_df, view_df) # TODO Expected checks are now implemented in the snippers, maybe move them out to here # when there is a neat function? if expected_df is None: snipper = CoolerSnipper(clr, view_df=view_df) else: snipper = ObsExpSnipper(clr, expected_df, view_df=view_df)
def _view_from_track(track_df): bioframe.core.checks._verify_columns(track_df, ["chrom", "start", "end"]) return bioframe.make_viewframe([(chrom, df.start.min(), df.end.max()) for chrom, df in track_df.groupby("chrom") ])
def call_compartments( cool_path, reference_track, view, contact_type, n_eigs, verbose, out_prefix, bigwig, ): """ Perform eigen value decomposition on a cooler matrix to calculate compartment signal by finding the eigenvector that correlates best with the phasing track. COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the '::' syntax to specify a group path in a multicooler file. TRACK_PATH : the path to a BedGraph-like file that stores phasing track as track-name named column. BedGraph-like format assumes tab-separated columns chrom, start, stop and track-name. """ clr = cooler.Cooler(cool_path) if reference_track is not None: # TODO: This all needs to be refactored into a more generic tabular file parser # Needs to handle stdin case too. track_path, col = reference_track buf, names = sniff_for_header(track_path) if names is None: if not isinstance(col, int): raise click.BadParameter( "No header found. " 'Cannot find "{}" column without a header.'.format(col)) track_name = "ref" kwargs = dict( header=None, usecols=[0, 1, 2, col], names=["chrom", "start", "end", track_name], ) else: if isinstance(col, int): try: col = names[col] except IndexError: raise click.BadParameter( 'Column #{} not compatible with header "{}".'.format( col, ",".join(names))) else: if col not in names: raise click.BadParameter( 'Column "{}" not found in header "{}"'.format( col, ",".join(names))) track_name = col kwargs = dict(header="infer", usecols=["chrom", "start", "end", track_name]) track_df = pd.read_table(buf, dtype={ "chrom": str, "start": np.int64, "end": np.int64, track_name: np.float64, }, comment="#", verbose=verbose, **kwargs) # we need to merge phasing track DataFrame with the cooler bins to get # a DataFrame with phasing info aligned and validated against bins inside of # the cooler file. track = pd.merge(left=clr.bins()[:], right=track_df, how="left", on=["chrom", "start", "end"]) # sanity check would be to check if len(bins) becomes > than nbins ... # that would imply there was something in the track_df that didn't match # ["chrom", "start", "end"] - keys from the c.bins()[:] . if len(track) > len(clr.bins()): ValueError( "There is something in the {} that ".format(track_path) + "couldn't be merged with cooler-bins {}".format(cool_path)) else: # use entire bin-table from cooler, when reference-track is not provided: track = clr.bins()[["chrom", "start", "end"]][:] track_name = None # define view for cis compartment-calling # use input "view" BED file or all chromosomes mentioned in "track": if view is None: # Generate viewframe from clr.chromsizes: view_df = bioframe.make_viewframe([(chrom, 0, clr.chromsizes[chrom]) for chrom in clr.chromnames]) else: # Make viewframe out of table: # Read view_df: try: view_df = bioframe.read_table(view, schema="bed4", index_col=False) except Exception: view_df = bioframe.read_table(view, schema="bed3", index_col=False) # Convert view_df to viewframe: try: view_df = bioframe.make_viewframe(view_df, check_bounds=clr.chromsizes) except ValueError as e: raise ValueError( "View table is incorrect, please, comply with the format. " ) from e # TODO: Add check that view_df has the same bins as track # it's contact_type dependent: if contact_type == "cis": eigvals, eigvec_table = eigdecomp.cooler_cis_eig( clr=clr, bins=track, view_df=view_df, n_eigs=n_eigs, phasing_track_col=track_name, clip_percentile=99.9, sort_metric=None, ) elif contact_type == "trans": eigvals, eigvec_table = eigdecomp.cooler_trans_eig( clr=clr, bins=track, n_eigs=n_eigs, partition=None, phasing_track_col=track_name, sort_metric=None, ) # Output eigvals.to_csv(out_prefix + "." + contact_type + ".lam.txt", sep="\t", index=False) eigvec_table.to_csv(out_prefix + "." + contact_type + ".vecs.tsv", sep="\t", index=False) if bigwig: bioframe.to_bigwig( eigvec_table, clr.chromsizes, out_prefix + "." + contact_type + ".bw", value_field="E1", )