Exemplo n.º 1
0
    def __init__(self, clr, expected, cooler_opts=None, view_df=None):
        self.clr = clr
        self.expected = expected

        # Detecting the columns for the detection of regions
        columns = expected.columns
        assert len(columns) > 0
        if ("region1" not in columns) or ("region2" not in columns):
            if ("chrom" in columns) or ("region" in columns):
                raise ValueError(
                    "Provided expected appears to have old format, it has to comply with the format of expected v1.0"
                )
            else:
                raise ValueError(
                    "Please check the expected dataframe, it has to comply with the format of expected v1.0"
                )

        # get chromosomes from cooler, if view_df not specified:
        if view_df is None:
            view_df = bioframe.make_viewframe([
                (chrom, 0, l, chrom) for chrom, l in clr.chromsizes.items()
            ])
        else:
            # appropriate viewframe checks:
            if not bioframe.is_viewframe(view_df):
                raise ValueError("view_df is not a valid viewframe.")
            if not bioframe.is_contained(
                    view_df, bioframe.make_viewframe(clr.chromsizes)):
                raise ValueError(
                    "view_df is out of the bounds of chromosomes in cooler.")

        self.view_df = view_df.set_index("name")

        for (name1,
             name2), group in self.expected.groupby(["region1", "region2"]):
            if name1 != name2:
                raise ValueError(
                    "Only symmetric regions a supported, e.g. chromosomes, arms, etc"
                )
            n_diags = group.shape[0]
            region = self.view_df.loc[name1]
            lo, hi = self.clr.extent(region)
            if n_diags != (hi - lo):
                raise ValueError(
                    "Region shape mismatch between expected and cooler. "
                    "Are they using the same resolution?")

        self.binsize = self.clr.binsize
        self.offsets = {}
        self.pad = True
        self.cooler_opts = {} if cooler_opts is None else cooler_opts
        self.cooler_opts.setdefault("sparse", True)
Exemplo n.º 2
0
def make_cooler_view(clr, ucsc_names=False):
    """
    Generate a full chromosome viewframe
    using cooler's chromsizes

    Parameters
    ----------
    clr :  cooler
        cooler-object to extract chromsizes
    ucsc_names : bool
        Use full UCSC formatted names instead
        of short chromosome names.

    Returns
    -------
    cooler_view : viewframe
        full chromosome viewframe
    """
    cooler_view = bioframe.make_viewframe(clr.chromsizes)
    if ucsc_names:
        # UCSC formatted names
        return cooler_view
    else:
        # rename back to short chromnames
        cooler_view["name"] = cooler_view["chrom"]
        return cooler_view
Exemplo n.º 3
0
def _make_cooler_view(view_df, clr):
    try:
        if not bioframe.is_viewframe(view_df, raise_errors=True):
            raise ValueError("view_df is not a valid viewframe.")
    except Exception as e:  # AssertionError or ValueError, see https://github.com/gfudenberg/bioframe/blob/main/bioframe/core/checks.py#L177
        warnings.warn(
            "view_df has to be a proper viewframe from next release",
            DeprecationWarning,
            stacklevel=2,
        )
        view_df = bioframe.make_viewframe(view_df)
    if not bioframe.is_contained(view_df,
                                 bioframe.make_viewframe(clr.chromsizes)):
        raise ValueError(
            "View table is out of the bounds of chromosomes in cooler.")
    return view_df
Exemplo n.º 4
0
def read_viewframe(
    fname,
    verify_cooler_view=None,
):
    """
    Read a BED file with regions that conforms
    a definition of a viewframe (non-overlaping, unique names, etc).

    Parameters
    ----------
    fname : str
        Path to a BED file with regions.
    verify_cooler_view : None or viewframe
        Viewframe with entire chromosome sizes

    Returns
    -------
    view_df : pd.DataFrame
        DataFrame with the viewframe
    """
    # define chromsizes based on verify_cooler_view
    chromsizes = None if (verify_cooler_view is None) else \
        verify_cooler_view.set_index("chrom")["end"]
    # read BED file assuming bed4/3 formats (with names-columns and without):
    try:
        view_df = bioframe.read_table(fname, schema="bed4", index_col=False)
    except Exception:
        view_df = bioframe.read_table(fname, schema="bed3", index_col=False)
    # Convert view dataframe to viewframe:
    try:
        view_df = bioframe.make_viewframe(view_df) if (verify_cooler_view is None) else \
            bioframe.make_viewframe(view_df, check_bounds=chromsizes)
    except ValueError as e:
        raise ValueError(
            "View table is incorrect, please, comply with the format. ") from e

    # Check that input view is contained in cooler bounds, but not vice versa (because cooler may have more regions):
    if verify_cooler_view is not None:
        if not bioframe.is_contained(view_df, verify_cooler_view):
            raise ValueError(
                "View regions are not contained in cooler chromsizes bounds")

    return view_df
Exemplo n.º 5
0
    def __init__(self, clr, cooler_opts=None, view_df=None):

        # get chromosomes from bins, if view_df not specified:
        if view_df is None:
            view_df = bioframe.make_viewframe([
                (chrom, 0, l, chrom) for chrom, l in clr.chromsizes.items()
            ])
        else:
            # appropriate viewframe checks:
            if not bioframe.is_viewframe(view_df):
                raise ValueError("view_df is not a valid viewframe.")
            if not bioframe.is_contained(
                    view_df, bioframe.make_viewframe(clr.chromsizes)):
                raise ValueError(
                    "view_df is out of the bounds of chromosomes in cooler.")

        self.view_df = view_df.set_index("name")

        self.clr = clr
        self.binsize = self.clr.binsize
        self.offsets = {}
        self.pad = True
        self.cooler_opts = {} if cooler_opts is None else cooler_opts
        self.cooler_opts.setdefault("sparse", True)
Exemplo n.º 6
0
def cooler_cis_eig(
    clr,
    bins,
    regions=None,
    n_eigs=3,
    phasing_track_col="GC",
    balance="weight",
    ignore_diags=None,
    bad_bins=None,
    clip_percentile=99.9,
    sort_metric=None,
    smooth=False,
    cutoff = 3,
    max_levels = 8,
    OE_log=False,
    map=map,
):
    """
    Compute compartment eigenvector for a given cooler `clr` in a number of
    symmetric intra chromosomal regions (cis-regions), or for each chromosome.
    Note that the amplitude of compartment eigenvectors is weighted by their
    corresponding eigenvalue
    Parameters
    ----------
    clr : cooler
        cooler object to fetch data from
    bins : DataFrame
        table of bins derived from clr with phasing track added
    regions : iterable or DataFrame, optional
        if provided, eigenvectors are calculated for the regions only,
        otherwise chromosome-wide eigenvectors are computed, for chromosomes
        specified in bins.
    n_eigs : int
        number of eigenvectors to compute
    phasing_track_col : str, optional
        name of the columns in `bins` table, if provided, eigenvectors are
        flipped to achieve a positive correlation with `bins[phasing_track_col]`.
    balance : str
        name of the column with balancing weights to be used.
    ignore_diags : int, optional
        the number of diagonals to ignore. Derived from cooler metadata
        if not specified.
    bad_bins : array-like
        a list of bins to ignore. Indexes of bins must be absolute,
        as in clr.bins()[:], as opposed to being offset by chromosome start.
        `bad_bins` will be combined with the bad bins masked by balancing.
    clip_percentile : float
        if >0 and <100, clip pixels with diagonal-normalized values
        higher than the specified percentile of matrix-wide values.
    sort_metric : str
        If provided, re-sort `eigenvecs` and `eigvals` in the order of
        decreasing correlation between phasing_track and eigenvector, using the
        specified measure of correlation. Possible values:
        'pearsonr' - sort by decreasing Pearson correlation.
        'var_explained' - sort by decreasing absolute amount of variation in
        `eigvecs` explained by `phasing_track` (i.e. R^2 * var(eigvec))
        'MAD_explained' - sort by decreasing absolute amount of Median Absolute
        Deviation from the median of `eigvecs` explained by `phasing_track`
        (i.e. COMED(eigvec, phasing_track) * MAD(eigvec)).
        'spearmanr' - sort by decreasing Spearman correlation.
        This option is designed to report the most "biologically" informative
        eigenvectors first, and prevent eigenvector swapping caused by
        translocations. In reality, however, sometimes it shows poor
        performance and may lead to reporting of non-informative eigenvectors.
        Off by default.
    smooth : boolean, optional
        This option lets you coarsegrain the matrix prior to calling eigendecomposition.
    cutoff: int, optional
        Cutoff to pass to adaptive_coarsegrain's cutoff argument
    max_levels: int, optional
        Max level to pass to adaptive_coarsegrain's max_levels argument
    OE_log: boolean, optional
        Pass OE_log to cis_eig's OE_log argument.
        This works only if matrix does not contain zeroes (eg. after using adaptive_coarsegrain)
    map : callable, optional
        Map functor implementation.
    Returns
    -------
    eigvals, eigvec_table -> DataFrames with eigenvalues for each region and
    a table of eigenvectors filled in the `bins` table.
    .. note:: ALWAYS check your EVs by eye. The first one occasionally does
              not reflect the compartment structure, but instead describes
              chromosomal arms or translocation blowouts. Possible mitigations:
              employ `regions` (e.g. arms) to avoid issues with chromosomal arms,
              use `bad_bins` to ignore small transolcations.
    """

    # get chromosomes from bins, if regions not specified:
    if regions is None:
        regions = list(bins["chrom"].unique()) # parse_regions fill in the rest

    # make sure phasing_track_col is in bins, if phasing is requested
    if phasing_track_col and (phasing_track_col not in bins):
        raise ValueError(f'No column "{phasing_track_col}" in the bin table')

    # regions to dataframe
    # regions = bioframe.parse_regions(regions, clr.chromsizes)
    regions = bioframe.make_viewframe(regions)

    # ignore diags as in cooler inless specified
    ignore_diags = (
        clr._load_attrs("bins/weight").get("ignore_diags", 2)
        if ignore_diags is None
        else ignore_diags
    )

    # prepare output table for eigen vectors
    eigvec_table = bins.copy()
    eigvec_columns = [f"E{i + 1}" for i in range(n_eigs)]
    for ev_col in eigvec_columns:
        eigvec_table[ev_col] = np.nan

    # prepare output table for eigenvalues
    eigvals_table = regions.copy()
    eigval_columns = [f"eigval{i + 1}" for i in range(n_eigs)]
    for eval_col in eigval_columns:
        eigvals_table[eval_col] = np.nan

    def _each(region):
        """
        perform eigen decomposition for a given region
        assuming safety checks are done outside of this
        function.
        Parameters
        ----------
        region: tuple-like
            tuple of the form (chroms,start,end,*)
        Returns
        -------
        _region, eigvals, eigvecs -> ndarrays
            array of eigenvalues and an array eigenvectors
        """
        _region = region[:3] # take only (chrom, start, end)
        print("now doing region:", _region)

        if smooth:
            A = numutils.adaptive_coarsegrain(
                clr.matrix(balance=True).fetch(_region),
                clr.matrix(balance=False).fetch(_region),
                cutoff=cutoff,
                max_levels=max_levels)

        else:
            A = clr.matrix(balance=balance).fetch(_region)

        # filter bad_bins relevant for the _region from A
        if bad_bins is not None:
            # filter bad_bins for the _region and turn relative:
            lo, hi = clr.extent(_region)
            bad_bins_region = bad_bins[(bad_bins>=lo)&(bad_bins<hi)]
            bad_bins_region -= lo
            if len(bad_bins_region) > 0:
                # apply bad bins to symmetric matrix A:
                A[:,bad_bins_region] = np.nan
                A[bad_bins_region,:] = np.nan

        # extract phasing track relevant for the _region
        phasing_track = (
            bioframe.select(bins, _region)[phasing_track_col].values
            if phasing_track_col
            else None
        )

        eigvals, eigvecs = cis_eig(
            A,
            n_eigs=n_eigs,
            ignore_diags=ignore_diags,
            phasing_track=phasing_track,
            clip_percentile=clip_percentile,
            sort_metric=sort_metric,
            OE_log=OE_log
        )

        return _region, eigvals, eigvecs

    # eigendecompose matrix per region (can be multiprocessed)
    # output assumes that the order of results matches regions
    results = map(_each, regions.values)

    # go through eigendecomposition results and fill in
    # output table eigvec_table and eigvals_table
    for _region, _eigvals, _eigvecs in results:
        idx = bioframe.select(eigvec_table, _region).index
        eigvec_table.at[idx, eigvec_columns] = _eigvecs.T
        idx = bioframe.select(eigvals_table, _region).index
        eigvals_table.at[idx, eigval_columns] = _eigvals


    return eigvals_table, eigvec_table
Exemplo n.º 7
0
def is_compatible_viewframe(view_df,
                            verify_cooler,
                            check_sorting=False,
                            raise_errors=False):
    """
    Check if view_df is a viewframe and if
    it is compatible with the provided cooler.

    Parameters
    ----------
    view_df :  DataFrame
        view_df DataFrame to be validated
    verify_cooler : cooler
        cooler object to use for verification
    check_sorting : bool
        Check is regions in view_df are sorted as in
        chromosomes in cooler.
    raise_errors : bool
        raise expection instead of returning False

    Returns
    -------
    is_compatible_viewframe : bool
        True when view_df is compatible, False otherwise
    """
    try:
        try:
            _ = bioframe.is_viewframe(view_df, raise_errors=True)
        except Exception as error_not_viewframe:
            try:
                _ = bioframe.make_viewframe(view_df)
            except Exception as error_cannot_make_viewframe:
                # view_df is not viewframe and cannot be easily converted
                raise ValueError(
                    "view_df is not a valid viewframe and cannot be recovered"
                ) from error_cannot_make_viewframe
            else:
                # view_df is not viewframe, but can be converted - formatting issue ? name-column ?
                raise ValueError(
                    "view_df is not a valid viewframe, apply bioframe.make_viewframe to convert"
                ) from error_not_viewframe

        # is view_df contained inside cooler-chromosomes ?
        cooler_view = make_cooler_view(verify_cooler)
        if not bioframe.is_contained(view_df, cooler_view, raise_errors=False):
            raise ValueError(
                "View table is out of the bounds of chromosomes in cooler.")

        # is view_df sorted by coord and chrom order as in cooler ?
        if check_sorting:
            if not bioframe.is_sorted(
                    view_df, cooler_view, df_view_col="chrom"):
                raise ValueError(
                    "regions in the view_df must be sorted by coordinate"
                    " and chromosomes order as as in the verify_cooler.")

    except Exception as e:
        if raise_errors:
            raise ValueError(
                "view_df is not compatible, or not a viewframe") from e
        else:
            # something went wrong: it's not a viewframe
            return False
    else:
        # no exceptions were raised: it's a compatible viewframe
        return True
Exemplo n.º 8
0
supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes]

# test the most frequent use cases, balancing applied, no bad bins, etc.

common_regions = []
for i in range(4):
    chrom = chromosomes[i]
    halfway_chrom = int(chromsizes[chrom] / 2)
    # make halfway_chrom point "bin-aligned" according to anticipated binsize
    halfway_chrom = round(halfway_chrom / assumed_binsize) * assumed_binsize
    reg1 = (chrom, 0, halfway_chrom)
    reg2 = (chrom, halfway_chrom, chromsizes[chrom])
    common_regions.append(reg1)
    common_regions.append(reg2)

view_df = bioframe.make_viewframe(common_regions, name_style='ucsc')


def test_diagsum_symm(request):
    # perform test:
    clr = cooler.Cooler(
        op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool"))
    res = cooltools.api.expected.diagsum_symm(
        clr,
        view_df=view_df,
        transforms=transforms,
        clr_weight_name=clr_weight_name,
        bad_bins=bad_bins,
        ignore_diags=ignore_diags,
        chunksize=chunksize,
    )
Exemplo n.º 9
0
def saddle(
    cool_path,
    track_path,
    expected_path,
    contact_type,
    min_dist,
    max_dist,
    n_bins,
    vrange,
    qrange,
    clr_weight_name,
    strength,
    view,
    out_prefix,
    fig,
    scale,
    cmap,
    vmin,
    vmax,
    hist_color,
    verbose,
):
    """
    Calculate saddle statistics and generate saddle plots for an arbitrary
    signal track on the genomic bins of a contact matrix.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map. Use the
    '::' syntax to specify a group path in a multicooler file.

    TRACK_PATH : The path to bedGraph-like file with a binned compartment track
    (eigenvector), including a header. Use the '::' syntax to specify a column
    name.

    EXPECTED_PATH : The paths to a tsv-like file with expected signal,
    including a header. Use the '::' syntax to specify a column name.

    Analysis will be performed for chromosomes referred to in TRACK_PATH, and
    therefore these chromosomes must be a subset of chromosomes referred to in
    COOL_PATH and EXPECTED_PATH.

    COOL_PATH, TRACK_PATH and EXPECTED_PATH must be binned at the same
    resolution (expect for  EXPECTED_PATH in case of trans contact type).

    EXPECTED_PATH must contain at least the following columns for cis contacts:
    'chrom', 'diag', 'n_valid', value_name and the following columns for trans
    contacts: 'chrom1', 'chrom2', 'n_valid', value_name value_name is controlled
    using options. Header must be present in a file.

    """
    #### Read inputs: ####
    clr = cooler.Cooler(cool_path)

    expected_path, expected_value_col = expected_path
    track_path, track_name = track_path

    #### Read track: ####
    # read bedGraph-file :
    track_columns = ["chrom", "start", "end", track_name]
    # specify dtype as a rudimentary form of validation:
    track_dtype = {
        "chrom": np.str_,
        "start": np.int64,
        "end": np.int64,
        track_name: np.float64,
    }
    track = pd.read_table(
        track_path,
        usecols=track_columns,
        dtype=track_dtype,
        comment=None,
        verbose=verbose,
    )

    #### Generate viewframes ####
    # 1:cooler_view_df. Generate viewframe from clr.chromsizes:
    cooler_view_df = make_cooler_view(clr)

    # 2:view_df. Define global view for calculating calling dots
    # use input "view" BED file or all chromosomes :
    if view is None:
        view_df = cooler_view_df
    else:
        view_df = read_viewframe_from_file(view, clr, check_sorting=True)

    # 3:track_view_df. Generate viewframe from track table:
    track_view_df = bioframe.make_viewframe([
        (group.chrom.iloc[0], np.nanmin(group.start), np.nanmax(group.end))
        for i, group in track.reset_index().groupby("chrom")
    ])

    #### Read expected: ####

    expected_summary_cols = [
        expected_value_col,
    ]

    expected = read_expected_from_file(
        expected_path,
        contact_type=contact_type,
        expected_value_cols=expected_summary_cols,
        verify_view=view_df,
        verify_cooler=clr,
    )

    if min_dist < 0:
        min_diag = 3
    else:
        min_diag = int(np.ceil(min_dist / clr.binsize))

    if max_dist >= 0:
        max_diag = int(np.floor(max_dist / clr.binsize))
    else:
        max_diag = -1

    if clr_weight_name:
        track = mask_cooler_bad_bins((track, track_name),
                                     (clr.bins()[:], clr_weight_name))

    if vrange[0] is None:
        vrange = None
    if qrange[0] is None:
        qrange = None
    if (qrange is not None) and (vrange is not None):
        raise ValueError("only one of vrange or qrange can be supplied")

    # digitize outside of saddle so that we have binedges to save below
    track = align_track_with_cooler(
        track,
        clr,
        view_df=view_df,
        clr_weight_name=clr_weight_name,
        mask_bad_bins=True,
    )
    digitized_track, binedges = api.saddle.digitize(
        track.iloc[:, :4],
        n_bins,
        vrange=vrange,
        qrange=qrange,
        digitized_suffix=".d",
    )

    S, C = api.saddle.saddle(
        clr,
        expected,
        digitized_track,
        contact_type,
        None,
        vrange=None,
        qrange=None,
        view_df=view_df,
        clr_weight_name=clr_weight_name,
        expected_value_col=expected_value_col,
        view_name_col="name",
        min_diag=min_diag,
        max_diag=max_diag,
        verbose=verbose,
    )
    saddledata = S / C

    to_save = dict(
        saddledata=saddledata,
        binedges=binedges,
        digitized=digitized_track,
        saddlecounts=C,
    )

    if strength:
        ratios = api.saddle.saddle_strength(S, C)
        ratios = ratios[1:-1]  # drop outlier bins
        to_save["saddle_strength"] = ratios

    # Save data
    np.savez(out_prefix + ".saddledump", **to_save)  # .npz auto-added
    digitized_track.to_csv(out_prefix + ".digitized.tsv",
                           sep="\t",
                           index=False)

    # Generate figure
    if len(fig):
        try:
            import matplotlib as mpl

            mpl.use("Agg")  # savefig only for now:
            import matplotlib.pyplot as plt
        except ImportError:
            print("Install matplotlib to use ", file=sys.stderr)
            sys.exit(1)

        if hist_color is None:
            color = (
                0.41568627450980394,
                0.8,
                0.39215686274509803,
            )  # sns.color_palette('muted')[2]
        else:
            color = mpl.colors.colorConverter.to_rgb(hist_color)
        title = op.basename(cool_path) + " ({})".format(contact_type)

        if qrange is not None:
            track_label = track_name + " quantiles"
        else:
            track_label = track_name

        clabel = "(contact frequency / expected)"

        api.saddle.saddleplot(
            track,
            saddledata,
            n_bins,
            vrange=vrange,
            qrange=qrange,
            scale=scale,
            vmin=vmin,
            vmax=vmax,
            color=color,
            title=title,
            xlabel=track_label,
            ylabel=track_label,
            clabel=clabel,
            cmap=cmap,
        )

        for ext in fig:
            plt.savefig(out_prefix + "." + ext, bbox_inches="tight")
Exemplo n.º 10
0
def compute_expected(
    cool_path,
    nproc,
    chunksize,
    output,
    contact_type,
    view,
    balance,
    clr_weight_name,
    ignore_diags,
):
    """
    Calculate expected Hi-C signal either for cis or for trans regions
    of chromosomal interaction map.

    When balancing weights are not applied to the data, there is no
    masking of bad bins performed.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map.

    """

    clr = cooler.Cooler(cool_path)
    if view is not None:
        # Read view_df dataframe:
        try:
            view_df = bioframe.read_table(view, schema="bed4", index_col=False)
        except Exception:
            view_df = bioframe.read_table(view, schema="bed3", index_col=False)
        # Convert view dataframe to viewframe:
        try:
            view_df = bioframe.make_viewframe(view_df, check_bounds=clr.chromsizes)
        except ValueError as e:
            raise ValueError(
                "View table is incorrect, please, comply with the format. "
            ) from e
    else:
        view_df = None # full chromosome case

    if contact_type == "cis":
        result = expected.get_cis_expected(
            clr,
            view_df=view_df,
            intra_only=True,
            clr_weight_name=clr_weight_name if balance else None,
            ignore_diags=ignore_diags,
            chunksize=chunksize,
            nproc=nproc
        )
    elif contact_type == "trans":
        result = expected.get_trans_expected(
            clr,
            view_df=view_df,
            clr_weight_name=clr_weight_name if balance else None,
            chunksize=chunksize,
            nproc=nproc,
        )

    # output to file if specified:
    if output:
        result.to_csv(output, sep="\t", index=False, na_rep="nan")
    # or print into stdout otherwise:
    else:
        print(result.to_csv(sep="\t", index=False, na_rep="nan"))
Exemplo n.º 11
0
def read_viewframe_from_file(
    view_fname,
    verify_cooler=None,
    check_sorting=False,
):
    """
    Read a BED file with regions that conforms
    a definition of a viewframe (non-overlaping, unique names, etc).

    Parameters
    ----------
    view_fname : str
        Path to a BED file with regions.
    verify_cooler : cooler | None
        cooler object to get chromsizes for bound checking
        No checks are done when None.
    check_sorting : bool
        Check is regions in view_df are sorted as in
        chromosomes in cooler.

    Returns
    -------
    view_df : pd.DataFrame
        DataFrame with the viewframe
    """

    # read BED file assuming bed4/3 formats (with names-columns and without):
    try:
        view_df = bioframe.read_table(view_fname,
                                      schema="bed4",
                                      index_col=False)
    except Exception as err_bed4:
        try:
            view_df = bioframe.read_table(view_fname,
                                          schema="bed3",
                                          index_col=False)
        except Exception as err_bed3:
            raise ValueError(
                f"{view_fname} is not a BED file with 3 or 4 columns"
            ) from err_bed4

    # Convert view dataframe to viewframe:
    try:
        view_df = bioframe.make_viewframe(view_df)
    except ValueError as e:
        raise ValueError(
            "View table is incorrect, please, comply with the format. ") from e

    if verify_cooler is not None:
        try:
            _ = is_compatible_viewframe(view_df,
                                        verify_cooler,
                                        check_sorting,
                                        raise_errors=True)
        except Exception as e:
            raise ValueError(
                "view_df is not compatible with the cooler") from e
        else:
            # view_df is compaible, returning
            return view_df
    else:
        # no cooler for checking, returning
        return view_df
Exemplo n.º 12
0
supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes]

# test the most frequent use cases, balancing applied, no bad bins, etc.

common_regions = []
for i in range(4):
    chrom = chromosomes[i]
    halfway_chrom = int(chromsizes[chrom] / 2)
    # make halfway_chrom point "bin-aligned" according to anticipated binsize
    halfway_chrom = round(halfway_chrom / assumed_binsize) * assumed_binsize
    reg1 = (chrom, 0, halfway_chrom)
    reg2 = (chrom, halfway_chrom, chromsizes[chrom])
    common_regions.append(reg1)
    common_regions.append(reg2)

view_df = bioframe.make_viewframe(common_regions)


def test_diagsum_symm(request):
    # perform test:
    clr = cooler.Cooler(
        op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool"))
    res = cooltools.expected.diagsum_symm(
        clr,
        view_df=view_df,
        transforms=transforms,
        weight_name=weight_name,
        bad_bins=bad_bins,
        ignore_diags=ignore_diags,
        chunksize=chunksize,
    )
Exemplo n.º 13
0
def compute_pileup(
    cool_path,
    features,
    view,
    expected,
    flank,
    features_format,
    weight_name,
    out,
    out_format,
    store_snips,
    nproc,
    ignore_diags,
    aggregate,
    force,
    verbose,
):
    """
    Perform retrieval of the snippets from .cool file.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map. Use the
    '::' syntax to specify a group path in a multicooler file.

    FEATURES_PATH : the path to a BED or BEDPE-like file that contains features for snipping windows.
    If BED, then the features are on-diagonal. If BEDPE, then the features
    can be off-diagonal (but not in trans or between different regions in the view).

    """

    clr = cooler.Cooler(cool_path)

    #### Read the features:
    buf, names = sniff_for_header(features)
    if features_format.lower() == "bedpe":
        default_cols = [0, 1, 2, 3, 4, 5]
        bedpe_cols = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"]
        dtypes = {
            "chrom1": str,
            "start1": np.int64,
            "end1": np.int64,
            "chrom2": str,
            "start2": np.int64,
            "end2": np.int64,
        }
        if names is None:
            kwargs = dict(
                header=None,
                usecols=default_cols,
                dtype=dtypes,
                names=bedpe_cols,
            )
        else:
            kwargs = dict(header="infer", usecols=bedpe_cols)
    elif features_format.lower() == "bed":
        default_cols = [0, 1, 2]
        bed_cols = ["chrom", "start", "end"]
        dtypes = {"chrom": str, "start": np.int64, "end": np.int64}
        if names is None:
            kwargs = dict(
                header=None,
                names=bed_cols,
            )
        else:
            kwargs = dict(header="infer", usecols=bed_cols)
    else:
        raise ValueError(
            "Automatic detection of features format is not implemented yet. "
            "Please provide BED or BEDPE as --features-format")

    features_df = pd.read_table(buf,
                                comment="#",
                                usecols=default_cols,
                                dtype=dtypes,
                                verbose=verbose,
                                **kwargs)

    ###### Define view for cis compartment-calling
    # use input "view" BED file or all chromosomes mentioned in "track":
    if view is None:
        # Generate viewframe from clr.chromsizes:
        view_df = bioframe.make_viewframe([(chrom, 0, clr.chromsizes[chrom])
                                           for chrom in clr.chromnames])
        if not bioframe.is_contained(features_df, view_df):
            raise ValueError(
                "Features are not contained in chromosomes bounds")
    else:
        # Make viewframe out of table:
        # Read view_df:
        try:
            view_df = bioframe.read_table(view, schema="bed4", index_col=False)
        except Exception:
            view_df = bioframe.read_table(view, schema="bed3", index_col=False)
        # Convert view_df to viewframe:
        try:
            view_df = bioframe.make_viewframe(view_df,
                                              check_bounds=clr.chromsizes)
        except ValueError as e:
            raise ValueError(
                "View table is incorrect, please, comply with the format. "
            ) from e

    if not bioframe.is_contained(features_df, view_df):
        raise ValueError("Features are not contained in view bounds")

    ##### Read expected, should be cis-expected:
    if not expected is None:
        expected_path, expected_value_col = expected
        expected_summary_cols = [
            expected_value_col,
        ]
        expected = read_expected(
            expected_path,
            contact_type="cis",
            expected_value_cols=expected_summary_cols,
            verify_view=view_df,
            verify_cooler=clr,
        )

    ##### CReate the pileup:
    stack = snipping.pileup(
        clr,
        features_df,
        view_df=view_df,
        expected_df=expected,
        flank=flank,
        min_diag=ignore_diags,  # TODO: implement in pileup API
        clr_weight_name=weight_name,  # TODO: implement in pileup API
        force=force,  # TODO: implement in pileup API
        nproc=nproc,
    )

    ##### Aggregate the signal:
    aggregate = aggregate.lower()
    if aggregate is None or aggregate == "mean" or aggregate == "none":
        agg_func = np.nanmean
    elif aggregate == "median":
        agg_func = np.nanmedian
    elif aggregate == "min":
        agg_func = np.nanmin
    elif aggregate == "max":
        agg_func = np.nanmax
    elif aggregate == "std":
        agg_func = np.nanstd
    else:
        raise ValueError(
            f"Aggregation mode {aggregate} not supported. Please use mean/median/min/max/std."
        )

    pileup = agg_func(stack, axis=2)

    ##### Store the data as NPZ file:
    if out_format.lower() == "npz":
        if store_snips:
            np.savez(out, pileup=pileup)
        else:
            np.savez(out, pileup=pileup, stack=stack)
    elif out_format.lower() == "hdf5":
        h5 = h5py.File(out, "w")
        h5.create_dataset("pileup", data=pileup)
        if store_snips:
            h5.create_dataset("stack", data=stack)
Exemplo n.º 14
0
def call_dots(
    cool_path,
    expected_path,
    view,
    clr_weight_name,
    nproc,
    max_loci_separation,
    max_nans_tolerated,
    tile_size,
    kernel_width,
    kernel_peak,
    num_lambda_chunks,
    fdr,
    dots_clustering_radius,
    verbose,
    out_prefix,
):
    """
    Call dots on a Hi-C heatmap that are not larger than max_loci_separation.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map.

    EXPECTED_PATH : The paths to a tsv-like file with expected signal,
    including a header. Use the '::' syntax to specify a column name.

    Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and
    therefore these chromosomes must be a subset of chromosomes referred to in
    COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial,
    i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH
    before applying this script.

    COOL_PATH and EXPECTED_PATH must be binned at the same resolution.

    EXPECTED_PATH must contain at least the following columns for cis contacts:
    'region1/2', 'diag', 'n_valid', value_name. value_name is controlled using
    options. Header must be present in a file.

    """
    clr = cooler.Cooler(cool_path)
    expected_path, expected_value_col = expected_path

    #### Generate viewframes ####
    # 1:cooler_view_df. Generate viewframe from clr.chromsizes:
    cooler_view_df = bioframe.make_viewframe( clr.chromsizes )

    # 2:view_df. Define global view for calculating calling dots
    # use input "view" BED file or all chromosomes :
    view_df = cooler_view_df if (view is None) else read_viewframe(view, cooler_view_df)

    #### Read expected: ####
    expected_summary_cols = [expected_value_col, ]
    expected = read_expected(
        expected_path,
        contact_type="cis",
        expected_value_cols=expected_summary_cols,
        verify_view=view_df,
        verify_cooler=clr,
    )
    # add checks to make sure cis-expected is symmetric

    # Prepare some parameters.
    binsize = clr.binsize
    loci_separation_bins = int(max_loci_separation / binsize)
    tile_size_bins = int(tile_size / binsize)
    balance_factor = 1.0  # clr._load_attrs("bins/weight")["scale"]

    # clustering would deal with bases-units for now, so supress this for now
    # clustering_radius_bins = int(dots_clustering_radius/binsize)

    # kernels
    # 'upright' is a symmetrical inversion of "lowleft", not needed.
    ktypes = ["donut", "vertical", "horizontal", "lowleft"]

    if (kernel_width is None) or (kernel_peak is None):
        w, p = dotfinder.recommend_kernel_params(binsize)
        print(f"Using kernel parameters w={w}, p={p} recommended for binsize {binsize}")
    else:
        w, p = kernel_width, kernel_peak
        # add some sanity check for w,p:
        if not w > p:
            raise ValueError(f"Wrong inner/outer kernel parameters w={w}, p={p}")
        print(f"Using kernel parameters w={w}, p={p} provided by user")

    # once kernel parameters are setup check max_nans_tolerated
    # to make sure kernel footprints overlaping 1 side with the
    # NaNs filled row/column are not "allowed"
    # this requires dynamic adjustment for the "shrinking donut"
    if not max_nans_tolerated <= 2 * w:
        raise ValueError("Too many NaNs allowed!")
    # may lead to scoring the same pixel twice, - i.e. duplicates.

    # generate standard kernels - consider providing custom ones
    kernels = {k: dotfinder.get_kernel(w, p, k) for k in ktypes}

    # list of tile coordinate ranges
    tiles = list(
        dotfinder.heatmap_tiles_generator_diag(
            clr, view_df, w, tile_size_bins, loci_separation_bins
        )
    )

    # lambda-chunking edges ...
    if not dotfinder.HiCCUPS_W1_MAX_INDX <= num_lambda_chunks <= 50:
        raise ValueError("Incompatible num_lambda_chunks")
    base = 2 ** (1 / 3)
    ledges = np.concatenate(
        (
            [-np.inf],
            np.logspace(
                0,
                num_lambda_chunks - 1,
                num=num_lambda_chunks,
                base=base,
                dtype=np.float,
            ),
            [np.inf],
        )
    )

    # 1. Calculate genome-wide histograms of scores.
    gw_hist = dotfinder.scoring_and_histogramming_step(
        clr,
        expected.set_index(["region1","region2","diag"]),
        expected_value_col,
        clr_weight_name,
        tiles,
        kernels,
        ledges,
        max_nans_tolerated,
        loci_separation_bins,
        nproc,
        verbose,
    )

    if verbose:
        print("Done building histograms ...")

    # 2. Determine the FDR thresholds.
    threshold_df, qvalues = dotfinder.determine_thresholds(
        kernels, ledges, gw_hist, fdr
    )

    # 3. Filter using FDR thresholds calculated in the histogramming step
    filtered_pixels = dotfinder.scoring_and_extraction_step(
        clr,
        expected.set_index(["region1","region2","diag"]),
        expected_value_col,
        clr_weight_name,
        tiles,
        kernels,
        ledges,
        threshold_df,
        max_nans_tolerated,
        balance_factor,
        loci_separation_bins,
        op.join(op.dirname(out_prefix), op.basename(out_prefix) + ".enriched.tsv"),
        nproc,
        verbose,
        bin1_id_name="bin1_id",
        bin2_id_name="bin2_id",
    )

    # 4. Post-processing
    if verbose:
        print(f"Begin post-processing of {len(filtered_pixels)} filtered pixels")
        print("preparing to extract needed q-values ...")

    filtered_pixels_qvals = dotfinder.annotate_pixels_with_qvalues(
        filtered_pixels, qvalues, kernels
    )
    # 4a. clustering
    ########################################################################
    # Clustering has to be done using annotated DataFrame of filtered pixels
    # why ? - because - clustering has to be done independently for every region!
    ########################################################################
    filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals, clr.bins()[:])
    filtered_pixels_annotated = assign_regions(filtered_pixels_annotated, view_df)
    # consider reseting index here
    centroids = dotfinder.clustering_step(
        filtered_pixels_annotated,
        view_df["name"],
        dots_clustering_radius,
        verbose,
    )

    # 4b. filter by enrichment and qval
    postprocessed_calls = dotfinder.thresholding_step(centroids)

    # Final-postprocessed result
    if out_prefix is not None:

        postprocessed_fname = op.join(
            op.dirname(out_prefix), op.basename(out_prefix) + ".postproc.bedpe"
        )

        postprocessed_calls.to_csv(
            postprocessed_fname, sep="\t", header=True, index=False, compression=None
        )
Exemplo n.º 15
0
    if {"chrom", "start", "end"}.issubset(features_df.columns):
        feature_type = "bed"
    elif {"chrom1", "start1", "end1", "chrom2", "start2",
          "end1"}.issubset(features_df.columns):
        feature_type = "bedpe"
    else:
        raise ValueError("Unknown feature_df format")
    if flank is not None:
        features_df = expand_align_features(features_df,
                                            flank,
                                            clr.binsize,
                                            format=feature_type)

    if view_df is None:
        view_df = bioframe.make_viewframe(clr.chromsizes)
    else:
        if not bioframe.is_contained(view_df,
                                     bioframe.make_viewframe(clr.chromsizes)):
            raise ValueError(
                "view_df is out of the bounds of chromosomes in cooler.")

    features_df = assign_regions(features_df, view_df)

    # TODO Expected checks are now implemented in the snippers, maybe move them out to here
    # when there is a neat function?

    if expected_df is None:
        snipper = CoolerSnipper(clr, view_df=view_df)
    else:
        snipper = ObsExpSnipper(clr, expected_df, view_df=view_df)
Exemplo n.º 16
0
def _view_from_track(track_df):
    bioframe.core.checks._verify_columns(track_df, ["chrom", "start", "end"])
    return bioframe.make_viewframe([(chrom, df.start.min(), df.end.max())
                                    for chrom, df in track_df.groupby("chrom")
                                    ])
Exemplo n.º 17
0
def call_compartments(
    cool_path,
    reference_track,
    view,
    contact_type,
    n_eigs,
    verbose,
    out_prefix,
    bigwig,
):
    """
    Perform eigen value decomposition on a cooler matrix to calculate
    compartment signal by finding the eigenvector that correlates best with the
    phasing track.


    COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the
    '::' syntax to specify a group path in a multicooler file.

    TRACK_PATH : the path to a BedGraph-like file that stores phasing track as
    track-name named column.

    BedGraph-like format assumes tab-separated columns chrom, start, stop and
    track-name.

    """
    clr = cooler.Cooler(cool_path)

    if reference_track is not None:

        # TODO: This all needs to be refactored into a more generic tabular file parser
        # Needs to handle stdin case too.
        track_path, col = reference_track
        buf, names = sniff_for_header(track_path)

        if names is None:
            if not isinstance(col, int):
                raise click.BadParameter(
                    "No header found. "
                    'Cannot find "{}" column without a header.'.format(col))

            track_name = "ref"
            kwargs = dict(
                header=None,
                usecols=[0, 1, 2, col],
                names=["chrom", "start", "end", track_name],
            )
        else:
            if isinstance(col, int):
                try:
                    col = names[col]
                except IndexError:
                    raise click.BadParameter(
                        'Column #{} not compatible with header "{}".'.format(
                            col, ",".join(names)))
            else:
                if col not in names:
                    raise click.BadParameter(
                        'Column "{}" not found in header "{}"'.format(
                            col, ",".join(names)))

            track_name = col
            kwargs = dict(header="infer",
                          usecols=["chrom", "start", "end", track_name])

        track_df = pd.read_table(buf,
                                 dtype={
                                     "chrom": str,
                                     "start": np.int64,
                                     "end": np.int64,
                                     track_name: np.float64,
                                 },
                                 comment="#",
                                 verbose=verbose,
                                 **kwargs)

        # we need to merge phasing track DataFrame with the cooler bins to get
        # a DataFrame with phasing info aligned and validated against bins inside of
        # the cooler file.
        track = pd.merge(left=clr.bins()[:],
                         right=track_df,
                         how="left",
                         on=["chrom", "start", "end"])

        # sanity check would be to check if len(bins) becomes > than nbins ...
        # that would imply there was something in the track_df that didn't match
        # ["chrom", "start", "end"] - keys from the c.bins()[:] .
        if len(track) > len(clr.bins()):
            ValueError(
                "There is something in the {} that ".format(track_path) +
                "couldn't be merged with cooler-bins {}".format(cool_path))
    else:
        # use entire bin-table from cooler, when reference-track is not provided:
        track = clr.bins()[["chrom", "start", "end"]][:]
        track_name = None

    # define view for cis compartment-calling
    # use input "view" BED file or all chromosomes mentioned in "track":
    if view is None:
        # Generate viewframe from clr.chromsizes:
        view_df = bioframe.make_viewframe([(chrom, 0, clr.chromsizes[chrom])
                                           for chrom in clr.chromnames])
    else:
        # Make viewframe out of table:
        # Read view_df:
        try:
            view_df = bioframe.read_table(view, schema="bed4", index_col=False)
        except Exception:
            view_df = bioframe.read_table(view, schema="bed3", index_col=False)
        # Convert view_df to viewframe:
        try:
            view_df = bioframe.make_viewframe(view_df,
                                              check_bounds=clr.chromsizes)
        except ValueError as e:
            raise ValueError(
                "View table is incorrect, please, comply with the format. "
            ) from e

    # TODO: Add check that view_df has the same bins as track

    # it's contact_type dependent:
    if contact_type == "cis":
        eigvals, eigvec_table = eigdecomp.cooler_cis_eig(
            clr=clr,
            bins=track,
            view_df=view_df,
            n_eigs=n_eigs,
            phasing_track_col=track_name,
            clip_percentile=99.9,
            sort_metric=None,
        )
    elif contact_type == "trans":
        eigvals, eigvec_table = eigdecomp.cooler_trans_eig(
            clr=clr,
            bins=track,
            n_eigs=n_eigs,
            partition=None,
            phasing_track_col=track_name,
            sort_metric=None,
        )

    # Output
    eigvals.to_csv(out_prefix + "." + contact_type + ".lam.txt",
                   sep="\t",
                   index=False)
    eigvec_table.to_csv(out_prefix + "." + contact_type + ".vecs.tsv",
                        sep="\t",
                        index=False)
    if bigwig:
        bioframe.to_bigwig(
            eigvec_table,
            clr.chromsizes,
            out_prefix + "." + contact_type + ".bw",
            value_field="E1",
        )