Exemplo n.º 1
0
    def _each(region):
        """
        perform eigen decomposition for a given region
        assuming safety checks are done outside of this
        function.
        Parameters
        ----------
        region: tuple-like
            tuple of the form (chroms,start,end,*)
        Returns
        -------
        _region, eigvals, eigvecs -> ndarrays
            array of eigenvalues and an array eigenvectors
        """
        _region = region[:3] # take only (chrom, start, end)
        print("now doing region:", _region)

        if smooth:
            A = numutils.adaptive_coarsegrain(
                clr.matrix(balance=True).fetch(_region),
                clr.matrix(balance=False).fetch(_region),
                cutoff=cutoff,
                max_levels=max_levels)

        else:
            A = clr.matrix(balance=balance).fetch(_region)

        # filter bad_bins relevant for the _region from A
        if bad_bins is not None:
            # filter bad_bins for the _region and turn relative:
            lo, hi = clr.extent(_region)
            bad_bins_region = bad_bins[(bad_bins>=lo)&(bad_bins<hi)]
            bad_bins_region -= lo
            if len(bad_bins_region) > 0:
                # apply bad bins to symmetric matrix A:
                A[:,bad_bins_region] = np.nan
                A[bad_bins_region,:] = np.nan

        # extract phasing track relevant for the _region
        phasing_track = (
            bioframe.select(bins, _region)[phasing_track_col].values
            if phasing_track_col
            else None
        )

        eigvals, eigvecs = cis_eig(
            A,
            n_eigs=n_eigs,
            ignore_diags=ignore_diags,
            phasing_track=phasing_track,
            clip_percentile=clip_percentile,
            sort_metric=sort_metric,
            OE_log=OE_log
        )

        return _region, eigvals, eigvecs
Exemplo n.º 2
0
def digitize_track(binedges, track, regions=None):
    """
    Digitize genomic signal tracks into integers between `1` and `n`.

    Parameters
    ----------
    binedges : 1D array (length n + 1)
        Bin edges for quantization of signal. For `n` bins, there are `n + 1`
        edges. See encoding details in Notes.
    track : tuple of (DataFrame, str)
        bedGraph-like dataframe along with the name of the value column.
    regions: sequence of str or tuples
        List of genomic regions to include. Each can be a chromosome, a
        UCSC-style genomic region string or a tuple.

    Returns
    -------
    digitized : DataFrame
        New bedGraph-like dataframe with value column and an additional
        digitized value column with name suffixed by '.d'
    hist : 1D array (length n + 2)
        Histogram of digitized signal values. Its length is `n + 2` because
        the first and last elements correspond to outliers. See notes.

    Notes
    -----
    The digital encoding is as follows:

    - `1..n` <-> values assigned to histogram bins
    - `0` <-> left outlier values
    - `n+1` <-> right outlier values
    - `-1` <-> missing data (NaNs)

    """
    if not isinstance(track, tuple):
        raise ValueError(
            "``track`` should be a tuple of (dataframe, column_name)")
    track, name = track

    # subset and re-order chromosome groups
    if regions is not None:
        regions = [bioframe.parse_region(reg) for reg in regions]
        track = pd.concat(bioframe.select(track, region) for region in regions)

    # histogram the signal
    digitized = track.copy()
    digitized[name + ".d"] = np.digitize(track[name].values,
                                         binedges,
                                         right=False)
    mask = track[name].isnull()
    digitized.loc[mask, name + ".d"] = -1
    x = digitized[name + ".d"].values.copy()
    x = x[(x > 0) & (x < len(binedges) + 1)]
    hist = np.bincount(x, minlength=len(binedges) + 1)
    return digitized, hist
Exemplo n.º 3
0
    def _each(region):
        """
        perform eigen decomposition for a given region
        assuming safety checks are done outside of this
        function.
        Parameters
        ----------
        region: tuple-like
            tuple of the form (chroms,start,end,*)
        Returns
        -------
        _region, eigvals, eigvecs -> ndarrays
            array of eigenvalues and an array eigenvectors
        """
        _region = region[:3]  # take only (chrom, start, end)
        A = clr.matrix(balance=clr_weight_name).fetch(_region)

        # filter bad_bins relevant for the _region from A
        if bad_bins is not None:
            # filter bad_bins for the _region and turn relative:
            lo, hi = clr.extent(_region)
            bad_bins_region = bad_bins[(bad_bins >= lo) & (bad_bins < hi)]
            bad_bins_region -= lo
            if len(bad_bins_region) > 0:
                # apply bad bins to symmetric matrix A:
                A[:, bad_bins_region] = np.nan
                A[bad_bins_region, :] = np.nan

        # extract phasing track relevant for the _region
        if phasing_track is not None:
            phasing_track_region = bioframe.select(phasing_track, _region)
            phasing_track_region_values = phasing_track_region["value"].values
        else:
            phasing_track_region_values = None

        eigvals, eigvecs = cis_eig(
            A,
            n_eigs=n_eigs,
            ignore_diags=ignore_diags,
            phasing_track=phasing_track_region_values,
            clip_percentile=clip_percentile,
            sort_metric=sort_metric,
        )

        return _region, eigvals, eigvecs
Exemplo n.º 4
0
def plot_insulation(clr, insulation, windows, resolution, out_path, exclude_chroms, title):
    dir_path = os.path.join(os.path.dirname(out_path), title)

    if not os.path.exists(dir_path):
        os.mkdir(dir_path)

    chromsizes = bioframe.fetch_chromsizes('sacCer3', filter_chroms=False)
    regions = [(k, 0, v) for k, v in chromsizes.drop('chrM').iteritems()]

    for region in regions:
        norm = LogNorm(vmax=0.1, vmin=0.001)
        data = clr.matrix(balance=True).fetch(region)
        fig, ax = plt.subplots(figsize=(20, 4))

        img = plot_45_mat(ax, data, start=0, resolution=resolution, norm=norm, cmap='fall')

        ax.set_aspect(0.5)
        ax.set_ylim(0, 30000)
        format_ticks(ax, rotate=False)
        ax.xaxis.set_visible(False)

        divider = make_axes_locatable(ax)
        cax = divider.append_axes('right', size='1%' ,pad=0.1, aspect=6)
        plt.colorbar(img, cax=cax)

        insul_region = bioframe.select(insulation, region)
        
        ins_ax = divider.append_axes('bottom', size='50%', pad=0.0, sharex=ax)
        ins_ax.set_prop_cycle(plt.cycler('color', plt.cm.plasma(np.linspace(0, 1, 5))))

        for window in windows:
            ins_ax.plot(insul_region[['start', 'end']].mean(axis=1),
                insul_region[f'log2_insulation_score_{window}'],
                label=f'{window} bp window', lw=1)

        ins_ax.legend(bbox_to_anchor=(1.125, 1.05), loc='upper right')
        fig.suptitle(f'{title}: {region[0]}')

        path = os.path.join(dir_path, '_'.join((region[0], os.path.basename(out_path))))

        plt.savefig(path, dpi=300)
Exemplo n.º 5
0
    def _each(region):
        A = clr.matrix(balance=balance).fetch(region)
        if phasing_track_col and (phasing_track_col not in bins):
            raise ValueError(
                'No column "{}" in the bin table'.format(phasing_track_col)
            )
        phasing_track = (
            bioframe.select(bins, region)[phasing_track_col].values
            if phasing_track_col
            else None
        )

        eigvals, eigvecs = cis_eig(
            A,
            n_eigs=n_eigs,
            ignore_diags=ignore_diags,
            phasing_track=phasing_track,
            clip_percentile=clip_percentile,
            sort_metric=sort_metric,
        )

        return eigvals, eigvecs
Exemplo n.º 6
0
def get_saddle(
    clr,
    expected,
    digitized_track,
    contact_type,
    view_df=None,
    clr_weight_name="weight",
    expected_value_col="balanced.avg",
    view_name_col="name",
    min_diag=3,
    max_diag=-1,
    trim_outliers=False,
    verbose=False,
):
    """
    Get a matrix of average interactions between genomic bin
    pairs as a function of a specified genomic track.

    The provided genomic track must a dataframe with a categorical
    column, as generated by `get_digitized()`.

    Parameters
    ----------
    clr : cooler.Cooler
        Observed matrix.
    expected : DataFrame in expected format
        Diagonal summary statistics for each chromosome, and name of the column
        with the values of expected to use.
    contact_type : str
        If 'cis' then only cis interactions are used to build the matrix.
        If 'trans', only trans interactions are used.
    digitized_track : DataFrame with digitized value column
        A track, i.e. BedGraph-like dataframe, of digitized signal.
        The value column specifies a category for every position in the track.
        Generated by get_digitzed() from track.
    view_df: viewframe
        Viewframe with genomic regions. If none, generate from track chromosomes.
    clr_weight_name : str
        Name of the column in the clr.bins to use as balancing weights.
    expected_value_col : str
        Name of the column in expected used for normalizing.
    view_name_col : str
        Name of column in view_df with region names.
    min_diag : int
        Smallest diagonal to include in computation. Ignored with
        contact_type=trans.
    max_diag : int
        Biggest diagonal to include in computation. Ignored with
        contact_type=trans.
    trim_outliers : bool, optional
        Remove first and last row and column from the output matrix.
    verbose : bool, optional
        If True then reports progress.
    Returns
    -------
    interaction_sum : 2D array
        The matrix of summed interaction probability between two genomic bins
        given their values of the provided genomic track.
    interaction_count : 2D array
        The matrix of the number of genomic bin pairs that contributed to the
        corresponding pixel of ``interaction_sum``.
    """

    ### TODO add input validation for: track, expeced,
    if type(digitized_track.dtypes[3]
            ) is not pd.core.dtypes.dtypes.CategoricalDtype:
        raise ValueError(
            "a digitized track, where the value column is a" +
            "pandas categorical must be provided as input. see get_digitized()."
        )
    digitized_col = digitized_track.columns[3]
    cats = digitized_track[digitized_col].dtype.categories.values
    n_bins = len(cats[cats > -1]) - 2

    if view_df is None:
        view_df = _view_from_track(digitized_track)
    else:
        view_df = _make_cooler_view(view_df, clr)

    digitized_tracks = {}
    for num, reg in view_df.iterrows():
        digitized_reg = bioframe.select(digitized_track, reg)
        digitized_tracks[reg[view_name_col]] = digitized_reg[digitized_col]

    ### set "cis" or "trans" for supports (regions to iterate over) and matrix fetcher
    if contact_type == "cis":
        supports = list(zip(view_df[view_name_col], view_df[view_name_col]))
        if not bioframe.is_cataloged(expected,
                                     view_df,
                                     df_view_col="region1",
                                     view_name_col=view_name_col):
            raise ValueError(
                "Region names in expected are not cataloged in view_df.")
        getmatrix = _make_cis_obsexp_fetcher(
            clr,
            expected,
            view_df,
            view_name_col=view_name_col,
            expected_value_col=expected_value_col,
            clr_weight_name=clr_weight_name,
        )
    elif contact_type == "trans":
        supports = list(combinations(view_df[view_name_col], 2))
        supports = [
            i for i in supports
            if (view_df["chrom"].loc[view_df[view_name_col] == i[0]].values !=
                view_df["chrom"].loc[view_df[view_name_col] == i[1]].values)
        ]

        getmatrix = _make_trans_obsexp_fetcher(
            clr,
            expected,
            view_df,
            view_name_col=view_name_col,
            expected_value_col=expected_value_col,
            clr_weight_name=clr_weight_name,
        )
    else:
        raise ValueError(
            "Allowed values for contact_type are 'cis' or 'trans'.")

    # n_bins here includes 2 open bins for values <lo and >hi.
    interaction_sum = np.zeros((n_bins + 2, n_bins + 2))
    interaction_count = np.zeros((n_bins + 2, n_bins + 2))

    for reg1, reg2 in supports:
        _accumulate(
            interaction_sum,
            interaction_count,
            getmatrix,
            digitized_tracks,
            reg1,
            reg2,
            min_diag=min_diag,
            max_diag=max_diag,
            verbose=verbose,
        )

    interaction_sum += interaction_sum.T
    interaction_count += interaction_count.T

    if trim_outliers:
        interaction_sum = interaction_sum[1:-1, 1:-1]
        interaction_count = interaction_count[1:-1, 1:-1]

    return interaction_sum, interaction_count
Exemplo n.º 7
0
def cooler_cis_eig(
    clr,
    bins,
    regions=None,
    n_eigs=3,
    phasing_track_col="GC",
    balance="weight",
    ignore_diags=None,
    bad_bins=None,
    clip_percentile=99.9,
    sort_metric=None,
    smooth=False,
    cutoff = 3,
    max_levels = 8,
    OE_log=False,
    map=map,
):
    """
    Compute compartment eigenvector for a given cooler `clr` in a number of
    symmetric intra chromosomal regions (cis-regions), or for each chromosome.
    Note that the amplitude of compartment eigenvectors is weighted by their
    corresponding eigenvalue
    Parameters
    ----------
    clr : cooler
        cooler object to fetch data from
    bins : DataFrame
        table of bins derived from clr with phasing track added
    regions : iterable or DataFrame, optional
        if provided, eigenvectors are calculated for the regions only,
        otherwise chromosome-wide eigenvectors are computed, for chromosomes
        specified in bins.
    n_eigs : int
        number of eigenvectors to compute
    phasing_track_col : str, optional
        name of the columns in `bins` table, if provided, eigenvectors are
        flipped to achieve a positive correlation with `bins[phasing_track_col]`.
    balance : str
        name of the column with balancing weights to be used.
    ignore_diags : int, optional
        the number of diagonals to ignore. Derived from cooler metadata
        if not specified.
    bad_bins : array-like
        a list of bins to ignore. Indexes of bins must be absolute,
        as in clr.bins()[:], as opposed to being offset by chromosome start.
        `bad_bins` will be combined with the bad bins masked by balancing.
    clip_percentile : float
        if >0 and <100, clip pixels with diagonal-normalized values
        higher than the specified percentile of matrix-wide values.
    sort_metric : str
        If provided, re-sort `eigenvecs` and `eigvals` in the order of
        decreasing correlation between phasing_track and eigenvector, using the
        specified measure of correlation. Possible values:
        'pearsonr' - sort by decreasing Pearson correlation.
        'var_explained' - sort by decreasing absolute amount of variation in
        `eigvecs` explained by `phasing_track` (i.e. R^2 * var(eigvec))
        'MAD_explained' - sort by decreasing absolute amount of Median Absolute
        Deviation from the median of `eigvecs` explained by `phasing_track`
        (i.e. COMED(eigvec, phasing_track) * MAD(eigvec)).
        'spearmanr' - sort by decreasing Spearman correlation.
        This option is designed to report the most "biologically" informative
        eigenvectors first, and prevent eigenvector swapping caused by
        translocations. In reality, however, sometimes it shows poor
        performance and may lead to reporting of non-informative eigenvectors.
        Off by default.
    smooth : boolean, optional
        This option lets you coarsegrain the matrix prior to calling eigendecomposition.
    cutoff: int, optional
        Cutoff to pass to adaptive_coarsegrain's cutoff argument
    max_levels: int, optional
        Max level to pass to adaptive_coarsegrain's max_levels argument
    OE_log: boolean, optional
        Pass OE_log to cis_eig's OE_log argument.
        This works only if matrix does not contain zeroes (eg. after using adaptive_coarsegrain)
    map : callable, optional
        Map functor implementation.
    Returns
    -------
    eigvals, eigvec_table -> DataFrames with eigenvalues for each region and
    a table of eigenvectors filled in the `bins` table.
    .. note:: ALWAYS check your EVs by eye. The first one occasionally does
              not reflect the compartment structure, but instead describes
              chromosomal arms or translocation blowouts. Possible mitigations:
              employ `regions` (e.g. arms) to avoid issues with chromosomal arms,
              use `bad_bins` to ignore small transolcations.
    """

    # get chromosomes from bins, if regions not specified:
    if regions is None:
        regions = list(bins["chrom"].unique()) # parse_regions fill in the rest

    # make sure phasing_track_col is in bins, if phasing is requested
    if phasing_track_col and (phasing_track_col not in bins):
        raise ValueError(f'No column "{phasing_track_col}" in the bin table')

    # regions to dataframe
    # regions = bioframe.parse_regions(regions, clr.chromsizes)
    regions = bioframe.make_viewframe(regions)

    # ignore diags as in cooler inless specified
    ignore_diags = (
        clr._load_attrs("bins/weight").get("ignore_diags", 2)
        if ignore_diags is None
        else ignore_diags
    )

    # prepare output table for eigen vectors
    eigvec_table = bins.copy()
    eigvec_columns = [f"E{i + 1}" for i in range(n_eigs)]
    for ev_col in eigvec_columns:
        eigvec_table[ev_col] = np.nan

    # prepare output table for eigenvalues
    eigvals_table = regions.copy()
    eigval_columns = [f"eigval{i + 1}" for i in range(n_eigs)]
    for eval_col in eigval_columns:
        eigvals_table[eval_col] = np.nan

    def _each(region):
        """
        perform eigen decomposition for a given region
        assuming safety checks are done outside of this
        function.
        Parameters
        ----------
        region: tuple-like
            tuple of the form (chroms,start,end,*)
        Returns
        -------
        _region, eigvals, eigvecs -> ndarrays
            array of eigenvalues and an array eigenvectors
        """
        _region = region[:3] # take only (chrom, start, end)
        print("now doing region:", _region)

        if smooth:
            A = numutils.adaptive_coarsegrain(
                clr.matrix(balance=True).fetch(_region),
                clr.matrix(balance=False).fetch(_region),
                cutoff=cutoff,
                max_levels=max_levels)

        else:
            A = clr.matrix(balance=balance).fetch(_region)

        # filter bad_bins relevant for the _region from A
        if bad_bins is not None:
            # filter bad_bins for the _region and turn relative:
            lo, hi = clr.extent(_region)
            bad_bins_region = bad_bins[(bad_bins>=lo)&(bad_bins<hi)]
            bad_bins_region -= lo
            if len(bad_bins_region) > 0:
                # apply bad bins to symmetric matrix A:
                A[:,bad_bins_region] = np.nan
                A[bad_bins_region,:] = np.nan

        # extract phasing track relevant for the _region
        phasing_track = (
            bioframe.select(bins, _region)[phasing_track_col].values
            if phasing_track_col
            else None
        )

        eigvals, eigvecs = cis_eig(
            A,
            n_eigs=n_eigs,
            ignore_diags=ignore_diags,
            phasing_track=phasing_track,
            clip_percentile=clip_percentile,
            sort_metric=sort_metric,
            OE_log=OE_log
        )

        return _region, eigvals, eigvecs

    # eigendecompose matrix per region (can be multiprocessed)
    # output assumes that the order of results matches regions
    results = map(_each, regions.values)

    # go through eigendecomposition results and fill in
    # output table eigvec_table and eigvals_table
    for _region, _eigvals, _eigvecs in results:
        idx = bioframe.select(eigvec_table, _region).index
        eigvec_table.at[idx, eigvec_columns] = _eigvecs.T
        idx = bioframe.select(eigvals_table, _region).index
        eigvals_table.at[idx, eigval_columns] = _eigvals


    return eigvals_table, eigvec_table
Exemplo n.º 8
0
def make_saddle(
    getmatrix,
    binedges,
    digitized,
    contact_type,
    regions=None,
    min_diag=3,
    max_diag=-1,
    trim_outliers=False,
    verbose=False,
):
    """
    Make a matrix of average interaction probabilities between genomic bin
    pairs as a function of a specified genomic track. The provided genomic
    track must be pre-quantized as integers (i.e. digitized).

    Parameters
    ----------
    getmatrix : function
        A function returning a matrix of interaction between two chromosomes
        given their names/indicies.
    binedges : 1D array (length n + 1)
        Bin edges of the digitized signal. For `n` bins, there are `n + 1`
        edges. See :func:`digitize_track`.
    digitized : tuple of (DataFrame, str)
        BedGraph-like dataframe of digitized signal along with the name of
        the digitized value column.
    contact_type : str
        If 'cis' then only cis interactions are used to build the matrix.
        If 'trans', only trans interactions are used.
    regions : sequence of str or tuple, optional
        A list of genomic regions to use. Each can be a chromosome, a
        UCSC-style genomic region string or a tuple.
    min_diag : int
        Smallest diagonal to include in computation. Ignored with
        contact_type=trans.
    max_diag : int
        Biggest diagonal to include in computation. Ignored with
        contact_type=trans.
    trim_outliers : bool, optional
        Remove first and last row and column from the output matrix.
    verbose : bool, optional
        If True then reports progress.

    Returns
    -------
    interaction_sum : 2D array
        The matrix of summed interaction probability between two genomic bins
        given their values of the provided genomic track.
    interaction_count : 2D array
        The matrix of the number of genomic bin pairs that contributed to the
        corresponding pixel of ``interaction_sum``.

    """
    digitized_df, name = digitized
    digitized_df = digitized_df[["chrom", "start", "end", name]]

    if regions is None:
        regions = [(chrom, df.start.min(), df.end.max())
                   for chrom, df in digitized_df.groupby("chrom")]

    regions = bioframe.parse_regions(regions)

    digitized_tracks = {}
    for reg in regions.values:
        track = bioframe.select(digitized_df, reg)
        digitized_tracks[reg[3]] = track[name]  # 3 = name

    if contact_type == "cis":
        supports = list(zip(regions["name"], regions["name"]))
    elif contact_type == "trans":
        supports = list(combinations(regions["name"], 2))
    else:
        raise ValueError("The allowed values for the contact_type "
                         "argument are 'cis' or 'trans'.")

    # n_bins here includes 2 open bins
    # for values <lo and >hi.
    n_bins = len(binedges) + 1
    interaction_sum = np.zeros((n_bins, n_bins))
    interaction_count = np.zeros((n_bins, n_bins))

    for reg1, reg2 in supports:
        _accumulate(
            interaction_sum,
            interaction_count,
            getmatrix,
            digitized_tracks,
            reg1,
            reg2,
            min_diag,
            max_diag,
            verbose,
        )

    interaction_sum += interaction_sum.T
    interaction_count += interaction_count.T

    if trim_outliers:
        interaction_sum = interaction_sum[1:-1, 1:-1]
        interaction_count = interaction_count[1:-1, 1:-1]

    return interaction_sum, interaction_count
Exemplo n.º 9
0
def compute_scaling(pairs_paths, out_path, region, exclude_chroms, assembly,
                    centromeres_path, split_arms, normalized, plot_slope,
                    show_average_trans, labels, title, no_cache):
    """
    Compute and plot contact frequency vs genomic separation curves for one or more pairs files.
    """
    labels = list(labels)
    # parse left/right arm parameter of chromosomes to exclude
    exclude_chroms = [chrom.split(':') for chrom in exclude_chroms]

    chromsizes = bioframe.fetch_chromsizes(assembly,
                                           filter_chroms=False,
                                           as_bed=True)
    chromsizes = chromsizes[~chromsizes.chrom.isin(exclude_chroms)]

    if centromeres_path:
        centromeres = {}
        with open(centromeres_path) as file:
            for line in file:
                cols = line.split(' ')
                centromeres[cols[0]] = (int(cols[1]) + int(cols[2])) // 2
    else:
        centromeres = bioframe.fetch_centromeres(assembly)
        centromeres.set_index('chrom', inplace=True)
        centromeres = centromeres.mid.to_dict()

    if len(labels) != 0 and len(pairs_paths) != len(labels) and not split_arms:
        sys.exit('Please provide as many labels as pairs paths.')

    if region:
        regions = bioframe.select(chromsizes, region).reset_index()
    else:
        # use chromosomal arms as separate regions if no regions are specified
        arms = bioframe.split(chromsizes, centromeres)
        # remove user-excluded chromosomes/arms
        for chrom in exclude_chroms:
            if len(chrom) == 1:
                # no arm specified, remove entire chromosome
                arms = arms[arms.chrom != chrom[0]]
            elif chrom[1] == 'left':
                # remove specified chromosome with start == 0 (left arm)
                arms = arms[~((arms.chrom == chrom[0]) & (arms.start == 0))]
            elif chrom[1] == 'right':
                # remove specified chromosome with start != 0 (right arm)
                arms = arms[~((arms.chrom == chrom[0]) & (arms.start != 0))]

        # remove 40kb from each side (80kb total) of an arm to remove centromere and telomere regions
        arms = bioframe.ops.expand(arms, -ARM_PADDING)
        # remove arms arms with a length of < 0 after removing side regions
        regions = arms[arms.start < arms.end].reset_index()

    all_scalings = []
    all_avg_trans_levels = []

    for idx, path in enumerate(pairs_paths):
        cis_scalings, avg_trans = None, None

        if split_arms:
            # calculate scalings per arm per chromosome
            cis_scalings, trans_levels = pairlib.scalings.compute_scaling(
                path,
                regions,
                chromsizes,
                dist_range=(int(1e1), int(1e9)),
                n_dist_bins=128,
                chunksize=int(1e7))

            # remove unassigned pairs with start/end positions < 0
            cis_scalings = cis_scalings[(cis_scalings.start1 > 0)
                                        & (cis_scalings.end1 > 0) &
                                        (cis_scalings.start2 > 0) &
                                        (cis_scalings.end2 > 0)]

            sc_agg = (cis_scalings.groupby(
                ['chrom1', 'start1', 'min_dist', 'max_dist']).agg({
                    'n_pairs':
                    'sum',
                    'n_bp2':
                    'sum'
                }).reset_index())
            avail_chroms = set(sc_agg.chrom1)

            for chrom in avail_chroms:
                # calculate scalings for left/right arms (left arms start at position 0 + ARM_PADDING)
                sc_left, avg_trans_left = (calc_pair_freqs(
                    sc_agg[(sc_agg.chrom1 == chrom)
                           & (sc_agg.start1 == ARM_PADDING)], trans_levels,
                    show_average_trans, normalized))
                sc_right, avg_trans_right = (calc_pair_freqs(
                    sc_agg[(sc_agg.chrom1 == chrom)
                           & (sc_agg.start1 != ARM_PADDING)], trans_levels,
                    show_average_trans, normalized))

                dir_path = os.path.join(os.path.dirname(out_path),
                                        os.path.basename(path))
                if not os.path.exists(dir_path):
                    os.mkdir(dir_path)
                chrom_path = os.path.join(
                    dir_path, '_'.join((chrom, os.path.basename(out_path))))
                (plot_scalings(
                    scalings=[sc_left, sc_right],
                    avg_trans_levels=[avg_trans_left, avg_trans_right],
                    plot_slope=plot_slope,
                    labels=['left', 'right'],
                    title=chrom,
                    out_path=chrom_path))
        else:
            if not no_cache:
                # get cached values
                cached = cache.get(path)
                if cached is not None:
                    cis_scalings = cached['cis_scalings'] if cached[
                        'normalized'] == normalized else None
                    avg_trans = cached['avg_trans']

            if no_cache or cis_scalings is None or (avg_trans is None
                                                    and show_average_trans):
                print(
                    f'Computing scalings for file {idx + 1}/{len(pairs_paths)} ...',
                    end='\r')
                # caching disabled or no cached values found

                cis_scalings, trans_levels = pairlib.scalings.compute_scaling(
                    path,
                    regions,
                    chromsizes,
                    dist_range=(int(1e1), int(1e9)),
                    n_dist_bins=128,
                    chunksize=int(1e7))
                # remove unassigned pairs with start/end positions < 0
                cis_scalings = cis_scalings[(cis_scalings.start1 >= 0)
                                            & (cis_scalings.end1 >= 0) &
                                            (cis_scalings.start2 >= 0) &
                                            (cis_scalings.end2 >= 0)]

                sc_agg = (cis_scalings.groupby(['min_dist', 'max_dist']).agg({
                    'n_pairs':
                    'sum',
                    'n_bp2':
                    'sum'
                }).reset_index())

                cis_scalings, avg_trans = calc_pair_freqs(
                    sc_agg, trans_levels, show_average_trans, normalized)

                if not no_cache:
                    cache.set(
                        path, {
                            'cis_scalings': cis_scalings,
                            'avg_trans': avg_trans,
                            'normalized': normalized
                        })
            else:
                print(
                    f'Retrieved cached values for file {idx + 1}/{len(pairs_paths)}.',
                    end='\r')

            # use file names as labels if labels have not been provided
            labels.append(
                os.path.basename) if len(labels) < len(pairs_paths) else None

            all_scalings.append(cis_scalings)
            all_avg_trans_levels.append(
                avg_trans) if avg_trans is not None else None

        if len(all_scalings) > 0 and not split_arms:
            plot_scalings(all_scalings, all_avg_trans_levels, plot_slope,
                          labels, title, out_path)
Exemplo n.º 10
0
def cooler_cis_eig(
    clr,
    bins,
    regions=None,
    n_eigs=3,
    phasing_track_col="GC",
    balance="weight",
    ignore_diags=None,
    clip_percentile=99.9,
    sort_metric=None,
):
    # Perform consitency checks.
    if regions is None:
        chroms_not_in_clr = [
            chrom for chrom in bins["chrom"].unique() if chrom not in clr.chromsizes
        ]

        if len(chroms_not_in_clr) > 0:
            raise ValueError(
                "The following chromosomes are found in the bin table, but not "
                "in the cooler: " + str(chroms_not_in_clr)
            )

    if regions is None:
        regions = (
            [(chrom, 0, clr.chromsizes[chrom]) for chrom in bins["chrom"].unique()]
            if regions is None
            else [bioframe.parse_region(r) for r in regions]
        )

    ignore_diags = (
        clr._load_attrs("bins/weight").get("ignore_diags", 2)
        if ignore_diags is None
        else ignore_diags
    )

    eigvec_table = bins.copy()
    for i in range(n_eigs):
        eigvec_table["E" + str(i + 1)] = np.nan

    def _each(region):
        A = clr.matrix(balance=balance).fetch(region)
        if phasing_track_col and (phasing_track_col not in bins):
            raise ValueError(
                'No column "{}" in the bin table'.format(phasing_track_col)
            )
        phasing_track = (
            bioframe.slice_bedframe(bins, region)[phasing_track_col].values
            if phasing_track_col
            else None
        )

        eigvals, eigvecs = cis_eig(
            A,
            n_eigs=n_eigs,
            ignore_diags=ignore_diags,
            phasing_track=phasing_track,
            clip_percentile=clip_percentile,
            sort_metric=sort_metric,
        )

        return eigvals, eigvecs

    eigvals_per_reg, eigvecs_per_reg = zip(*map(_each, regions))

    for region, eigvecs in zip(regions, eigvecs_per_reg):
        idx = bioframe.select(bins, region).index
        for i, eigvec in enumerate(eigvecs):
            eigvec_table.loc[idx, "E" + str(i + 1)] = eigvec

    region_strs = [
        (
            chrom
            if (start == 0 and end == clr.chromsizes[chrom])
            else "{}:{}-{}".format(chrom, start, end)
        )
        for chrom, start, end in regions
    ]

    eigvals = pd.DataFrame(
        index=region_strs,
        data=np.vstack(eigvals_per_reg),
        columns=["eigval" + str(i + 1) for i in range(n_eigs)],
    )

    eigvals.index.name = "region"

    return eigvals, eigvec_table
Exemplo n.º 11
0
def eigs_cis(
    clr,
    phasing_track=None,
    view_df=None,
    n_eigs=3,
    clr_weight_name="weight",
    ignore_diags=None,
    bad_bins=None,
    clip_percentile=99.9,
    sort_metric=None,
    map=map,
):
    """
    Compute compartment eigenvector for a given cooler `clr` in a number of
    symmetric intra chromosomal regions defined in view_df (cis-regions), or for each
    chromosome.

    Note that the amplitude of compartment eigenvectors is weighted by their
    corresponding eigenvalue. Eigenvectors can be oriented by passing a binned
    `phasing_track` with the same resolution as the cooler.


    Parameters
    ----------
    clr : cooler
        cooler object to fetch data from
    phasing_track : DataFrame
        binned track with the same resolution as cooler bins, the fourth column is
        used to phase the eigenvectors, flipping them to achieve a positive correlation.
    view_df : iterable or DataFrame, optional
        if provided, eigenvectors are calculated for the regions of the view only,
        otherwise chromosome-wide eigenvectors are computed, for chromosomes
        specified in phasing_track.
    n_eigs : int
        number of eigenvectors to compute
    clr_weight_name : str
        name of the column with balancing weights to be used.
    ignore_diags : int, optional
        the number of diagonals to ignore. Derived from cooler metadata
        if not specified.
    bad_bins : array-like
        a list of bins to ignore. Indexes of bins must be absolute,
        as in clr.bins()[:], as opposed to being offset by chromosome start.
        `bad_bins` will be combined with the bad bins masked by balancing.
    clip_percentile : float
        if >0 and <100, clip pixels with diagonal-normalized values
        higher than the specified percentile of matrix-wide values.
    sort_metric : str
        If provided, re-sort `eigenvecs` and `eigvals` in the order of
        decreasing correlation between phasing_track and eigenvector, using the
        specified measure of correlation. Possible values:
        'pearsonr' - sort by decreasing Pearson correlation.
        'var_explained' - sort by decreasing absolute amount of variation in
        `eigvecs` explained by `phasing_track` (i.e. R^2 * var(eigvec))
        'MAD_explained' - sort by decreasing absolute amount of Median Absolute
        Deviation from the median of `eigvecs` explained by `phasing_track`
        (i.e. COMED(eigvec, phasing_track) * MAD(eigvec)).
        'spearmanr' - sort by decreasing Spearman correlation.
        This option is designed to report the most "biologically" informative
        eigenvectors first, and prevent eigenvector swapping caused by
        translocations. In reality, however, sometimes it shows poor
        performance and may lead to reporting of non-informative eigenvectors.
        Off by default.
    map : callable, optional
        Map functor implementation.
    Returns
    -------
    eigvals, eigvec_table -> DataFrames with eigenvalues for each region and
    a table of eigenvectors filled in the `bins` table.
    .. note:: ALWAYS check your EVs by eye. The first one occasionally does
              not reflect the compartment structure, but instead describes
              chromosomal arms or translocation blowouts. Possible mitigations:
              employ `view_df` (e.g. arms) to avoid issues with chromosomal arms,
              use `bad_bins` to ignore small transolcations.
    """

    # get chromosomes from cooler, if view_df not specified:
    if view_df is None:
        view_df = make_cooler_view(clr)
    else:
        # Make sure view_df is a proper viewframe
        try:
            _ = is_compatible_viewframe(
                view_df,
                clr,
                check_sorting=True,
                raise_errors=True,
            )
        except Exception as e:
            raise ValueError("view_df is not a valid viewframe or incompatible") from e

    # check if cooler is balanced
    try:
        _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True)
    except Exception as e:
        raise ValueError(
            f"provided cooler is not balanced or {clr_weight_name} is missing"
        ) from e

    # ignore diags as in cooler unless specified
    ignore_diags = (
        clr._load_attrs(f"bins/{clr_weight_name}").get("ignore_diags", 2)
        if ignore_diags is None
        else ignore_diags
    )

    bins = clr.bins()[:]

    if phasing_track is not None:
        phasing_track = align_track_with_cooler(
            phasing_track,
            clr,
            view_df=view_df,
            clr_weight_name=clr_weight_name,
            mask_bad_bins=True,
        )

    # prepare output table for eigen vectors
    eigvec_table = bins.copy()
    eigvec_columns = [f"E{i + 1}" for i in range(n_eigs)]
    for ev_col in eigvec_columns:
        eigvec_table[ev_col] = np.nan

    # prepare output table for eigenvalues
    eigvals_table = view_df.copy()
    eigval_columns = [f"eigval{i + 1}" for i in range(n_eigs)]
    for eval_col in eigval_columns:
        eigvals_table[eval_col] = np.nan

    def _each(region):
        """
        perform eigen decomposition for a given region
        assuming safety checks are done outside of this
        function.
        Parameters
        ----------
        region: tuple-like
            tuple of the form (chroms,start,end,*)
        Returns
        -------
        _region, eigvals, eigvecs -> ndarrays
            array of eigenvalues and an array eigenvectors
        """
        _region = region[:3]  # take only (chrom, start, end)
        A = clr.matrix(balance=clr_weight_name).fetch(_region)

        # filter bad_bins relevant for the _region from A
        if bad_bins is not None:
            # filter bad_bins for the _region and turn relative:
            lo, hi = clr.extent(_region)
            bad_bins_region = bad_bins[(bad_bins >= lo) & (bad_bins < hi)]
            bad_bins_region -= lo
            if len(bad_bins_region) > 0:
                # apply bad bins to symmetric matrix A:
                A[:, bad_bins_region] = np.nan
                A[bad_bins_region, :] = np.nan

        # extract phasing track relevant for the _region
        if phasing_track is not None:
            phasing_track_region = bioframe.select(phasing_track, _region)
            phasing_track_region_values = phasing_track_region["value"].values
        else:
            phasing_track_region_values = None

        eigvals, eigvecs = cis_eig(
            A,
            n_eigs=n_eigs,
            ignore_diags=ignore_diags,
            phasing_track=phasing_track_region_values,
            clip_percentile=clip_percentile,
            sort_metric=sort_metric,
        )

        return _region, eigvals, eigvecs

    # eigendecompose matrix per region (can be multiprocessed)
    # output assumes that the order of results matches regions
    results = map(_each, view_df.values)

    # go through eigendecomposition results and fill in
    # output table eigvec_table and eigvals_table
    for _region, _eigvals, _eigvecs in results:
        idx = bioframe.select(eigvec_table, _region).index
        eigvec_table.loc[idx, eigvec_columns] = _eigvecs.T
        idx = bioframe.select(eigvals_table, _region).index
        eigvals_table.loc[idx, eigval_columns] = _eigvals

    return eigvals_table, eigvec_table
Exemplo n.º 12
0
def align_track_with_cooler(track,
                            clr,
                            view_df=None,
                            clr_weight_name="weight",
                            mask_bad_bins=True):
    """
    Sync a track dataframe with a cooler bintable.

    Checks that bin sizes match between a track and a cooler,
    merges the cooler bintable with the track, and 
    propagates masked regions from a cooler bintable to a track.

    Parameters
    ----------
    track : pd.DataFrame
        bedGraph-like track DataFrame to check
    clr : cooler
        cooler object to check against
    view_df : bioframe.viewframe or None
        Optional viewframe of regions to check for their number of bins with assigned track values.
        If None, constructs a view_df from cooler chromsizes.
    clr_weight_name : str
        Name of the column in the bin table with weight
    mask_bad_bins : bool
        Whether to propagate null bins from cooler bintable column clr_weight_name
        to the 'value' column of the output clr_track. Default True.

    Returns
    -------
    clr_track
        track dataframe that has been aligned with the cooler bintable
        and has columns ['chrom','start','end','value']


    """
    from .checks import is_track, is_cooler_balanced

    try:
        is_track(track, raise_errors=True)
    except Exception as e:
        raise ValueError("invalid input track") from e

    # since tracks are currently allowed to have flexible column names
    c, s, e, v = track.columns[:4]

    # using median to allow for shorter / longer last bin on any chromosome
    track_bin_width = int((track[e] - track[s]).median())
    if not (track_bin_width == clr.binsize):
        raise ValueError(
            "mismatch between track and cooler bin size, check track resolution"
        )

    clr_track = ((clr.bins()[:]).copy().merge(
        track.rename(columns={
            c: "chrom",
            s: "start",
            e: "end",
            v: "value"
        }),
        how="left",
        on=["chrom", "start"],
        suffixes=("", "_")))

    if clr_weight_name:
        try:
            is_cooler_balanced(clr,
                               clr_weight_name=clr_weight_name,
                               raise_errors=True)
        except Exception as e:
            raise ValueError(
                f"no column {clr_weight_name} detected in input cooler bintable"
            ) from e
    else:
        clr_track[clr_weight_name] = 1.0

    valid_bins = clr_track[clr_weight_name].notna()
    num_valid_bins = valid_bins.sum()
    num_assigned_bins = (clr_track["value"][valid_bins].notna()).sum()
    if num_assigned_bins == 0:
        raise ValueError("no track values assigned to cooler bintable")
    elif num_assigned_bins < 0.5 * np.sum(valid_bins):
        warnings.warn("less than 50% of valid bins have been assigned a value")

    view_df = make_cooler_view(clr) if view_df is None else view_df
    for region in view_df.itertuples(index=False):
        track_region = bioframe.select(clr_track, region)
        num_assigned_region_bins = track_region["value"].notna().sum()
        if num_assigned_region_bins == 0:
            raise ValueError(
                f"no track values assigned to region {bioframe.to_ucsc_string(region)}"
            )
    if mask_bad_bins:
        clr_track.loc[~valid_bins, "value"] = np.nan

    return clr_track[["chrom", "start", "end", "value"]]
Exemplo n.º 13
0
def saddle(
    clr,
    expected,
    track,
    contact_type,
    n_bins,
    vrange=None,
    qrange=None,
    view_df=None,
    clr_weight_name="weight",
    expected_value_col="balanced.avg",
    view_name_col="name",
    min_diag=3,
    max_diag=-1,
    trim_outliers=False,
    verbose=False,
):
    """
    Get a matrix of average interactions between genomic bin
    pairs as a function of a specified genomic track.

    The provided genomic track is either:
    (a) digitized inside this function by passing 'n_bins', and one of 'v_range' or 'q_range'
    (b) passed as a pre-digitized track with a categorical value column as generated by `get_digitized()`.

    Parameters
    ----------
    clr : cooler.Cooler
        Observed matrix.
    expected : DataFrame in expected format
        Diagonal summary statistics for each chromosome, and name of the column
        with the values of expected to use.
    contact_type : str
        If 'cis' then only cis interactions are used to build the matrix.
        If 'trans', only trans interactions are used.
    track : DataFrame
        A track, i.e. BedGraph-like dataframe, which is digitized with
        the options n_bins, vrange and qrange. Can optionally be passed
        as a pre-digitized dataFrame with a categorical value column,
        as generated by get_digitzied(), also passing n_bins as None.
    n_bins : int or None
        number of bins for signal quantization. If None, then track must
        be passed as a pre-digitized track.
    vrange : tuple
        Low and high values used for binning track values.
        See get_digitized().
    qrange : tuple
        Low and high values for quantile binning track values.
        Low must be 0.0 or more, high must be 1.0 or less.
        Only one of vrange or qrange can be passed. See get_digitzed().
    view_df: viewframe
        Viewframe with genomic regions. If none, generate from track chromosomes.
    clr_weight_name : str
        Name of the column in the clr.bins to use as balancing weights.
        Using raw unbalanced data is not supported for saddles.
    expected_value_col : str
        Name of the column in expected used for normalizing.
    view_name_col : str
        Name of column in view_df with region names.
    min_diag : int
        Smallest diagonal to include in computation. Ignored with
        contact_type=trans.
    max_diag : int
        Biggest diagonal to include in computation. Ignored with
        contact_type=trans.
    trim_outliers : bool, optional
        Remove first and last row and column from the output matrix.
    verbose : bool, optional
        If True then reports progress.
    Returns
    -------
    interaction_sum : 2D array
        The matrix of summed interaction probability between two genomic bins
        given their values of the provided genomic track.
    interaction_count : 2D array
        The matrix of the number of genomic bin pairs that contributed to the
        corresponding pixel of ``interaction_sum``.
    """

    if type(n_bins) is int:
        # perform digitization
        track = align_track_with_cooler(
            track,
            clr,
            view_df=view_df,
            clr_weight_name=clr_weight_name,
            mask_bad_bins=True,
        )
        digitized_track, binedges = digitize(
            track.iloc[:, :4],
            n_bins,
            vrange=vrange,
            qrange=qrange,
            digitized_suffix=".d",
        )
        digitized_col = digitized_track.columns[3]

    elif n_bins is None:
        # assume and test if track is pre-digitized
        digitized_track = track
        digitized_col = digitized_track.columns[3]
        is_track(track.astype({digitized_col: "float"}), raise_errors=True)
        if (type(digitized_track.dtypes[3])
                is not pd.core.dtypes.dtypes.CategoricalDtype):
            raise ValueError(
                "when n_bins=None, saddle assumes the track has been " +
                "pre-digitized and the value column is a " +
                "pandas categorical. See get_digitized().")
        cats = digitized_track[digitized_col].dtype.categories.values
        # cats has two additional categories, 0 and n_bins+1, for values
        # falling outside range, as well as -1 for NAs.
        n_bins = len(cats[cats > -1]) - 2
    else:
        raise ValueError("n_bins must be provided as int or None")

    if view_df is None:
        view_df = view_from_track(digitized_track)
    else:
        # Make sure view_df is a proper viewframe
        try:
            _ = is_compatible_viewframe(
                view_df,
                clr,
                check_sorting=True,  # just in case
                raise_errors=True,
            )
        except Exception as e:
            raise ValueError(
                "view_df is not a valid viewframe or incompatible") from e

    # make sure provided expected is compatible
    try:
        _ = is_valid_expected(
            expected,
            contact_type,
            view_df,
            verify_cooler=clr,
            expected_value_cols=[
                expected_value_col,
            ],
            raise_errors=True,
        )
    except Exception as e:
        raise ValueError("provided expected is not compatible") from e

    # check if cooler is balanced
    if clr_weight_name:
        try:
            _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True)
        except Exception as e:
            raise ValueError(
                f"provided cooler is not balanced or {clr_weight_name} is missing"
            ) from e

    digitized_tracks = {}
    for num, reg in view_df.iterrows():
        digitized_reg = bioframe.select(digitized_track, reg)
        digitized_tracks[reg[view_name_col]] = digitized_reg[digitized_col]

    # set "cis" or "trans" for supports (regions to iterate over) and matrix fetcher
    if contact_type == "cis":
        # only symmetric intra-chromosomal regions :
        supports = list(zip(view_df[view_name_col], view_df[view_name_col]))

        getmatrix = _make_cis_obsexp_fetcher(
            clr,
            expected,
            view_df,
            view_name_col=view_name_col,
            expected_value_col=expected_value_col,
            clr_weight_name=clr_weight_name,
        )
    elif contact_type == "trans":
        # asymmetric inter-chromosomal regions :
        supports = list(combinations(view_df[view_name_col], 2))
        supports = [
            i for i in supports
            if (view_df["chrom"].loc[view_df[view_name_col] == i[0]].values !=
                view_df["chrom"].loc[view_df[view_name_col] == i[1]].values)
        ]

        getmatrix = _make_trans_obsexp_fetcher(
            clr,
            expected,
            view_df,
            view_name_col=view_name_col,
            expected_value_col=expected_value_col,
            clr_weight_name=clr_weight_name,
        )
    else:
        raise ValueError(
            "Allowed values for contact_type are 'cis' or 'trans'.")

    # n_bins here includes 2 open bins for values <lo and >hi.
    interaction_sum = np.zeros((n_bins + 2, n_bins + 2))
    interaction_count = np.zeros((n_bins + 2, n_bins + 2))

    for reg1, reg2 in supports:
        _accumulate(
            interaction_sum,
            interaction_count,
            getmatrix,
            digitized_tracks,
            reg1,
            reg2,
            min_diag=min_diag,
            max_diag=max_diag,
            verbose=verbose,
        )

    interaction_sum += interaction_sum.T
    interaction_count += interaction_count.T

    if trim_outliers:
        interaction_sum = interaction_sum[1:-1, 1:-1]
        interaction_count = interaction_count[1:-1, 1:-1]

    return interaction_sum, interaction_count