Пример #1
0
        def _fetch_trans_oe(reg1, reg2):
            reg1 = bioframe.parse_region(reg1)
            reg2 = bioframe.parse_region(reg2)

            return (
                clr.matrix().fetch(reg1, reg2) /
                _fetch_trans_exp(reg1[0], reg2[0])
            )
Пример #2
0
def cooler_global_scaling(cool,
                          genome,
                          trans=True,
                          mapper=map,
                          balance='weight',
                          thres=None,
                          ignore_diags=2):

    row_masker = col_masker = cooler_mask(cool, header=balance, thres=thres)
    matrix_fetcher = cooler_matrix_generator(cool, header=balance)
    resolution = cool.info['bin-size']

    chrom_arms = DNA_info.get_chromosome_arms(genome)
    cis_regions = [(arm, arm) for arm in chrom_arms]

    cis_results = cis_binning(cis_regions,
                              matrix_fetcher,
                              row_masker,
                              col_masker,
                              resolution,
                              ignore_diags,
                              mapper=mapper)
    cis_results = pd.concat(cis_results)
    cis_results = cis_results.reset_index().rename(
        columns={'region1': 'region'})
    del cis_results['region2']
    cis_results.set_index(['region', 'diag'], inplace=True, drop=True)

    if trans:
        print('Computing trans expected')
        chromsizes = bioframe.fetch_chromsizes(genome)
        trans_regions = [(bioframe.parse_region(cool.chromnames[i],
                                                chromsizes=chromsizes),
                          bioframe.parse_region(cool.chromnames[j],
                                                chromsizes=chromsizes))
                         for i in range(len(cool.chromnames))
                         for j in range(i + 1, len(cool.chromnames))]

        trans_results = trans_binning(trans_regions,
                                      matrix_fetcher,
                                      row_masker,
                                      col_masker,
                                      resolution,
                                      mapper=mapper)
        trans_results = [
            result for result in trans_results if result is not None
        ]
        trans_results = pd.concat(trans_results)
        trans_results['chrom1'] = trans_results.index.map(
            lambda x: x[0][0]).values
        trans_results['chrom2'] = trans_results.index.map(
            lambda x: x[1][0]).values
        trans_results.set_index(['chrom1', 'chrom2'], inplace=True)

        return cis_results, trans_results
    return cis_results
Пример #3
0
def make_diag_tables(clr, supports):
    where = np.flatnonzero
    diag_tables = {}
    for region in supports:
        if isinstance(region, str):
            region = bioframe.parse_region(region)
        if len(region) == 1:
            chrom, = region
            start1, end1 = 0, clr.chromsizes[chrom]
            start2, end2 = start1, end1
        elif len(region) == 2:
            chrom, start1, end1 = region[0]
            _, start2, end2 = region[1]
        elif len(region) == 3:
            chrom, start1, end1 = region
            start2, end2 = start1, end1
        elif len(region) == 5:
            chrom, start1, end1, start2, end2 = region
        else:
            raise ValueError("Regions must be sequences of length 1, 3 or 5")
        bins = clr.bins().fetch(chrom).reset_index(drop=True)
        bad_mask = np.array(bins['weight'].isnull())
        lo1, hi1 = clr.extent((chrom, start1, end1))
        lo2, hi2 = clr.extent((chrom, start2, end2))
        co = clr.offset(chrom)
        lo1 -= co
        lo2 -= co
        hi1 -= co
        hi2 -= co
        dt = make_diag_table(bad_mask, [lo1, hi1], [lo2, hi2])
        diag_tables[region] = dt
    return diag_tables
Пример #4
0
def digitize_track(binedges, track, regions=None):
    """
    Digitize genomic signal tracks into integers between `1` and `n`.

    Parameters
    ----------
    binedges : 1D array (length n + 1)
        Bin edges for quantization of signal. For `n` bins, there are `n + 1`
        edges. See encoding details in Notes.
    track : tuple of (DataFrame, str)
        bedGraph-like dataframe along with the name of the value column.
    regions: sequence of str or tuples
        List of genomic regions to include. Each can be a chromosome, a
        UCSC-style genomic region string or a tuple.

    Returns
    -------
    digitized : DataFrame
        New bedGraph-like dataframe with value column and an additional
        digitized value column with name suffixed by '.d'
    hist : 1D array (length n + 2)
        Histogram of digitized signal values. Its length is `n + 2` because
        the first and last elements correspond to outliers. See notes.

    Notes
    -----
    The digital encoding is as follows:

    - `1..n` <-> values assigned to histogram bins
    - `0` <-> left outlier values
    - `n+1` <-> right outlier values
    - `-1` <-> missing data (NaNs)

    """
    if not isinstance(track, tuple):
        raise ValueError(
            "``track`` should be a tuple of (dataframe, column_name)")
    track, name = track

    # subset and re-order chromosome groups
    if regions is not None:
        regions = [bioframe.parse_region(reg) for reg in regions]
        grouped = track.groupby('chrom')
        track = pd.concat(
            bioframe.bedslice(grouped, chrom, st, end)
            for (chrom, st, end) in regions)

    # histogram the signal
    digitized = track.copy()
    digitized[name + '.d'] = np.digitize(track[name].values,
                                         binedges,
                                         right=False)
    mask = track[name].isnull()
    digitized.loc[mask, name + '.d'] = -1
    x = digitized[name + '.d'].values.copy()
    x = x[(x > 0) & (x < len(binedges) + 1)]
    hist = np.bincount(x, minlength=len(binedges) + 1)
    return digitized, hist
Пример #5
0
def exclude_regions(df, regions_to_keep=[], genome=None, print_final=False):
    if len(regions_to_keep):
        assert genome is not None, 'Please provide valid genome'
        chromsizes = bioframe.fetch_chromsizes(genome)
    else:
        if print_final:
            print(np.asarray(df.region.unique()))
        return df

    regions_to_keep = [
        bioframe.parse_region(reg, chromsizes) for reg in regions_to_keep
    ]

    assert 'region' in df.columns

    regions = df['region'].apply(
        lambda x: bioframe.parse_region(x, chromsizes)).values
    chrom, start, end = list(zip(*regions))
    df['chrom'] = chrom
    df['start'] = start
    df['end'] = end

    new_df = []
    for chrom, start, end in regions_to_keep:
        sub_df = bioframe.bedslice(df, (chrom, start, end))
        new_df.append(sub_df)
    new_df = pd.concat(new_df)

    if print_final:
        print(np.asarray(new_df.region.unique()))

    del new_df['chrom']
    del new_df['start']
    del new_df['end']

    return new_df
Пример #6
0
def make_diag_tables(clr, supports):

    bins = clr.bins()[:]
    if 'weight' in clr.bins().columns:
        groups = dict(iter(bins.groupby('chrom')['weight']))
        bad_bin_dict = {
            chrom: np.array(groups[chrom].isnull())
            for chrom in groups.keys()
        }
    else:
        sizes = dict(bins.groupby('chrom').size())
        bad_bin_dict = {
            chrom: np.zeros(sizes[chrom], dtype=bool)
            for chrom in sizes.keys()
        }

    where = np.flatnonzero
    diag_tables = {}
    for region in supports:
        if isinstance(region, str):
            region = bioframe.parse_region(region)
        if len(region) == 1:
            chrom, = region
            start1, end1 = 0, clr.chromsizes[chrom]
            start2, end2 = start1, end1
        elif len(region) == 2:
            chrom, start1, end1 = region[0]
            _, start2, end2 = region[1]
        elif len(region) == 3:
            chrom, start1, end1 = region
            start2, end2 = start1, end1
        elif len(region) == 5:
            chrom, start1, end1, start2, end2 = region
        else:
            raise ValueError("Regions must be sequences of length 1, 3 or 5")

        lo1, hi1 = clr.extent((chrom, start1, end1))
        lo2, hi2 = clr.extent((chrom, start2, end2))
        co = clr.offset(chrom)
        lo1 -= co
        lo2 -= co
        hi1 -= co
        hi2 -= co

        bad_mask = bad_bin_dict[chrom]
        diag_tables[region] = make_diag_table(bad_mask, [lo1, hi1], [lo2, hi2])

    return diag_tables
Пример #7
0
        def _fetch_trans_oe(reg1, reg2):
            reg1 = bioframe.parse_region(reg1)
            reg2 = bioframe.parse_region(reg2)

            return clr.matrix(balance=weight_name).fetch(
                reg1, reg2) / _fetch_trans_exp(reg1[0], reg2[0])
Пример #8
0
def make_saddle(getmatrix,
                binedges,
                digitized,
                contact_type,
                regions=None,
                min_diag=3,
                max_diag=-1,
                trim_outliers=False,
                verbose=False):
    """
    Make a matrix of average interaction probabilities between genomic bin
    pairs as a function of a specified genomic track. The provided genomic
    track must be pre-quantized as integers (i.e. digitized).

    Parameters
    ----------
    getmatrix : function
        A function returning a matrix of interaction between two chromosomes
        given their names/indicies.
    binedges : 1D array (length n + 1)
        Bin edges of the digitized signal. For `n` bins, there are `n + 1`
        edges. See :func:`digitize_track`.
    digitized : tuple of (DataFrame, str)
        BedGraph-like dataframe of digitized signal along with the name of
        the digitized value column.
    contact_type : str
        If 'cis' then only cis interactions are used to build the matrix.
        If 'trans', only trans interactions are used.
    regions : sequence of str or tuple, optional
        A list of genomic regions to use. Each can be a chromosome, a
        UCSC-style genomic region string or a tuple.
    min_diag : int
        Smallest diagonal to include in computation. Ignored with
        contact_type=trans.
    max_diag : int
        Biggest diagonal to include in computation. Ignored with
        contact_type=trans.
    trim_outliers : bool, optional
        Remove first and last row and column from the output matrix.
    verbose : bool, optional
        If True then reports progress.

    Returns
    -------
    interaction_sum : 2D array
        The matrix of summed interaction probability between two genomic bins
        given their values of the provided genomic track.
    interaction_count : 2D array
        The matrix of the number of genomic bin pairs that contributed to the
        corresponding pixel of ``interaction_sum``.

    """
    digitized_df, name = digitized

    if regions is None:
        regions = [(chrom, df.start.min(), df.end.max())
                   for chrom, df in digitized_df.groupby('chrom')]
    else:
        regions = [bioframe.parse_region(reg) for reg in regions]

    digitized_tracks = {
        reg: bioframe.bedslice(digitized_df.groupby('chrom'), reg[0], reg[1],
                               reg[2])[name]
        for reg in regions
    }

    if contact_type == 'cis':
        supports = list(zip(regions, regions))
    elif contact_type == 'trans':
        supports = list(combinations(regions, 2))
    else:
        raise ValueError("The allowed values for the contact_type "
                         "argument are 'cis' or 'trans'.")

    # n_bins here includes 2 open bins
    # for values <lo and >hi.
    n_bins = len(binedges) + 1
    interaction_sum = np.zeros((n_bins, n_bins))
    interaction_count = np.zeros((n_bins, n_bins))

    for reg1, reg2 in supports:
        _accumulate(interaction_sum, interaction_count, getmatrix,
                    digitized_tracks, reg1, reg2, min_diag, max_diag, verbose)

    interaction_sum += interaction_sum.T
    interaction_count += interaction_count.T

    if trim_outliers:
        interaction_sum = interaction_sum[1:-1, 1:-1]
        interaction_count = interaction_count[1:-1, 1:-1]

    return interaction_sum, interaction_count
Пример #9
0
def cooler_cis_eig(
        clr,
        bins,
        regions=None,
        n_eigs=3,
        phasing_track_col='GC',
        balance='weight',
        ignore_diags=None,
        clip_percentile=99.9,
        sort_metric=None):

    # Perform consitency checks.
    if regions is None:
        chroms_not_in_clr = [
            chrom for chrom in bins['chrom'].unique()
            if chrom not in clr.chromsizes]

        if len(chroms_not_in_clr) > 0:
            raise ValueError(
                'The following chromosomes are found in the bin table, but not '
                'in the cooler: '+str(chroms_not_in_clr)
            )

    if regions is None:
        regions = (
            [(chrom, 0, clr.chromsizes[chrom])
             for chrom in bins['chrom'].unique()]
            if regions is None
            else [bioframe.parse_region(r) for r in regions]
        )

    ignore_diags = (
        clr._load_attrs('bins/weight').get('ignore_diags', 2)
        if ignore_diags is None
        else ignore_diags)

    eigvec_table = bins.copy()
    for i in range(n_eigs):
        eigvec_table['E'+str(i+1)] = np.nan

    def _each(region):
        A = clr.matrix(balance=balance).fetch(region)
        if phasing_track_col and (phasing_track_col not in bins):
            raise ValueError('No column "{}" in the bin table'.format(
                phasing_track_col))
        phasing_track = (
            bioframe.slice_bedframe(bins, region)[phasing_track_col].values
            if phasing_track_col else None)

        eigvals, eigvecs = cis_eig(
            A,
            n_eigs=n_eigs,
            ignore_diags=ignore_diags,
            phasing_track=phasing_track,
            clip_percentile=clip_percentile,
            sort_metric=sort_metric)

        return eigvals, eigvecs

    eigvals_per_reg, eigvecs_per_reg = zip(*map(_each, regions))

    for region, eigvecs in zip(regions, eigvecs_per_reg):
        lo, hi = bioframe.bisect_bedframe(bins, region)
        for i, eigvec in enumerate(eigvecs):
            eigvec_table.iloc[
                lo:hi,
                eigvec_table.columns.get_loc('E'+str(i+1))] = eigvec

    region_strs = [
        (chrom
         if (start == 0 and end == clr.chromsizes[chrom])
         else '{}:{}-{}'.format(chrom, start, end)
         )
        for chrom, start, end in regions
    ]

    eigvals = pd.DataFrame(
        index=region_strs,
        data=np.vstack(eigvals_per_reg),
        columns=['eigval'+str(i+1) for i in range(n_eigs)],
    )

    eigvals.index.name = 'region'

    return eigvals, eigvec_table
Пример #10
0
def make_diag_tables(clr, supports, weight_name="weight", bad_bins=None):
    """
    For every support region infer diagonals that intersect this region
    and calculate the size of these intersections in pixels, both "total" and
    "n_valid", where "n_valid" does not include "bad" bins into counting.

    "Bad" pixels are inferred from the balancing weight column `weight_name` or
    provided directly in the form of an array `bad_bins`.

    Setting `weight_name` and `bad_bins` to `None` yields 0 "bad" pixels per
    diagonal per support region.

    Parameters
    ----------
    clr : cooler.Cooler
        Input cooler
    supports : list
        a list of genomic support regions
    weight_name : str
        name of the weight vector in the "bins" table,
        if weight_name is None returns 0 for each block.
        Balancing weight are used to infer bad bins.
    bad_bins : array-like
        a list of bins to ignore per support region.
        Overwrites inference of bad bins from balacning
        weight [to be implemented].

    Returns
    -------
    diag_tables : dict
        dictionary with DataFrames of relevant diagonals for every support.
    """

    if bad_bins is not None:
        raise NotImplementedError("providing external list \
            of bad bins is not implemented.")

    bins = clr.bins()[:]
    if weight_name is None:
        # ignore bad bins
        sizes = dict(bins.groupby("chrom").size())
        bad_bin_dict = {
            chrom: np.zeros(sizes[chrom], dtype=bool)
            for chrom in sizes.keys()
        }
    elif isinstance(weight_name, str):
        # using balacning weight to infer bad bins
        if weight_name not in clr.bins().columns:
            raise KeyError("Balancing weight {weight_name} not found!")
        groups = dict(iter(bins.groupby("chrom")[weight_name]))
        bad_bin_dict = {
            chrom: np.array(groups[chrom].isnull())
            for chrom in groups.keys()
        }
    else:
        raise ValueError("`weight_name` can be `str` or `None`")

    where = np.flatnonzero
    diag_tables = {}
    for region in supports:
        # parse region if str
        if isinstance(region, str):
            region = bioframe.parse_region(region)
        # unpack region(s) into chroms,starts,ends
        if len(region) == 1:
            chrom, = region
            start1, end1 = 0, clr.chromsizes[chrom]
            start2, end2 = start1, end1
        elif len(region) == 2:
            chrom, start1, end1 = region[0]
            _, start2, end2 = region[1]
        elif len(region) == 3:
            chrom, start1, end1 = region
            start2, end2 = start1, end1
        elif len(region) == 5:
            chrom, start1, end1, start2, end2 = region
        else:
            raise ValueError("Regions must be sequences of length 1, 3 or 5")

        # translate regions into relative bin id-s:
        lo1, hi1 = clr.extent((chrom, start1, end1))
        lo2, hi2 = clr.extent((chrom, start2, end2))
        co = clr.offset(chrom)
        lo1 -= co
        lo2 -= co
        hi1 -= co
        hi2 -= co

        bad_mask = bad_bin_dict[chrom]
        diag_tables[region] = make_diag_table(bad_mask, [lo1, hi1], [lo2, hi2])

    return diag_tables
Пример #11
0
def cooler_cis_eig(
    clr,
    bins,
    regions=None,
    n_eigs=3,
    phasing_track_col="GC",
    balance="weight",
    ignore_diags=None,
    clip_percentile=99.9,
    sort_metric=None,
):
    # Perform consitency checks.
    if regions is None:
        chroms_not_in_clr = [
            chrom for chrom in bins["chrom"].unique() if chrom not in clr.chromsizes
        ]

        if len(chroms_not_in_clr) > 0:
            raise ValueError(
                "The following chromosomes are found in the bin table, but not "
                "in the cooler: " + str(chroms_not_in_clr)
            )

    if regions is None:
        regions = (
            [(chrom, 0, clr.chromsizes[chrom]) for chrom in bins["chrom"].unique()]
            if regions is None
            else [bioframe.parse_region(r) for r in regions]
        )

    ignore_diags = (
        clr._load_attrs("bins/weight").get("ignore_diags", 2)
        if ignore_diags is None
        else ignore_diags
    )

    eigvec_table = bins.copy()
    for i in range(n_eigs):
        eigvec_table["E" + str(i + 1)] = np.nan

    def _each(region):
        A = clr.matrix(balance=balance).fetch(region)
        if phasing_track_col and (phasing_track_col not in bins):
            raise ValueError(
                'No column "{}" in the bin table'.format(phasing_track_col)
            )
        phasing_track = (
            bioframe.slice_bedframe(bins, region)[phasing_track_col].values
            if phasing_track_col
            else None
        )

        eigvals, eigvecs = cis_eig(
            A,
            n_eigs=n_eigs,
            ignore_diags=ignore_diags,
            phasing_track=phasing_track,
            clip_percentile=clip_percentile,
            sort_metric=sort_metric,
        )

        return eigvals, eigvecs

    eigvals_per_reg, eigvecs_per_reg = zip(*map(_each, regions))

    for region, eigvecs in zip(regions, eigvecs_per_reg):
        idx = bioframe.select(bins, region).index
        for i, eigvec in enumerate(eigvecs):
            eigvec_table.loc[idx, "E" + str(i + 1)] = eigvec

    region_strs = [
        (
            chrom
            if (start == 0 and end == clr.chromsizes[chrom])
            else "{}:{}-{}".format(chrom, start, end)
        )
        for chrom, start, end in regions
    ]

    eigvals = pd.DataFrame(
        index=region_strs,
        data=np.vstack(eigvals_per_reg),
        columns=["eigval" + str(i + 1) for i in range(n_eigs)],
    )

    eigvals.index.name = "region"

    return eigvals, eigvec_table