Пример #1
0
def digitize_track(binedges, track, regions=None):
    """
    Digitize genomic signal tracks into integers between `1` and `n`.

    Parameters
    ----------
    binedges : 1D array (length n + 1)
        Bin edges for quantization of signal. For `n` bins, there are `n + 1`
        edges. See encoding details in Notes.
    track : tuple of (DataFrame, str)
        bedGraph-like dataframe along with the name of the value column.
    regions: sequence of str or tuples
        List of genomic regions to include. Each can be a chromosome, a
        UCSC-style genomic region string or a tuple.

    Returns
    -------
    digitized : DataFrame
        New bedGraph-like dataframe with value column and an additional
        digitized value column with name suffixed by '.d'
    hist : 1D array (length n + 2)
        Histogram of digitized signal values. Its length is `n + 2` because
        the first and last elements correspond to outliers. See notes.

    Notes
    -----
    The digital encoding is as follows:

    - `1..n` <-> values assigned to histogram bins
    - `0` <-> left outlier values
    - `n+1` <-> right outlier values
    - `-1` <-> missing data (NaNs)

    """
    if not isinstance(track, tuple):
        raise ValueError(
            "``track`` should be a tuple of (dataframe, column_name)")
    track, name = track

    # subset and re-order chromosome groups
    if regions is not None:
        regions = [bioframe.parse_region(reg) for reg in regions]
        grouped = track.groupby('chrom')
        track = pd.concat(
            bioframe.bedslice(grouped, chrom, st, end)
            for (chrom, st, end) in regions)

    # histogram the signal
    digitized = track.copy()
    digitized[name + '.d'] = np.digitize(track[name].values,
                                         binedges,
                                         right=False)
    mask = track[name].isnull()
    digitized.loc[mask, name + '.d'] = -1
    x = digitized[name + '.d'].values.copy()
    x = x[(x > 0) & (x < len(binedges) + 1)]
    hist = np.bincount(x, minlength=len(binedges) + 1)
    return digitized, hist
Пример #2
0
def bedpeslice(df, chrom, start, end):

    index = df.index.values
    df_l = df[['chrom1', 'start1', 'end1']].rename(columns=lambda x: x[0:-1])
    gb_l = df_l.groupby('chrom')
    subset_l = bioframe.bedslice(gb_l, chrom, start, end)
    index_l = subset_l.index.values
    mask_l = np.isin(index, index_l)

    df_r = df[['chrom2', 'start2', 'end2']].rename(columns=lambda x: x[0:-1])
    gb_r = df_r.groupby('chrom')
    subset_r = bioframe.bedslice(gb_r, chrom, start, end)
    index_r = subset_r.index.values
    mask_r = np.isin(index, index_r)

    sliced_df = df.loc[(mask_l & mask_r)].copy()

    return sliced_df
Пример #3
0
def sort_by_eigenvalue(lams, vectors):
    lam_list = [] 
    vector_list = []
    for reg, lambdas in lams.iterrows():
        if fnmatch.fnmatch(reg, '*:*-*'):
            chrom = reg[0:reg.find(':')]
            start = int(reg[reg.find(':')+1:reg.find('-')])
            end = int(reg[reg.find('-')+1:])
        else:
            chrom = reg
            start, end = None, None

        if start is None and end is None:
            region_vector = vectors[vectors.chrom == chrom].copy(deep=True)
        else:
            region_vector = bedslice(vectors.groupby('chrom'), chrom, start, end)


        if np.any(np.isnan(lambdas.values)):
            srtd_idx = np.array([0,1,2])
        else:
            srtd_idx = np.argsort(-np.abs(lambdas.values))
            
        region_vector[['E1', 'E2', 'E3']] = region_vector[['E1', 'E2', 'E3']].values[:, srtd_idx]

        lam_list.append(lambdas.values[srtd_idx])
        vector_list.append(region_vector)

    sorted_vectors = pd.concat(vector_list)
    missing = [ch for ch in vectors.chrom.unique() if ch not in sorted_vectors.chrom.unique()]

    for item in missing:
        vector_list.append(vectors[vectors.chrom == item].copy(deep=True))

    sorted_lams = pd.DataFrame(data=np.concatenate(tuple(lam_list)).reshape(-1,3), columns=lams.columns)
    sorted_lams['region'] = lams.index
    sorted_lams.set_index('region', inplace=True)
    sorted_vectors = pd.concat(vector_list).drop_duplicates()

    return sorted_lams, sorted_vectors
Пример #4
0
    def _mask_fetcher(region):
        bins = cool.bins().fetch(region)
        length = len(bins)
        bins['label'] = np.nan
        if region[0] not in vector.chrom.unique():
            return np.zeros(len(bins))

        region_eig = bedslice(vector, region[0], region[1], region[2])

        if len(region_eig) == 0:
            return np.zeros(len(bins))

        for i, row in region_eig.iterrows():
            cond = np.logical_and(bins.start >= row.start, bins.end <= row.end)
            bins.loc[bins[cond].index, 'label'] = row.label

        mask = np.logical_and(bins.label == val,
                              ~np.isnan(bins.weight)).values.astype(int)

        assert not np.any(np.isnan(bins[mask.astype(bool)]['weight']))
        assert np.all(bins[mask.astype(bool)]['label'] == val)

        return mask
Пример #5
0
def exclude_regions(df, regions_to_keep=[], genome=None, print_final=False):
    if len(regions_to_keep):
        assert genome is not None, 'Please provide valid genome'
        chromsizes = bioframe.fetch_chromsizes(genome)
    else:
        if print_final:
            print(np.asarray(df.region.unique()))
        return df

    regions_to_keep = [
        bioframe.parse_region(reg, chromsizes) for reg in regions_to_keep
    ]

    assert 'region' in df.columns

    regions = df['region'].apply(
        lambda x: bioframe.parse_region(x, chromsizes)).values
    chrom, start, end = list(zip(*regions))
    df['chrom'] = chrom
    df['start'] = start
    df['end'] = end

    new_df = []
    for chrom, start, end in regions_to_keep:
        sub_df = bioframe.bedslice(df, (chrom, start, end))
        new_df.append(sub_df)
    new_df = pd.concat(new_df)

    if print_final:
        print(np.asarray(new_df.region.unique()))

    del new_df['chrom']
    del new_df['start']
    del new_df['end']

    return new_df
Пример #6
0
def make_saddle(getmatrix,
                binedges,
                digitized,
                contact_type,
                regions=None,
                min_diag=3,
                max_diag=-1,
                trim_outliers=False,
                verbose=False):
    """
    Make a matrix of average interaction probabilities between genomic bin
    pairs as a function of a specified genomic track. The provided genomic
    track must be pre-quantized as integers (i.e. digitized).

    Parameters
    ----------
    getmatrix : function
        A function returning a matrix of interaction between two chromosomes
        given their names/indicies.
    binedges : 1D array (length n + 1)
        Bin edges of the digitized signal. For `n` bins, there are `n + 1`
        edges. See :func:`digitize_track`.
    digitized : tuple of (DataFrame, str)
        BedGraph-like dataframe of digitized signal along with the name of
        the digitized value column.
    contact_type : str
        If 'cis' then only cis interactions are used to build the matrix.
        If 'trans', only trans interactions are used.
    regions : sequence of str or tuple, optional
        A list of genomic regions to use. Each can be a chromosome, a
        UCSC-style genomic region string or a tuple.
    min_diag : int
        Smallest diagonal to include in computation. Ignored with
        contact_type=trans.
    max_diag : int
        Biggest diagonal to include in computation. Ignored with
        contact_type=trans.
    trim_outliers : bool, optional
        Remove first and last row and column from the output matrix.
    verbose : bool, optional
        If True then reports progress.

    Returns
    -------
    interaction_sum : 2D array
        The matrix of summed interaction probability between two genomic bins
        given their values of the provided genomic track.
    interaction_count : 2D array
        The matrix of the number of genomic bin pairs that contributed to the
        corresponding pixel of ``interaction_sum``.

    """
    digitized_df, name = digitized

    if regions is None:
        regions = [(chrom, df.start.min(), df.end.max())
                   for chrom, df in digitized_df.groupby('chrom')]
    else:
        regions = [bioframe.parse_region(reg) for reg in regions]

    digitized_tracks = {
        reg: bioframe.bedslice(digitized_df.groupby('chrom'), reg[0], reg[1],
                               reg[2])[name]
        for reg in regions
    }

    if contact_type == 'cis':
        supports = list(zip(regions, regions))
    elif contact_type == 'trans':
        supports = list(combinations(regions, 2))
    else:
        raise ValueError("The allowed values for the contact_type "
                         "argument are 'cis' or 'trans'.")

    # n_bins here includes 2 open bins
    # for values <lo and >hi.
    n_bins = len(binedges) + 1
    interaction_sum = np.zeros((n_bins, n_bins))
    interaction_count = np.zeros((n_bins, n_bins))

    for reg1, reg2 in supports:
        _accumulate(interaction_sum, interaction_count, getmatrix,
                    digitized_tracks, reg1, reg2, min_diag, max_diag, verbose)

    interaction_sum += interaction_sum.T
    interaction_count += interaction_count.T

    if trim_outliers:
        interaction_sum = interaction_sum[1:-1, 1:-1]
        interaction_count = interaction_count[1:-1, 1:-1]

    return interaction_sum, interaction_count
Пример #7
0
    print(name)
    os.makedirs(f'{savepath}{name}/100000/cis', exist_ok=True)
    os.makedirs(f'{savepath}{name}/100000/trans', exist_ok=True)
    c = row['cooler_100000']
    lams = row['lams_100000']
    vector = row['vectors_100000']

    #     supports = {'cis': DNA_info.get_chromosome_arms(genome),
    #                 'trans': [(chrom, 0, c.chromsizes[chrom])
    #                                    for chrom in c.chromnames[0:22]]}

    for region in DNA_info.get_chromosome_arms(genome):
        print(region)
        chrom, start, end = region
        mat = c.matrix(balance=True).fetch(region)
        vec = bioframe.bedslice(vector.groupby('chrom'), chrom, start, end)
        vec = vec['E1_cis'].values
        if np.all(np.isnan(vec)):
            continue
        if len(vec) == 0:
            continue
        S, C = saddleplot.construct_cis_saddleplot(mat, vec, num_percentile=20)

        np.save(f'{savepath}{name}/100000/cis/{chrom}:{start}-{end}.npy',
                np.dstack((S, C)))

    for i in np.arange(len(c.chromnames[0:22])):
        for j in np.arange(i + 1, len(c.chromnames[0:22])):

            chrom1 = c.chromnames[i]
            start1, end1 = 0, c.chromsizes[chrom1]