Exemplo n.º 1
0
def _diagsum_asymm(clr, fields, transforms, regions1, regions2, span):
    """
    calculates diagonal summary for a collection of
    rectangular regions defined as combinations of
    regions1 and regions2.
    returns a dictionary of DataFrames with diagonal
    sums as values, and 0-based indexes of rectangular
    genomic regions as keys.
    """
    lo, hi = span
    bins = clr.bins()[:]
    pixels = clr.pixels()[lo:hi]
    pixels = cooler.annotate(pixels, bins, replace=False)

    # this could further expanded to allow for custom groupings:
    pixels["dist"] = pixels["bin2_id"] - pixels["bin1_id"]
    for field, t in transforms.items():
        pixels[field] = t(pixels)

    diag_sums = {}
    # r1 and r2 define rectangular block i:
    for i, (r1, r2) in enumerate(zip(regions1, regions2)):
        r1 = assign_supports(pixels, [r1], suffix="1")
        r2 = assign_supports(pixels, [r2], suffix="2")
        # calculate diag_sums on the spot to allow for overlapping blocks:
        diag_sums[i] = pixels[(r1 == r2)].groupby("dist")[fields].sum()

    return diag_sums
Exemplo n.º 2
0
def getData3(cooler_matrix, zoomLevel, startPos1, endPos1, startPos2, endPos2):
    c = cooler_matrix['cooler']

    i0 = absCoord2bin(c, startPos1)
    i1 = absCoord2bin(c, endPos1)
    j0 = absCoord2bin(c, startPos2)
    j1 = absCoord2bin(c, endPos2)

    if (i1 - i0) == 0 or (j1 - j0) == 0:
        return pd.DataFrame(columns=['genome_start', 'genome_end', 'balanced'])

    pixels = c.matrix(as_pixels=True, max_chunk=np.inf)[i0:i1, j0:j1]

    if not len(pixels):
        return pd.DataFrame(columns=['genome_start', 'genome_end', 'balanced'])

    lo = min(i0, j0)
    hi = max(i1, j1)
    bins = c.bins()[['chrom', 'start', 'end', 'weight']][lo:hi]
    bins['chrom'] = bins['chrom'].cat.codes
    pixels = cooler.annotate(pixels, bins)
    pixels['genome_start'] = cumul_lengths[pixels['chrom1']] + pixels['start1']
    pixels['genome_end'] = cumul_lengths[pixels['chrom2']] + pixels['end2']
    pixels['balanced'] = pixels['count'] * \
        pixels['weight1'] * pixels['weight2']

    return pixels[['genome_start', 'genome_end', 'balanced']]
Exemplo n.º 3
0
def getData3(fpath, zoomLevel, startPos1, endPos1, startPos2, endPos2):
    t1 = time.time()
    f = h5py.File(fpath, 'r')
    c = cooler.Cooler(f[str(zoomLevel)])
    matrix = c.matrix(balance=True, as_pixels=True, join=True)
    cooler_matrix = {'cooler': c, 'matrix': matrix}
    c = cooler_matrix['cooler']

    i0 = absCoord2bin(c, startPos1)
    i1 = absCoord2bin(c, endPos1)
    j0 = absCoord2bin(c, startPos2)
    j1 = absCoord2bin(c, endPos2)

    if (i1 - i0) == 0 or (j1 - j0) == 0:
        return pd.DataFrame(columns=['genome_start', 'genome_end', 'balanced'])

    pixels = c.matrix(as_pixels=True, max_chunk=np.inf)[i0:i1, j0:j1]

    if not len(pixels):
        return pd.DataFrame(columns=['genome_start', 'genome_end', 'balanced'])

    lo = min(i0, j0)
    hi = max(i1, j1)
    bins = c.bins()[['chrom', 'start', 'end', 'weight']][lo:hi]
    bins['chrom'] = bins['chrom'].cat.codes
    pixels = cooler.annotate(pixels, bins)
    pixels['genome_start'] = cumul_lengths[pixels['chrom1']] + pixels['start1']
    pixels['genome_end'] = cumul_lengths[pixels['chrom2']] + pixels['end2']
    pixels[
        'balanced'] = pixels['count'] * pixels['weight1'] * pixels['weight2']
    #print  type(pixels[map(lambda x: "{0:.2f}".format(x),map(lambda x: float(x),['genome_start', 'genome_end', 'balanced']))])

    return pixels[['genome_start', 'genome_end', 'balanced']]
Exemplo n.º 4
0
def getData3(cooler_matrix, zoomLevel, startPos1, endPos1, startPos2, endPos2):
    c = cooler_matrix["cooler"]

    i0 = absCoord2bin(c, startPos1)
    i1 = absCoord2bin(c, endPos1)
    j0 = absCoord2bin(c, startPos2)
    j1 = absCoord2bin(c, endPos2)

    if (i1 - i0) == 0 or (j1 - j0) == 0:
        return pd.DataFrame(columns=["genome_start", "genome_end", "balanced"])

    pixels = c.matrix(as_pixels=True, max_chunk=np.inf)[i0:i1, j0:j1]

    if not len(pixels):
        return pd.DataFrame(columns=["genome_start", "genome_end", "balanced"])

    lo = min(i0, j0)
    hi = max(i1, j1)
    bins = c.bins()[["chrom", "start", "end", "weight"]][lo:hi]
    bins["chrom"] = bins["chrom"].cat.codes
    pixels = cooler.annotate(pixels, bins)
    pixels["genome_start"] = cumul_lengths[pixels["chrom1"]] + pixels["start1"]
    pixels["genome_end"] = cumul_lengths[pixels["chrom2"]] + pixels["end2"]
    pixels[
        "balanced"] = pixels["count"] * pixels["weight1"] * pixels["weight2"]

    return pixels[["genome_start", "genome_end", "balanced"]]
Exemplo n.º 5
0
def _accum_by_cisdiag(c, bins, span):
    """Sum properties along the diagonals of the intrachromosomal matrices"""
    lo, hi = span
    pixels = c.pixels()[lo:hi]

    # assign chroms and filter for cis records
    pixels = cooler.annotate(pixels, bins[['chrom', 'weight']], replace=False)
    pixels = pixels[pixels.chrom1 == pixels.chrom2].copy()

    # assign diagonal indices
    pixels = pixels.rename(columns={'chrom1': 'chrom'})
    pixels['diag'] = pixels['bin2_id'] - pixels['bin1_id']

    # balance
    pixels[
        'balanced'] = pixels['count'] * pixels['weight1'] * pixels['weight2']
    pixels['balanced2'] = pixels['balanced'] * pixels['balanced']

    # group by diagonal and accumulate
    grouped = pixels.groupby(['chrom', 'diag'], sort=False)
    agg = grouped.aggregate({
        'balanced': np.sum,
        'balanced2': np.sum,
    })
    return agg.reset_index()
Exemplo n.º 6
0
def _blocksum_asymm(clr, fields, transforms, regions1, regions2, span):
    """
    calculates block summary for a collection of
    rectangular regions defined as combinations of
    regions1 and regions2.
    returns a dictionary of with block sums as values,
    and 0-based indexes of rectangular genomic regions
    as keys.
    """
    lo, hi = span
    bins = clr.bins()[:]
    pixels = clr.pixels()[lo:hi]
    pixels = cooler.annotate(pixels, bins, replace=False)

    for field, t in transforms.items():
        pixels[field] = t(pixels)

    block_sums = {}
    # r1 and r2 define rectangular block i:
    for i, (r1, r2) in enumerate(zip(regions1, regions2)):
        r1 = assign_supports(pixels, [r1], suffix="1")
        r2 = assign_supports(pixels, [r2], suffix="2")
        # calculate sum on the spot to allow for overlapping blocks:
        block_sums[i] = pixels[(r1 == r2)][fields].sum()

    return block_sums
Exemplo n.º 7
0
def get_count_df(cool_mat):
    logger.debug(f"Creating counts dataframe for {cool_mat}")
    df = (
        annotate(cool_mat.pixels()[:], cool_mat.bins()[:]["chrom"])
        .eval("is_cis = (chrom1 == chrom2)")
        .pipe(get_distance, cool_mat.binsize)
        .set_index(["bin1_id", "bin2_id"])
        .sort_index()
    )
    num_bins = len(df)
    logger.info(f"Read {num_bins} bins from {cool_mat}")
    return df
Exemplo n.º 8
0
def get_data(f, zoom_level, start_pos_1, end_pos_1, start_pos_2, end_pos_2):
    """Get balanced pixel data.
 
    Args:
        f (File): File pointer to a .cool filer.
        zoom_level (int): Test.
        start_pos_1 (int): Test.
        end_pos_1 (int): Test.
        start_pos_2 (int): Test.
        end_pos_2 (int): Test.
 
    Returns:
        DataFrame: Annotated cooler pixels.
    """

    c = cooler.Cooler(f[str(zoom_level)])

    (chroms, chrom_sizes,
     chrom_cum_lengths) = get_chromosome_names_cumul_lengths(c)

    i0 = abs_coord_2_bin(c, start_pos_1, chroms, chrom_cum_lengths,
                         chrom_sizes)
    i1 = abs_coord_2_bin(c, end_pos_1, chroms, chrom_cum_lengths, chrom_sizes)
    j0 = abs_coord_2_bin(c, start_pos_2, chroms, chrom_cum_lengths,
                         chrom_sizes)
    j1 = abs_coord_2_bin(c, end_pos_2, chroms, chrom_cum_lengths, chrom_sizes)

    pixels = c.matrix(as_pixels=True, balance=False,
                      max_chunk=np.inf)[i0:i1 + 1, j0:j1 + 1]

    if not len(pixels):
        return pd.DataFrame(
            columns=['genome_start1', 'genome_start2', 'balanced'])

    if 'weight' in c.bins():
        bins = c.bins(convert_enum=False)[['chrom', 'start', 'end', 'weight']]
    else:
        bins = c.bins(convert_enum=False)[['chrom', 'start', 'end']]

    pixels = cooler.annotate(pixels, bins)
    pixels['genome_start1'] = chrom_cum_lengths[
        pixels['chrom1']] + pixels['start1']
    pixels['genome_start2'] = chrom_cum_lengths[
        pixels['chrom2']] + pixels['start2']

    if 'weight' in c.bins():
        pixels['balanced'] = (pixels['count'] * pixels['weight1'] *
                              pixels['weight2'])
        return pixels[['genome_start1', 'genome_start2', 'balanced']]
    else:
        return pixels[['genome_start1', 'genome_start2', 'count']]
Exemplo n.º 9
0
def _diagsum_symm(clr, fields, transforms, supports, span):
    lo, hi = span
    bins = clr.bins()[:]
    pixels = clr.pixels()[lo:hi]
    pixels = cooler.annotate(pixels, bins, replace=False)
    
    pixels = pixels[pixels['chrom1'] == pixels['chrom2']].copy()
    pixels['diag'] = pixels['bin2_id'] - pixels['bin1_id']
    for field, t in transforms.items():
        pixels[field] = t(pixels)
    
    pixels['support'] = assign_supports(pixels, supports, suffix='1')
    
    pixel_groups = dict(iter(pixels.groupby('support')))
    return {int(i): group.groupby('diag')[fields].sum()
                  for i, group in pixel_groups.items()}
Exemplo n.º 10
0
def _blocksum_asymm(clr, fields, transforms, supports1, supports2, span):
    lo, hi = span
    bins = clr.bins()[:]
    pixels = clr.pixels()[lo:hi]
    pixels = cooler.annotate(pixels, bins, replace=False)

    pixels = pixels[pixels["chrom1"] != pixels["chrom2"]].copy()
    for field, t in transforms.items():
        pixels[field] = t(pixels)

    pixels["support1"] = assign_supports(pixels, supports1, suffix="1")
    pixels["support2"] = assign_supports(pixels, supports2, suffix="2")
    pixels = pixels.dropna()

    pixel_groups = dict(iter(pixels.groupby(["support1", "support2"])))
    return {(int(i), int(j)): group[fields].sum()
            for (i, j), group in pixel_groups.items()}
Exemplo n.º 11
0
def _diagsum_symm(clr, fields, transforms, supports, span):
    lo, hi = span
    bins = clr.bins()[:]
    pixels = clr.pixels()[lo:hi]
    pixels = cooler.annotate(pixels, bins, replace=False)

    pixels["support1"] = assign_supports(pixels, supports, suffix="1")
    pixels["support2"] = assign_supports(pixels, supports, suffix="2")
    pixels = pixels[pixels["support1"] == pixels["support2"]].copy()

    pixels["diag"] = pixels["bin2_id"] - pixels["bin1_id"]
    for field, t in transforms.items():
        pixels[field] = t(pixels)

    pixelgroups = dict(iter(pixels.groupby("support1")))
    return {
        int(i): group.groupby("diag")[fields].sum() for i, group in pixelgroups.items()
    }
Exemplo n.º 12
0
def _diagsum_asymm(clr, fields, transforms, contact_type, supports1, supports2, span):
    lo, hi = span
    bins = clr.bins()[:]
    pixels = clr.pixels()[lo:hi]
    pixels = cooler.annotate(pixels, bins, replace=False)
    
    if contact_type == 'cis':
        pixels = pixels[pixels['chrom1'] == pixels['chrom2']].copy()
    elif contact_type == 'trans':
        pixels = pixels[pixels['chrom1'] != pixels['chrom2']].copy()

    pixels['diag'] = pixels['bin2_id'] - pixels['bin1_id']
    for field, t in transforms.items():
        pixels[field] = t(pixels)
    
    pixels['support1'] = assign_supports(pixels, supports1, suffix='1')
    pixels['support2'] = assign_supports(pixels, supports1, suffix='2')
    
    pixel_groups = dict(iter(pixels.groupby(('support1', 'support2'))))
    return {(int(i), int(j)): group.groupby('diag')[fields].sum()
                  for (i, j), group in pixel_groups.items()}
Exemplo n.º 13
0
def _diagsum_asymm(clr, fields, transforms, contact_type, supports1, supports2,
                   span):
    lo, hi = span
    bins = clr.bins()[:]
    pixels = clr.pixels()[lo:hi]
    pixels = cooler.annotate(pixels, bins, replace=False)

    if contact_type == "cis":
        pixels = pixels[pixels["chrom1"] == pixels["chrom2"]].copy()
    elif contact_type == "trans":
        pixels = pixels[pixels["chrom1"] != pixels["chrom2"]].copy()

    pixels["diag"] = pixels["bin2_id"] - pixels["bin1_id"]
    for field, t in transforms.items():
        pixels[field] = t(pixels)

    pixels["support1"] = assign_supports(pixels, supports1, suffix="1")
    pixels["support2"] = assign_supports(pixels, supports2, suffix="2")

    pixel_groups = dict(iter(pixels.groupby(["support1", "support2"])))
    return {(int(i), int(j)): group.groupby("diag")[fields].sum()
            for (i, j), group in pixel_groups.items()}
Exemplo n.º 14
0
def get_data(f,
             zoom_level,
             start_pos_1,
             end_pos_1,
             start_pos_2,
             end_pos_2,
             transform='default'):
    """Get balanced pixel data.
 
    Args:
        f (File): File pointer to a .cool filer.
        zoom_level (int): Test.
        start_pos_1 (int): Test.
        end_pos_1 (int): Test.
        start_pos_2 (int): Test.
        end_pos_2 (int): Test.
 
    Returns:
        DataFrame: Annotated cooler pixels.
    """

    c = cooler.Cooler(f[str(zoom_level)])

    (chroms, chrom_sizes,
     chrom_cum_lengths) = get_chromosome_names_cumul_lengths(c)

    i0 = abs_coord_2_bin(c, start_pos_1, chroms, chrom_cum_lengths,
                         chrom_sizes)
    i1 = abs_coord_2_bin(c, end_pos_1, chroms, chrom_cum_lengths, chrom_sizes)
    j0 = abs_coord_2_bin(c, start_pos_2, chroms, chrom_cum_lengths,
                         chrom_sizes)
    j1 = abs_coord_2_bin(c, end_pos_2, chroms, chrom_cum_lengths, chrom_sizes)

    pixels = c.matrix(as_pixels=True, balance=False,
                      max_chunk=np.inf)[i0:i1 + 1, j0:j1 + 1]

    if not len(pixels):
        return pd.DataFrame(
            columns=['genome_start1', 'genome_start2', 'balanced'])

    # select bin columns to extract
    cols = ['chrom', 'start', 'end']
    if (transform == 'default'
            and 'weight' in c.bins()) or transform == 'weight':
        cols.append('weight')
    elif transform in ('KR', 'VC', 'VC_SQRT'):
        cols.append(transform)

    bins = c.bins(convert_enum=False)[cols]
    pixels = cooler.annotate(pixels, bins)
    pixels['genome_start1'] = chrom_cum_lengths[
        pixels['chrom1']] + pixels['start1']
    pixels['genome_start2'] = chrom_cum_lengths[
        pixels['chrom2']] + pixels['start2']

    # apply transform
    if (transform == 'default'
            and 'weight' in c.bins()) or transform == 'weight':
        pixels['balanced'] = (pixels['count'] * pixels['weight1'] *
                              pixels['weight2'])
        return pixels[['genome_start1', 'genome_start2', 'balanced']]
    elif transform in ('KR', 'VC', 'VC_SQRT'):
        pixels['balanced'] = (pixels['count'] / pixels[transform + '1'] /
                              pixels[transform + '2'])
        return pixels[['genome_start1', 'genome_start2', 'balanced']]
    else:
        return pixels[['genome_start1', 'genome_start2', 'count']]
Exemplo n.º 15
0
def trans_expected(clr, chromosomes, chunksize=1000000, use_dask=False):
    """
    Aggregate the signal in intrachromosomal blocks.
    Can be used as abackground for contact frequencies between chromosomes.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    chromosomes : list of str
        List of chromosome names
    chunksize : int, optional
        Size of dask chunks
    use_dask : bool, optional
        option to use dask
    
    Returns
    -------
    pandas.DataFrame that stores total number of
    interactions between a pair of chromosomes: 'balanced.sum',
    corresponding number of bins involved
    in the inter-chromosomal interactions: 'n_valid', 
    and a ratio 'balanced.avg = balanced.sum/n_valid', that is
    the actual value of expected for every interchromosomal pair.

    """
    def n_total_trans_elements(clr, chromosomes):
        n = len(chromosomes)
        x = [clr.extent(chrom)[1] - clr.extent(chrom)[0] 
                 for chrom in chromosomes]
        pairblock_list = []
        for i in range(n):
            for j in range(i + 1, n):
                # appending to the list of tuples
                pairblock_list.append((chromosomes[i],
                                       chromosomes[j],
                                       x[i] * x[j] ))
        return pd.DataFrame(pairblock_list, 
            columns=['chrom1', 'chrom2', 'n_total'])

    def n_bad_trans_elements(clr, chromosomes):
        n = 0
        # bad bins are ones with
        # the weight vector being NaN:
        x = [np.sum(clr.bins()['weight']
                       .fetch(chrom)
                       .isnull()
                       .astype(int)
                       .values)
                 for chrom in chromosomes]
        pairblock_list = []
        for i in range(len(x)):
            for j in range(i + 1, len(x)):
                # appending to the list of tuples
                pairblock_list.append((chromosomes[i],
                                       chromosomes[j],
                                       x[i] * x[j] ))
        return pd.DataFrame(pairblock_list,
            columns=['chrom1', 'chrom2', 'n_bad'])

    if use_dask:
        # pixels = daskify(clr.filename, clr.root + '/pixels', chunksize=chunksize)
        raise NotImplementedError("To be implemented once dask supports MultiIndex")
    else:
        pixels = clr.pixels()[:]
    # getting pixels that belong to trans-area,
    # defined by the list of chromosomes:
    pixels = cooler.annotate(pixels, clr.bins(), replace=False)
    pixels = pixels[
        (pixels.chrom1.isin(chromosomes)) &
        (pixels.chrom2.isin(chromosomes)) &
        (pixels.chrom1 != pixels.chrom2)
    ]
    pixels['balanced'] = pixels['count'] * pixels['weight1'] * pixels['weight2']
    ntot = n_total_trans_elements(clr, chromosomes).groupby(('chrom1', 'chrom2'))['n_total'].sum()
    nbad = n_bad_trans_elements(clr, chromosomes).groupby(('chrom1', 'chrom2'))['n_bad'].sum()
    trans_area = ntot - nbad
    trans_area.name = 'n_valid'
    # processing with use_dask=True is different:
    if use_dask:
        # trans_sum = pixels.groupby(('chrom1', 'chrom2'))['balanced'].sum().compute()
        pass
    else:
        trans_sum = pixels.groupby(('chrom1', 'chrom2'))['balanced'].sum()
    # for consistency with the cis_expected function:
    trans_sum.name = trans_sum.name + '.sum'

    # returning a DataFrame with MultiIndex, that stores
    # pairs of 'balanced.sum' and 'n_valid' values for each
    # pair of chromosomes.
    dtable = pd.merge(
        trans_sum.to_frame(),
        trans_area.to_frame(),
        left_index=True,
        right_index=True)

    # the actual expected is balanced.sum/n_valid:
    dtable['balanced.avg'] = dtable['balanced.sum'] / dtable['n_valid']
    return dtable
Exemplo n.º 16
0
def get_data(f,
             start_pos_1,
             end_pos_1,
             start_pos_2,
             end_pos_2,
             transform='default',
             resolution=None):
    """Get balanced pixel data.

    Args:
        f: h5py.File
            An HDF5 Group that contains the cooler for this resolution
        start_pos_1 (int): Test.
        end_pos_1 (int): Test.
        start_pos_2 (int): Test.
        end_pos_2 (int): Test.

    Returns:
        DataFrame: Annotated cooler pixels.
    """

    c = cooler.Cooler(f)

    (chroms, chrom_sizes,
     chrom_cum_lengths) = get_chromosome_names_cumul_lengths(c)

    i0 = abs_coord_2_bin(c, start_pos_1, chroms, chrom_cum_lengths,
                         chrom_sizes)
    i1 = abs_coord_2_bin(c, end_pos_1, chroms, chrom_cum_lengths, chrom_sizes)

    j0 = abs_coord_2_bin(c, start_pos_2, chroms, chrom_cum_lengths,
                         chrom_sizes)
    j1 = abs_coord_2_bin(c, end_pos_2, chroms, chrom_cum_lengths, chrom_sizes)

    matrix = c.matrix(as_pixels=True, balance=False, max_chunk=np.inf)

    if i0 >= matrix.shape[0] or j0 >= matrix.shape[1]:
        # query beyond the bounds of the matrix
        # return an empty matrix
        i0, i1, j0, j1 = 0, 0, 0, 0

        return (pd.DataFrame(
            columns=['genome_start1', 'genome_start2', 'balanced']),
                (pd.DataFrame({
                    'genome_start': [],
                    'genome_end': [],
                    'weight': []
                }),
                 pd.DataFrame({
                     'genome_start': [],
                     'genome_end': [],
                     'weight': []
                 })))
    else:
        # limit the range of the query to be within bounds
        i1 = min(i1, matrix.shape[0] - 1)
        j1 = min(j1, matrix.shape[1] - 1)

    pixels = matrix[i0:i1 + 1, j0:j1 + 1]
    '''
    if not len(pixels):
        return (pd.DataFrame(columns=['genome_start1', 'genome_start2', 'balanced']), (None, None))
    '''

    # select bin columns to extract
    cols = ['chrom', 'start', 'end']
    if (transform == 'default'
            and 'weight' in c.bins()) or transform == 'weight':
        cols.append('weight')
    elif transform in ('KR', 'VC', 'VC_SQRT'):
        cols.append(transform)

    bins = c.bins(convert_enum=False)[cols]
    pixels = cooler.annotate(pixels, bins)

    pixels['genome_start1'] = chrom_cum_lengths[
        pixels['chrom1']] + pixels['start1']
    pixels['genome_start2'] = chrom_cum_lengths[
        pixels['chrom2']] + pixels['start2']

    bins1 = bins[i0:i1 + 1]
    bins2 = bins[j0:j1 + 1]

    bins1['genome_start'] = chrom_cum_lengths[bins1['chrom']] + bins1['start']
    bins2['genome_start'] = chrom_cum_lengths[bins2['chrom']] + bins2['start']

    bins1['genome_end'] = chrom_cum_lengths[bins1['chrom']] + bins1['end']
    bins2['genome_end'] = chrom_cum_lengths[bins2['chrom']] + bins2['end']

    # apply transform
    if (transform == 'default'
            and 'weight' in c.bins()) or transform == 'weight':
        pixels['balanced'] = (pixels['count'] * pixels['weight1'] *
                              pixels['weight2'])

        return (pixels[['genome_start1', 'genome_start2',
                        'balanced']], (bins1, bins2))
    elif transform in ('KR', 'VC', 'VC_SQRT'):
        pixels['balanced'] = (pixels['count'] / pixels[transform + '1'] /
                              pixels[transform + '2'])

        bins1['weight'] = bins1[transform]
        bins2['weight'] = bins2[transform]

        return (pixels[['genome_start1', 'genome_start2',
                        'balanced']], (bins1, bins2))
    else:
        return (pixels[['genome_start1', 'genome_start2',
                        'count']], (None, None))
Exemplo n.º 17
0
def get_frag(c: cooler.api.Cooler,
             resolution: int,
             offsets: pd.core.series.Series,
             chrom1: str,
             start1: int,
             end1: int,
             chrom2: str,
             start2: int,
             end2: int,
             width: int = 22,
             height: int = -1,
             padding: int = 10,
             normalize: bool = True,
             balanced: bool = True,
             percentile: float = 100.0,
             ignore_diags: int = 0,
             no_normalize: bool = False) -> np.ndarray:
    """
    Retrieves a matrix fragment.

    Args:
        c:
            Cooler object.
        chrom1:
            Chromosome 1. E.g.: `1` or `chr1`.
        start1:
            First start position in base pairs relative to `chrom1`.
        end1:
            First end position in base pairs relative to `chrom1`.
        chrom2:
            Chromosome 2. E.g.: `1` or `chr1`.
        start2:
            Second start position in base pairs relative to `chrom2`.
        end2:
            Second end position in base pairs relative to `chrom2`.
        offsets:
            Pandas Series of chromosome offsets in bins.
        width:
            Width of the fragment in pixels.
        height:
            Height of the fragments in pixels. If `-1` `height` will equal
            `width`. Defaults to `-1`.
        padding: Percental padding related to the dimension of the fragment.
            E.g., 10 = 10% padding (5% per side). Defaults to `10`.
        normalize:
            If `True` the fragment will be normalized to [0, 1].
            Defaults to `True`.
        balanced:
            If `True` the fragment will be balanced using Cooler.
            Defaults to `True`.
        percentile:
            Percentile clip. E.g., For 99 the maximum will be
            capped at the 99-percentile. Defaults to `100.0`.
        ignore_diags:
            Number of diagonals to be ignored, i.e., set to 0.
            Defaults to `0`.
        no_normalize:
            If `true` the returned matrix is not normalized.
            Defaults to `False`.

    Returns:

    """

    if height is -1:
        height = width

    # Restrict padding to be [0, 100]%
    padding = min(100, max(0, padding)) / 100

    try:
        offset1 = offsets[chrom1]
        offset2 = offsets[chrom2]
    except KeyError:
        # One more try before we will fail miserably
        offset1 = offsets['chr{}'.format(chrom1)]
        offset2 = offsets['chr{}'.format(chrom2)]

    start_bin1 = offset1 + int(round(float(start1) / resolution))
    end_bin1 = offset1 + int(round(float(end1) / resolution)) + 1

    start_bin2 = offset2 + int(round(float(start2) / resolution))
    end_bin2 = offset2 + int(round(float(end2) / resolution)) + 1

    # Apply percentile padding
    padding1 = int(round(((end_bin1 - start_bin1) / 2) * padding))
    padding2 = int(round(((end_bin2 - start_bin2) / 2) * padding))
    start_bin1 -= padding1
    start_bin2 -= padding2
    end_bin1 += padding1
    end_bin2 += padding2

    # Get the size of the region
    dim1 = end_bin1 - start_bin1
    dim2 = end_bin2 - start_bin2

    # Get additional absolute padding if needed
    padding1 = 0
    if dim1 < width:
        padding1 = int((width - dim1) / 2)
        start_bin1 -= padding1
        end_bin1 += padding1

    padding2 = 0
    if dim2 < height:
        padding2 = int((height - dim2) / 2)
        start_bin2 -= padding2
        end_bin2 += padding2

    # In case the final dimension does not math the desired dimension we
    # increase the end bin. This can be caused when the padding is not
    # divisible by 2, since the padding is rounded to the nearest integer.
    abs_dim1 = abs(start_bin1 - end_bin1)
    if abs_dim1 < width:
        end_bin1 += width - abs_dim1
        abs_dim1 = width

    abs_dim2 = abs(start_bin2 - end_bin2)
    if abs_dim2 < height:
        end_bin2 += height - abs_dim2
        abs_dim2 = height

    # Maximum width / height is 512
    if abs_dim1 > hss.SNIPPET_MAT_MAX_DATA_DIM: raise SnippetTooLarge()
    if abs_dim2 > hss.SNIPPET_MAT_MAX_DATA_DIM: raise SnippetTooLarge()

    # Finally, adjust to negative values.
    # Since relative bin IDs are adjusted by the start this will lead to a
    # white offset.
    real_start_bin1 = start_bin1 if start_bin1 >= 0 else 0
    real_start_bin2 = start_bin2 if start_bin2 >= 0 else 0

    # Get the data
    data = c.matrix(as_pixels=True, balance=False,
                    max_chunk=np.inf)[real_start_bin1:end_bin1,
                                      real_start_bin2:end_bin2]

    # Annotate pixels for balancing
    bins = c.bins(convert_enum=False)[['weight']]
    data = cooler.annotate(data, bins, replace=False)

    # Calculate relative bin IDs
    rel_bin1 = np.add(data['bin1_id'].values, -start_bin1)
    rel_bin2 = np.add(data['bin2_id'].values, -start_bin2)

    # Balance counts
    if balanced:
        values = data['count'].values.astype(np.float32)
        values *= data['weight1'].values * data['weight2'].values
    else:
        values = data['count'].values

    # Get pixel IDs for the upper triangle
    idx1 = np.add(np.multiply(rel_bin1, abs_dim1), rel_bin2)

    # Mirror matrix
    idx2_1 = np.add(data['bin2_id'].values, -start_bin1)
    idx2_2 = np.add(data['bin1_id'].values, -start_bin2)
    idx2 = np.add(np.multiply(idx2_1, abs_dim1), idx2_2)
    validBins = np.where((idx2_1 < abs_dim1) & (idx2_2 >= 0))

    # Ignore diagonals
    diags_start_row = None
    if ignore_diags > 0:
        try:
            diags_start_idx = np.min(
                np.where(data['bin1_id'].values == data['bin2_id'].values))
            diags_start_row = (rel_bin1[diags_start_idx] -
                               rel_bin2[diags_start_idx])
        except ValueError:
            pass

    # Copy pixel values onto the final array
    frag_len = abs_dim1 * abs_dim2
    frag = np.zeros(frag_len, dtype=np.float32)
    # Make sure we're within the bounds
    idx1_f = np.where(idx1 < frag_len)
    frag[idx1[idx1_f]] = values[idx1_f]
    frag[idx2[validBins]] = values[validBins]
    frag = frag.reshape((abs_dim1, abs_dim2))

    # Store low quality bins
    low_quality_bins = np.where(np.isnan(frag))

    # Assign 0 for now to avoid influencing the max values
    frag[low_quality_bins] = 0

    # Scale fragment down if needed
    scaled = False
    scale_x = width / frag.shape[0]
    if frag.shape[0] > width or frag.shape[1] > height:
        scaledFrag = np.zeros((width, height), float)
        frag = scaledFrag + zoomArray(frag, scaledFrag.shape, order=1)
        scaled = True

    # Normalize by minimum
    if not no_normalize:
        min_val = np.min(frag)
        frag -= min_val

    ignored_idx = None

    # Remove diagonals
    if ignore_diags > 0 and diags_start_row is not None:
        if width == height:
            scaled_row = int(np.rint(diags_start_row / scale_x))

            idx = np.diag_indices(width)
            scaled_idx = (idx if scaled_row == 0 else
                          [idx[0][scaled_row:], idx[0][:-scaled_row]])

            for i in range(ignore_diags):

                # First set all cells to be ignored to `-1` so that we can
                # easily query for them later.
                if i == 0:
                    frag[scaled_idx] = -1
                else:
                    dist_to_diag = scaled_row - i
                    dist_neg = min(0, dist_to_diag)
                    off = 0 if dist_to_diag >= 0 else i - scaled_row

                    # Above diagonal
                    frag[((scaled_idx[0] - i)[off:],
                          (scaled_idx[1])[off:])] = -1

                    # Extra cutoff at the bottom right
                    frag[(range(
                        scaled_idx[0][-1] - i,
                        scaled_idx[0][-1] + 1 + dist_neg,
                    ),
                          range(scaled_idx[1][-1],
                                scaled_idx[1][-1] + i + 1 + dist_neg))] = -1

                    # Below diagonal
                    frag[((scaled_idx[0] + i)[:-i], (scaled_idx[1])[:-i])] = -1

            # Save the final selection of ignored cells for fast access
            # later and set those values to `0` now.
            ignored_idx = np.where(frag == -1)
            frag[ignored_idx] = 0

        else:
            logger.warn(
                'Ignoring the diagonal only supported for squared features')

    # Capp by percentile
    max_val = np.percentile(frag, percentile)
    frag = np.clip(frag, 0, max_val)

    # Normalize by maximum
    if not no_normalize and max_val > 0:
        frag /= max_val

    # Set the ignored diagonal to the maximum
    if ignored_idx:
        frag[ignored_idx] = 1.0

    if not scaled:
        # Recover low quality bins
        frag[low_quality_bins] = -1

    return frag
Exemplo n.º 18
0
def insul_diamond(
    pixel_query,
    bins,
    window=10,
    ignore_diags=2,
    norm_by_median=True,
    clr_weight_name="weight",
):
    """
    Calculates the insulation score of a Hi-C interaction matrix.

    Parameters
    ----------
    pixel_query : RangeQuery object <TODO:update description>
        A table of Hi-C interactions. Must follow the Cooler columnar format:
        bin1_id, bin2_id, count, balanced (optional)).
    bins : pandas.DataFrame
        A table of bins, is used to determine the span of the matrix
        and the locations of bad bins.
    window : int
        The width (in bins) of the diamond window to calculate the insulation
        score.
    ignore_diags : int
        If > 0, the interactions at separations < `ignore_diags` are ignored
        when calculating the insulation score. Typically, a few first diagonals
        of the Hi-C map should be ignored due to contamination with Hi-C
        artifacts.
    norm_by_median : bool
        If True, normalize the insulation score by its NaN-median.
    clr_weight_name : str or None
        Name of balancing weight column from the cooler to use.
        Using raw unbalanced data is not supported for insulation.
    """
    lo_bin_id = bins.index.min()
    hi_bin_id = bins.index.max() + 1
    N = hi_bin_id - lo_bin_id
    sum_counts = np.zeros(N)
    sum_balanced = np.zeros(N)

    if clr_weight_name is None:
        # define n_pixels
        n_pixels = get_n_pixels(np.repeat(False, len(bins)),
                                window=window,
                                ignore_diags=ignore_diags)
    else:
        # calculate n_pixels
        n_pixels = get_n_pixels(
            bins[clr_weight_name].isnull().values,
            window=window,
            ignore_diags=ignore_diags,
        )
        # define transform - balanced and raw ('count') for now
        weight1 = clr_weight_name + "1"
        weight2 = clr_weight_name + "2"
        transform = lambda p: p["count"] * p[weight1] * p[weight2]

    for chunk_dict in pixel_query.read_chunked():
        chunk = pd.DataFrame(chunk_dict,
                             columns=["bin1_id", "bin2_id", "count"])
        diag_pixels = chunk[chunk.bin2_id - chunk.bin1_id <= (window - 1) * 2]

        if clr_weight_name:
            diag_pixels = cooler.annotate(diag_pixels, bins[[clr_weight_name]])
            diag_pixels["balanced"] = transform(diag_pixels)
            valid_pixel_mask = ~diag_pixels["balanced"].isnull().values

        i = diag_pixels.bin1_id.values - lo_bin_id
        j = diag_pixels.bin2_id.values - lo_bin_id

        for i_shift in range(0, window):
            for j_shift in range(0, window):
                if i_shift + j_shift < ignore_diags:
                    continue

                mask = ((i + i_shift == j - j_shift)
                        & (i + i_shift < N)
                        & (j - j_shift >= 0))

                sum_counts += np.bincount(i[mask] + i_shift,
                                          diag_pixels["count"].values[mask],
                                          minlength=N)

                if clr_weight_name:
                    sum_balanced += np.bincount(
                        i[mask & valid_pixel_mask] + i_shift,
                        diag_pixels["balanced"].values[mask
                                                       & valid_pixel_mask],
                        minlength=N,
                    )

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        if clr_weight_name:
            score = sum_balanced / n_pixels
        else:
            score = sum_counts / n_pixels

        if norm_by_median:
            score /= np.nanmedian(score)

    return score, n_pixels, sum_balanced, sum_counts
Exemplo n.º 19
0
def dots(
    cool_path,
    expected_path,
    view,
    clr_weight_name,
    nproc,
    max_loci_separation,
    max_nans_tolerated,
    tile_size,
    kernel_width,
    kernel_peak,
    num_lambda_chunks,
    fdr,
    dots_clustering_radius,
    verbose,
    out_prefix,
):
    """
    Call dots on a Hi-C heatmap that are not larger than max_loci_separation.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map.

    EXPECTED_PATH : The paths to a tsv-like file with expected signal,
    including a header. Use the '::' syntax to specify a column name.

    Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and
    therefore these chromosomes must be a subset of chromosomes referred to in
    COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial,
    i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH
    before applying this script.

    COOL_PATH and EXPECTED_PATH must be binned at the same resolution.

    EXPECTED_PATH must contain at least the following columns for cis contacts:
    'region1/2', 'dist', 'n_valid', value_name. value_name is controlled using
    options. Header must be present in a file.

    """
    clr = cooler.Cooler(cool_path)
    expected_path, expected_value_col = expected_path

    #### Generate viewframes ####
    # 1:cooler_view_df. Generate viewframe from clr.chromsizes:
    cooler_view_df = make_cooler_view(clr)

    # 2:view_df. Define global view for calculating calling dots
    # use input "view" BED file or all chromosomes :
    if view is None:
        view_df = cooler_view_df
    else:
        view_df = read_viewframe_from_file(view, clr, check_sorting=True)

    #### Read expected: ####
    expected_summary_cols = [
        expected_value_col,
    ]
    expected = read_expected_from_file(
        expected_path,
        contact_type="cis",
        expected_value_cols=expected_summary_cols,
        verify_view=view_df,
        verify_cooler=clr,
    )
    # add checks to make sure cis-expected is symmetric

    # Prepare some parameters.
    binsize = clr.binsize
    loci_separation_bins = int(max_loci_separation / binsize)
    tile_size_bins = int(tile_size / binsize)
    balance_factor = 1.0  # clr._load_attrs("bins/weight")["scale"]

    # clustering would deal with bases-units for now, so supress this for now
    # clustering_radius_bins = int(dots_clustering_radius/binsize)

    # kernels
    # 'upright' is a symmetrical inversion of "lowleft", not needed.
    ktypes = ["donut", "vertical", "horizontal", "lowleft"]

    if (kernel_width is None) or (kernel_peak is None):
        w, p = api.dotfinder.recommend_kernel_params(binsize)
        logging.info(
            f"Using kernel parameters w={w}, p={p} recommended for binsize {binsize}"
        )
    else:
        w, p = kernel_width, kernel_peak
        # add some sanity check for w,p:
        if not w > p:
            raise ValueError(
                f"Wrong inner/outer kernel parameters w={w}, p={p}")
        logging.info(f"Using kernel parameters w={w}, p={p} provided by user")

    # once kernel parameters are setup check max_nans_tolerated
    # to make sure kernel footprints overlaping 1 side with the
    # NaNs filled row/column are not "allowed"
    # this requires dynamic adjustment for the "shrinking donut"
    if not max_nans_tolerated <= 2 * w:
        raise ValueError("Too many NaNs allowed!")
    # may lead to scoring the same pixel twice, - i.e. duplicates.

    # generate standard kernels - consider providing custom ones
    kernels = {k: api.dotfinder.get_kernel(w, p, k) for k in ktypes}

    # list of tile coordinate ranges
    tiles = list(
        api.dotfinder.heatmap_tiles_generator_diag(clr, view_df, w,
                                                   tile_size_bins,
                                                   loci_separation_bins))

    # lambda-chunking edges ...
    if not 40 <= num_lambda_chunks <= 50:
        raise ValueError("Incompatible num_lambda_chunks")
    base = 2**(1 / 3)
    ledges = np.concatenate((
        [-np.inf],
        np.logspace(
            0,
            num_lambda_chunks - 1,
            num=num_lambda_chunks,
            base=base,
            dtype=np.float64,
        ),
        [np.inf],
    ))

    # 1. Calculate genome-wide histograms of scores.
    gw_hist = api.dotfinder.scoring_and_histogramming_step(
        clr,
        expected.set_index(["region1", "region2", "dist"]),
        expected_value_col,
        clr_weight_name,
        tiles,
        kernels,
        ledges,
        max_nans_tolerated,
        loci_separation_bins,
        nproc,
        verbose,
    )

    if verbose:
        logging.info("Done building histograms ...")

    # 2. Determine the FDR thresholds.
    threshold_df, qvalues = api.dotfinder.determine_thresholds(
        kernels, ledges, gw_hist, fdr)

    # 3. Filter using FDR thresholds calculated in the histogramming step
    filtered_pixels = api.dotfinder.scoring_and_extraction_step(
        clr,
        expected.set_index(["region1", "region2", "dist"]),
        expected_value_col,
        clr_weight_name,
        tiles,
        kernels,
        ledges,
        threshold_df,
        max_nans_tolerated,
        balance_factor,
        loci_separation_bins,
        op.join(op.dirname(out_prefix),
                op.basename(out_prefix) + ".enriched.tsv"),
        nproc,
        verbose,
        bin1_id_name="bin1_id",
        bin2_id_name="bin2_id",
    )

    # 4. Post-processing
    if verbose:
        logging.info(
            f"Begin post-processing of {len(filtered_pixels)} filtered pixels")
        logging.info("preparing to extract needed q-values ...")

    filtered_pixels_qvals = api.dotfinder.annotate_pixels_with_qvalues(
        filtered_pixels, qvalues, kernels)
    # 4a. clustering
    ########################################################################
    # Clustering has to be done using annotated DataFrame of filtered pixels
    # why ? - because - clustering has to be done independently for every region!
    ########################################################################
    filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals,
                                                clr.bins()[:])
    filtered_pixels_annotated = assign_regions(filtered_pixels_annotated,
                                               view_df)
    # consider reseting index here
    centroids = api.dotfinder.clustering_step(
        filtered_pixels_annotated,
        view_df["name"],
        dots_clustering_radius,
        verbose,
    )

    # 4b. filter by enrichment and qval
    postprocessed_calls = api.dotfinder.thresholding_step(centroids)

    # Final-postprocessed result
    if out_prefix is not None:

        postprocessed_fname = op.join(
            op.dirname(out_prefix),
            op.basename(out_prefix) + ".postproc.bedpe")

        postprocessed_calls.to_csv(postprocessed_fname,
                                   sep="\t",
                                   header=True,
                                   index=False,
                                   compression=None)
Exemplo n.º 20
0
def cis_expected(clr,
                 regions,
                 field="balanced",
                 chunksize=1000000,
                 use_dask=True,
                 ignore_diags=2):
    """
    Compute the mean signal along diagonals of one or more regional blocks of
    intra-chromosomal contact matrices. Typically used as a background model
    for contact frequencies on the same polymer chain.

    Parameters
    ----------
    clr : cooler.Cooler
        Input Cooler
    regions : iterable of genomic regions or pairs of regions
        Iterable of genomic region strings or 3-tuples, or 5-tuples for pairs
        of regions
    field : str, optional
        Which values of the contact matrix to aggregate. This is currently a
        no-op. *FIXME*
    chunksize : int, optional
        Size of dask chunks.

    Returns
    -------
    Dataframe of diagonal statistics, indexed by region and diagonal number

    """
    warnings.warn(
        "`cooltools.expected.cis_expected()` is deprecated in 0.3.2, will be removed subsequently. "
        "Use `cooltools.expected.diagsum()` and `cooltools.expected.diagsum_asymm()` instead.",
        category=FutureWarning,
        stacklevel=2,
    )

    def _bg2slice_frame(bg2, region1, region2):
        """
        Slice a dataframe with columns ['chrom1', 'start1', 'end1', 'chrom2',
        'start2', 'end2']. Assumes no proper nesting of intervals.

        [Warning] this function does not follow the same logic as
        cooler.matrix.fetch when start/end are at the edges of the bins.
        """
        chrom1, start1, end1 = region1
        chrom2, start2, end2 = region2
        if end1 is None:
            end1 = np.inf
        if end2 is None:
            end2 = np.inf
        out = bg2[(bg2["chrom1"] == chrom1)
                  & (bg2["start1"] >= start1)
                  & (bg2["end1"] < end1)
                  & (bg2["chrom2"] == chrom2)
                  & (bg2["start2"] >= start2)
                  & (bg2["end2"] < end2)]
        return out

    import dask.dataframe as dd
    from cooler.sandbox.dask import read_table

    if use_dask:
        pixels = read_table(clr.uri + "/pixels", chunksize=chunksize)
    else:
        pixels = clr.pixels()[:]
    pixels = cooler.annotate(pixels, clr.bins(), replace=False)
    pixels = pixels[pixels.chrom1 == pixels.chrom2]

    named_regions = False
    if isinstance(regions, pd.DataFrame):
        named_regions = True
        chroms = regions["chrom"].values
        names = regions["name"].values
        regions = regions[["chrom", "start", "end"]].to_records(index=False)
    else:
        chroms = [region[0] for region in regions]
        names = chroms
    cis_maps = {chrom: pixels[pixels.chrom1 == chrom] for chrom in chroms}

    diag_tables = []
    data_sums = []

    for region in regions:
        if len(region) == 1:
            chrom, = region
            start1, end1 = 0, clr.chromsizes[chrom]
            start2, end2 = start1, end1
        elif len(region) == 3:
            chrom, start1, end1 = region
            start2, end2 = start1, end1
        elif len(region) == 5:
            chrom, start1, end1, start2, end2 = region
        else:
            raise ValueError("Regions must be sequences of length 1, 3 or 5")

        bins = clr.bins().fetch(chrom).reset_index(drop=True)
        bad_mask = np.array(bins["weight"].isnull())
        lo1, hi1 = clr.extent((chrom, start1, end1))
        lo2, hi2 = clr.extent((chrom, start2, end2))
        co = clr.offset(chrom)
        lo1 -= co
        lo2 -= co
        hi1 -= co
        hi2 -= co

        dt = make_diag_table(bad_mask, [lo1, hi1], [lo2, hi2])
        sel = _bg2slice_frame(cis_maps[chrom], (chrom, start1, end1),
                              (chrom, start2, end2)).copy()
        sel["diag"] = sel["bin2_id"] - sel["bin1_id"]
        sel["balanced"] = sel["count"] * sel["weight1"] * sel["weight2"]
        agg = _sum_diagonals(sel, field)
        diag_tables.append(dt)
        data_sums.append(agg)

    # run dask scheduler
    if len(data_sums) and isinstance(data_sums[0], dd.Series):
        data_sums = dd.compute(*data_sums)

    # append to tables
    for dt, agg in zip(diag_tables, data_sums):
        dt[agg.name] = 0
        dt[agg.name] = dt[agg.name].add(agg, fill_value=0)
        dt.iloc[:ignore_diags, dt.columns.get_loc(agg.name)] = np.nan

    # merge and return
    if named_regions:
        dtable = pd.concat(diag_tables,
                           keys=zip(names, chroms),
                           names=["name", "chrom"])
    else:
        dtable = pd.concat(diag_tables, keys=list(chroms), names=["chrom"])

    # the actual expected is balanced.sum/n_valid:
    dtable["balanced.avg"] = dtable["balanced.sum"] / dtable["n_valid"]
    return dtable
Exemplo n.º 21
0
def score_tile(tile_cij, clr, cis_exp, exp_v_name, bal_v_name, kernels,
               nans_tolerated, band_to_cover, verbose):
    """
    The main working function that given a tile of a heatmap, applies kernels to
    perform convolution to calculate locally-adjusted expected and then
    calculates a p-value for every meaningfull pixel against these l.a. expected
    values.
    
    Parameters
    ----------
    tile_cij : tuple
        Tuple of 3: chromosome name, tile span row-wise, tile span column-wise: 
        (chrom, tile_i, tile_j), where tile_i = (start_i, end_i), and 
        tile_j = (start_j, end_j).
    clr : cooler
        Cooler object to use to extract Hi-C heatmap data.
    cis_exp : pandas.DataFrame
        DataFrame with 1 dimensional expected, indexed with 'chrom' and 'diag'.
    exp_v_name : str
        Name of a value column in expected DataFrame
    bal_v_name : str
        Name of a value column with balancing weights in a cooler.bins() 
        DataFrame. Typically 'weight'.
    kernels : dict
        A dictionary with keys being kernels names and values being ndarrays 
        representing those kernels.
    nans_tolerated : int
        Number of NaNs tolerated in a footprint of every kernel.
    band_to_cover : int
        Results would be stored only for pixels connecting loci closer than 
        'band_to_cover'.
    verbose : bool
        Enable verbose output.
        
    Returns
    -------
    res_df : pandas.DataFrame
        results: annotated pixels with calculated locally adjusted expected 
        for every kernels, observed, precalculated pvalues, number of NaNs in 
        footprint of every kernels, all of that in a form of an annotated
        pixels DataFrame for eligible pixels of a given tile.

    """
    # unpack tile's coordinates
    chrom, tilei, tilej = tile_cij
    origin = (tilei[0], tilej[0])

    # we have to do it for every tile, because
    # chrom is not known apriori (maybe move outside):
    lazy_exp = LazyToeplitz(cis_exp.loc[chrom][exp_v_name].values)
    
    # RAW observed matrix slice:
    observed = clr.matrix(balance=False)[slice(*tilei), slice(*tilej)]
    # expected as a rectangular tile :
    expected = lazy_exp[slice(*tilei), slice(*tilej)]
    # slice of balance_weight for row-span and column-span :
    bal_weight_i = clr.bins()[slice(*tilei)][bal_v_name].values
    bal_weight_j = clr.bins()[slice(*tilej)][bal_v_name].values
    
    # do the convolutions
    result = dotfinder.get_adjusted_expected_tile_some_nans(
        origin=origin,
        observed=observed,
        expected=expected,
        bal_weight=(bal_weight_i,bal_weight_j),
        kernels=kernels,
        verbose=verbose)

    # Post-processing filters
    # (1) exclude pixels that connect loci further than 'band_to_cover' apart: 
    is_inside_band = (result["bin1_id"] > (result["bin2_id"]-band_to_cover))

    # (2) identify pixels that pass number of NaNs compliance test for ALL kernels:
    does_comply_nans = np.all(
        result[["la_exp."+k+".nnans" for k in kernels]] < nans_tolerated, 
        axis=1)
    # so, selecting inside band and nNaNs compliant results:
    # ( drop dropping index maybe ??? ) ...
    res_df = result[is_inside_band & does_comply_nans].reset_index(drop=True)
    # do Poisson tests:
    get_pval = lambda la_exp : 1.0 - poisson.cdf(res_df["obs.raw"], la_exp)
    for k in kernels:
        res_df["la_exp."+k+".pval"] = get_pval( res_df["la_exp."+k+".value"] )
    
    # annotate and return
    return cooler.annotate(res_df.reset_index(drop=True), clr.bins()[:])
Exemplo n.º 22
0
def cis_expected(clr, regions, field='balanced', chunksize=1000000, 
                 use_dask=True, ignore_diags=2):
    """
    Compute the mean signal along diagonals of one or more regional blocks of
    intra-chromosomal contact matrices. Typically used as a background model 
    for contact frequencies on the same polymer chain. 

    Parameters
    ----------
    clr : cooler.Cooler
        Input Cooler
    regions : iterable of genomic regions or pairs of regions
        Iterable of genomic region strings or 3-tuples, or 5-tuples for pairs
        of regions
    field : str, optional
        Which values of the contact matrix to aggregate. This is currently a
        no-op. *FIXME*
    chunksize : int, optional
        Size of dask chunks.

    Returns
    -------
    Dataframe of diagonal statistics, indexed by region and diagonal number

    """
    if use_dask:
        pixels = daskify(clr.filename, clr.root + '/pixels', chunksize=chunksize)
    else:
        pixels = clr.pixels()[:]
    pixels = cooler.annotate(pixels, clr.bins(), replace=False)
    pixels = pixels[pixels.chrom1 == pixels.chrom2]

    named_regions = False
    if isinstance(regions, pd.DataFrame):
        named_regions = True
        chroms = regions['chrom'].values
        names = regions['name'].values
        regions = regions[['chrom', 'start', 'end']].to_records(index=False)
    else:
        chroms = [region[0] for region in regions]
        names = chroms
    cis_maps = {chrom: pixels[pixels.chrom1==chrom] for chrom in chroms}

    diag_tables = []
    data_sums = []

    for region in regions:
        if len(region) == 1:
            chrom, = region
            start1, end1 = 0, clr.chromsizes[chrom]
            start2, end2 = start1, end1
        elif len(region) == 3:
            chrom, start1, end1 = region
            start2, end2 = start1, end1
        elif len(region) == 5:
            chrom, start1, end1, start2, end2 = region
        else:
            raise ValueError("Regions must be sequences of length 1, 3 or 5")

        bins = clr.bins().fetch(chrom).reset_index(drop=True)
        bad_mask = np.array(bins['weight'].isnull())
        lo1, hi1 = clr.extent((chrom, start1, end1))
        lo2, hi2 = clr.extent((chrom, start2, end2))
        co = clr.offset(chrom)
        lo1 -= co
        lo2 -= co
        hi1 -= co
        hi2 -= co

        dt = make_diag_table(bad_mask, [lo1, hi1], [lo2, hi2])
        sel = bg2slice_frame(
            cis_maps[chrom], 
            (chrom, start1, end1), 
            (chrom, start2, end2)
        ).copy()
        sel['diag'] = sel['bin2_id'] - sel['bin1_id']
        sel['balanced'] = sel['count'] * sel['weight1'] * sel['weight2']
        agg = _sum_diagonals(sel, field)
        diag_tables.append(dt)
        data_sums.append(agg)

    # run dask scheduler
    if len(data_sums) and isinstance(data_sums[0], dd.Series):
        data_sums = dd.compute(*data_sums)

    # append to tables
    for dt, agg in zip(diag_tables, data_sums):
        dt[agg.name] = 0
        dt[agg.name] = dt[agg.name].add(agg, fill_value=0)
        dt.iloc[:ignore_diags, dt.columns.get_loc(agg.name)] = np.nan

    # merge and return
    if named_regions:
        dtable = pd.concat(
            diag_tables, 
            keys=zip(names, chroms), 
            names=['name', 'chrom'])
    else:
        dtable = pd.concat(
            diag_tables, 
            keys=list(chroms), 
            names=['chrom'])  

    # the actual expected is balanced.sum/n_valid:
    dtable['balanced.avg'] = dtable['balanced.sum'] / dtable['n_valid']
    return dtable
Exemplo n.º 23
0
def insul_diamond(pixel_query,
                  bins,
                  window=10,
                  ignore_diags=2,
                  norm_by_median=True):
    """
    Calculates the insulation score of a Hi-C interaction matrix.

    Parameters
    ----------
    pixel_query : RangeQuery object <TODO:update description>
        A table of Hi-C interactions. Must follow the Cooler columnar format:
        bin1_id, bin2_id, count, balanced (optional)).
    bins : pandas.DataFrame
        A table of bins, is used to determine the span of the matrix
        and the locations of bad bins.
    window : int
        The width (in bins) of the diamond window to calculate the insulation
        score.
    ignore_diags : int
        If > 0, the interactions at separations < `ignore_diags` are ignored
        when calculating the insulation score. Typically, a few first diagonals
        of the Hi-C map should be ignored due to contamination with Hi-C
        artifacts.
    norm_by_median : bool
        If True, normalize the insulation score by its NaN-median.
    """
    lo_bin_id = bins.index.min()
    hi_bin_id = bins.index.max() + 1
    N = hi_bin_id - lo_bin_id
    sum_counts = np.zeros(N)
    sum_balanced = np.zeros(N)

    n_pixels = get_n_pixels(bins.weight.isnull().values,
                            window=window,
                            ignore_diags=ignore_diags)

    for chunk_dict in pixel_query.read_chunked():
        chunk = pd.DataFrame(chunk_dict,
                             columns=["bin1_id", "bin2_id", "count"])
        diag_pixels = chunk[chunk.bin2_id - chunk.bin1_id <= (window - 1) * 2]

        diag_pixels = cooler.annotate(diag_pixels, bins[["weight"]])
        diag_pixels["balanced"] = (diag_pixels["count"] *
                                   diag_pixels["weight1"] *
                                   diag_pixels["weight2"])
        valid_pixel_mask = ~diag_pixels["balanced"].isnull().values

        i = diag_pixels.bin1_id.values - lo_bin_id
        j = diag_pixels.bin2_id.values - lo_bin_id

        for i_shift in range(0, window):
            for j_shift in range(0, window):
                if i_shift + j_shift < ignore_diags:
                    continue

                mask = ((i + i_shift == j - j_shift)
                        & (i + i_shift < N)
                        & (j - j_shift >= 0))

                sum_counts += np.bincount(i[mask] + i_shift,
                                          diag_pixels["count"].values[mask],
                                          minlength=N)

                sum_balanced += np.bincount(
                    i[mask & valid_pixel_mask] + i_shift,
                    diag_pixels["balanced"].values[mask & valid_pixel_mask],
                    minlength=N,
                )

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        score = sum_balanced / n_pixels

        if norm_by_median:
            score /= np.nanmedian(score)

    return score, n_pixels, sum_balanced, sum_counts
Exemplo n.º 24
0
def get_data(
    f,
    start_pos_1,
    end_pos_1,
    start_pos_2,
    end_pos_2,
    transform="default",
    resolution=None,
):
    """Get balanced pixel data.

    Args:
        f: h5py.File
            An HDF5 Group that contains the cooler for this resolution
        start_pos_1 (int): Test.
        end_pos_1 (int): Test.
        start_pos_2 (int): Test.
        end_pos_2 (int): Test.

    Returns:
        DataFrame: Annotated cooler pixels.
    """

    c = cooler.Cooler(f)

    (chroms, chrom_sizes,
     chrom_cum_lengths) = get_chromosome_names_cumul_lengths(c)

    i0 = abs_coord_2_bin(c, start_pos_1, chroms, chrom_cum_lengths,
                         chrom_sizes)
    i1 = abs_coord_2_bin(c, end_pos_1, chroms, chrom_cum_lengths, chrom_sizes)

    j0 = abs_coord_2_bin(c, start_pos_2, chroms, chrom_cum_lengths,
                         chrom_sizes)
    j1 = abs_coord_2_bin(c, end_pos_2, chroms, chrom_cum_lengths, chrom_sizes)

    matrix = c.matrix(as_pixels=True, balance=False, max_chunk=np.inf)

    if i0 >= matrix.shape[0] or j0 >= matrix.shape[1]:
        # query beyond the bounds of the matrix
        # return an empty matrix
        i0, i1, j0, j1 = 0, 0, 0, 0

        return (
            pd.DataFrame(
                columns=["genome_start1", "genome_start2", "balanced"]),
            (
                pd.DataFrame({
                    "genome_start": [],
                    "genome_end": [],
                    "weight": []
                }),
                pd.DataFrame({
                    "genome_start": [],
                    "genome_end": [],
                    "weight": []
                }),
            ),
        )
    else:
        # limit the range of the query to be within bounds
        i1 = min(i1, matrix.shape[0] - 1)
        j1 = min(j1, matrix.shape[1] - 1)

    pixels = matrix[i0:i1 + 1, j0:j1 + 1]
    """
    if not len(pixels):
        return (pd.DataFrame(columns=['genome_start1', 'genome_start2', 'balanced']), (None, None))
    """

    # select bin columns to extract
    cols = ["chrom", "start", "end"]
    if (transform == "default"
            and "weight" in c.bins()) or transform == "weight":
        cols.append("weight")
    elif transform in ("KR", "VC", "VC_SQRT"):
        cols.append(transform)

    bins = c.bins(convert_enum=False)[cols]
    pixels = cooler.annotate(pixels, bins)

    pixels["genome_start1"] = chrom_cum_lengths[
        pixels["chrom1"]] + pixels["start1"]
    pixels["genome_start2"] = chrom_cum_lengths[
        pixels["chrom2"]] + pixels["start2"]

    bins1 = bins[i0:i1 + 1]
    bins2 = bins[j0:j1 + 1]

    bins1["genome_start"] = chrom_cum_lengths[bins1["chrom"]] + bins1["start"]
    bins2["genome_start"] = chrom_cum_lengths[bins2["chrom"]] + bins2["start"]

    bins1["genome_end"] = chrom_cum_lengths[bins1["chrom"]] + bins1["end"]
    bins2["genome_end"] = chrom_cum_lengths[bins2["chrom"]] + bins2["end"]

    # apply transform
    if (transform == "default"
            and "weight" in c.bins()) or transform == "weight":
        pixels["balanced"] = pixels["count"] * pixels["weight1"] * pixels[
            "weight2"]

        return (pixels[["genome_start1", "genome_start2",
                        "balanced"]], (bins1, bins2))
    elif transform in ("KR", "VC", "VC_SQRT"):
        pixels["balanced"] = (pixels["count"] / pixels[transform + "1"] /
                              pixels[transform + "2"])

        bins1["weight"] = bins1[transform]
        bins2["weight"] = bins2[transform]

        return (pixels[["genome_start1", "genome_start2",
                        "balanced"]], (bins1, bins2))
    else:
        return (pixels[["genome_start1", "genome_start2",
                        "count"]], (None, None))
Exemplo n.º 25
0
def call_dots(
    cool_path,
    expected_path,
    expected_name,
    weight_name,
    nproc,
    max_loci_separation,
    max_nans_tolerated,
    tile_size,
    kernel_width,
    kernel_peak,
    num_lambda_chunks,
    fdr,
    dots_clustering_radius,
    verbose,
    output_scores,
    output_hists,
    output_calls,
    score_dump_mode,
    temp_dir,
    no_delete_temp,
):
    """
    Call dots on a Hi-C heatmap that are not larger than max_loci_separation.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map.

    EXPECTED_PATH : The paths to a tsv-like file with expected signal.

    Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and
    therefore these chromosomes must be a subset of chromosomes referred to in
    COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial,
    i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH
    before applying this script.

    COOL_PATH and EXPECTED_PATH must be binned at the same resolution.

    EXPECTED_PATH must contain at least the following columns for cis contacts:
    'chrom', 'diag', 'n_valid', value_name. value_name is controlled using
    options. Header must be present in a file.

    """
    clr = cooler.Cooler(cool_path)

    expected_columns = ["chrom", "diag", "n_valid", expected_name]
    expected_index = ["chrom", "diag"]
    expected_dtypes = {
        "chrom": np.str,
        "diag": np.int64,
        "n_valid": np.int64,
        expected_name: np.float64,
    }
    expected = pd.read_table(
        expected_path,
        usecols=expected_columns,
        dtype=expected_dtypes,
        comment=None,
        verbose=verbose,
    )
    expected.set_index(expected_index, inplace=True)

    # Input validation
    # unique list of chroms mentioned in expected_path
    # do simple column-name validation for now
    get_exp_chroms = lambda df: df.index.get_level_values("chrom").unique()
    expected_chroms = get_exp_chroms(expected)
    if not set(expected_chroms).issubset(clr.chromnames):
        raise ValueError(
            "Chromosomes in {} must be subset of ".format(expected_path) +
            "chromosomes in cooler {}".format(cool_path))
    # check number of bins
    # compute # of bins by comparing matching indexes
    get_exp_bins = lambda df, ref_chroms: (df.index.get_level_values("chrom").
                                           isin(ref_chroms).sum())
    expected_bins = get_exp_bins(expected, expected_chroms)
    cool_bins = clr.bins()[:]["chrom"].isin(expected_chroms).sum()
    if not (expected_bins == cool_bins):
        raise ValueError(
            "Number of bins is not matching: ",
            "{} in {}, and {} in {} for chromosomes {}".format(
                expected_bins, expected_path, cool_bins, cool_path,
                expected_chroms),
        )
    if verbose:
        print("{} and {} passed cross-compatibility checks.".format(
            cool_path, expected_path))

    # Prepare some parameters.
    binsize = clr.binsize
    loci_separation_bins = int(max_loci_separation / binsize)
    tile_size_bins = int(tile_size / binsize)
    balance_factor = 1.0  # clr._load_attrs("bins/weight")["scale"]

    # clustering would deal with bases-units for now, so supress this for now
    # clustering_radius_bins = int(dots_clustering_radius/binsize)

    # kernels
    # 'upright' is a symmetrical inversion of "lowleft", not needed.
    ktypes = ["donut", "vertical", "horizontal", "lowleft"]

    if (kernel_width is None) or (kernel_peak is None):
        w, p = dotfinder.recommend_kernel_params(binsize)
        print("Using kernel parameters w={}, p={} recommended for binsize {}".
              format(w, p, binsize))
    else:
        w, p = kernel_width, kernel_peak
        # add some sanity check for w,p:
        assert w > p, "Wrong inner/outer kernel parameters w={}, p={}".format(
            w, p)
        print("Using kernel parameters w={}, p={} provided by user".format(
            w, p))

    # once kernel parameters are setup check max_nans_tolerated
    # to make sure kernel footprints overlaping 1 side with the
    # NaNs filled row/column are not "allowed"
    # this requires dynamic adjustment for the "shrinking donut"
    assert max_nans_tolerated <= 2 * w, "Too many NaNs allowed!"
    # may lead to scoring the same pixel twice, - i.e. duplicates.

    # generate standard kernels - consider providing custom ones
    kernels = {k: dotfinder.get_kernel(w, p, k) for k in ktypes}

    # list of tile coordinate ranges
    tiles = list(
        dotfinder.heatmap_tiles_generator_diag(clr, expected_chroms, w,
                                               tile_size_bins,
                                               loci_separation_bins))

    # lambda-chunking edges ...
    assert dotfinder.HiCCUPS_W1_MAX_INDX <= num_lambda_chunks <= 50
    base = 2**(1 / 3)
    ledges = np.concatenate((
        [-np.inf],
        np.logspace(
            0,
            num_lambda_chunks - 1,
            num=num_lambda_chunks,
            base=base,
            dtype=np.float,
        ),
        [np.inf],
    ))

    # 1. Calculate genome-wide histograms of scores.
    gw_hist = dotfinder.scoring_and_histogramming_step(
        clr,
        expected,
        expected_name,
        weight_name,
        tiles,
        kernels,
        ledges,
        max_nans_tolerated,
        loci_separation_bins,
        nproc,
        verbose,
    )

    if verbose:
        print("Done building histograms ...")

    # 2. Determine the FDR thresholds.
    threshold_df, qvalues = dotfinder.determine_thresholds(
        kernels, ledges, gw_hist, fdr)

    # 3. Filter using FDR thresholds calculated in the histogramming step
    filtered_pixels = dotfinder.scoring_and_extraction_step(
        clr,
        expected,
        expected_name,
        weight_name,
        tiles,
        kernels,
        ledges,
        threshold_df,
        max_nans_tolerated,
        balance_factor,
        loci_separation_bins,
        output_calls,
        nproc,
        verbose,
    )

    # 4. Post-processing
    if verbose:
        print("Begin post-processing of {} filtered pixels".format(
            len(filtered_pixels)))
        print("preparing to extract needed q-values ...")

    filtered_pixels_qvals = dotfinder.annotate_pixels_with_qvalues(
        filtered_pixels, qvalues, kernels)
    # 4a. clustering
    ########################################################################
    # Clustering has to be done using annotated DataFrame of filtered pixels
    # why ? - because - clustering has to be done chromosome by chromosome !
    ########################################################################
    filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals,
                                                clr.bins()[:])
    centroids = dotfinder.clustering_step(filtered_pixels_annotated,
                                          expected_chroms,
                                          dots_clustering_radius, verbose)

    # 4b. filter by enrichment and qval
    postprocessed_calls = dotfinder.thresholding_step(centroids)

    # Final-postprocessed result
    if output_calls is not None:

        postprocessed_fname = op.join(op.dirname(output_calls),
                                      op.basename(output_calls) + ".postproc")

        postprocessed_calls.to_csv(postprocessed_fname,
                                   sep="\t",
                                   header=True,
                                   index=False,
                                   compression=None)
Exemplo n.º 26
0
def call_dots(
    cool_path,
    expected_path,
    regions,
    expected_name,
    weight_name,
    nproc,
    max_loci_separation,
    max_nans_tolerated,
    tile_size,
    kernel_width,
    kernel_peak,
    num_lambda_chunks,
    fdr,
    dots_clustering_radius,
    verbose,
    out_prefix,
):
    """
    Call dots on a Hi-C heatmap that are not larger than max_loci_separation.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map.

    EXPECTED_PATH : The paths to a tsv-like file with expected cis-expected.

    Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and
    therefore these chromosomes must be a subset of chromosomes referred to in
    COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial,
    i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH
    before applying this script.

    COOL_PATH and EXPECTED_PATH must be binned at the same resolution.

    EXPECTED_PATH must contain at least the following columns for cis contacts:
    'region', 'diag', 'n_valid', value_name. value_name is controlled using
    options. Header must be present in a file.

    """
    clr = cooler.Cooler(cool_path)

    # preliminary SCHEMA for cis-expected
    region_column_name = "region"
    expected_columns = [region_column_name, "diag", "n_valid", expected_name]
    expected_dtypes = {
        region_column_name: np.str,
        "diag": np.int64,
        "n_valid": np.int64,
        expected_name: np.float64,
    }

    try:
        expected = pd.read_table(
            expected_path,
            usecols=expected_columns,
            dtype=expected_dtypes,
            comment=None,
            verbose=verbose,
        )
    except ValueError as e:
        raise ValueError(
            "input expected does not match the schema\n"
            "tab-separated expected file must have a header as wel")
    expected_index = [
        region_column_name,
        "diag",
    ]
    expected.set_index(expected_index, inplace=True)
    # end of SCHEMA for cis-expected

    # Optional reading region table provided by the user:
    if regions is None:
        try:
            uniq_regions = expected.index.get_level_values(
                region_column_name).unique()
            regions_table = bioframe.parse_regions(uniq_regions,
                                                   clr.chromsizes)
            regions_table["name"] = regions_table["chrom"]
        except ValueError as e:
            print(e)
            raise ValueError(
                "Cannot interpret regions from EXPECTED_PATH\n"
                "specify regions definitions using --regions option.")
    else:
        # Flexible reading of the regions table:
        regions_buf, names = util.sniff_for_header(regions)
        regions_table = pd.read_csv(regions_buf, sep="\t", header=None)
        if regions_table.shape[1] not in (3, 4):
            raise ValueError(
                "The region file does not have three or four tab-delimited columns."
                "We expect a bed file with columns chrom, start, end, and optional name"
            )
        if regions_table.shape[1] == 4:
            regions_table = regions_table.rename(columns={
                0: "chrom",
                1: "start",
                2: "end",
                3: "name"
            })
            regions_table = bioframe.parse_regions(regions_table)
        else:
            regions_table = regions_table.rename(columns={
                0: "chrom",
                1: "start",
                2: "end"
            })
            regions_table = bioframe.parse_regions(regions_table)
        regions_table = regions_table[regions_table["chrom"].isin(
            clr.chromnames)].reset_index(drop=True)

    # Verify appropriate columns order (required for heatmap_tiles_generator_diag):
    regions_table = regions_table[["chrom", "start", "end", "name"]]

    # Input validation
    get_exp_regions = lambda df: df.index.get_level_values(region_column_name
                                                           ).unique()
    expected_regions = get_exp_regions(expected)

    # unique list of regions mentioned in expected_path
    # are also in regions table
    if not set(expected_regions).issubset(regions_table["name"]):
        raise ValueError(
            "Regions in {} must be subset of ".format(expected_path) +
            f"regions in {'regions table'+regions_path if not regions_path is None else 'cooler'}"
        )

    # check number of bins per region in cooler and expected table
    # compute # of bins by comparing matching indexes
    try:
        for region_name, group in expected.reset_index().groupby(
                region_column_name):
            n_diags = group.shape[0]
            region = regions_table.set_index("name").loc[region_name]
            lo, hi = clr.extent(region)
            assert n_diags == (hi - lo)
    except AssertionError:
        raise ValueError("Region shape mismatch between expected and cooler. "
                         "Are they using the same resolution?")
    # All the checks have passed:
    if verbose:
        print("{} and {} passed cross-compatibility checks.".format(
            cool_path, expected_path))

    # by now we have a usable region_table and expected for most scenarios

    # Prepare some parameters.
    binsize = clr.binsize
    loci_separation_bins = int(max_loci_separation / binsize)
    tile_size_bins = int(tile_size / binsize)
    balance_factor = 1.0  # clr._load_attrs("bins/weight")["scale"]

    # clustering would deal with bases-units for now, so supress this for now
    # clustering_radius_bins = int(dots_clustering_radius/binsize)

    # kernels
    # 'upright' is a symmetrical inversion of "lowleft", not needed.
    ktypes = ["donut", "vertical", "horizontal", "lowleft"]

    if (kernel_width is None) or (kernel_peak is None):
        w, p = dotfinder.recommend_kernel_params(binsize)
        print(
            f"Using kernel parameters w={w}, p={p} recommended for binsize {binsize}"
        )
    else:
        w, p = kernel_width, kernel_peak
        # add some sanity check for w,p:
        assert w > p, f"Wrong inner/outer kernel parameters w={w}, p={p}"
        print(f"Using kernel parameters w={w}, p={p} provided by user")

    # once kernel parameters are setup check max_nans_tolerated
    # to make sure kernel footprints overlaping 1 side with the
    # NaNs filled row/column are not "allowed"
    # this requires dynamic adjustment for the "shrinking donut"
    assert max_nans_tolerated <= 2 * w, "Too many NaNs allowed!"
    # may lead to scoring the same pixel twice, - i.e. duplicates.

    # generate standard kernels - consider providing custom ones
    kernels = {k: dotfinder.get_kernel(w, p, k) for k in ktypes}

    # list of tile coordinate ranges
    tiles = list(
        dotfinder.heatmap_tiles_generator_diag(clr, regions_table, w,
                                               tile_size_bins,
                                               loci_separation_bins))

    # lambda-chunking edges ...
    assert dotfinder.HiCCUPS_W1_MAX_INDX <= num_lambda_chunks <= 50
    base = 2**(1 / 3)
    ledges = np.concatenate((
        [-np.inf],
        np.logspace(
            0,
            num_lambda_chunks - 1,
            num=num_lambda_chunks,
            base=base,
            dtype=np.float,
        ),
        [np.inf],
    ))

    # 1. Calculate genome-wide histograms of scores.
    gw_hist = dotfinder.scoring_and_histogramming_step(
        clr,
        expected,
        expected_name,
        weight_name,
        tiles,
        kernels,
        ledges,
        max_nans_tolerated,
        loci_separation_bins,
        nproc,
        verbose,
    )

    if verbose:
        print("Done building histograms ...")

    # 2. Determine the FDR thresholds.
    threshold_df, qvalues = dotfinder.determine_thresholds(
        kernels, ledges, gw_hist, fdr)

    # 3. Filter using FDR thresholds calculated in the histogramming step
    filtered_pixels = dotfinder.scoring_and_extraction_step(
        clr,
        expected,
        expected_name,
        weight_name,
        tiles,
        kernels,
        ledges,
        threshold_df,
        max_nans_tolerated,
        balance_factor,
        loci_separation_bins,
        op.join(op.dirname(out_prefix),
                op.basename(out_prefix) + ".enriched.tsv"),
        nproc,
        verbose,
        bin1_id_name="bin1_id",
        bin2_id_name="bin2_id",
    )

    # 4. Post-processing
    if verbose:
        print(
            f"Begin post-processing of {len(filtered_pixels)} filtered pixels")
        print("preparing to extract needed q-values ...")

    filtered_pixels_qvals = dotfinder.annotate_pixels_with_qvalues(
        filtered_pixels, qvalues, kernels)
    # 4a. clustering
    ########################################################################
    # Clustering has to be done using annotated DataFrame of filtered pixels
    # why ? - because - clustering has to be done independently for every region!
    ########################################################################
    filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals,
                                                clr.bins()[:])
    filtered_pixels_annotated = assign_regions(filtered_pixels_annotated,
                                               regions_table)
    # consider reseting index here
    centroids = dotfinder.clustering_step(filtered_pixels_annotated,
                                          expected_regions,
                                          dots_clustering_radius, verbose)

    # 4b. filter by enrichment and qval
    postprocessed_calls = dotfinder.thresholding_step(centroids)

    # Final-postprocessed result
    if out_prefix is not None:

        postprocessed_fname = op.join(
            op.dirname(out_prefix),
            op.basename(out_prefix) + ".postproc.bedpe")

        postprocessed_calls.to_csv(postprocessed_fname,
                                   sep="\t",
                                   header=True,
                                   index=False,
                                   compression=None)