示例#1
0
def diagsum_asymm(clr, supports1, supports2, contact_type='cis',
                  transforms=None, chunksize=10000000, ignore_diags=2, map=map):
    """

    Intra-chromosomal diagonal summary statistics.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    supports : sequence of genomic range tuples
        Support regions for intra-chromosomal diagonal summation
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    chunksize : int, optional
        Size of pixel table chunks to process
    ignore_diags : int, optional
        Number of intial diagonals to exclude from statistics
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    dict of support region -> dataframe of diagonal statistics

    """
    spans = partition(0, len(clr.pixels()), chunksize)
    fields = ['count'] + list(transforms.keys())
    areas = list(zip(supports1, supports2))
    dtables = make_diag_tables(clr, areas)

    for dt in dtables.values():
        for field in fields:
            agg_name = '{}.sum'.format(field)
            dt[agg_name] = 0

    job = partial(_diagsum_asymm, clr, fields, transforms,
                  contact_type, supports1, supports2)
    results = map(job, spans)
    for result in results:
        for (i, j), agg in result.items():
            support1 = supports1[i]
            support2 = supports2[j]
            for field in fields:
                agg_name = '{}.sum'.format(field)
                dtables[support1, support2][agg_name] = \
                    dtables[support1, support2][agg_name].add(
                        agg[field], fill_value=0)

    if ignore_diags:
        for dt in dtables.values():
            for field in fields:
                agg_name = '{}.sum'.format(field)
                j = dt.columns.get_loc(agg_name)
                dt.iloc[:ignore_diags, j] = np.nan

    return dtables
示例#2
0
def blocksum_asymm(
    clr,
    regions1,
    regions2,
    transforms={},
    weight_name="weight",
    bad_bins=None,
    chunksize=1000000,
    map=map,
):
    """
    Summary statistics on rectangular blocks of genomic regions.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    regions1 : sequence of genomic range tuples
        "left"-side support regions for diagonal summation
    regions2 : sequence of genomic range tuples
        "right"-side support regions for diagonal summation
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    weight_name : str
        name of the balancing weight vector used to count
        "bad"(masked) pixels per block.
        Use `None` to avoid masking "bad" pixels.
    bad_bins : array-like
        a list of bins to ignore per support region.
        Combines with the list of bad bins from balacning
        weight.
    chunksize : int, optional
        Size of pixel table chunks to process
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    DataFrame with entries for each blocks: region1, region2, n_valid, count.sum

    """

    regions1 = bioframe.parse_regions(regions1, clr.chromsizes)
    regions2 = bioframe.parse_regions(regions2, clr.chromsizes)

    spans = partition(0, len(clr.pixels()), chunksize)
    fields = ["count"] + list(transforms.keys())

    # similar with diagonal summations, pre-generate a block_table listing
    # all of the rectangular blocks and "n_valid" number of pixels per each block:
    records = make_block_table(
        clr, regions1, regions2, weight_name=weight_name, bad_bins=bad_bins
    )

    # combine masking with existing transforms and add a "count" transform:
    if bad_bins is not None:
        # turn bad_bins into a mask of size clr.bins:
        mask_size = len(clr.bins())
        bad_bins_mask = np.ones(mask_size, dtype=int)
        bad_bins_mask[bad_bins] = 0
        #
        masked_transforms = {}
        bin1 = "bin1_id"
        bin2 = "bin2_id"
        for field in fields:
            if field in transforms:
                # combine masking and transform, minding the scope:
                t = transforms[field]
                masked_transforms[field] = (
                    lambda p, t=t, m=bad_bins_mask: t(p) * m[p[bin1]] * m[p[bin2]]
                )
            else:
                # presumably field == "count", mind the scope as well:
                masked_transforms[field] = (
                    lambda p, f=field, m=bad_bins_mask: p[f] * m[p[bin1]] * m[p[bin2]]
                )
        # substitute transforms to the masked_transforms:
        transforms = masked_transforms

    job = partial(
        _blocksum_asymm, clr, fields, transforms, regions1.values, regions2.values
    )
    results = map(job, spans)
    for result in results:
        for i, agg in result.items():
            for field in fields:
                agg_name = "{}.sum".format(field)
                s = agg[field].item()
                if not np.isnan(s):
                    n1 = regions1.loc[i, "name"]
                    n2 = regions2.loc[i, "name"]
                    records[n1, n2][agg_name] += s

    # returning a dataframe for API consistency:
    return pd.DataFrame(
        [{"region1": n1, "region2": n2, **rec} for (n1, n2), rec in records.items()],
        columns=["region1", "region2", "n_valid", "count.sum"]
        + [k + ".sum" for k in transforms.keys()],
    )
示例#3
0
def diagsum_asymm(
    clr,
    regions1,
    regions2,
    transforms={},
    weight_name="weight",
    bad_bins=None,
    chunksize=10000000,
    map=map,
):
    """

    Diagonal summary statistics.

    Matchings elements of `regions1` and  `regions2` define
    asymmetric rectangular blocks for calculating diagonal
    summary statistics.
    Only intra-chromosomal blocks are supported.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    regions1 : sequence of genomic range tuples
        "left"-side support regions for diagonal summation
    regions2 : sequence of genomic range tuples
        "right"-side support regions for diagonal summation
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    weight_name : str
        name of the balancing weight vector used to count
        "bad"(masked) pixels per diagonal.
        Use `None` to avoid masking "bad" pixels.
    bad_bins : array-like
        a list of bins to ignore per support region.
        Combines with the list of bad bins from balacning
        weight.
    chunksize : int, optional
        Size of pixel table chunks to process
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    DataFrame with summary statistic of every diagonal of every block:
    region1, region2, diag, n_valid, count.sum

    """
    spans = partition(0, len(clr.pixels()), chunksize)
    fields = ["count"] + list(transforms.keys())
    regions1 = bioframe.parse_regions(regions1, clr.chromsizes)
    regions2 = bioframe.parse_regions(regions2, clr.chromsizes)

    dtables = make_diag_tables(
        clr, regions1, regions2, weight_name=weight_name, bad_bins=bad_bins
    )

    # combine masking with existing transforms and add a "count" transform:
    if bad_bins is not None:
        # turn bad_bins into a mask of size clr.bins:
        mask_size = len(clr.bins())
        bad_bins_mask = np.ones(mask_size, dtype=int)
        bad_bins_mask[bad_bins] = 0
        #
        masked_transforms = {}
        bin1 = "bin1_id"
        bin2 = "bin2_id"
        for field in fields:
            if field in transforms:
                # combine masking and transform, minding the scope:
                t = transforms[field]
                masked_transforms[field] = (
                    lambda p, t=t, m=bad_bins_mask: t(p) * m[p[bin1]] * m[p[bin2]]
                )
            else:
                # presumably field == "count", mind the scope as well:
                masked_transforms[field] = (
                    lambda p, f=field, m=bad_bins_mask: p[f] * m[p[bin1]] * m[p[bin2]]
                )
        # substitute transforms to the masked_transforms:
        transforms = masked_transforms

    for dt in dtables.values():
        for field in fields:
            agg_name = "{}.sum".format(field)
            dt[agg_name] = 0

    job = partial(
        _diagsum_asymm, clr, fields, transforms, regions1.values, regions2.values
    )
    results = map(job, spans)
    for result in results:
        for i, agg in result.items():
            region1 = regions1.loc[i, "name"]
            region2 = regions2.loc[i, "name"]
            for field in fields:
                agg_name = "{}.sum".format(field)
                dtables[region1, region2][agg_name] = dtables[region1, region2][
                    agg_name
                ].add(agg[field], fill_value=0)

    # returning a dataframe for API consistency:
    result = []
    for (i, j), dtable in dtables.items():
        dtable = dtable.reset_index()
        dtable.insert(0, "region1", i)
        dtable.insert(1, "region2", j)
        result.append(dtable)
    return pd.concat(result).reset_index(drop=True)
示例#4
0
def diagsum(
    clr,
    regions,
    transforms={},
    weight_name="weight",
    bad_bins=None,
    chunksize=10000000,
    ignore_diags=2,
    map=map,
):
    """

    Intra-chromosomal diagonal summary statistics.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    regions : sequence of genomic range tuples
        Support regions for intra-chromosomal diagonal summation
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    weight_name : str
        name of the balancing weight vector used to count
        "bad"(masked) pixels per diagonal.
        Use `None` to avoid masking "bad" pixels.
    bad_bins : array-like
        a list of bins to ignore per support region.
        Combines with the list of bad bins from balacning
        weight.
    chunksize : int, optional
        Size of pixel table chunks to process
    ignore_diags : int, optional
        Number of intial diagonals to exclude from statistics
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    Dataframe of diagonal statistics for all regions

    """
    spans = partition(0, len(clr.pixels()), chunksize)
    fields = ["count"] + list(transforms.keys())

    regions = bioframe.parse_regions(regions, clr.chromsizes)

    dtables = make_diag_tables(clr, regions, weight_name=weight_name, bad_bins=bad_bins)

    # combine masking with existing transforms and add a "count" transform:
    if bad_bins is not None:
        # turn bad_bins into a mask of size clr.bins:
        mask_size = len(clr.bins())
        bad_bins_mask = np.ones(mask_size, dtype=int)
        bad_bins_mask[bad_bins] = 0
        #
        masked_transforms = {}
        bin1 = "bin1_id"
        bin2 = "bin2_id"
        for field in fields:
            if field in transforms:
                # combine masking and transform, minding the scope:
                t = transforms[field]
                masked_transforms[field] = (
                    lambda p, t=t, m=bad_bins_mask: t(p) * m[p[bin1]] * m[p[bin2]]
                )
            else:
                # presumably field == "count", mind the scope as well:
                masked_transforms[field] = (
                    lambda p, f=field, m=bad_bins_mask: p[f] * m[p[bin1]] * m[p[bin2]]
                )
        # substitute transforms to the masked_transforms:
        transforms = masked_transforms

    for dt in dtables.values():
        for field in fields:
            agg_name = "{}.sum".format(field)
            dt[agg_name] = 0

    job = partial(_diagsum_symm, clr, fields, transforms, regions.values)
    results = map(job, spans)
    for result in results:
        for i, agg in result.items():
            region = regions.loc[i, "name"]
            for field in fields:
                agg_name = "{}.sum".format(field)
                dtables[region][agg_name] = dtables[region][agg_name].add(
                    agg[field], fill_value=0
                )

    if ignore_diags:
        for dt in dtables.values():
            for field in fields:
                agg_name = "{}.sum".format(field)
                j = dt.columns.get_loc(agg_name)
                dt.iloc[:ignore_diags, j] = np.nan

    # returning dataframe for API consistency
    result = []
    for i, dtable in dtables.items():
        dtable = dtable.reset_index()
        dtable.insert(0, "region", i)
        result.append(dtable)
    return pd.concat(result).reset_index(drop=True)
def compute_expected(c,
                     binsize,
                     drop_diags,
                     chunksize,
                     map_impl=map,
                     regions=None,
                     smooth_factor=None):
    bins = c.bins()[:]

    if regions is None:
        # names = [item[0] for item in bins.groupby('chrom', sort=False)]
        groups = [item[1] for item in bins.groupby('chrom', sort=False)]
    else:
        groups = []
        g = bins.groupby('chrom', sort=False)
        for _, region in regions.iterrows():
            # names.extend([region['name']] * len(g))
            groups.append(
                bedslice(g, region['chrom'], region['start'], region['end']))

    n_bins_per_group = [len(g) for g in groups]
    bad_bins_per_group = [where(np.isnan(g['weight'].values)) for g in groups]

    # initialize
    ex = bins[['chrom']].copy()
    # ex['name'] = bins
    ex['diag'] = bins['start'] // binsize
    ex['balanced'] = 0
    ex['bad'] = list(
        concat(
            map_impl(count_bad_pixels_per_diag, n_bins_per_group,
                     bad_bins_per_group)))
    ex['total'] = list(
        concat(map_impl(count_all_pixels_per_diag, n_bins_per_group)))

    # split records into chunks
    args = partition(0, len(c.pixels()), chunksize)

    # apply + combine
    combined = pandas.concat(map_impl(partial(_accum_by_cisdiag, c, bins),
                                      args),
                             axis=0,
                             ignore_index=True)
    combined = combined.groupby(['chrom', 'diag']).sum()

    ex = ex.set_index(['chrom', 'diag'])
    ex = ex.add(combined, fill_value=0)
    ex = ex.reset_index()

    if smooth_factor is not None:
        ex['balanced'] = apply_by_chrom(
            ex, lambda c, g: logsmooth(g['balanced'], smooth_factor))
        ex['balanced2'] = apply_by_chrom(
            ex, lambda c, g: logsmooth(g['balanced2'], smooth_factor))

    # average over valid elements
    n = ex['total'] - ex['bad']
    ex['average'] = ex['balanced'] / n
    ex['std'] = np.sqrt(ex['balanced2'] / n - (ex['balanced'] / n)**2)

    # mask out bad diagonals
    ex.loc[ex['diag'] < drop_diags, 'average'] = np.nan
    ex.loc[ex['diag'] < drop_diags, 'std'] = np.nan

    return ex
示例#6
0
def blocksum_pairwise(clr,
                      supports,
                      transforms=None,
                      weight_name="weight",
                      bad_bins=None,
                      chunksize=1000000,
                      map=map):
    """
    Summary statistics on inter-chromosomal rectangular blocks.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    supports : sequence of genomic range tuples
        Support regions for summation. Blocks for all pairs of support regions
        will be used.
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    weight_name : str
        name of the balancing weight vector used to count
        "bad"(masked) pixels per block.
        Use `None` to avoid masking "bad" pixels.
    bad_bins : array-like
        a list of bins to ignore per support region.
        Overwrites inference of bad bins from balacning
        weight [to be implemented].
    chunksize : int, optional
        Size of pixel table chunks to process
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    dict of support region -> (field name -> summary)

    """

    blocks = list(combinations(supports, 2))
    supports1, supports2 = list(zip(*blocks))
    spans = partition(0, len(clr.pixels()), chunksize)
    fields = ["count"] + list(transforms.keys())

    n_tot = count_all_pixels_per_block(clr, supports)
    n_bad = count_bad_pixels_per_block(clr,
                                       supports,
                                       weight_name=weight_name,
                                       bad_bins=bad_bins)
    records = {(c1, c2): defaultdict(int) for (c1, c2) in blocks}
    for c1, c2 in blocks:
        records[c1, c2]["n_valid"] = n_tot[c1, c2] - n_bad[c1, c2]

    job = partial(_blocksum_asymm, clr, fields, transforms, supports1,
                  supports2)
    results = map(job, spans)
    for result in results:
        for (i, j), agg in result.items():
            for field in fields:
                agg_name = "{}.sum".format(field)
                s = agg[field].item()
                if not np.isnan(s):
                    records[supports1[i], supports2[j]][agg_name] += s

    return records
示例#7
0
def diagsum_asymm(clr,
                  supports1,
                  supports2,
                  contact_type="cis",
                  transforms=None,
                  weight_name="weight",
                  bad_bins=None,
                  chunksize=10000000,
                  ignore_diags=2,
                  map=map):
    """

    Intra-chromosomal diagonal summary statistics.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    supports : sequence of genomic range tuples
        Support regions for intra-chromosomal diagonal summation
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    weight_name : str
        name of the balancing weight vector used to count
        "bad"(masked) pixels per diagonal.
        Use `None` to avoid masking "bad" pixels.
    bad_bins : array-like
        a list of bins to ignore per support region.
        Overwrites inference of bad bins from balacning
        weight [to be implemented].
    chunksize : int, optional
        Size of pixel table chunks to process
    ignore_diags : int, optional
        Number of intial diagonals to exclude from statistics
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    dict of support region -> dataframe of diagonal statistics

    """
    spans = partition(0, len(clr.pixels()), chunksize)
    fields = ["count"] + list(transforms.keys())
    areas = list(zip(supports1, supports2))
    dtables = make_diag_tables(clr,
                               areas,
                               weight_name=weight_name,
                               bad_bins=bad_bins)

    for dt in dtables.values():
        for field in fields:
            agg_name = "{}.sum".format(field)
            dt[agg_name] = 0

    job = partial(_diagsum_asymm, clr, fields, transforms, contact_type,
                  supports1, supports2)
    results = map(job, spans)
    for result in results:
        for (i, j), agg in result.items():
            support1 = supports1[i]
            support2 = supports2[j]
            for field in fields:
                agg_name = "{}.sum".format(field)
                dtables[support1,
                        support2][agg_name] = dtables[support1,
                                                      support2][agg_name].add(
                                                          agg[field],
                                                          fill_value=0)

    if ignore_diags:
        for dt in dtables.values():
            for field in fields:
                agg_name = "{}.sum".format(field)
                j = dt.columns.get_loc(agg_name)
                dt.iloc[:ignore_diags, j] = np.nan

    return dtables
示例#8
0
def blocksum_pairwise(clr, supports, transforms=None, chunksize=1000000, map=map):
    """
    Summary statistics on inter-chromosomal rectangular blocks.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    supports : sequence of genomic range tuples
        Support regions for summation. Blocks for all pairs of support regions
        will be used.
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    chunksize : int, optional
        Size of pixel table chunks to process
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    dict of support region -> (field name -> summary)

    """    
    def n_total_block_elements(clr, supports):
        n = len(supports)
        x = [clr.extent(region)[1] - clr.extent(region)[0] 
                 for region in supports]
        blocks = {}
        for i in range(n):
            for j in range(i + 1, n):
                blocks[supports[i], supports[j]] = x[i] * x[j]
        return blocks

    def n_bad_block_elements(clr, supports):
        n = 0
        # bad bins are ones with
        # the weight vector being NaN:
        x = [np.sum(clr.bins()['weight']
                       .fetch(region)
                       .isnull()
                       .astype(int)
                       .values)
                 for region in supports]
        blocks = {}
        for i in range(len(x)):
            for j in range(i + 1, len(x)):
                blocks[supports[i], supports[j]] = x[i] * x[j]
        return blocks

    blocks = list(combinations(supports, 2))
    supports1, supports2 = list(zip(*blocks))
    spans = partition(0, len(clr.pixels()), chunksize)
    fields = ['count'] + list(transforms.keys())

    n_tot = n_total_block_elements(clr, supports)
    n_bad = n_bad_block_elements(clr, supports)
    records = {(c1, c2): defaultdict(int) for (c1, c2) in blocks}    
    for c1, c2 in blocks:
        records[c1, c2]['n_valid'] = n_tot[c1, c2] - n_bad[c1, c2]
    
    job = partial(_blocksum_asymm, clr, fields, transforms, supports1, supports2)
    results = map(job, spans)
    for result in results:
        for (i, j), agg in result.items():
            for field in fields:
                agg_name = '{}.sum'.format(field)
                s = np.asscalar(agg[field])
                if not np.isnan(s):
                    records[supports1[i], supports2[j]][agg_name] += s
                
    return records