Пример #1
0
def random_intersection(x, y, genome_fn, shuffle_kwargs, intersect_kwargs):
    z = x.shuffle(g=genome_fn, **shuffle_kwargs)
    zz = z.intersect(y, stream=True, **intersect_kwargs)
    result = len(zz)
    helpers.close_or_delete(z)
    helpers.close_or_delete(zz)
    return result
Пример #2
0
def random_intersection(x, y, genome_fn, shuffle_kwargs, intersect_kwargs):
    z = x.shuffle(g=genome_fn, **shuffle_kwargs)
    zz = z.intersect(y, stream=True, **intersect_kwargs)
    result = len(zz)
    helpers.close_or_delete(z)
    helpers.close_or_delete(zz)
    return result
Пример #3
0
def random_jaccard(x, y, genome_fn, shuffle_kwargs=None, intersect_kwargs=None):
    """
    Shuffles `x`, using the chromsizes in `genome_fn` and any additional
    `shuffle_kwargs`, then call `jaccard()` with any additional
    `intersect_kwargs`.
    """
    if shuffle_kwargs is None:
        shuffle_kwargs = {}
    z = x.shuffle(g=genome_fn, **shuffle_kwargs)
    result = jaccard(z, y, intersect_kwargs)
    helpers.close_or_delete(z)
    return result
Пример #4
0
def random_jaccard(x,
                   y,
                   genome_fn,
                   shuffle_kwargs=None,
                   intersect_kwargs=None):
    """
    Shuffles `x`, using the chromsizes in `genome_fn` and any additional
    `shuffle_kwargs`, then call `jaccard()` with any additional
    `intersect_kwargs`.
    """
    if shuffle_kwargs is None:
        shuffle_kwargs = {}
    z = x.shuffle(g=genome_fn, **shuffle_kwargs)
    result = jaccard(z, y, intersect_kwargs)
    helpers.close_or_delete(z)
    return result
Пример #5
0
def jaccard(x, y, intersect_kwargs=None):
    """
    Returns the naive Jaccard statistic (intersection over union; both in bp)
    for BedTools `x` and `y`.

    Optionally specify `intersect_kwargs` to adjust how the intersection is
    calculated.

    See comments in source for details on multiprocessing support.
    """
    # Note: This function may run many thousands of times, so we need to be
    # really careful about cleaning up after each operation.
    #
    # It seems that IntervalFile leaks file handles when iterating -- say, when
    # we try to do sum(len(i) for i in bedtool). A workaround is to use an
    # IntervalIterator which cleans up when done.  IntervalIterators are used
    # automatically when specifying using stream=True, so we create a streaming
    # BedTool just before computing the total bp.
    #
    if intersect_kwargs is None:
        intersect_kwargs = {}

    # Streaming so we can iterate over features and then clean up
    intersection = x.intersect(y, stream=True, **intersect_kwargs)

    # To get the union with current BEDTools programs, we first cat the files
    # then do a merge.
    #
    # BedTool.cat() is a little more sophisticated, but returns a file-based
    # BedTool. So a streaming cat is implemented here:
    tmp = pybedtools.BedTool._tmp()
    z = open(tmp, "w")
    z.write(open(x.fn).read())
    z.write(open(y.fn).read())
    z.close()
    catted = pybedtools.BedTool(tmp)

    # Merge now requires a sort.
    sortd = catted.sort()

    # Streaming so we can iterate over features and then clean up
    union = sortd.merge(stream=True)

    # This is the part that leads to too many open file handles when working
    # with large numbers of files at once (10,000+).  But iterating over
    # a streaming BedTool works fine.
    numerator = sum(len(i) for i in intersection)
    denominator = sum(len(i) for i in union)

    # Be obsessive about deleting tempfiles created and filehandles opened.
    helpers.close_or_delete(intersection)
    helpers.close_or_delete(sortd)
    helpers.close_or_delete(union)
    helpers.close_or_delete(catted)

    return float(numerator) / denominator
Пример #6
0
def jaccard(x, y, intersect_kwargs=None):
    """
    Returns the naive Jaccard statistic (intersection over union; both in bp)
    for BedTools `x` and `y`.

    Optionally specify `intersect_kwargs` to adjust how the intersection is
    calculated.

    See comments in source for details on multiprocessing support.
    """
    # Note: This function may run many thousands of times, so we need to be
    # really careful about cleaning up after each operation.
    #
    # It seems that IntervalFile leaks file handles when iterating -- say, when
    # we try to do sum(len(i) for i in bedtool). A workaround is to use an
    # IntervalIterator which cleans up when done.  IntervalIterators are used
    # automatically when specifying using stream=True, so we create a streaming
    # BedTool just before computing the total bp.
    #
    if intersect_kwargs is None:
        intersect_kwargs = {}

    # Streaming so we can iterate over features and then clean up
    intersection = x.intersect(y, stream=True, **intersect_kwargs)

    # To get the union with current BEDTools programs, we first cat the files
    # then do a merge.
    #
    # BedTool.cat() is a little more sophisticated, but returns a file-based
    # BedTool. So a streaming cat is implemented here:
    tmp = pybedtools.BedTool._tmp()
    z = open(tmp, 'w')
    z.write(open(x.fn).read())
    z.write(open(y.fn).read())
    z.close()
    catted = pybedtools.BedTool(tmp)

    # Merge now requires a sort.
    sortd = catted.sort()

    # Streaming so we can iterate over features and then clean up
    union = sortd.merge(stream=True)

    # This is the part that leads to too many open file handles when working
    # with large numbers of files at once (10,000+).  But iterating over
    # a streaming BedTool works fine.
    numerator = sum(len(i) for i in intersection)
    denominator = sum(len(i) for i in union)

    # Be obsessive about deleting tempfiles created and filehandles opened.
    helpers.close_or_delete(intersection)
    helpers.close_or_delete(sortd)
    helpers.close_or_delete(union)
    helpers.close_or_delete(catted)

    return float(numerator) / denominator
Пример #7
0
def _parallel_wrap(orig_bedtool, shuffle_kwargs, genome_fn, method,
                   method_args, method_kwargs, sort=False, shuffle=True,
                   reduce_func=None):
    """
    Given a BedTool object `orig_bedtool`, call its `method` with `args` and
    `kwargs` and then call `reduce_func` on the results.

    See parallel_apply docstring for details

    """

    # note: be careful about cleaning up tempfiles
    if not shuffle:
        to_use = orig_bedtool
    else:
        shuffled = orig_bedtool.shuffle(g=genome_fn, **shuffle_kwargs)
        if sort:
            to_use = shuffled.sort()
            helpers.close_or_delete(shuffled)
        else:
            to_use = shuffled

    result = getattr(to_use, method)(*method_args, **method_kwargs)

    if shuffle:
        helpers.close_or_delete(to_use)

    if reduce_func:
        reduced = reduce_func(result)
        if isinstance(result, pybedtools.BedTool):
            helpers.close_or_delete(result)
        return reduced
    else:
        return result
Пример #8
0
def _parallel_wrap(orig_bedtool,
                   shuffle_kwargs,
                   genome_fn,
                   method,
                   method_args,
                   method_kwargs,
                   sort=False,
                   shuffle=True,
                   reduce_func=None):
    """
    Given a BedTool object `orig_bedtool`, call its `method` with `args` and
    `kwargs` and then call `reduce_func` on the results.

    See parallel_apply docstring for details

    """

    # note: be careful about cleaning up tempfiles
    if not shuffle:
        to_use = orig_bedtool
    else:
        shuffled = orig_bedtool.shuffle(g=genome_fn, **shuffle_kwargs)
        if sort:
            to_use = shuffled.sort()
            helpers.close_or_delete(shuffled)
        else:
            to_use = shuffled

    result = getattr(to_use, method)(*method_args, **method_kwargs)

    if shuffle:
        helpers.close_or_delete(to_use)

    if reduce_func:
        reduced = reduce_func(result)
        if isinstance(result, pybedtools.BedTool):
            helpers.close_or_delete(result)
        return reduced
    else:
        return result
Пример #9
0
def random_jaccard(x, y, genome_fn, shuffle_kwargs, jaccard_kwargs):
    z = x.shuffle(g=genome_fn, **shuffle_kwargs).sort()
    result = z.jaccard(y, **jaccard_kwargs)
    helpers.close_or_delete(z)
    return result