예제 #1
0
def test_roundtrip(f_hm, f_cool):
    chromsizes = cooler.read_chromsizes(
        "http://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes",
        name_patterns=(r"^chr[0-9]+$", r"chrX$"),
    )
    binsize = 2000000
    bintable = cooler.binnify(chromsizes, binsize)

    heatmap = np.load(f_hm)
    reader = cooler.create.ArrayLoader(bintable, heatmap, 100000)
    cooler.create.create(f_cool, bintable, reader, assembly="hg19")

    h5 = h5py.File(f_cool, "r")
    new_chromtable = cooler.api.chroms(h5)
    assert np.all(chromsizes.index == new_chromtable["name"])

    new_bintable = cooler.api.bins(h5)
    assert np.all(bintable == new_bintable)

    info = cooler.api.info(h5)
    assert info["genome-assembly"] == "hg19"
    assert info["bin-type"] == "fixed"
    assert info["bin-size"] == binsize

    mat = cooler.api.matrix(h5, 0, 100, 0, 100, "count", balance=False)
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100, :100], mat)

    mat = cooler.Cooler(h5).matrix("count", balance=False)[:100, :100]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100, :100], mat)

    mat = cooler.api.matrix(h5, 100, 200, 100, 200, "count", balance=False)
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200, 100:200], mat)

    mat = cooler.Cooler(h5).matrix("count", balance=False)[100:200, 100:200]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200, 100:200], mat)

    try:
        os.remove(f_cool)
    except OSError:
        pass
예제 #2
0
def test_roundtrip(f_hm, f_cool):
    chromsizes = cooler.read_chromsizes(
        'https://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes',
        name_patterns=(r'^chr[0-9]+$', r'chrX$'))
    binsize = 2000000
    bintable = cooler.binnify(chromsizes, binsize)

    heatmap = np.load(f_hm)
    reader = cooler.create.ArrayLoader(bintable, heatmap, 100000)
    cooler.create.create(f_cool, bintable, reader, assembly='hg19')

    h5 = h5py.File(f_cool, 'r')
    new_chromtable = cooler.api.chroms(h5)
    assert np.all(chromsizes.index == new_chromtable['name'])

    new_bintable = cooler.api.bins(h5)
    assert np.all(bintable == new_bintable)

    info = cooler.api.info(h5)
    assert info['genome-assembly'] == 'hg19'
    assert info['bin-type'] == 'fixed'
    assert info['bin-size'] == binsize

    mat = cooler.api.matrix(h5, 0, 100, 0, 100, 'count', balance=False)
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100, :100], mat)

    mat = cooler.Cooler(h5).matrix('count', balance=False)[:100, :100]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100, :100], mat)

    mat = cooler.api.matrix(h5, 100, 200, 100, 200, 'count', balance=False)
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200, 100:200], mat)

    mat = cooler.Cooler(h5).matrix('count', balance=False)[100:200, 100:200]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200, 100:200], mat)

    try:
        os.remove(f_cool)
    except OSError:
        pass
예제 #3
0
def test_roundtrip():
    chromsizes = cooler.read_chromsizes(
        'https://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes',
        name_patterns=(r'^chr[0-9]+$', r'chrX$'))
    binsize = 2000000
    bintable = cooler.binnify(chromsizes, binsize)

    heatmap = np.load(
        os.path.join(testdir, 'data', 'IMR90-MboI-matrix.2000kb.npy'))
    reader = cooler.io.DenseLoader(heatmap)
    cooler.io.create(testfile_path,
                     chromsizes,
                     bintable,
                     reader,
                     assembly='hg19')

    h5 = h5py.File(testfile_path, 'r')
    new_chromtable = cooler.chroms(h5)
    assert np.all(chromsizes.index == new_chromtable['name'])

    new_bintable = cooler.bins(h5)
    assert np.all(bintable == new_bintable)

    info = cooler.info(h5)
    assert info['genome-assembly'] == 'hg19'
    assert info['bin-type'] == 'fixed'
    assert info['bin-size'] == binsize

    mat = cooler.matrix(h5, 0, 100, 0, 100, 'count', balance=False)
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100, :100], mat)

    mat = cooler.Cooler(h5).matrix('count', balance=False)[:100, :100]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100, :100], mat)

    mat = cooler.matrix(h5, 100, 200, 100, 200, 'count', balance=False)
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200, 100:200], mat)

    mat = cooler.Cooler(h5).matrix('count', balance=False)[100:200, 100:200]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200, 100:200], mat)
예제 #4
0
def test_roundtrip():
    chromsizes = cooler.read_chromsizes(
        'https://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes',
        name_patterns=(r'^chr[0-9]+$', r'chrX$'))
    chroms, lengths = zip(*iteritems(chromsizes))

    binsize = 2000000
    bintable = cooler.binnify(chromsizes, binsize)

    heatmap = np.load(os.path.join(testdir, 'data', 'IMR90-MboI-matrix.2000kb.npy'))
    with h5py.File(testfile_path, 'w') as h5:
        reader = cooler.io.DenseLoader(heatmap)
        cooler.io.create(h5, chroms, lengths, bintable, reader, assembly='hg19')

    h5 = h5py.File(testfile_path, 'r')
    new_chromtable = cooler.chroms(h5)
    assert np.all(chromsizes.index == new_chromtable['name'])

    new_bintable = cooler.bins(h5)
    assert np.all(bintable == new_bintable)

    info = cooler.info(h5)
    assert info['genome-assembly'] == 'hg19'
    assert info['bin-type'] == 'fixed'
    assert info['bin-size'] == binsize

    mat = cooler.matrix(h5, 0, 100, 0, 100, 'count')
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100,:100], mat.toarray())

    mat = cooler.Cooler(h5).matrix('count')[:100, :100]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100,:100], mat.toarray())

    mat = cooler.matrix(h5, 100, 200, 100, 200, 'count')
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200,100:200], mat.toarray())

    mat = cooler.Cooler(h5).matrix('count')[100:200, 100:200]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200,100:200], mat.toarray())
예제 #5
0
from __future__ import division, print_function
import json

import numpy as np
import pandas as pd
import cooler
import h5py
import time

TILESIZE = 256

where = np.flatnonzero
chromsizes = cooler.read_chromsizes(
    "http://s3.amazonaws.com/pkerp/data/mm9/chromInfo.txt"
)  # defaults to reading chr#,X,Y,M
chromosomes = list(chromsizes.keys())
chromid_map = dict(zip(chromosomes, range(len(chromosomes))))
cumul_lengths = np.r_[0, np.cumsum(chromsizes)]


def absCoord2bin(c, pos):
    try:
        cid = where(cumul_lengths > pos)[0] - 1
    except IndexError:
        return c.info["nbins"]
    chrom = chromosomes[cid]
    relPos = pos - cumul_lengths[cid]
    return c.offset((chrom, relPos, chromsizes[chrom]))


def getData(FILEPATH, zoomLevel, startPos1, endPos1, startPos2, endPos2):
예제 #6
0
        description="Output a genome segmentation of restriction fragments as a BED file.")
    parser.add_argument(
        "chromsizes",
        help="UCSC-like chromsizes file, with chromosomes in desired order",
        metavar="CHROMSIZES_PATH")
    parser.add_argument(
        "binsize",
        help="Resolution (bin size) in base pairs <int>",
        metavar="BINSIZE")
    parser.add_argument(
        "--out", "-o",
        help="Output file (defaults to stdout)")
    args = vars(parser.parse_args())

    binsize = int(args['binsize'])
    chromsizes = cooler.read_chromsizes(args['chromsizes'])
    bins = cooler.binnify(chromsizes, binsize)

    # Write output
    out = args['out']
    try:
        if out is None:
            f = sys.stdout
        else:
            f = open(out, 'wt')
        bins.to_csv(f, sep='\t', index=False, header=False)
    except OSError:
        pass
    finally:
        f.close()
def main():
    parser = argparse.ArgumentParser(description="""python matrix_storage_benchmark.py matrix_tsv""")
    parser.add_argument('matrix_tsv')
    parser.add_argument(
        '-s', '--square',
        default=256,
        type=int,
        help="The size of the square within which to return values")
    parser.add_argument(
        '-i', '--iterations',
        default=100,
        type=int,
        help="The number of times to run the range query")

    args = vars(parser.parse_args())
    binsize = 5000

    infilepath = args['matrix_tsv']
    outfilepath = op.join(op.dirname(infilepath), 'chrX.{}kb.cool'.format(binsize//1000))


    # Build "index"
    t1 = time.time()
    chromsizes = cooler.read_chromsizes('test/data/hg19.chrom.sizes')
    chroms = ['chrX']
    lengths = [chromsizes['chrX']]
    bins = cooler.binnify(chromsizes.loc['chrX':'chrX'], binsize)
    chunksize = int(100e6)
    reader = cooler.io.SparseLoader(infilepath, chunksize)
    h5opts = dict(compression='gzip', compression_opts=6)
    with h5py.File(outfilepath, 'w') as h5:
        cooler.io.create(h5, chroms, lengths, bins, reader, binsize, h5opts=h5opts)

    c = cooler.Cooler(outfilepath)
    print("Time creating index: {:.3f} seconds".format(time.time() - t1))


    # Normalization
    t15 = time.time()
    N_CPUS = 8
    chunksize = int(100e6)
    with h5py.File(outfilepath, 'r+') as h5, Pool(N_CPUS) as pool:
        bias = ice.iterative_correction(
            h5, chunksize=chunksize, tol=1e-05, mad_max=3,
            cis_only=False, ignore_diags=3, map=pool.map)

        h5opts = dict(compression='gzip', compression_opts=6)
        h5['bins'].create_dataset('weight', data=bias, **h5opts)
    print("Time for normalization (cis and trans): {:.3f} seconds".format(time.time() - t15))


    # The bounds of the contact coordinates
    c = cooler.Cooler(outfilepath)
    matrix = c.matrix()
    min_x = 0
    min_y = 0
    max_x = c.shape[0]
    max_y = c.shape[1]
    print("max_x:", max_x)
    print("max_y:", max_y)


    # Range queries
    square_size = args['square']
    t2 = time.time()

    for i in range(args['iterations']):
        point1 = random.randint(min_x, max_x - square_size)
        point2 = random.randint(min_y, max_y - square_size)
        mat = matrix[point1 : point1+square_size, point2 : point2+square_size]
        selected_points = list(zip(mat.row, mat.col, mat.data))

    t25 = time.time()
    print("Time performing range queries (256x256): {:.3f} seconds (per query): {:.3f} seconds".format(t25 - t2, (t25 - t2) / args['iterations']))

    weights = c.bins()['weight'][:].values
    for i in range(args['iterations']):
        point1 = random.randint(min_x, max_x - square_size)
        point2 = random.randint(min_y, max_y - square_size)
        mat = matrix[point1 : point1+square_size, point2 : point2+square_size]
        bias1 = weights[point1:point1+square_size]
        bias2 = weights[point2:point2+square_size]
        mat.data = bias1[mat.row] * bias2[mat.col] * mat.data
        selected_points = list(zip(mat.row, mat.col, mat.data))

    t26 = time.time()
    print("Time performing range queries (256x256) with balancing: {:.3f} seconds (per query): {:.3f} seconds".format(t26 - t25, (t26 - t25) / args['iterations']))

    for i in range(args['iterations']):
        point1 = random.randint(min_x, max_x - square_size * 8)
        point2 = random.randint(min_y, max_y - square_size * 8)
        mat = matrix[point1 : point1+square_size*8, point2 : point2+square_size*8]
        selected_points = list(zip(mat.row, mat.col, mat.data))

    t3 = time.time()
    print("Time performing range queries (2048 x 2048): {:.3f} seconds (per query): {:.3f} seconds".format(t3 - t26, (t3 - t26) / args['iterations']))

    weights = c.bins()['weight'][:].values
    for i in range(args['iterations']):
        point1 = random.randint(min_x, max_x - square_size * 8)
        point2 = random.randint(min_y, max_y - square_size * 8)
        mat = matrix[point1 : point1+square_size*8, point2 : point2+square_size*8]
        selected_points = list(zip(mat.row, mat.col, mat.data))

    t35 = time.time()
    print("Time performing range queries (2048 x 2048) with balancing: {:.3f} seconds (per query): {:.3f} seconds".format(t35 - t3, (t35 - t3) / args['iterations']))

    for i in range(args['iterations']):
        point1 = random.randint(min_x, max_x - square_size)
        mat = matrix[point1, :]
        selected_points = list(zip(mat.row, mat.col, mat.data))

    t4 = time.time()
    print("Time slicing across first dimension: {:.3f} seconds (per query): {:.3f} seconds".format(t4 - t35, (t4 - t35) / args['iterations']))

    for i in range(args['iterations']):
        point2 = random.randint(min_y, max_y - square_size)
        mat = matrix[:, point2]
        selected_points = list(zip(mat.row, mat.col, mat.data))

    t5 = time.time()
    print("Time slicing across second dimension: {:.3f} seconds (per query): {:.3f} seconds".format(t5 - t4, (t5 - t4) / args['iterations']))

    selected_points = []
    for i in range(args['iterations']):
        for pix in c.pixels().iterchunks(size=1000000):
            diag = pix[pix.bin1_id == pix.bin2_id]
            selected_points.extend( list(zip(diag['bin1_id'], diag['bin2_id'], diag['count'])) )

    t6 = time.time()
    print("Time slicing across the diagonal: {:.3f} seconds (per query): {:.3f} seconds".format(t6 - t5, (t6 - t5) / args['iterations']))


    # Dump
    print("Size of index: {} bytes".format(op.getsize(outfilepath)))
    with open('/tmp/tmp.tsv', 'wt') as f:
        for pix in c.pixels().iterchunks(size=100000):
            pix.to_csv(f, sep='\t', index=False, header=False)
    print("Time outputting the index: {:.3f}".format(time.time() - t6))
    print("Size of output: {} bytes".format(op.getsize('/tmp/tmp.tsv')))