def test_roundtrip(f_hm, f_cool): chromsizes = cooler.read_chromsizes( "http://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes", name_patterns=(r"^chr[0-9]+$", r"chrX$"), ) binsize = 2000000 bintable = cooler.binnify(chromsizes, binsize) heatmap = np.load(f_hm) reader = cooler.create.ArrayLoader(bintable, heatmap, 100000) cooler.create.create(f_cool, bintable, reader, assembly="hg19") h5 = h5py.File(f_cool, "r") new_chromtable = cooler.api.chroms(h5) assert np.all(chromsizes.index == new_chromtable["name"]) new_bintable = cooler.api.bins(h5) assert np.all(bintable == new_bintable) info = cooler.api.info(h5) assert info["genome-assembly"] == "hg19" assert info["bin-type"] == "fixed" assert info["bin-size"] == binsize mat = cooler.api.matrix(h5, 0, 100, 0, 100, "count", balance=False) assert mat.shape == (100, 100) assert np.allclose(heatmap[:100, :100], mat) mat = cooler.Cooler(h5).matrix("count", balance=False)[:100, :100] assert mat.shape == (100, 100) assert np.allclose(heatmap[:100, :100], mat) mat = cooler.api.matrix(h5, 100, 200, 100, 200, "count", balance=False) assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200, 100:200], mat) mat = cooler.Cooler(h5).matrix("count", balance=False)[100:200, 100:200] assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200, 100:200], mat) try: os.remove(f_cool) except OSError: pass
def test_roundtrip(f_hm, f_cool): chromsizes = cooler.read_chromsizes( 'https://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes', name_patterns=(r'^chr[0-9]+$', r'chrX$')) binsize = 2000000 bintable = cooler.binnify(chromsizes, binsize) heatmap = np.load(f_hm) reader = cooler.create.ArrayLoader(bintable, heatmap, 100000) cooler.create.create(f_cool, bintable, reader, assembly='hg19') h5 = h5py.File(f_cool, 'r') new_chromtable = cooler.api.chroms(h5) assert np.all(chromsizes.index == new_chromtable['name']) new_bintable = cooler.api.bins(h5) assert np.all(bintable == new_bintable) info = cooler.api.info(h5) assert info['genome-assembly'] == 'hg19' assert info['bin-type'] == 'fixed' assert info['bin-size'] == binsize mat = cooler.api.matrix(h5, 0, 100, 0, 100, 'count', balance=False) assert mat.shape == (100, 100) assert np.allclose(heatmap[:100, :100], mat) mat = cooler.Cooler(h5).matrix('count', balance=False)[:100, :100] assert mat.shape == (100, 100) assert np.allclose(heatmap[:100, :100], mat) mat = cooler.api.matrix(h5, 100, 200, 100, 200, 'count', balance=False) assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200, 100:200], mat) mat = cooler.Cooler(h5).matrix('count', balance=False)[100:200, 100:200] assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200, 100:200], mat) try: os.remove(f_cool) except OSError: pass
def test_roundtrip(): chromsizes = cooler.read_chromsizes( 'https://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes', name_patterns=(r'^chr[0-9]+$', r'chrX$')) binsize = 2000000 bintable = cooler.binnify(chromsizes, binsize) heatmap = np.load( os.path.join(testdir, 'data', 'IMR90-MboI-matrix.2000kb.npy')) reader = cooler.io.DenseLoader(heatmap) cooler.io.create(testfile_path, chromsizes, bintable, reader, assembly='hg19') h5 = h5py.File(testfile_path, 'r') new_chromtable = cooler.chroms(h5) assert np.all(chromsizes.index == new_chromtable['name']) new_bintable = cooler.bins(h5) assert np.all(bintable == new_bintable) info = cooler.info(h5) assert info['genome-assembly'] == 'hg19' assert info['bin-type'] == 'fixed' assert info['bin-size'] == binsize mat = cooler.matrix(h5, 0, 100, 0, 100, 'count', balance=False) assert mat.shape == (100, 100) assert np.allclose(heatmap[:100, :100], mat) mat = cooler.Cooler(h5).matrix('count', balance=False)[:100, :100] assert mat.shape == (100, 100) assert np.allclose(heatmap[:100, :100], mat) mat = cooler.matrix(h5, 100, 200, 100, 200, 'count', balance=False) assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200, 100:200], mat) mat = cooler.Cooler(h5).matrix('count', balance=False)[100:200, 100:200] assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200, 100:200], mat)
def test_roundtrip(): chromsizes = cooler.read_chromsizes( 'https://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes', name_patterns=(r'^chr[0-9]+$', r'chrX$')) chroms, lengths = zip(*iteritems(chromsizes)) binsize = 2000000 bintable = cooler.binnify(chromsizes, binsize) heatmap = np.load(os.path.join(testdir, 'data', 'IMR90-MboI-matrix.2000kb.npy')) with h5py.File(testfile_path, 'w') as h5: reader = cooler.io.DenseLoader(heatmap) cooler.io.create(h5, chroms, lengths, bintable, reader, assembly='hg19') h5 = h5py.File(testfile_path, 'r') new_chromtable = cooler.chroms(h5) assert np.all(chromsizes.index == new_chromtable['name']) new_bintable = cooler.bins(h5) assert np.all(bintable == new_bintable) info = cooler.info(h5) assert info['genome-assembly'] == 'hg19' assert info['bin-type'] == 'fixed' assert info['bin-size'] == binsize mat = cooler.matrix(h5, 0, 100, 0, 100, 'count') assert mat.shape == (100, 100) assert np.allclose(heatmap[:100,:100], mat.toarray()) mat = cooler.Cooler(h5).matrix('count')[:100, :100] assert mat.shape == (100, 100) assert np.allclose(heatmap[:100,:100], mat.toarray()) mat = cooler.matrix(h5, 100, 200, 100, 200, 'count') assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200,100:200], mat.toarray()) mat = cooler.Cooler(h5).matrix('count')[100:200, 100:200] assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200,100:200], mat.toarray())
from __future__ import division, print_function import json import numpy as np import pandas as pd import cooler import h5py import time TILESIZE = 256 where = np.flatnonzero chromsizes = cooler.read_chromsizes( "http://s3.amazonaws.com/pkerp/data/mm9/chromInfo.txt" ) # defaults to reading chr#,X,Y,M chromosomes = list(chromsizes.keys()) chromid_map = dict(zip(chromosomes, range(len(chromosomes)))) cumul_lengths = np.r_[0, np.cumsum(chromsizes)] def absCoord2bin(c, pos): try: cid = where(cumul_lengths > pos)[0] - 1 except IndexError: return c.info["nbins"] chrom = chromosomes[cid] relPos = pos - cumul_lengths[cid] return c.offset((chrom, relPos, chromsizes[chrom])) def getData(FILEPATH, zoomLevel, startPos1, endPos1, startPos2, endPos2):
description="Output a genome segmentation of restriction fragments as a BED file.") parser.add_argument( "chromsizes", help="UCSC-like chromsizes file, with chromosomes in desired order", metavar="CHROMSIZES_PATH") parser.add_argument( "binsize", help="Resolution (bin size) in base pairs <int>", metavar="BINSIZE") parser.add_argument( "--out", "-o", help="Output file (defaults to stdout)") args = vars(parser.parse_args()) binsize = int(args['binsize']) chromsizes = cooler.read_chromsizes(args['chromsizes']) bins = cooler.binnify(chromsizes, binsize) # Write output out = args['out'] try: if out is None: f = sys.stdout else: f = open(out, 'wt') bins.to_csv(f, sep='\t', index=False, header=False) except OSError: pass finally: f.close()
def main(): parser = argparse.ArgumentParser(description="""python matrix_storage_benchmark.py matrix_tsv""") parser.add_argument('matrix_tsv') parser.add_argument( '-s', '--square', default=256, type=int, help="The size of the square within which to return values") parser.add_argument( '-i', '--iterations', default=100, type=int, help="The number of times to run the range query") args = vars(parser.parse_args()) binsize = 5000 infilepath = args['matrix_tsv'] outfilepath = op.join(op.dirname(infilepath), 'chrX.{}kb.cool'.format(binsize//1000)) # Build "index" t1 = time.time() chromsizes = cooler.read_chromsizes('test/data/hg19.chrom.sizes') chroms = ['chrX'] lengths = [chromsizes['chrX']] bins = cooler.binnify(chromsizes.loc['chrX':'chrX'], binsize) chunksize = int(100e6) reader = cooler.io.SparseLoader(infilepath, chunksize) h5opts = dict(compression='gzip', compression_opts=6) with h5py.File(outfilepath, 'w') as h5: cooler.io.create(h5, chroms, lengths, bins, reader, binsize, h5opts=h5opts) c = cooler.Cooler(outfilepath) print("Time creating index: {:.3f} seconds".format(time.time() - t1)) # Normalization t15 = time.time() N_CPUS = 8 chunksize = int(100e6) with h5py.File(outfilepath, 'r+') as h5, Pool(N_CPUS) as pool: bias = ice.iterative_correction( h5, chunksize=chunksize, tol=1e-05, mad_max=3, cis_only=False, ignore_diags=3, map=pool.map) h5opts = dict(compression='gzip', compression_opts=6) h5['bins'].create_dataset('weight', data=bias, **h5opts) print("Time for normalization (cis and trans): {:.3f} seconds".format(time.time() - t15)) # The bounds of the contact coordinates c = cooler.Cooler(outfilepath) matrix = c.matrix() min_x = 0 min_y = 0 max_x = c.shape[0] max_y = c.shape[1] print("max_x:", max_x) print("max_y:", max_y) # Range queries square_size = args['square'] t2 = time.time() for i in range(args['iterations']): point1 = random.randint(min_x, max_x - square_size) point2 = random.randint(min_y, max_y - square_size) mat = matrix[point1 : point1+square_size, point2 : point2+square_size] selected_points = list(zip(mat.row, mat.col, mat.data)) t25 = time.time() print("Time performing range queries (256x256): {:.3f} seconds (per query): {:.3f} seconds".format(t25 - t2, (t25 - t2) / args['iterations'])) weights = c.bins()['weight'][:].values for i in range(args['iterations']): point1 = random.randint(min_x, max_x - square_size) point2 = random.randint(min_y, max_y - square_size) mat = matrix[point1 : point1+square_size, point2 : point2+square_size] bias1 = weights[point1:point1+square_size] bias2 = weights[point2:point2+square_size] mat.data = bias1[mat.row] * bias2[mat.col] * mat.data selected_points = list(zip(mat.row, mat.col, mat.data)) t26 = time.time() print("Time performing range queries (256x256) with balancing: {:.3f} seconds (per query): {:.3f} seconds".format(t26 - t25, (t26 - t25) / args['iterations'])) for i in range(args['iterations']): point1 = random.randint(min_x, max_x - square_size * 8) point2 = random.randint(min_y, max_y - square_size * 8) mat = matrix[point1 : point1+square_size*8, point2 : point2+square_size*8] selected_points = list(zip(mat.row, mat.col, mat.data)) t3 = time.time() print("Time performing range queries (2048 x 2048): {:.3f} seconds (per query): {:.3f} seconds".format(t3 - t26, (t3 - t26) / args['iterations'])) weights = c.bins()['weight'][:].values for i in range(args['iterations']): point1 = random.randint(min_x, max_x - square_size * 8) point2 = random.randint(min_y, max_y - square_size * 8) mat = matrix[point1 : point1+square_size*8, point2 : point2+square_size*8] selected_points = list(zip(mat.row, mat.col, mat.data)) t35 = time.time() print("Time performing range queries (2048 x 2048) with balancing: {:.3f} seconds (per query): {:.3f} seconds".format(t35 - t3, (t35 - t3) / args['iterations'])) for i in range(args['iterations']): point1 = random.randint(min_x, max_x - square_size) mat = matrix[point1, :] selected_points = list(zip(mat.row, mat.col, mat.data)) t4 = time.time() print("Time slicing across first dimension: {:.3f} seconds (per query): {:.3f} seconds".format(t4 - t35, (t4 - t35) / args['iterations'])) for i in range(args['iterations']): point2 = random.randint(min_y, max_y - square_size) mat = matrix[:, point2] selected_points = list(zip(mat.row, mat.col, mat.data)) t5 = time.time() print("Time slicing across second dimension: {:.3f} seconds (per query): {:.3f} seconds".format(t5 - t4, (t5 - t4) / args['iterations'])) selected_points = [] for i in range(args['iterations']): for pix in c.pixels().iterchunks(size=1000000): diag = pix[pix.bin1_id == pix.bin2_id] selected_points.extend( list(zip(diag['bin1_id'], diag['bin2_id'], diag['count'])) ) t6 = time.time() print("Time slicing across the diagonal: {:.3f} seconds (per query): {:.3f} seconds".format(t6 - t5, (t6 - t5) / args['iterations'])) # Dump print("Size of index: {} bytes".format(op.getsize(outfilepath))) with open('/tmp/tmp.tsv', 'wt') as f: for pix in c.pixels().iterchunks(size=100000): pix.to_csv(f, sep='\t', index=False, header=False) print("Time outputting the index: {:.3f}".format(time.time() - t6)) print("Size of output: {} bytes".format(op.getsize('/tmp/tmp.tsv')))