def test_aggregate_records(): bins = cooler.binnify( cooler.util.read_chromsizes(op.join(datadir, "toy.chrom.sizes")), 1 ) records = pd.read_csv( op.join(datadir, "toy.pairs"), sep='\t', names=[ "read_id", "chrom1", "pos1", "chrom2", "pos2", "strand1", "strand2", "value" ] ) sanitizer = sanitize_records( bins, schema="pairs", validate=False, tril_action="reflect", is_one_based=False, sort=False, ) chunk = sanitizer(records) aggregator = aggregate_records() aggregator(chunk)
def hdf2cool(infile, outfile, chrms_sizes, assembly='dm3', correct=True): """ This function converts hiclib .hdf5 to .cool file. Note that attributes "heatmap" (whole-genome heatmap), "resolution" and "genomeIdxToLabel" are required in .hdf5 file. :param infile: input .hdf5 file :param outfile: output .cool file :param chrms_sizes: tab-separated file with chromosome lengths :param assembly: genome assembly (dm3 by default) :param correct: iteratively correct the heatmap? (True by default) :return: Python cooler object which was written to the file. """ a = h5py.File(infile, 'r') heatmap = a['heatmap'].value chrms = list( map( lambda x: 'chr'+x if 'chr' not in x else x, pickle.loads(a['genomeIdxToLabel'].value).values() )) chromsizes = pd.read_csv(chrms_sizes, sep='\t', names=['name', 'length']).set_index('name').loc[chrms, 'length'] binsize = pickle.loads(a['resolution'].value) bins = cooler.binnify(chromsizes, binsize) iterator = cooler.io.ArrayLoader(bins, heatmap, binsize) cooler.io.create(outfile, bins, iterator, assembly=assembly) c = cooler.Cooler(outfile) if correct: bias, stats = cooler.ice.iterative_correction(c, store=c) return c
def _write_mtx(self, output_cooler, binsize, assembly_name, ch, chromsize): self.compute() chromsizes = pd.Series({ch: chromsize}, name='length') bins = cooler.binnify(chromsizes, binsize) pixels = cooler.io.ArrayLoader(bins, self._mtx, chunksize=10000000) cooler.io.create(output_cooler, bins, pixels, assembly=assembly_name)
def test_from_readhdf5(): # uniform bins binsize = 100 bintable = cooler.binnify(chromsizes, binsize) yield should_not_depend_on_chunksize, bintable yield should_raise_if_input_not_sorted, bintable yield should_work_with_int32_cols, bintable # non-uniform bins steps = [10, 100] bintable = _alternating_bins(chromsizes, steps) yield should_not_depend_on_chunksize, bintable yield should_raise_if_input_not_sorted, bintable yield should_work_with_int32_cols, bintable
def test_sanitize_pixels(): bins = cooler.binnify( cooler.util.read_chromsizes(op.join(datadir, "toy.chrom.sizes")), 1) chunk = pd.read_csv(op.join(datadir, "toy.symm.upper.1.zb.coo"), sep='\t', names=['bin1_id', 'bin2_id', 'count']) chunk['foo1'] = 4 chunk['foo2'] = 2 sanitize_pixels(bins, )(chunk.copy()) # one-based bin IDs out = sanitize_pixels( bins, is_one_based=True, )(chunk.copy()) assert (out['bin1_id'] == chunk['bin1_id'] - 1).all() # tril action: reflect (after swapping bin1, bin2) tril_chunk = chunk.copy() tril_chunk['bin2_id'] = chunk['bin1_id'] tril_chunk['bin1_id'] = chunk['bin2_id'] out = sanitize_pixels( bins, tril_action="reflect", sided_fields=['foo'], )(tril_chunk.copy()) assert len(out) == len(chunk) assert (out['foo2'] == chunk['foo1']).all() assert (out['foo1'] == chunk['foo2']).all() assert (out['bin1_id'] == chunk['bin1_id']).all() assert (out['bin2_id'] == chunk['bin2_id']).all() # tril action: drop out = sanitize_pixels( bins, tril_action="drop", )(tril_chunk.copy()) assert len(out) == 0 # tril action: raise with pytest.raises(BadInputError): sanitize_pixels( bins, tril_action="raise", )(tril_chunk.copy())
def test_roundtrip(f_hm, f_cool): chromsizes = cooler.read_chromsizes( "http://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes", name_patterns=(r"^chr[0-9]+$", r"chrX$"), ) binsize = 2000000 bintable = cooler.binnify(chromsizes, binsize) heatmap = np.load(f_hm) reader = cooler.create.ArrayLoader(bintable, heatmap, 100000) cooler.create.create(f_cool, bintable, reader, assembly="hg19") h5 = h5py.File(f_cool, "r") new_chromtable = cooler.api.chroms(h5) assert np.all(chromsizes.index == new_chromtable["name"]) new_bintable = cooler.api.bins(h5) assert np.all(bintable == new_bintable) info = cooler.api.info(h5) assert info["genome-assembly"] == "hg19" assert info["bin-type"] == "fixed" assert info["bin-size"] == binsize mat = cooler.api.matrix(h5, 0, 100, 0, 100, "count", balance=False) assert mat.shape == (100, 100) assert np.allclose(heatmap[:100, :100], mat) mat = cooler.Cooler(h5).matrix("count", balance=False)[:100, :100] assert mat.shape == (100, 100) assert np.allclose(heatmap[:100, :100], mat) mat = cooler.api.matrix(h5, 100, 200, 100, 200, "count", balance=False) assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200, 100:200], mat) mat = cooler.Cooler(h5).matrix("count", balance=False)[100:200, 100:200] assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200, 100:200], mat) try: os.remove(f_cool) except OSError: pass
def test_roundtrip(f_hm, f_cool): chromsizes = cooler.read_chromsizes( 'https://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes', name_patterns=(r'^chr[0-9]+$', r'chrX$')) binsize = 2000000 bintable = cooler.binnify(chromsizes, binsize) heatmap = np.load(f_hm) reader = cooler.create.ArrayLoader(bintable, heatmap, 100000) cooler.create.create(f_cool, bintable, reader, assembly='hg19') h5 = h5py.File(f_cool, 'r') new_chromtable = cooler.api.chroms(h5) assert np.all(chromsizes.index == new_chromtable['name']) new_bintable = cooler.api.bins(h5) assert np.all(bintable == new_bintable) info = cooler.api.info(h5) assert info['genome-assembly'] == 'hg19' assert info['bin-type'] == 'fixed' assert info['bin-size'] == binsize mat = cooler.api.matrix(h5, 0, 100, 0, 100, 'count', balance=False) assert mat.shape == (100, 100) assert np.allclose(heatmap[:100, :100], mat) mat = cooler.Cooler(h5).matrix('count', balance=False)[:100, :100] assert mat.shape == (100, 100) assert np.allclose(heatmap[:100, :100], mat) mat = cooler.api.matrix(h5, 100, 200, 100, 200, 'count', balance=False) assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200, 100:200], mat) mat = cooler.Cooler(h5).matrix('count', balance=False)[100:200, 100:200] assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200, 100:200], mat) try: os.remove(f_cool) except OSError: pass
def test_roundtrip(): chromsizes = cooler.read_chromsizes( 'https://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes', name_patterns=(r'^chr[0-9]+$', r'chrX$')) binsize = 2000000 bintable = cooler.binnify(chromsizes, binsize) heatmap = np.load( os.path.join(testdir, 'data', 'IMR90-MboI-matrix.2000kb.npy')) reader = cooler.io.DenseLoader(heatmap) cooler.io.create(testfile_path, chromsizes, bintable, reader, assembly='hg19') h5 = h5py.File(testfile_path, 'r') new_chromtable = cooler.chroms(h5) assert np.all(chromsizes.index == new_chromtable['name']) new_bintable = cooler.bins(h5) assert np.all(bintable == new_bintable) info = cooler.info(h5) assert info['genome-assembly'] == 'hg19' assert info['bin-type'] == 'fixed' assert info['bin-size'] == binsize mat = cooler.matrix(h5, 0, 100, 0, 100, 'count', balance=False) assert mat.shape == (100, 100) assert np.allclose(heatmap[:100, :100], mat) mat = cooler.Cooler(h5).matrix('count', balance=False)[:100, :100] assert mat.shape == (100, 100) assert np.allclose(heatmap[:100, :100], mat) mat = cooler.matrix(h5, 100, 200, 100, 200, 'count', balance=False) assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200, 100:200], mat) mat = cooler.Cooler(h5).matrix('count', balance=False)[100:200, 100:200] assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200, 100:200], mat)
def test_validate_pixels(): bins = cooler.binnify( cooler.util.read_chromsizes(op.join(datadir, "toy.chrom.sizes")), 1 ) chunk = pd.read_csv( op.join(datadir, "toy.symm.upper.1.zb.coo"), sep='\t', names=['bin1_id', 'bin2_id', 'count'] ) validator = validate_pixels( len(bins), boundscheck=True, triucheck=True, dupcheck=True, ensure_sorted=True ) validator(chunk.copy()) validator(chunk.to_dict(orient='series')) # wrongly assume zero-based, producing -1 bins IDs chunk_ = sanitize_pixels( bins, is_one_based=True, )(chunk.copy()) with pytest.raises(BadInputError): validator(chunk_) # out-of-bounds bin ID chunk_ = chunk.copy() chunk_.at[-1, 'bin1_id'] = len(bins) + 1 with pytest.raises(BadInputError): validator(chunk_) # pass in non-triu data tril_chunk = chunk.copy() tril_chunk['bin2_id'] = chunk['bin1_id'] tril_chunk['bin1_id'] = chunk['bin2_id'] with pytest.raises(BadInputError): validator(tril_chunk) # pass in duplicates with pytest.raises(BadInputError): validator(pd.concat([chunk, chunk], ignore_index=True))
def test_roundtrip(): chromsizes = cooler.read_chromsizes( 'https://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes', name_patterns=(r'^chr[0-9]+$', r'chrX$')) chroms, lengths = zip(*iteritems(chromsizes)) binsize = 2000000 bintable = cooler.binnify(chromsizes, binsize) heatmap = np.load(os.path.join(testdir, 'data', 'IMR90-MboI-matrix.2000kb.npy')) with h5py.File(testfile_path, 'w') as h5: reader = cooler.io.DenseLoader(heatmap) cooler.io.create(h5, chroms, lengths, bintable, reader, assembly='hg19') h5 = h5py.File(testfile_path, 'r') new_chromtable = cooler.chroms(h5) assert np.all(chromsizes.index == new_chromtable['name']) new_bintable = cooler.bins(h5) assert np.all(bintable == new_bintable) info = cooler.info(h5) assert info['genome-assembly'] == 'hg19' assert info['bin-type'] == 'fixed' assert info['bin-size'] == binsize mat = cooler.matrix(h5, 0, 100, 0, 100, 'count') assert mat.shape == (100, 100) assert np.allclose(heatmap[:100,:100], mat.toarray()) mat = cooler.Cooler(h5).matrix('count')[:100, :100] assert mat.shape == (100, 100) assert np.allclose(heatmap[:100,:100], mat.toarray()) mat = cooler.matrix(h5, 100, 200, 100, 200, 'count') assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200,100:200], mat.toarray()) mat = cooler.Cooler(h5).matrix('count')[100:200, 100:200] assert mat.shape == (100, 100) assert np.allclose(heatmap[100:200,100:200], mat.toarray())
def test_from_hdf5_pairs(): def should_not_depend_on_chunksize(chromsizes, bintable, mock_pairs): # try different chunk sizes binner = cooler.create.HDF5Aggregator(mock_pairs, chromsizes, bintable, chunksize=66) cooler.create.create(testcool_path, bintable, binner) with h5py.File(testcool_path, 'r') as h5: oc1 = h5['indexes']['chrom_offset'][:] ob1 = h5['indexes']['bin1_offset'][:] p1 = cooler.api.pixels(h5, join=False) binner = cooler.create.HDF5Aggregator(mock_pairs, chromsizes, bintable, chunksize=666) cooler.create.create(testcool_path, bintable, binner) with h5py.File(testcool_path, 'r') as h5: oc2 = h5['indexes']['chrom_offset'][:] ob2 = h5['indexes']['bin1_offset'][:] p2 = cooler.api.pixels(h5, join=False) assert np.all(oc1 == oc2) assert np.all(ob1 == ob2) assert np.all(p1.values == p2.values) def should_raise_if_input_not_sorted(chromsizes, bintable, mock_pairs): # not sorted by chrm1 #with h5py.File(testcool_path, 'w') as h5: bad_reads = { 'chrms1': mock_pairs['chrms2'], 'cuts1': mock_pairs['cuts2'], 'chrms2': mock_pairs['chrms1'], 'cuts2': mock_pairs['cuts1'], } with pytest.raises(ValueError): cooler.create.HDF5Aggregator(bad_reads, chromsizes, bintable, chunksize=66) # not triu bad_reads = { 'chrms1': mock_pairs['chrms1'].copy(), 'cuts1': mock_pairs['cuts1'].copy(), 'chrms2': mock_pairs['chrms2'].copy(), 'cuts2': mock_pairs['cuts2'].copy(), } bad_reads['chrms1'][0] = 0 bad_reads['chrms2'][0] = 0 bad_reads['cuts1'][0] = 10 bad_reads['cuts2'][0] = 9 binner = cooler.create.HDF5Aggregator(bad_reads, chromsizes, bintable, chunksize=66) with pytest.raises(ValueError): cooler.create.create(testcool_path, bintable, binner) def should_work_with_int32_cols(chromsizes, bintable, mock_pairs): # int64 binner = cooler.create.HDF5Aggregator(mock_pairs, chromsizes, bintable, chunksize=66) cooler.create.create(testcool_path, bintable, binner) with h5py.File(testcool_path, 'r') as h5: oc1 = h5['indexes']['chrom_offset'][:] ob1 = h5['indexes']['bin1_offset'][:] p1 = cooler.api.pixels(h5, join=False) # int32 mock_pairs32 = { 'chrms1': mock_pairs['chrms1'].astype(np.int32), 'cuts1': mock_pairs['cuts1'].astype(np.int32), 'chrms2': mock_pairs['chrms2'].astype(np.int32), 'cuts2': mock_pairs['cuts2'].astype(np.int32), } binner = cooler.create.HDF5Aggregator(mock_pairs32, chromsizes, bintable, chunksize=66) cooler.create.create(testcool_path, bintable, binner) with h5py.File(testcool_path, 'r') as h5: oc2 = h5['indexes']['chrom_offset'][:] ob2 = h5['indexes']['bin1_offset'][:] p2 = cooler.api.pixels(h5, join=False) assert np.all(oc1 == oc2) assert np.all(ob1 == ob2) assert np.all(p1.values == p2.values) def _mock_hdf5_pairs(): np.random.seed(1) chrms = np.random.randint(0, n_chroms, n_records * 2) cuts = np.random.randint(0, clen, n_records * 2) abs_cuts = np.array( [clen * chrm + cut for chrm, cut in zip(chrms, cuts)]) abs_cuts1, abs_cuts2 = abs_cuts[:n_records], abs_cuts[n_records:] mock_pairs = { 'chrms1': chrms[:n_records], 'cuts1': cuts[:n_records], 'chrms2': chrms[n_records:], 'cuts2': cuts[n_records:], } # Triu-sort mask = abs_cuts1 > abs_cuts2 mock_pairs['chrms1'][mask], mock_pairs['chrms2'][mask] = mock_pairs[ 'chrms2'][mask], mock_pairs['chrms1'][mask] mock_pairs['cuts1'][mask], mock_pairs['cuts2'][mask] = mock_pairs[ 'cuts2'][mask], mock_pairs['cuts1'][mask] abs_cuts1[mask], abs_cuts2[mask] = abs_cuts2[mask], abs_cuts1[mask] idx = np.lexsort([abs_cuts2, abs_cuts1]) for key in mock_pairs: mock_pairs[key] = mock_pairs[key][idx] return mock_pairs n_chroms = 2 clen = 2000 n_records = 3000 chromsizes = pd.Series(index=['chr1', 'chr2'], data=[clen, clen]) mock_pairs = _mock_hdf5_pairs() # uniform bins bintable = cooler.binnify(chromsizes, 100) should_not_depend_on_chunksize(chromsizes, bintable, mock_pairs) should_raise_if_input_not_sorted(chromsizes, bintable, mock_pairs) should_work_with_int32_cols(chromsizes, bintable, mock_pairs) # non-uniform bins bintable = _alternating_bins(chromsizes, [10, 100]) should_not_depend_on_chunksize(chromsizes, bintable, mock_pairs) should_raise_if_input_not_sorted(chromsizes, bintable, mock_pairs) should_work_with_int32_cols(chromsizes, bintable, mock_pairs)
def convert(self, input_filename, output_filename, input_format=None, output_format=None, **kwargs): """ Convert files/matrix without reading them into InteractionMatrix._mtx :param input_format: :param output_format: :param input: :param output: :return: """ if type(input_filename) == np.ndarray: input_format = 'mtx' mtx = input_filename if input_format is None: input_format = input_filename.split('.') if output_format is None: output_format = output_filename.split('.') TADselect_logger.info("Converting %s -> %s: from %s to %s", input_format, output_format, input_filename, output_filename) chromosome = kwargs.get('chr', 'chr2L') resolution = kwargs.get('res', 20000) remove_intermediary_files = kwargs.get('remove_intermediary_files', True) balance = kwargs.get('balance', False) if 'cool' in input_format and 'txt' in output_format: output_prefix = '.'.join(input_filename.split('.')[:-1]) output_filename = output_prefix + '.{}.txt'.format(chromosome) c = cooler.Cooler(input_filename) mtx = c.matrix(balance=balance, as_pixels=False).fetch(chromosome, chromosome) np.savetxt(output_filename, mtx, delimiter='\t') if 'gz' in output_format: command = 'gzip {}'.format(output_filename) run_command(command) output_filename += '.gz' return output_filename elif 'cool' in input_format and output_format == 'sparse': output_prefix = '.'.join(input_filename.split('.')[:-1]) output_filename = output_prefix + '.{}.sparse.txt'.format( chromosome) c = cooler.Cooler(input_filename) mtx_df = c.matrix(balance=balance, as_pixels=True, join=True, ignore_index=False).fetch( chromosome, chromosome) if balance: mtx_df.loc[:, 'count'] = mtx_df.loc[:, 'balanced'] mtx_df = mtx_df.drop('balanced', axis=1) mtx_df = mtx_df.dropna() mtx_df.to_csv(output_filename, index=False, sep='\t', header=False) return output_filename elif 'cool' in input_format and 'mr_sparse' in output_format: output_prefix = '.'.join(input_filename.split('.')[:-1]) output_filename = output_prefix + '.{}.mr_sparse.txt'.format( chromosome) c = cooler.Cooler(input_filename) mtx = c.matrix(balance=True, as_pixels=True).fetch(chromosome, chromosome) mtx.loc[:, "bin1_id":"bin2_id"] += 1 mtx.loc[:, 'bin1_id':'count'].to_csv(output_filename, header=False, index=False, sep='\t') max_bin = mtx.loc[:, 'bin1_id':'bin2_id'].max().max() with open(output_prefix + ".{}.genome_bin.txt".format(chromosome), 'w') as outfile: outfile.write("1\tchr1\t0\t{}".format(max_bin - 1)) with open(output_prefix + ".{}.all_bins.txt".format(chromosome), 'w') as outfile: for i in range(max_bin): outfile.write("0\t{}\t{}\n".format(i * resolution + 1, (i + 1) * resolution)) return output_filename elif 'txt' in input_format and 'cool' in output_format: mtx = np.loadtxt(input_filename) chromsizes = pd.Series({ch: resolution * mtx.shape[0]}, name='length') bins = cooler.binnify(chromsizes, resolution) pixels = cooler.io.ArrayLoader(bins, mtx, chunksize=10000000) cooler.io.create(output_filename, bins, pixels) elif 'cool' in input_format and output_format == 'h5': output_prefix = '.'.join(input_filename.split('.')[:-1]) output_filename = output_prefix + '.h5' command = "hicExport --inFile {} --outFileName {} --inputFormat cool --outputFormat h5" \ .format(input_filename, output_filename) run_command(command) return output_filename elif 'cool' in input_format and output_format == 'hic': binary_path = kwargs.get('binary_path', 'java') juicer_path = kwargs.get('juicer_path', './juicer_tools.1.8.9_jcuda.0.8.jar') genome = kwargs.get('genome', 'dm3') output_prefix = '.'.join(input_filename.split('.')[:-1]) outfile_hic = "{}.{}.hic".format(output_prefix, chromosome) outfile_txt = outfile_hic + '.txt' outfile_tmp = outfile_hic + '.tmp' with open(outfile_tmp, 'w'): pass outfile_tmp = self.convert(input_filename=input_filename, output_filename=outfile_tmp, input_format='cool', output_format='sparse') command1 = "awk '{{print 0, $1, $2, 0, 0, $4, $5, 1, $7}}' {} > {}".format( outfile_tmp, outfile_txt) command2 = "gzip -f {}".format(outfile_txt) command3 = "{} -Xmx2g -jar {} pre -r {} -c {} {}.gz {} {}".format( binary_path, juicer_path, resolution, chromosome, outfile_txt, outfile_hic, genome) run_command(command1) run_command(command2) run_command(command3) if remove_intermediary_files: os.remove(outfile_txt + '.gz') elif input_format == 'mtx': if 'txt' in output_format: np.savetxt(output_filename, mtx, delimiter='\t') if 'gz' in output_format: command = 'gzip {}'.format(output_filename) run_command(command) output_filename += '.gz' elif output_format == 'cool': chromsizes = pd.Series({ch: resolution * mtx.shape[0]}, name='length') bins = cooler.binnify(chromsizes, resolution) pixels = cooler.io.ArrayLoader(bins, mtx, chunksize=10000000) cooler.io.create(output_filename, bins, pixels) return output_filename
description="Output a genome segmentation of restriction fragments as a BED file.") parser.add_argument( "chromsizes", help="UCSC-like chromsizes file, with chromosomes in desired order", metavar="CHROMSIZES_PATH") parser.add_argument( "binsize", help="Resolution (bin size) in base pairs <int>", metavar="BINSIZE") parser.add_argument( "--out", "-o", help="Output file (defaults to stdout)") args = vars(parser.parse_args()) binsize = int(args['binsize']) chromsizes = cooler.read_chromsizes(args['chromsizes']) bins = cooler.binnify(chromsizes, binsize) # Write output out = args['out'] try: if out is None: f = sys.stdout else: f = open(out, 'wt') bins.to_csv(f, sep='\t', index=False, header=False) except OSError: pass finally: f.close()
def main(): parser = argparse.ArgumentParser(description="""python matrix_storage_benchmark.py matrix_tsv""") parser.add_argument('matrix_tsv') parser.add_argument( '-s', '--square', default=256, type=int, help="The size of the square within which to return values") parser.add_argument( '-i', '--iterations', default=100, type=int, help="The number of times to run the range query") args = vars(parser.parse_args()) binsize = 5000 infilepath = args['matrix_tsv'] outfilepath = op.join(op.dirname(infilepath), 'chrX.{}kb.cool'.format(binsize//1000)) # Build "index" t1 = time.time() chromsizes = cooler.read_chromsizes('test/data/hg19.chrom.sizes') chroms = ['chrX'] lengths = [chromsizes['chrX']] bins = cooler.binnify(chromsizes.loc['chrX':'chrX'], binsize) chunksize = int(100e6) reader = cooler.io.SparseLoader(infilepath, chunksize) h5opts = dict(compression='gzip', compression_opts=6) with h5py.File(outfilepath, 'w') as h5: cooler.io.create(h5, chroms, lengths, bins, reader, binsize, h5opts=h5opts) c = cooler.Cooler(outfilepath) print("Time creating index: {:.3f} seconds".format(time.time() - t1)) # Normalization t15 = time.time() N_CPUS = 8 chunksize = int(100e6) with h5py.File(outfilepath, 'r+') as h5, Pool(N_CPUS) as pool: bias = ice.iterative_correction( h5, chunksize=chunksize, tol=1e-05, mad_max=3, cis_only=False, ignore_diags=3, map=pool.map) h5opts = dict(compression='gzip', compression_opts=6) h5['bins'].create_dataset('weight', data=bias, **h5opts) print("Time for normalization (cis and trans): {:.3f} seconds".format(time.time() - t15)) # The bounds of the contact coordinates c = cooler.Cooler(outfilepath) matrix = c.matrix() min_x = 0 min_y = 0 max_x = c.shape[0] max_y = c.shape[1] print("max_x:", max_x) print("max_y:", max_y) # Range queries square_size = args['square'] t2 = time.time() for i in range(args['iterations']): point1 = random.randint(min_x, max_x - square_size) point2 = random.randint(min_y, max_y - square_size) mat = matrix[point1 : point1+square_size, point2 : point2+square_size] selected_points = list(zip(mat.row, mat.col, mat.data)) t25 = time.time() print("Time performing range queries (256x256): {:.3f} seconds (per query): {:.3f} seconds".format(t25 - t2, (t25 - t2) / args['iterations'])) weights = c.bins()['weight'][:].values for i in range(args['iterations']): point1 = random.randint(min_x, max_x - square_size) point2 = random.randint(min_y, max_y - square_size) mat = matrix[point1 : point1+square_size, point2 : point2+square_size] bias1 = weights[point1:point1+square_size] bias2 = weights[point2:point2+square_size] mat.data = bias1[mat.row] * bias2[mat.col] * mat.data selected_points = list(zip(mat.row, mat.col, mat.data)) t26 = time.time() print("Time performing range queries (256x256) with balancing: {:.3f} seconds (per query): {:.3f} seconds".format(t26 - t25, (t26 - t25) / args['iterations'])) for i in range(args['iterations']): point1 = random.randint(min_x, max_x - square_size * 8) point2 = random.randint(min_y, max_y - square_size * 8) mat = matrix[point1 : point1+square_size*8, point2 : point2+square_size*8] selected_points = list(zip(mat.row, mat.col, mat.data)) t3 = time.time() print("Time performing range queries (2048 x 2048): {:.3f} seconds (per query): {:.3f} seconds".format(t3 - t26, (t3 - t26) / args['iterations'])) weights = c.bins()['weight'][:].values for i in range(args['iterations']): point1 = random.randint(min_x, max_x - square_size * 8) point2 = random.randint(min_y, max_y - square_size * 8) mat = matrix[point1 : point1+square_size*8, point2 : point2+square_size*8] selected_points = list(zip(mat.row, mat.col, mat.data)) t35 = time.time() print("Time performing range queries (2048 x 2048) with balancing: {:.3f} seconds (per query): {:.3f} seconds".format(t35 - t3, (t35 - t3) / args['iterations'])) for i in range(args['iterations']): point1 = random.randint(min_x, max_x - square_size) mat = matrix[point1, :] selected_points = list(zip(mat.row, mat.col, mat.data)) t4 = time.time() print("Time slicing across first dimension: {:.3f} seconds (per query): {:.3f} seconds".format(t4 - t35, (t4 - t35) / args['iterations'])) for i in range(args['iterations']): point2 = random.randint(min_y, max_y - square_size) mat = matrix[:, point2] selected_points = list(zip(mat.row, mat.col, mat.data)) t5 = time.time() print("Time slicing across second dimension: {:.3f} seconds (per query): {:.3f} seconds".format(t5 - t4, (t5 - t4) / args['iterations'])) selected_points = [] for i in range(args['iterations']): for pix in c.pixels().iterchunks(size=1000000): diag = pix[pix.bin1_id == pix.bin2_id] selected_points.extend( list(zip(diag['bin1_id'], diag['bin2_id'], diag['count'])) ) t6 = time.time() print("Time slicing across the diagonal: {:.3f} seconds (per query): {:.3f} seconds".format(t6 - t5, (t6 - t5) / args['iterations'])) # Dump print("Size of index: {} bytes".format(op.getsize(outfilepath))) with open('/tmp/tmp.tsv', 'wt') as f: for pix in c.pixels().iterchunks(size=100000): pix.to_csv(f, sep='\t', index=False, header=False) print("Time outputting the index: {:.3f}".format(time.time() - t6)) print("Size of output: {} bytes".format(op.getsize('/tmp/tmp.tsv')))
mpl.style.use('seaborn-white') import multiprocess as mp import numpy as np import pandas as pd import bioframe import cooltools import cooler from cooltools.eigdecomp import cooler_cis_eig mm10 = bioframe.fetch_chromsizes('mm10') chromsizes = bioframe.fetch_chromsizes('mm10') chromosomes = list(chromsizes.index) binsize = 10000 bins = cooler.binnify(mm10, binsize) fasta_records = bioframe.load_fasta('/data05/genomes/mm10_20chr.fa') bins['GC'] = bioframe.tools.frac_gc(bins, fasta_records) bins.head() import fnmatch import os for file in os.listdir('.'): if fnmatch.fnmatch(file, '*_10kb.cool'): clr = cooler.Cooler(file) cond = file.split('.')[0] lam, eigs = cooler_cis_eig(clr, bins, n_eigs=3, phasing_track_col='GC',
def hic2cool_extractnorms(infile, outfile, exclude_mt=False, show_warnings=False, silent=False): """ Find all normalization vectors in the given hic file at all resolutions and attempts to add them to the given cooler file. Does not add any metadata to the cooler file. TODO: should we add `extract-norms-date` attr? Params: <infile> str .hic filename <outfile> str .cool output filename <exclude_mt> bool. If True, ignore MT contacts. Defaults to False. <show_warnings> bool. If True, print out WARNING messages <silent> bool. If true, hide standard output """ unit = 'BP' # only using base pair unit for now # Global hic normalization types used global NORMS NORMS = [] global WARN WARN = False req = open(infile, 'rb') buf = mmap.mmap(req.fileno(), 0, access=mmap.ACCESS_READ) used_chrs, resolutions, masteridx, genome, metadata = read_header(req) pair_footer_info, expected, factors, norm_info = read_footer( req, buf, masteridx) # expected/factors unused for now del expected del factors chr_names = [used_chrs[key][1] for key in used_chrs.keys()] if not silent: # print hic header info for command line usage print('################################') print('### hic2cool / extract-norms ###') print('################################') print('Header info from hic:') print('... Chromosomes: ', chr_names) print('... Resolutions: ', resolutions) print('... Normalizations: ', NORMS) print('... Genome: ', genome) if exclude_mt: # remove mitchondrial chr by name if this flag is set # try to find index of chrM (a.k.a chrMT) if it is present mt_names = ['m', 'mt', 'chrm', 'chrmt'] found_idxs = [ idx for idx, fv in used_chrs.items() if fv[1].lower() in mt_names ] if len(found_idxs) == 1: excl = used_chrs.pop(found_idxs[0], None) if not silent: print('... Excluding chromosome %s with index %s' % (excl[1], excl[0])) if len(found_idxs) > 1: error_str = ( 'ERROR. More than one chromosome was found when attempting to' ' exclude MT. Found chromosomes: %s' % chr_names) force_exit(error_str, req) else: if not silent: print('... No chromosome found when attempting to exclude MT.') # exclude 'all' from chromsomes chromosomes = [ uc[1] for uc in used_chrs.values() if uc[1].lower() != 'all' ] lengths = [uc[2] for uc in used_chrs.values() if uc[1].lower() != 'all'] chromsizes = pd.Series(index=chromosomes, data=lengths) cooler_groups = {} for path in cooler.fileops.list_coolers(outfile): binsize = cooler.Cooler(outfile + '::' + path).info['bin-size'] cooler_groups[binsize] = path if not silent: print('### Found cooler contents:') print('... %s' % cooler_groups) for norm in NORMS: for binsize in resolutions: if binsize not in cooler_groups: if not silent: print('... Skip resolution %s; it is not in cooler file' % binsize) continue if not silent: print('... Extracting %s normalization vector at %s BP' % (norm, binsize)) chrom_map = {} bins = cooler.binnify(chromsizes, binsize) lengths_in_bins = bins.groupby('chrom').size() for chr_val in [ uc for uc in used_chrs.values() if uc[1].lower() != 'all' ]: chr_num_bins = lengths_in_bins.loc[chr_val[1]] try: norm_key = norm_info[norm, unit, binsize, chr_val[0]] except KeyError: WARN = True if show_warnings and not silent: print_stderr( '!!! WARNING. Normalization vector %s does not exist for %s.' % (norm, chr_val[1])) # add a vector of 0's with length equal to by_chr_bins[chr_idx] norm_vector = [np.nan] * chr_num_bins else: norm_vector = read_normalization_vector( req, buf, norm_key)[:chr_num_bins] chrom_map[chr_val[1]] = norm_vector # hic normalization vector lengths have inconsistent lengths... # truncate appropriately bins[norm] = np.concatenate( [chrom_map[chrom] for chrom in chromosomes]) if not silent: print('... Writing to cool file ...') print('%s\n... Truncated ...' % bins.head()) group_path = cooler_groups[binsize] cooler.create.append(outfile + '::' + group_path, 'bins', {norm: bins[norm].values}, force=True) req.close() if not silent: if WARN and not show_warnings: print( '... Warnings were found in this run. Run with -v to display them.' ) print('### Finished! Output written to: %s' % outfile)