def _multivec(filepath, output_file, assembly, tile_size, chromsizes_filename, starting_resolution, row_infos_filename=None): ''' Aggregate a multivec file. This is a file containing nxn data that is aggregated along only one axis. This data should be in an HDF5 file where each dataset is named for a chromosome and contains a 'resolutions' group containing values for the base level resolution. Example: f['chr1']['reslutions']['1000'] = [[1,2,3],[4,5,6]] The resulting data will be organized by resolution and chromosome. Example: f_out['chr1']['resolutions']['5000']=[[1000,2000,3000],[4000,5000,6000]] Aggregation is currently done by summing adjacent values. ''' f_in = h5py.File(filepath, 'r') if output_file is None: output_file = op.splitext(filepath)[0] + ".multires.mv5" (chrom_info, chrom_names, chrom_sizes) = cch.load_chromsizes(chromsizes_filename, assembly) if method == 'maxtotal': pass if method == 'logsumexp': def agg(x): a = x.T.reshape((x.shape[1], -1, 2)) return sm.logsumexp(a, axis=2).T else: agg = lambda x: x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T print("agg:", agg) if row_infos_filename is not None: with open(row_infos_filename, 'r') as fr: row_infos = [l.strip().encode('utf8') for l in fr] else: row_infos = None print("row_infos:", row_infos) cmv.create_multivec_multires( f_in, chromsizes=zip(chrom_names, chrom_sizes), agg=lambda x: np.nansum(x.T.reshape((x.shape[1], -1, 2)), axis=2).T, starting_resolution=starting_resolution, tile_size=tile_size, output_file=output_file, row_infos=row_infos)
def abs2genome_fn(chromsizes_filename, start, end): """Convert an absolute genomic range to sections of genomic ranges. E.g. (1000,2000) => [('chr1', 1000, 1500), ('chr2', 1500, 2000)] """ (chrom_info, chrom_names, chrom_sizes) = load_chromsizes(chromsizes_filename) for cid, start, end in abs2genomic(chrom_sizes, start, end): try: yield ChromosomeInterval(cid=cid, name=chrom_names[cid], start=start, end=end) except IndexError: # we've gone beyond the last chromosome so stop iterating return yield cid_hi, start, rel_pos_hi
def _bedgraph_to_multivec(filepaths, output_file, assembly, chrom_col, from_pos_col, to_pos_col, value_col, has_header, chunk_size, nan_value, chromsizes_filename, starting_resolution, num_rows, format, row_infos_filename, tile_size, method): print('chrom_col:', chrom_col) with tempfile.TemporaryDirectory() as td: print('temporary dir:', td) temp_file = op.join(td, 'temp.mv5') f_out = h5py.File(temp_file, 'w') (chrom_info, chrom_names, chrom_sizes) = cch.load_chromsizes(chromsizes_filename, assembly) if row_infos_filename is not None: with open(row_infos_filename, 'r') as fr: row_infos = [l.strip().encode('utf8') for l in fr] else: row_infos = None for chrom in chrom_info.chrom_order: f_out.create_dataset(chrom, (math.ceil( chrom_info.chrom_lengths[chrom] / starting_resolution), num_rows * len(filepaths)), fillvalue=np.nan, compression='gzip') def bedline_to_chrom_start_end_vector(bedlines, row_infos=None): chrom_set = set() start_set = set() end_set = set() all_vector = [] for bedline in bedlines: parts = bedline.strip().split() chrom = parts[chrom_col - 1] start = int(parts[from_pos_col - 1]) end = int(parts[to_pos_col - 1]) vector = [ float(f) if not f == 'NA' else np.nan for f in parts[value_col - 1:value_col - 1 + num_rows] ] chrom_set.add(chrom) start_set.add(start) end_set.add(end) if len(chrom_set) > 1: raise ValueError("Chromosomes don't match in these lines:", bedlines) if len(start_set) > 1: raise ValueError( "Start positions don't match in these lines:", bedlines) if len(end_set) > 1: raise ValueError( "End positions don't match in these lines:", bedlines) all_vector += vector return (list(chrom_set)[0], list(start_set)[0], list(end_set)[0], all_vector) if format == 'epilogos': cmv.bedfile_to_multivec(filepaths, f_out, epilogos_bedline_to_vector, starting_resolution, has_header, chunk_size) elif format == 'states': assert ( row_infos != None ), "A row_infos file must be provided for --format = 'states' " states_dic = {row_infos[x]: x for x in range(len(row_infos))} cmv.bedfile_to_multivec(filepaths, f_out, states_bedline_to_vector, starting_resolution, has_header, chunk_size, states_dic) else: cmv.bedfile_to_multivec(filepaths, f_out, bedline_to_chrom_start_end_vector, starting_resolution, has_header, chunk_size) f_out.close() tf = temp_file f_in = h5py.File(tf, 'r') if output_file is None: output_file = op.splitext(filepaths[0])[0] + '.multires.mv5' print('output_file:', output_file) # Override the output file if it existts if op.exists(output_file): os.remove(output_file) if method == 'logsumexp': def agg(x): # newshape = (x.shape[2], -1, 2) # b = x.T.reshape((-1,)) a = x.T.reshape((x.shape[1], -1, 2)) # this is going to be an odd way to get rid of nan # values orig_shape = a.shape na = a.reshape((-1, )) SMALL_NUM = -1e8 NAN_THRESHOLD_NUM = SMALL_NUM / 100 if np.nanmin(na) < NAN_THRESHOLD_NUM: raise ValueError( "Error removing nan's when running logsumexp aggregation" ) na[np.isnan(na)] = SMALL_NUM na = na.reshape(orig_shape) res = sm.logsumexp(a, axis=2).T nres = res.reshape((-1, )) # print("nres:", np.nansum(nres < NAN_THRESHOLD_NUM)) nres[nres < NAN_THRESHOLD_NUM] = np.nan res = nres.reshape(res.shape) # print("res:", np.nansum(res.reshape((-1,)))) return res else: agg = lambda x: x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T cmv.create_multivec_multires(f_in, chromsizes=zip(chrom_names, chrom_sizes), agg=agg, starting_resolution=starting_resolution, tile_size=tile_size, output_file=output_file, row_infos=row_infos)
def _bedfile( filepath, output_file, assembly, importance_column, has_header, chromosome, max_per_tile, tile_size, delimiter, chromsizes_filename, offset, ): BEDDB_VERSION = 3 if output_file is None: output_file = filepath + ".beddb" else: output_file = output_file if op.exists(output_file): os.remove(output_file) if filepath.endswith(".gz"): import gzip bed_file = gzip.open(filepath, "rt") else: bed_file = open(filepath, "r") try: (chrom_info, chrom_names, chrom_sizes) = cch.load_chromsizes( chromsizes_filename, assembly ) except FileNotFoundError: if chromsizes_filename is None: print("Assembly not found:", assembly, file=sys.stderr) else: print( "Chromsizes filename not found:", chromsizes_filename, file=sys.stderr ) return None rand = random.Random(3) def line_to_np_array(line): """ Convert a bed file line to a numpy array which can later be used as an entry in an h5py file. """ try: start = int(line[1]) stop = int(line[2]) except ValueError: raise ValueError("Error parsing the position, line: {}".format(line)) chrom = line[0] if importance_column is None: # assume a random importance when no aggregation strategy is given importance = rand.random() elif importance_column == "size": importance = stop - start elif importance_column == "random": importance = rand.random() else: importance = float(line[int(importance_column) - 1]) if stop < start: print("WARNING: stop < start:", line, file=sys.stderr) start, stop = stop, start if len(line) > 3: bedline_name = line[3] else: bedline_name = "" # convert chromosome coordinates to genome coordinates genome_start = chrom_info.cum_chrom_lengths[chrom] + start + offset genome_end = chrom_info.cum_chrom_lengths[chrom] + stop + offset pos_offset = genome_start - start parts = { "startPos": genome_start, "endPos": genome_end, "uid": slugid.nice(), "name": bedline_name, "chrOffset": pos_offset, "fields": "\t".join(line), "importance": importance, "chromosome": str(chrom), } return parts dset = [] print("delimiter:", delimiter) if has_header: line = bed_file.readline() header = line.strip().split(delimiter) else: line = bed_file.readline().strip() line_parts = line.strip().split(delimiter) try: dset += [line_to_np_array(line_parts)] except KeyError: print( f"Unable to find {line_parts[0]} in the list of chromosome sizes. " "Please make sure the correct assembly or chromsizes filename " "is passed in as a parameter", file=sys.stderr, ) return None except IndexError: print("Invalid line:", line) header = map(str, list(range(1, len(line.strip().split(delimiter)) + 1))) for line in bed_file: line_parts = line.strip().split(delimiter) try: dset += [line_to_np_array(line_parts)] except IndexError: print("Invalid line:", line) if chromosome is not None: dset = [d for d in dset if d["chromosome"] == chromosome] # We neeed chromosome information as well as the assembly size to properly # tile this data tile_size = tile_size assembly_size = chrom_info.total_length + 1 """ else: try: assembly_size = chrom_info.chrom_lengths[chromosome] except KeyError: print( "ERROR: Chromosome {} not found in assembly {}.".format( chromosome, assembly ), file=sys.stderr ) return 1 """ max_zoom = int(math.ceil(math.log(assembly_size / tile_size) / math.log(2))) """ if max_zoom is not None and max_zoom < max_zoom: max_zoom = max_zoom """ # this script stores data in a sqlite database import sqlite3 sqlite3.register_adapter(np.int64, lambda val: int(val)) print("output_file:", output_file, "header:", header) conn = sqlite3.connect(output_file) # store some meta data store_meta_data( conn, 1, max_length=assembly_size, assembly=assembly, chrom_names=chrom_names, chrom_sizes=chrom_sizes, tile_size=tile_size, max_zoom=max_zoom, max_width=tile_size * 2 ** max_zoom, header=header, version=BEDDB_VERSION, ) # max_width = tile_size * 2 ** max_zoom uid_to_entry = {} intervals = [] # store each bed file entry as an interval for d in dset: uid = d["uid"] uid_to_entry[uid] = d intervals += [(d["startPos"], d["endPos"], uid)] tile_width = tile_size c = conn.cursor() c.execute( """ CREATE TABLE intervals ( id int PRIMARY KEY, zoomLevel int, importance real, startPos int, endPos int, chrOffset int, uid text, name text, fields text ) """ ) c.execute( """ CREATE VIRTUAL TABLE position_index USING rtree( id, rStartZoomLevel, rEndZoomLevel, rStartPos, rEndPos ) """ ) curr_zoom = 0 counter = 0 max_viewable_zoom = max_zoom if max_zoom is not None and max_zoom < max_zoom: max_viewable_zoom = max_zoom sorted_intervals = sorted( intervals, key=lambda x: -uid_to_entry[x[-1]]["importance"] ) # print('si:', sorted_intervals[:10]) print("max_per_tile:", max_per_tile) tile_counts = col.defaultdict(int) for interval in sorted_intervals: # go through each interval from most important to least while curr_zoom <= max_viewable_zoom: # try to place it in the highest zoom level and go down from there tile_width = tile_size * 2 ** (max_zoom - curr_zoom) curr_pos = interval[0] space_available = True # check if there's space at this zoom level while curr_pos < interval[1]: curr_tile = math.floor(curr_pos / tile_width) tile_id = "{}.{}".format(curr_zoom, curr_tile) """ if interval[0] < 1000000: print('tile_id:', tile_id, tile_counts[tile_id], curr_zoom, 'interval:', interval) """ # print(tile_id, "tile_counts[tile_id]", tile_counts[tile_id]) if tile_counts[tile_id] >= max_per_tile: space_available = False break curr_pos += tile_width # if there is, then fill it up if space_available: curr_pos = interval[0] while curr_pos < interval[1]: curr_tile = math.floor(curr_pos / tile_width) tile_id = "{}.{}".format(curr_zoom, curr_tile) tile_counts[tile_id] += 1 """ # increment tile counts for lower level tiles higher_zoom = curr_zoom + 1 higher_tile = math.floor(higher_zoom / 2) while higher_zoom <= max_viewable_zoom: new_tile_id = '{}.{}'.format(higher_zoom, higher_tile) higher_zoom += 1 higher_tile = math.floor(higher_tile / 2) tile_counts[new_tile_id] += 1 """ curr_pos += tile_width if space_available: # there's available space value = uid_to_entry[interval[-1]] # one extra question mark for the primary key exec_statement = "INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?,?)" c.execute( exec_statement, # primary key, zoomLevel, startPos, endPos, chrOffset, line ( counter, curr_zoom, value["importance"], value["startPos"], value["endPos"], value["chrOffset"], value["uid"], value["name"], value["fields"], ), ) if counter % 1000 == 0: print("counter:", counter, value["endPos"] - value["startPos"]) exec_statement = "INSERT INTO position_index VALUES (?,?,?,?,?)" c.execute( exec_statement, # add counter as a primary key (counter, curr_zoom, curr_zoom, value["startPos"], value["endPos"]), ) counter += 1 break curr_zoom += 1 curr_zoom = 0 conn.commit() return True
def _bedpe( filepath, output_file=None, assembly=None, importance_column="random", has_header=False, max_per_tile=100, tile_size=1024, chromosome=None, chromsizes_filename=None, chr1_col=1, from1_col=2, to1_col=3, chr2_col=4, from2_col=5, to2_col=6, max_zoom=None, sqlite_cache_size=500, # 500 MB sqlite_batch_size=100000, verbose=0, ): BED2DDB_VERSION = 1 if verbose > 0: print(f"BEDPEDB Version {BED2DDB_VERSION}") if filepath == "-": f = sys.stdin elif filepath.endswith(".gz"): f = gzip.open(filepath, "rt") else: f = open(filepath, "r") if output_file is None: output_file = filepath if filepath.endswith(".gz"): output_file = os.path.splitext(output_file)[0] output_file = os.path.splitext(output_file)[0] + ".bedpedb" if op.exists(output_file): os.remove(output_file) chrom_info, chrom_names, chrom_sizes = cch.load_chromsizes( chromsizes_filename, assembly ) def line_to_dict(line): parts = line.split() d = {} try: chrom1 = parts[chr1_col - 1] chrom2 = parts[chr2_col - 1] chrom1_offset = chrom_info.cum_chrom_lengths[chrom1] chrom2_offset = chrom_info.cum_chrom_lengths[chrom2] d["xs"] = [ chrom1_offset + int(parts[from1_col - 1]), chrom1_offset + int(parts[to1_col - 1]), ] d["ys"] = [ chrom2_offset + int(parts[from2_col - 1]), chrom2_offset + int(parts[to2_col - 1]), ] except KeyError: error_str = ( "ERROR converting chromosome position to genome position. " "Please make sure you've specified the correct assembly " "using the --assembly option or a chromsizes file using the . " "--chromsizes-filename option." "Current assembly: {}, chromosomes: {},{}".format( assembly, parts[chr1_col - 1], parts[chr2_col - 1] ) ) raise (KeyError(error_str)) d["uid"] = slugid.nice() d["chrOffset"] = d["xs"][0] - int(parts[from1_col - 1]) d["chrom1"] = str(chrom1) d["chrom2"] = str(chrom2) if importance_column is None: d["importance"] = max(d["xs"][1] - d["xs"][0], d["ys"][1] - d["ys"][0]) elif importance_column == "random": d["importance"] = random.random() else: # We seem to use one-based numbering for columns... d["importance"] = float(parts[int(importance_column) - 1]) d["fields"] = line return d entries = [] if has_header: f.readline() else: first_line = f.readline().strip() try: parts = first_line.split() int(parts[from1_col - 1]) int(parts[to1_col - 1]) int(parts[from2_col - 1]) int(parts[to2_col - 1]) except ValueError: error_str = ( "Couldn't convert one of the bedpe coordinates to an " "integer. If the input file contains a header, make sure to " "indicate that with the --has-header option. Line: {}".format( first_line ) ) raise ValueError(error_str) entries = [line_to_dict(first_line)] entries += [line_to_dict(line) for line in [line.strip() for line in f] if line] if chromosome is not None: entries = [ d for d in entries if d["chrom1"] == chromosome or d["chrom2"] == chromosome ] if verbose > 0: print(f"Found {len(entries)} entries") # We need chromosome information as well as the assembly size to properly # tile this data assembly_size = chrom_info.total_length + 1 max_zoom = int(math.ceil(math.log(assembly_size / tile_size) / math.log(2))) # this script stores data in a sqlite database sqlite3.register_adapter(np.int64, lambda val: int(val)) conn = sqlite3.connect(output_file, isolation_level=None) # store some meta data store_meta_data( conn, 1, max_length=assembly_size, assembly=assembly, chrom_names=chrom_names, chrom_sizes=chrom_sizes, tile_size=tile_size, max_zoom=max_zoom, max_width=tile_size * 2 ** max_zoom, version=BED2DDB_VERSION, ) # max_width = tile_size * 2 ** max_zoom # uid_to_entry = {} c = conn.cursor() c.execute("PRAGMA synchronous = OFF;") c.execute("PRAGMA journal_mode = OFF;") c.execute(f"PRAGMA cache_size = {int(sqlite_cache_size * 1000)};") c.execute( """ CREATE TABLE intervals ( id int PRIMARY KEY, zoomLevel int, importance real, fromX int, toX int, fromY int, toY int, chrOffset int, uid text, fields text ) """ ) c.execute( """ CREATE VIRTUAL TABLE position_index USING rtree( id, rFromX, rToX, rFromY, rToY ) """ ) curr_zoom = 0 counter = 0 tile_counts = col.defaultdict(lambda: col.defaultdict(lambda: col.defaultdict(int))) # Sort from high to low importance entries = sorted(entries, key=lambda x: -x["importance"]) interval_inserts = [] position_index_inserts = [] def batch_insert(conn, c, interval_inserts, position_index_inserts): if verbose > 0: print(f"Insert batch ({counter})") with transaction(conn): c.executemany( "INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?,?,?)", interval_inserts ) c.executemany( "INSERT INTO position_index VALUES (?,?,?,?,?)", position_index_inserts ) interval_inserts.clear() position_index_inserts.clear() for entry_num, d in enumerate(entries): curr_zoom = 0 while curr_zoom <= max_zoom: tile_width = tile_size * 2 ** (max_zoom - curr_zoom) tile_from = list( map(lambda x: int(x / tile_width), [d["xs"][0], d["ys"][0]]) ) tile_to = list(map(lambda x: int(x / tile_width), [d["xs"][1], d["ys"][1]])) empty_tiles = True # go through and check if any of the tiles at this zoom level are # full for i in range(tile_from[0], tile_to[0] + 1): if not empty_tiles: break for j in range(tile_from[1], tile_to[1] + 1): if tile_counts[curr_zoom][i][j] > max_per_tile: empty_tiles = False break if empty_tiles: # they're all empty so add this interval to this zoom level for i in range(tile_from[0], tile_to[0] + 1): for j in range(tile_from[1], tile_to[1] + 1): tile_counts[curr_zoom][i][j] += 1 interval_inserts.append( ( counter, curr_zoom, d["importance"], d["xs"][0], d["xs"][1], d["ys"][0], d["ys"][1], d["chrOffset"], d["uid"], d["fields"], ) ) position_index_inserts.append( (counter, d["xs"][0], d["xs"][1], d["ys"][0], d["ys"][1]) ) counter += 1 break curr_zoom += 1 if len(interval_inserts) >= sqlite_batch_size: batch_insert(conn, c, interval_inserts, position_index_inserts) batch_insert(conn, c, interval_inserts, position_index_inserts) c.close() return
def _bedfile( filepath, output_file, assembly, importance_column, has_header, chromosome, max_per_tile, tile_size, delimiter, chromsizes_filename, offset ): if output_file is None: output_file = filepath + ".beddb" else: output_file = output_file if op.exists(output_file): os.remove(output_file) if filepath.endswith('.gz'): import gzip bed_file = gzip.open(filepath, 'rt') else: bed_file = open(filepath, 'r') (chrom_info, chrom_names, chrom_sizes) = cch.load_chromsizes(chromsizes_filename, assembly) rand = random.Random(3) def line_to_np_array(line): ''' Convert a bed file line to a numpy array which can later be used as an entry in an h5py file. ''' try: start = int(line[1]) stop = int(line[2]) except ValueError: raise ValueError( "Error parsing the position, line: {}".format(line) ) chrom = line[0] if importance_column is None: # assume a random importance when no aggregation strategy is given importance = rand.random() elif importance_column == 'size': importance = stop - start elif importance_column == 'random': importance = rand.random() else: importance = int(line[int(importance_column)-1]) if stop < start: print("WARNING: stop < start:", line, file=sys.stderr) return # convert chromosome coordinates to genome coordinates genome_start = chrom_info.cum_chrom_lengths[chrom] + start + offset genome_end = chrom_info.cum_chrom_lengths[chrom] + stop + offset pos_offset = genome_start - start parts = { 'startPos': genome_start, 'endPos': genome_end, 'uid': slugid.nice().decode('utf-8'), 'chrOffset': pos_offset, 'fields': '\t'.join(line), 'importance': importance, 'chromosome': str(chrom) } return parts dset = [] print("delimiter:", delimiter) if has_header: line = bed_file.readline() header = line.strip().split(delimiter) else: line = bed_file.readline().strip() line_parts = line.strip().split(delimiter) try: dset += [line_to_np_array(line_parts)] except IndexError as ie: print("Invalid line:", line) header = map(str, list(range(1,len(line.strip().split(delimiter))+1))) for line in bed_file: line_parts = line.strip().split(delimiter) try: dset += [line_to_np_array(line_parts)] except IndexError as ie: print("Invalid line:", line) if chromosome is not None: dset = [d for d in dset if d['chromosome'] == chromosome] # We neeed chromosome information as well as the assembly size to properly # tile this data tile_size = tile_size assembly_size = chrom_info.total_length + 1 ''' else: try: assembly_size = chrom_info.chrom_lengths[chromosome] except KeyError: print( "ERROR: Chromosome {} not found in assembly {}.".format( chromosome, assembly ), file=sys.stderr ) return 1 ''' max_zoom = int( math.ceil(math.log(assembly_size / tile_size) / math.log(2)) ) ''' if max_zoom is not None and max_zoom < max_zoom: max_zoom = max_zoom ''' # this script stores data in a sqlite database import sqlite3 sqlite3.register_adapter(np.int64, lambda val: int(val)) print("output_file:", output_file) conn = sqlite3.connect(output_file) # store some meta data store_meta_data( conn, 1, max_length=assembly_size, assembly=assembly, chrom_names=chrom_names, chrom_sizes=chrom_sizes, tile_size=tile_size, max_zoom=max_zoom, max_width=tile_size * 2 ** max_zoom, header=header, ) max_width = tile_size * 2 ** max_zoom uid_to_entry = {} intervals = [] # store each bed file entry as an interval for d in dset: uid = d['uid'] uid_to_entry[uid] = d intervals += [(d['startPos'], d['endPos'], uid)] tile_width = tile_size c = conn.cursor() c.execute( ''' CREATE TABLE intervals ( id int PRIMARY KEY, zoomLevel int, importance real, startPos int, endPos int, chrOffset int, uid text, fields text ) ''' ) c.execute( ''' CREATE VIRTUAL TABLE position_index USING rtree( id, rStartPos, rEndPos ) ''' ) curr_zoom = 0 counter = 0 max_viewable_zoom = max_zoom if max_zoom is not None and max_zoom < max_zoom: max_viewable_zoom = max_zoom sorted_intervals = sorted(intervals, key=lambda x: -uid_to_entry[x[-1]]['importance']) # print('si:', sorted_intervals[:10]) print("max_per_tile:", max_per_tile) tile_counts = col.defaultdict(int) for interval in sorted_intervals: # go through each interval from most important to least while curr_zoom <= max_viewable_zoom: # try to place it in the highest zoom level and go down from there tile_width = tile_size * 2 ** (max_zoom - curr_zoom) curr_pos = interval[0] space_available = True # check if there's space at this zoom level while curr_pos < interval[1]: curr_tile = math.floor(curr_pos / tile_width) tile_id = '{}.{}'.format(curr_zoom, curr_tile) ''' if interval[0] < 1000000: print('tile_id:', tile_id, tile_counts[tile_id], curr_zoom, 'interval:', interval) ''' # print(tile_id, "tile_counts[tile_id]", tile_counts[tile_id]) if tile_counts[tile_id] >= max_per_tile: space_available = False break curr_pos += tile_width # if there is, then fill it up if space_available: curr_pos = interval[0] while curr_pos < interval[1]: curr_tile = math.floor(curr_pos / tile_width) tile_id = '{}.{}'.format(curr_zoom, curr_tile) tile_counts[tile_id] += 1 ''' # increment tile counts for lower level tiles higher_zoom = curr_zoom + 1 higher_tile = math.floor(higher_zoom / 2) while higher_zoom <= max_viewable_zoom: new_tile_id = '{}.{}'.format(higher_zoom, higher_tile) higher_zoom += 1 higher_tile = math.floor(higher_tile / 2) tile_counts[new_tile_id] += 1 ''' curr_pos += tile_width if space_available: # there's available space value = uid_to_entry[interval[-1]] # one extra question mark for the primary key exec_statement = 'INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?)' ret = c.execute( exec_statement, # primary key, zoomLevel, startPos, endPos, chrOffset, line (counter, curr_zoom, value['importance'], value['startPos'], value['endPos'], value['chrOffset'], value['uid'], value['fields']) ) if counter % 1000 == 0: print('counter:', counter, value['endPos'] - value['startPos']) exec_statement = 'INSERT INTO position_index VALUES (?,?,?)' ret = c.execute( exec_statement, (counter, value['startPos'], value['endPos']) #add counter as a primary key ) counter += 1 break curr_zoom += 1 curr_zoom = 0 conn.commit()
def _bedpe(filepath, output_file, assembly, importance_column, has_header, max_per_tile, tile_size, max_zoom=None, chromosome=None, chromsizes_filename=None, chr1_col=0, from1_col=1, to1_col=2, chr2_col=3, from2_col=4, to2_col=5): print('output_file:', output_file) if filepath == '-': f = sys.stdin elif filepath.endswith('.gz'): f = gzip.open(filepath, 'rt') else: print("plain") f = open(filepath, 'r') if output_file is None: output_file = filepath + ".multires.db" else: output_file = output_file if op.exists(output_file): os.remove(output_file) (chrom_info, chrom_names, chrom_sizes) = cch.load_chromsizes(chromsizes_filename, assembly) def line_to_dict(line): parts = line.split() d = {} try: d['xs'] = [ chrom_info.cum_chrom_lengths[ parts[chr1_col]] + int(parts[from1_col]), chrom_info.cum_chrom_lengths[ parts[chr1_col]] + int(parts[to1_col]) ] d['ys'] = [ chrom_info.cum_chrom_lengths[ parts[chr2_col]] + int(parts[from2_col]), chrom_info.cum_chrom_lengths[ parts[chr2_col]] + int(parts[to2_col]) ] except KeyError: error_str = ( "ERROR converting chromosome position to genome position. " "Please make sure you've specified the correct assembly " "using the --assembly option. " "Current assembly: {}, chromosomes: {},{}".format( assembly, parts[chr1_col], parts[chr2_col] ) ) raise(KeyError(error_str)) d['uid'] = slugid.nice().decode('utf-8') d['chrOffset'] = d['xs'][0] - int(parts[from1_col]) if importance_column is None: d['importance'] = max( d['xs'][1] - d['xs'][0], d['ys'][1] - d['ys'][0] ) elif importance_column == 'random': d['importance'] = random.random() else: # We seem to use one-based numbering for columns... d['importance'] = float(parts[int(importance_column) - 1]) d['fields'] = line return d entries = [] if has_header: f.readline() else: first_line = f.readline().strip() try: parts = first_line.split() ''' print("chr1_col", chr1_col, "chr2_col", chr2_col, "from1_col:", from1_col, "from2_col", from2_col, "to1_col", to1_col, "to2_col", to2_col) ''' int(parts[from1_col]) int(parts[to1_col]) int(parts[from2_col]) int(parts[to2_col]) except ValueError as ve: error_str = ( "Couldn't convert one of the bedpe coordinates to an " "integer. If the input file contains a header, make sure to " "indicate that with the --has-header option. Line: {}" .format(first_line) ) raise(ValueError(error_str)) entries = [line_to_dict(first_line)] entries += [line_to_dict(line.strip()) for line in f] # We neeed chromosome information as well as the assembly size to properly # tile this data tile_size = tile_size assembly_size = chrom_info.total_length + 1 max_zoom = int( math.ceil(math.log(assembly_size / tile_size) / math.log(2)) ) ''' if max_zoom is not None and max_zoom < max_zoom: max_zoom = max_zoom ''' # this script stores data in a sqlite database sqlite3.register_adapter(np.int64, lambda val: int(val)) conn = sqlite3.connect(output_file) # store some meta data store_meta_data( conn, 1, max_length=assembly_size, assembly=assembly, chrom_names=chrom_names, chrom_sizes=chrom_sizes, tile_size=tile_size, max_zoom=max_zoom, max_width=tile_size * 2 ** max_zoom ) # max_width = tile_size * 2 ** max_zoom # uid_to_entry = {} c = conn.cursor() c.execute( ''' CREATE TABLE intervals ( id int PRIMARY KEY, zoomLevel int, importance real, fromX int, toX int, fromY int, toY int, chrOffset int, uid text, fields text ) ''' ) print("creating rtree") c.execute(''' CREATE VIRTUAL TABLE position_index USING rtree( id, rFromX, rToX, rFromY, rToY ) ''') curr_zoom = 0 counter = 0 tile_counts = col.defaultdict( lambda: col.defaultdict(lambda: col.defaultdict(int)) ) entries = sorted(entries, key=lambda x: -x['importance']) counter = 0 for d in entries: curr_zoom = 0 while curr_zoom <= max_zoom: tile_width = tile_size * 2 ** (max_zoom - curr_zoom) tile_from = list( map(lambda x: x / tile_width, [d['xs'][0], d['ys'][0]]) ) tile_to = list( map(lambda x: x / tile_width, [d['xs'][1], d['ys'][1]]) ) empty_tiles = True # go through and check if any of the tiles at this zoom level are # full for i in range(int(tile_from[0]), int(tile_to[0])+1): if not empty_tiles: break for j in range(int(tile_from[1]), int(tile_to[1])+1): if tile_counts[curr_zoom][i][j] > max_per_tile: empty_tiles = False break if empty_tiles: # they're all empty so add this interval to this zoom level for i in range(int(tile_from[0]), int(tile_to[0])+1): for j in range(int(tile_from[1]), int(tile_to[1])+1): tile_counts[curr_zoom][i][j] += 1 c.execute( 'INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?,?,?)', ( counter, curr_zoom, d['importance'], d['xs'][0], d['xs'][1], d['ys'][0], d['ys'][1], d['chrOffset'], d['uid'], d['fields'] ) ) conn.commit() c.execute( 'INSERT INTO position_index VALUES (?,?,?,?,?)', ( counter, d['xs'][0], d['xs'][1], d['ys'][0], d['ys'][1] ) # add counter as a primary key ) conn.commit() counter += 1 break curr_zoom += 1 return
def bigwigs_to_multivec( filepaths, output_file, assembly, chromsizes_filename, row_infos_filename, tile_size, ): with tempfile.TemporaryDirectory() as td: print("temporary dir:", td) temp_file = op.join(td, "temp.mv5") f_out = h5py.File(temp_file, "w") (chrom_info, chrom_names, chrom_lengths) = cch.load_chromsizes(chromsizes_filename, assembly) if row_infos_filename is not None: with open(row_infos_filename, "r") as f: row_infos = [line.strip().encode("utf8") for line in f] else: row_infos = None starting_resolution = 1 resolution = starting_resolution for chrom in chrom_info.chrom_order: f_out.create_dataset( chrom, ( math.ceil( chrom_info.chrom_lengths[chrom] / starting_resolution), len(filepaths), ), fillvalue=np.nan, compression="gzip", ) # Fill in data for each bigwig file. for bw_index, bw_file in tqdm(list(enumerate(filepaths)), desc="bigwigs"): if bbi.is_bigwig(bw_file): chromsizes = bbi.chromsizes(bw_file) matching_chromosomes = set(chromsizes.keys()).intersection( set(chrom_names)) # Fill in data for each resolution of a bigwig file. for chr_name in matching_chromosomes: print("chr_name:", chr_name, resolution) chr_len = chrom_info.chrom_lengths[chr_name] chr_shape = (math.ceil(chr_len / resolution), len(filepaths)) arr = bbi.fetch(bw_file, chr_name, 0, chr_len, chr_shape[0], summary="sum") f_out[chr_name][:, bw_index] = arr else: print(f"{bw_file} not is_bigwig") f_out.flush() f_out.close() tf = temp_file f_in = h5py.File(tf, "r") def agg(x): return x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T cmv.create_multivec_multires( f_in, chromsizes=zip(chrom_names, chrom_lengths), agg=agg, starting_resolution=starting_resolution, tile_size=tile_size, output_file=output_file, row_infos=row_infos, )