def chromsizes_negspy_to_zarr(assembly, output, has_header): chrom_order = nc.get_chromorder(assembly) chrom_info = nc.get_chrominfo(assembly) chrom_rows = [{ 0: chrom_name, 1: chrom_info.chrom_lengths[chrom_name] } for chrom_name in chrom_order] df = pd.DataFrame(columns=[0, 1], data=chrom_rows) num_chroms = df.shape[0] columns = df.columns.values.tolist() chrom_names = df[columns[0]].values chrom_sizes = df[columns[1]].values df["name_len"] = df[columns[0]].apply(lambda name: len(name)) max_name_len = int(df["name_len"].max()) z = zarr.open(output, mode='w') compressor = Zlib(level=1) z.create_dataset("names", shape=(num_chroms, ), dtype=f"S{max_name_len}", compressor=compressor) z.create_dataset("sizes", shape=(num_chroms, ), dtype="u4", compressor=compressor) z["names"][:] = chrom_names z["sizes"][:] = chrom_sizes
def load_chromsizes(chromsizes_filename, assembly=None): """ Load a set of chromosomes from a file or using an assembly identifier. If using just an assembly identifier the chromsizes will be loaded from the negspy repository. Parameters: ----------- chromsizes_filename: string The file containing the tab-delimited chromosome sizes assembly: string Assembly name (e.g. 'hg19'). Not necessary if a chromsizes_filename is passed in """ if chromsizes_filename is not None: chrom_info = nc.get_chrominfo_from_file(chromsizes_filename) chrom_names = chrom_info.chrom_order chrom_sizes = [ chrom_info.chrom_lengths[c] for c in chrom_info.chrom_order ] else: if assembly is None: raise ValueError("No assembly or chromsizes specified") chrom_info = nc.get_chrominfo(assembly) chrom_names = nc.get_chromorder(assembly) chrom_sizes = nc.get_chromsizes(assembly) return (chrom_info, chrom_names, chrom_sizes)
def main(): parser = argparse.ArgumentParser(description=""" python chrom_sizes.py assembly Print the chromosome sizes for the given assembly. """) parser.add_argument('assembly') #parser.add_argument('argument', nargs=1) #parser.add_argument('-o', '--options', default='yo', # help="Some option", type='str') #parser.add_argument('-u', '--useless', action='store_true', # help='Another useless option') args = parser.parse_args() for chr in nc.get_chromorder(args.assembly): print(chr + "\t" + str(nc.get_chrominfo(args.assembly).chrom_lengths[chr]))
def main(): parser = argparse.ArgumentParser(description=""" python chrom_sizes.py assembly Print the chromosome sizes for the given assembly. """) parser.add_argument('assembly') #parser.add_argument('argument', nargs=1) #parser.add_argument('-o', '--options', default='yo', # help="Some option", type='str') #parser.add_argument('-u', '--useless', action='store_true', # help='Another useless option') args = parser.parse_args() for chr in nc.get_chromorder(args.assembly): print(chr + "\t" + str(nc.get_chrominfo(args.assembly).chrom_lengths[chr]))
def _bigwig(filepath, chunk_size=14, zoom_step=8, tile_size=1024, output_file=None, assembly='hg19', chromsizes_filename=None, chromosome=None): last_end = 0 data = [] if output_file is None: if chromosome is None: output_file = op.splitext(filepath)[0] + '.hitile' else: output_file = op.splitext( filepath)[0] + '.' + chromosome + '.hitile' # Override the output file if it existts if op.exists(output_file): os.remove(output_file) f = h5py.File(output_file, 'w') if chromsizes_filename is not None: chrom_info = nc.get_chrominfo_from_file(chromsizes_filename) chrom_order = [ a for a in nc.get_chromorder_from_file(chromsizes_filename) ] chrom_sizes = nc.get_chromsizes_from_file(chromsizes_filename) else: print("there") chrom_info = nc.get_chrominfo(assembly) chrom_order = [a for a in nc.get_chromorder(assembly)] chrom_sizes = nc.get_chromsizes(assembly) print("chrom_order:", chrom_order) assembly_size = chrom_info.total_length tile_size = tile_size chunk_size = tile_size * 2**chunk_size # how many values to read in at once while tiling dsets = [] # data sets at each zoom level nan_dsets = [] # initialize the arrays which will store the values at each stored zoom level z = 0 positions = [] # store where we are at the current dataset data_buffers = [[]] nan_data_buffers = [[]] while assembly_size / 2**z > tile_size: dset_length = math.ceil(assembly_size / 2**z) dsets += [ f.create_dataset('values_' + str(z), (dset_length, ), dtype='f', compression='gzip') ] nan_dsets += [ f.create_dataset('nan_values_' + str(z), (dset_length, ), dtype='f', compression='gzip') ] data_buffers += [[]] nan_data_buffers += [[]] positions += [0] z += zoom_step # load the bigWig file bwf = pbw.open(filepath) # store some meta data d = f.create_dataset('meta', (1, ), dtype='f') if chromosome is not None: d.attrs['min-pos'] = chrom_info.cum_chrom_lengths[chromosome] d.attrs['max-pos'] = chrom_info.cum_chrom_lengths[ chromosome] + bwf.chroms()[chromosome] else: d.attrs['min-pos'] = 0 d.attrs['max-pos'] = assembly_size ''' print("chroms.keys:", bwf.chroms().keys()) print("chroms.values:", bwf.chroms().values()) ''' d.attrs['zoom-step'] = zoom_step d.attrs['max-length'] = assembly_size d.attrs['assembly'] = assembly d.attrs['chrom-names'] = [a.encode('utf-8') for a in chrom_order] d.attrs['chrom-sizes'] = chrom_sizes d.attrs['chrom-order'] = [a.encode('utf-8') for a in chrom_order] d.attrs['tile-size'] = tile_size d.attrs['max-zoom'] = max_zoom = math.ceil( math.log(d.attrs['max-length'] / tile_size) / math.log(2)) d.attrs['max-width'] = tile_size * 2**max_zoom d.attrs['max-position'] = 0 print("assembly size (max-length)", d.attrs['max-length']) print("max-width", d.attrs['max-width']) print("max_zoom:", d.attrs['max-zoom']) print("chunk-size:", chunk_size) print("chrom-order", d.attrs['chrom-order']) t1 = time.time() curr_zoom = 0 def add_values_to_data_buffers(buffers_to_add, nan_buffers_to_add): curr_zoom = 0 data_buffers[0] += buffers_to_add nan_data_buffers[0] += nan_buffers_to_add curr_time = time.time() - t1 percent_progress = (positions[curr_zoom] + 1) / float(assembly_size) print( "position: {} progress: {:.2f} elapsed: {:.2f} remaining: {:.2f}". format(positions[curr_zoom] + 1, percent_progress, curr_time, curr_time / (percent_progress) - curr_time)) while len(data_buffers[curr_zoom]) >= chunk_size: # get the current chunk and store it, converting nans to 0 print("len(data_buffers[curr_zoom])", len(data_buffers[curr_zoom])) curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size]) nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size]) #curr_chunk[np.isnan(curr_chunk)] = 0 ''' print("1cc:", sum(curr_chunk)) print("1db:", data_buffers[curr_zoom][:chunk_size]) print("1curr_chunk:", nan_curr_chunk) ''' print("positions[curr_zoom]:", positions[curr_zoom]) dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] + chunk_size] = curr_chunk nan_dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] + chunk_size] = nan_curr_chunk # aggregate nan values #nan_curr_chunk[np.isnan(curr_chunk)] = 0 #print("1na_cc:", sum(nan_curr_chunk)) # aggregate and store aggregated values in the next zoom_level's data data_buffers[curr_zoom + 1] += list( ct.aggregate(curr_chunk, 2**zoom_step)) nan_data_buffers[curr_zoom + 1] += list( ct.aggregate(nan_curr_chunk, 2**zoom_step)) data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:] nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][ chunk_size:] data = data_buffers[curr_zoom + 1] nan_data = nan_data_buffers[curr_zoom + 1] # do the same for the nan values buffers positions[curr_zoom] += chunk_size curr_zoom += 1 if curr_zoom * zoom_step >= max_zoom: break # Do we only want values from a single chromosome? if chromosome is not None: chroms_to_use = [chromosome] else: chroms_to_use = chrom_order for chrom in chroms_to_use: print("chrom:", chrom) ''' if chrom not in bwf.chroms(): print("skipping chrom (not in bigWig file):", chrom, chrom_info.chrom_lengths[chrom]) continue ''' counter = 0 # chrom_size = bwf.chroms()[chrom] chrom_size = chrom_info.chrom_lengths[chrom] # print("chrom_size:", chrom_size, bwf.chroms()[chrom]) d.attrs['max-position'] += chrom_size while counter < chrom_size: remaining = min(chunk_size, chrom_size - counter) if chrom not in bwf.chroms(): values = [np.nan] * remaining nan_values = [1] * remaining else: values = bwf.values(chrom, counter, counter + remaining) nan_values = np.isnan(values).astype('i4') # print("counter:", counter, "remaining:", remaining, # "counter + remaining:", counter + remaining) counter += remaining curr_zoom = 0 add_values_to_data_buffers(list(values), list(nan_values)) while True: # get the current chunk and store it chunk_size = len(data_buffers[curr_zoom]) curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size]) nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size]) dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] + chunk_size] = curr_chunk nan_dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] + chunk_size] = nan_curr_chunk # aggregate and store aggregated values in the next zoom_level's data data_buffers[curr_zoom + 1] += list( ct.aggregate(curr_chunk, 2**zoom_step)) nan_data_buffers[curr_zoom + 1] += list( ct.aggregate(nan_curr_chunk, 2**zoom_step)) data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:] nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][chunk_size:] data = data_buffers[curr_zoom + 1] nan_data = nan_data_buffers[curr_zoom + 1] positions[curr_zoom] += chunk_size curr_zoom += 1 # we've created enough tile levels to cover the entire maximum width if curr_zoom * zoom_step >= max_zoom: break # still need to take care of the last chunk data = np.array(data) t1 = time.time() pass
def _bedfile(filepath, output_file, assembly, importance_column, has_header, chromosome, max_per_tile, tile_size, delimiter, chromsizes_filename, offset): if output_file is None: output_file = filepath + ".multires" else: output_file = output_file if op.exists(output_file): os.remove(output_file) bed_file = open(filepath, 'r') if chromsizes_filename is not None: chrom_info = nc.get_chrominfo_from_file(chromsizes_filename) chrom_names = chrom_info.chrom_order chrom_sizes = [ chrom_info.chrom_lengths[c] for c in chrom_info.chrom_order ] else: chrom_info = nc.get_chrominfo(assembly) chrom_names = nc.get_chromorder(assembly) chrom_sizes = nc.get_chromsizes(assembly) print("chrom_names:", chrom_info.chrom_order) print("chrom_sizes:", chrom_sizes) def line_to_np_array(line): ''' Convert a bed file line to a numpy array which can later be used as an entry in an h5py file. ''' try: start = int(line[1]) stop = int(line[2]) except ValueError: raise ValueError( "Error parsing the position, line: {}".format(line)) chrom = line[0] if importance_column is None: importance = stop - start elif importance_column == 'random': importance = random.random() else: importance = int(line[int(importance_column) - 1]) # convert chromosome coordinates to genome coordinates genome_start = chrom_info.cum_chrom_lengths[chrom] + start + offset #nc.chr_pos_to_genome_pos(str(chrom), start, assembly) genome_end = chrom_info.cum_chrom_lengths[chrom] + stop + offset #nc.chr_pos_to_genome_pos(chrom, stop, assembly) pos_offset = genome_start - start parts = { 'startPos': genome_start, 'endPos': genome_end, 'uid': slugid.nice().decode('utf-8'), 'chrOffset': pos_offset, 'fields': '\t'.join(line), 'importance': importance, 'chromosome': str(chrom) } return parts dset = [] if has_header: line = bed_file.readline() header = line.strip().split(delimiter) else: line = bed_file.readline().strip() dset += [line_to_np_array(line.strip().split(delimiter))] header = map(str, list(range(1, len(line.strip().split(delimiter)) + 1))) print("header:", header) for line in bed_file: dset += [line_to_np_array(line.strip().split(delimiter))] if chromosome is not None: dset = [d for d in dset if d['chromosome'] == chromosome] # We neeed chromosome information as well as the assembly size to properly # tile this data tile_size = tile_size #if chromosome is None: assembly_size = chrom_info.total_length + 1 ''' else: try: assembly_size = chrom_info.chrom_lengths[chromosome] except KeyError: print("ERROR: Chromosome {} not found in assembly {}.".format(chromosome, assembly), file=sys.stderr) return 1 ''' #max_zoom = int(math.ceil(math.log(assembly_size / min_feature_width) / math.log(2))) max_zoom = int(math.ceil( math.log(assembly_size / tile_size) / math.log(2))) ''' if max_zoom is not None and max_zoom < max_zoom: max_zoom = max_zoom ''' # this script stores data in a sqlite database import sqlite3 sqlite3.register_adapter(np.int64, lambda val: int(val)) print("output_file:", output_file) conn = sqlite3.connect(output_file) # store some meta data store_meta_data(conn, 1, max_length=assembly_size, assembly=assembly, chrom_names=chrom_names, chrom_sizes=chrom_sizes, tile_size=tile_size, max_zoom=max_zoom, max_width=tile_size * 2**max_zoom, header=header) max_width = tile_size * 2**max_zoom uid_to_entry = {} intervals = [] # store each bed file entry as an interval for d in dset: uid = d['uid'] uid_to_entry[uid] = d intervals += [(d['startPos'], d['endPos'], uid)] tile_width = tile_size removed = set() c = conn.cursor() c.execute(''' CREATE TABLE intervals ( id int PRIMARY KEY, zoomLevel int, importance real, startPos int, endPos int, chrOffset int, uid text, fields text ) ''') c.execute(''' CREATE VIRTUAL TABLE position_index USING rtree( id, rStartPos, rEndPos ) ''') curr_zoom = 0 counter = 0 max_viewable_zoom = max_zoom if max_zoom is not None and max_zoom < max_zoom: max_viewable_zoom = max_zoom while curr_zoom <= max_viewable_zoom and len(intervals) > 0: # at each zoom level, add the top genes tile_width = tile_size * 2**(max_zoom - curr_zoom) for tile_num in range(max_width // tile_width): # go over each tile and distribute the remaining values #values = interval_tree[tile_num * tile_width: (tile_num+1) * tile_width] from_value = tile_num * tile_width to_value = (tile_num + 1) * tile_width entries = [ i for i in intervals if (i[0] < to_value and i[1] > from_value) ] values_in_tile = sorted( entries, key=lambda x: -uid_to_entry[x[-1]]['importance'] )[:max_per_tile] # the importance is always the last column # take the negative because we want to prioritize # higher values if len(values_in_tile) > 0: for v in values_in_tile: counter += 1 value = uid_to_entry[v[-1]] # one extra question mark for the primary key exec_statement = 'INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?)' #print("value:", value['startPos']) ret = c.execute( exec_statement, # primary key, zoomLevel, startPos, endPos, chrOffset, line (counter, curr_zoom, value['importance'], value['startPos'], value['endPos'], value['chrOffset'], value['uid'], value['fields'])) conn.commit() exec_statement = 'INSERT INTO position_index VALUES (?,?,?)' ret = c.execute( exec_statement, (counter, value['startPos'], value['endPos'] ) #add counter as a primary key ) conn.commit() intervals.remove(v) #print ("curr_zoom:", curr_zoom, file=sys.stderr) curr_zoom += 1 conn.commit() conn.close() return
def main(): parser = argparse.ArgumentParser(description=""" python chr_pos_to_genome_pos.py -t 1,2:3,4 Convert chromosome,position pairs to genome_positions. Assumes that the coordinates refer to the hg19 assembly (unless otherwise specified). Example: 2 NM_000014 chr12 - 9220303 9268825 -> python scripts/chr_pos_to_genome_pos.py -c 3:5,3:6 2 NM_000014 genome - 2115405269 2115453791 -------------------------------- This also works with space-delimited fields: chr5 56765,56766 ->python scripts/chr_pos_to_genome_pos.py -c 1:2 genome 881683465,881683466 """) parser.add_argument('-a', '--assembly', default='hg19') parser.add_argument('-s', '--chromsizes-file', default=None) parser.add_argument('-n', '--new-chrom', default=None) parser.add_argument( '-c', '--columns', default='1,2', help="Which columns to translate to genome positions. " "Column pairs should be 1-based and separated by colons") #parser.add_argument('-u', '--useless', action='store_true', # help='Another useless option') args = parser.parse_args() if args.chromsizes_file is not None: chrom_info = nc.get_chrominfo_from_file(args.chromsizes_file) else: chrom_info = nc.get_chrominfo(args.assembly) for line in sys.stdin: try: line_output = [] line_parts = line.strip().split() translated_positions = {} translated_chroms = {} for translate_pair in [[int(y) for y in x.split(':')] for x in args.columns.split(',')]: # go through the pairs of columns that need to be translated to genome position # assume that the position column is comma separated list of values (although it doesn't # actually need to be) chrom, poss = line_parts[translate_pair[0] - 1], line_parts[ translate_pair[1] - 1].strip(",").split(',') genome_pos = ",".join( map(str, [ nc.chr_pos_to_genome_pos(chrom, int(pos), chrom_info) for pos in poss ])) #line_output += [genome_pos] # note that we've translated these columns and shouldn't include them in the output translated_positions[translate_pair[1] - 1] = genome_pos translated_chroms[translate_pair[0] - 1] = chrom for i, part in enumerate(line_parts): if i in translated_chroms: # replace chromosome identifiers (e.g. 'chr1') with 'genome' to indicate the positions if args.new_chrom is None: line_output += ['genome({})'.format(chrom)] else: line_output += [args.new_chrom] elif i in translated_positions: # this column used to contain a position so we need to replace it with a translated # position line_output += [translated_positions[i]] else: # if this column didn't contain a translated position output it as is line_output += [part] try: print("\t".join(map(str, line_output))) except BrokenPipeError: # Output is probably being run through "head" or something similar break except KeyError as ke: print("KeyError:", ke, line.strip(), file=sys.stderr)
def END_ABS(self, CHROM, END): chrom_info = nc.get_chrominfo("hg38") return nc.chr_pos_to_genome_pos("chr" + CHROM, END, chrom_info)
def START_ABS(self, CHROM, START): chrom_info = nc.get_chrominfo("hg38") return nc.chr_pos_to_genome_pos("chr" + CHROM, START, chrom_info)
def bigwigs_to_multivec(input_bigwig_files, input_metadata_files, output_file, starting_resolution): f = h5py.File(output_file, 'w') num_samples = len(input_bigwig_files) # Zip the input to create (bw, metadata) tuples zipped_input = zip(input_bigwig_files, input_metadata_files) # Create level zero groups info_group = f.create_group("info") resolutions_group = f.create_group("resolutions") chroms_group = f.create_group("chroms") # Set info attributes info_group.attrs['tile-size'] = 256 # Prepare to fill in chroms dataset chromosomes = nc.get_chromorder('hg38') chromosomes = chromosomes[:25] # TODO: should more than chr1-chrM be used? num_chromosomes = len(chromosomes) chroms_length_arr = np.array( [nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes], dtype="i8") chroms_name_arr = np.array(chromosomes, dtype="S23") chromosomes_set = set(chromosomes) chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr)) # Fill in chroms dataset entries "length" and "name" chroms_group.create_dataset("length", data=chroms_length_arr) chroms_group.create_dataset("name", data=chroms_name_arr) # Prepare to fill in resolutions dataset resolutions = [starting_resolution * (2**x) for x in range(16)] # Create each resolution group. for resolution in resolutions: resolution_group = resolutions_group.create_group(str(resolution)) # TODO: remove the unnecessary "values" layer resolution_values_group = resolution_group.create_group("values") # Create each chromosome dataset. for chr_name, chr_len in zip(chromosomes, chroms_length_arr): chr_shape = (math.ceil(chr_len / resolution), num_samples) resolution_values_group.create_dataset(chr_name, chr_shape, dtype="f4", fillvalue=np.nan, compression='gzip') # Fill in data for each bigwig file. for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)), desc='bigwigs'): if bbi.is_bigwig(bw_file): chromsizes = bbi.chromsizes(bw_file) matching_chromosomes = set( chromsizes.keys()).intersection(chromosomes_set) # Fill in data for each resolution of a bigwig file. for resolution in resolutions: # Fill in data for each chromosome of a resolution of a bigwig file. for chr_name in matching_chromosomes: chr_len = chrom_name_to_length[chr_name] chr_shape = (math.ceil(chr_len / resolution), num_samples) arr = bbi.fetch(bw_file, chr_name, 0, chr_len, chr_shape[0], summary="sum") resolutions_group[str( resolution)]["values"][chr_name][:, bw_index] = arr else: print(f"{bw_file} not is_bigwig") f.flush() f.close() max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print(max_mem) # Append metadata to the top resolution row_infos attribute. row_infos = [] for metadata_index, metadata_file in enumerate(input_metadata_files): with open(metadata_file) as mf: try: metadata_json = json.load(mf) except Exception as e: print(f"Error loading metadata file: {metadata_file}") print(e) metadata_json = None row_info = metadata_json_to_row_info(metadata_json) row_infos.append(row_info) row_infos_encoded = str(json.dumps(row_infos)) f = h5py.File(output_file, 'r+') info_group = f["info"] info_group["row_infos"] = row_infos_encoded f.close()
def bigwigs_to_multivec( input_bigwig_files, output_file, starting_resolution ): f = h5py.File(output_file, 'w') num_samples = len(input_bigwig_files) # Create level zero groups info_group = f.create_group("info") resolutions_group = f.create_group("resolutions") chroms_group = f.create_group("chroms") # Set info attributes info_group.attrs['tile-size'] = 256 # Prepare to fill in chroms dataset chromosomes = nc.get_chromorder(GENOME_BUILD) chromosomes = chromosomes[:25] # TODO: should more than chr1-chrM be used? chroms_length_arr = np.array([ nc.get_chrominfo('hg19').chrom_lengths[x] for x in chromosomes ], dtype="i8") chroms_name_arr = np.array(chromosomes, dtype="S23") chromosomes_set = set(chromosomes) chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr)) # Fill in chroms dataset entries "length" and "name" chroms_group.create_dataset("length", data=chroms_length_arr) chroms_group.create_dataset("name", data=chroms_name_arr) num_zoom_levels = math.floor(math.log2(GENOME_LENGTH / starting_resolution)) # Prepare to fill in resolutions dataset resolutions = [starting_resolution * (2 ** x) for x in range(num_zoom_levels)] # Create each resolution group. for resolution in resolutions: resolution_group = resolutions_group.create_group(str(resolution)) # TODO: remove the unnecessary "values" layer resolution_values_group = resolution_group.create_group("values") # Create each chromosome dataset. for chr_name, chr_len in zip(chromosomes, chroms_length_arr): chr_shape = (math.ceil(chr_len / resolution), num_samples) resolution_values_group.create_dataset(chr_name, chr_shape, dtype="f4", fillvalue=np.nan, compression='gzip') # Fill in data for each bigwig file. for bw_index, bw_file in enumerate(input_bigwig_files): if bbi.is_bigwig(bw_file): chromsizes = bbi.chromsizes(bw_file) matching_chromosomes = set(chromsizes.keys()).intersection(chromosomes_set) # Fill in data for each resolution of a bigwig file. for resolution in resolutions: # Fill in data for each chromosome of a resolution of a bigwig file. for chr_name in matching_chromosomes: chr_len = chrom_name_to_length[chr_name] num_bins = math.ceil(chr_len / resolution) arr = bbi.fetch(bw_file, chr_name, 0, chr_len, num_bins, summary="sum") resolutions_group[str(resolution)]["values"][chr_name][:,bw_index] = arr else: print(f"{bw_file} not is_bigwig") f.flush() f.close() max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print(max_mem) # Append metadata to the top resolution row_infos attribute. row_infos = [] for input_bigwig_file in input_bigwig_files: _, filename = os.path.split(input_bigwig_file) name, _ = os.path.splitext(filename) row_infos.append({ 'id': name }) row_infos_encoded = str(json.dumps(row_infos)) f = h5py.File(output_file, 'r+') info_group = f["info"] info_group["row_infos"] = row_infos_encoded f.close()
def main(): """ python make_tiles.py input_file Create tiles for all of the entries in the JSON file. """ parser = argparse.ArgumentParser() # parser.add_argument('-o', '--options', dest='some_option', default='yo', help="Place holder for a real option", type='str') # parser.add_argument('-u', '--useless', dest='uselesss', default=False, action='store_true', help='Another useless option') parser.add_argument("--min-pos", help="The minimum range for the tiling") parser.add_argument("--max-pos", help="The maximum range for the tiling") parser.add_argument("--assembly", default=None) parser.add_argument("-r", "--resolution", help="The resolution of the data", default=None, type=int) parser.add_argument( "-k", "--position-cols", help="The position columns (defaults to all but the last, 1-based)", default=None, ) parser.add_argument( "-v", "--value-pos", help="The value column (defaults to the last one, 1-based)", default=None, type=str, ) parser.add_argument("-z", "--max-zoom", help="The maximum zoom value", default=None, type=int) parser.add_argument("--expand-range", help="Expand ranges of values") parser.add_argument( "--ignore-0", help="Ignore ranges with a zero value", default=False, action="store_true", ) parser.add_argument( "-b", "--bins-per-dimension", default=1, help="The number of bins to consider in each dimension", type=int, ) parser.add_argument( "-e", "--elasticsearch-url", default=None, help="The url of the elasticsearch database where to save the tiles", ) parser.add_argument( "-f", "--columnfile-path", default=None, help="The path to the column file where to save the tiles", ) parser.add_argument("-n", "--num-threads", default=4, type=int) parser.add_argument("--triangular", default=False, action="store_true") parser.add_argument("--log-file", default=None) parser.add_argument("--max-queue-size", default=40000, type=int) parser.add_argument("--print-status", default=None, type=int) args = parser.parse_args() if args.resolution is None and args.max_zoom is None: print("One of --resolution and --max-zoom must be set", file=sys.stderr) sys.exit(1) first_line = sys.stdin.readline() first_line_parts = first_line.strip().split() if len(first_line_parts) == 0: print("ERROR: no input") return if args.position_cols is not None: position_cols = list(map(int, args.position_cols.split(","))) else: position_cols = None # if specific position columns aren't specified, use all but the last column if position_cols is None: position_cols = list(range(1, len(first_line_parts))) if args.assembly is not None: mins = [1 for p in position_cols] maxs = [ nc.get_chrominfo(args.assembly).total_length for p in position_cols ] else: mins = [float(p) for p in args.min_pos.split(",")] maxs = [float(p) for p in args.max_pos.split(",")] max_width = max([b - a for (a, b) in zip(mins, maxs)]) if args.expand_range is not None: expand_range = list(map(int, args.expand_range.split(","))) else: expand_range = None if args.max_zoom is None: # determine the maximum zoom level based on the domain of the data # and the resolution bins_to_display_at_max_resolution = (max_width // args.resolution // args.bins_per_dimension) max_max_zoom = math.ceil( math.log(bins_to_display_at_max_resolution) / math.log(2.0)) if max_max_zoom < 0: max_max_zoom = 0 max_zoom = int(max_max_zoom) else: max_zoom = args.max_zoom # print("max_zoom:", max_zoom) max_width = args.resolution * args.bins_per_dimension * 2**max_zoom value_pos = args.value_pos # if there's not column designated as the value column, use the last column if value_pos is None: value_pos = [len(first_line_parts) - 1] else: value_pos = [int(vp) - 1 for vp in value_pos.split(",")] max_data_in_sparse = args.bins_per_dimension**len(position_cols) // 10 """ if args.elasticsearch_url is not None: tile_saver = cst.ElasticSearchTileSaver(max_data_in_sparse, args.bins_per_dimension, num_dimensions = len(position_cols), es_path = args.elasticsearch_url) else: tile_saver = cst.EmptyTileSaver(max_data_in_sparse, args.bins_per_dimension, num_dimensions = len(position_cols)) """ print( "maxs:", maxs, "max_zoom:", max_zoom, "max_data_in_sparse:", max_data_in_sparse, "url:", args.elasticsearch_url, ) # bin_counts = col.defaultdict(col.defaultdict(int)) q = mpr.Queue(maxsize=args.max_queue_size) tilesaver_processes = [] finished = mpr.Value("b", False) if args.elasticsearch_url is not None: tile_saver = cst.ElasticSearchTileSaver( max_data_in_sparse, args.bins_per_dimension, len(position_cols), args.elasticsearch_url, args.log_file, args.print_status, initial_value=[0.0 for vp in value_pos], ) else: tile_saver = cst.ColumnFileTileSaver( max_data_in_sparse, args.bins_per_dimension, len(position_cols), args.columnfile_path, args.log_file, args.print_status, initial_value=[0.0 for vp in value_pos], ) for i in range(args.num_threads): p = mpr.Process(target=cst.tile_saver_worker, args=(q, tile_saver, finished)) p.daemon = True p.start() tilesaver_processes += [(tile_saver, p)] tileset_info = { "max_value": [0 for vp in value_pos], "min_value": [0 for vp in value_pos], "min_pos": mins, "max_pos": maxs, "max_zoom": max_zoom, "bins_per_dimension": args.bins_per_dimension, "max_width": max_width, } tile_saver.save_tile({ "tile_id": "tileset_info", "tile_value": tileset_info }) tile_saver.flush() try: tileset_info = create_tiles( q, [first_line], sys.stdin, position_cols, value_pos, max_zoom, args.bins_per_dimension, tile_saver, expand_range, args.ignore_0, tileset_info, max_width, args.triangular, args.max_queue_size, print_status=args.print_status, ) except KeyboardInterrupt: for (ts, p) in tilesaver_processes: ts.flush() p.terminate() p.join() raise finished.value = True # wait for the worker processes to finish for (ts, p) in tilesaver_processes: p.join() print("tileset_info:", tileset_info) tile_saver.save_tile({ "tile_id": "tileset_info", "tile_value": tileset_info }) tile_saver.flush()
def main(): usage = """ python make_tiles.py input_file Create tiles for all of the entries in the JSON file. """ num_args = 1 parser = argparse.ArgumentParser() #parser.add_argument('-o', '--options', dest='some_option', default='yo', help="Place holder for a real option", type='str') #parser.add_argument('-u', '--useless', dest='uselesss', default=False, action='store_true', help='Another useless option') parser.add_argument('input_file') parser.add_argument('-b', '--bins-per-dimension', help='The number of bins to divide the data into', default=1, type=int) parser.add_argument('--use-spark', default=False, action='store_true', help='Use spark to distribute the workload') parser.add_argument( '-r', '--resolution', help='The resolution of the data (applies only to matrix data)', type=int) parser.add_argument('--importance', action='store_true', help='Create tiles by importance') parser.add_argument( '-i', '--importance-field', dest='importance_field', default='importance_field', help= 'The field in each JSON entry that indicates how important that entry is', type=str) parser.add_argument( '-v', '--value', dest='value_field', default='count', help= 'The that has the value of each point. Used for aggregation and display' ) group = parser.add_mutually_exclusive_group() group.add_argument('-p', '--position', dest='position', default='position', help='Where this entry would be placed on the x axis', type=str) group.add_argument('-s', '--sort-by', default=None, help='Sort by a field and use as the position') parser.add_argument( '--end-position', default=None, help= "Use a field to indicate the end of a particular element so that it appears in all tiles that intersect it" ) parser.add_argument( '-e', '--max-entries-per-tile', dest='max_entries_per_tile', default=15, help= 'The maximum number of entries that can be displayed on a single tile', type=int) parser.add_argument('-c', '--column-names', dest='column_names', default=None) parser.add_argument('-m', '--max-zoom', dest='max_zoom', help='The maximum zoom level', type=int, required=True) parser.add_argument('--min-pos', dest='min_pos', default=None, help='The minimum x position', type=float) parser.add_argument('--max-pos', dest='max_pos', default=None, help='The maximum x position', type=float) parser.add_argument('--assembly', default=None) parser.add_argument( '--min-value', help= 'The field which will be used to determinethe minimum value for any data point', default='min_y') parser.add_argument( '--max-value', help= 'The field which will be used to determine the maximum value for any data point', default='max_y') parser.add_argument( '--range', help="Use two columns to create a range (i.e. pos1,pos2", default=None) parser.add_argument('--range-except-0', help="Don't expand rows which have values less than 0", default=None) parser.add_argument('--gzip', help='Compress the output JSON files using gzip', action='store_true') parser.add_argument( '--output-format', help= 'The format for the output matrix, can be either "dense" or "sparse"', default='sparse') parser.add_argument('--add-uuid', help='Add a uuid to each element', action='store_true', default=False) parser.add_argument('--reverse-importance', help='Reverse the ordering of the importance', action='store_true', default=False) output_group = parser.add_mutually_exclusive_group(required=True) output_group.add_argument( '--elasticsearch-path', help='Send the output to an elasticsearch instance', default=None) output_group.add_argument('-o', '--output-dir', help='The directory to place the tiles', default=None) parser.add_argument( '--delimiter', help= "The delimiter separating the different columns in the input files", default=None) parser.add_argument( '--elasticsearch-nodes', help='Specify elasticsearch nodes to push the completions to', default=None) parser.add_argument('--elasticsearch-index', help="The index to place the results in", default='test') parser.add_argument('--elasticsearch-doctype', help="The type of document to index", default="autocomplete") parser.add_argument('--print-status', action="store_true", help="Print status messages") args = parser.parse_args() if not args.importance: if args.output_format not in ['sparse', 'dense']: print( 'ERROR: The output format must be one of "dense" or "sparse"', file=sys.stderr) dim_names = args.position.split(',') position_cols = dim_names sc = None if args.use_spark: from pyspark import SparkContext sc = SparkContext() else: sys.stderr.write("setting sc:") sc = cfp.FakeSparkContext if args.column_names is not None: args.column_names = args.column_names.split(',') if args.assembly is not None: mins = [1 for p in position_cols] maxs = [ nc.get_chrominfo(args.assembly).total_length for p in position_cols ] else: mins = [float(p) for p in args.min_pos.split(',')] maxs = [float(p) for p in args.max_pos.split(',')] max_width = max([b - a for (a, b) in zip(mins, maxs)]) print("start time:", strftime("%Y-%m-%d %H:%M:%S", gmtime())) entries = cti.load_entries_from_file( sc, args.input_file, args.column_names, delimiter=args.delimiter, elasticsearch_path=args.elasticsearch_path) print("load entries time:", strftime("%Y-%m-%d %H:%M:%S", gmtime())) if args.range is not None: # if a pair of columns specifies a range of values, then create multiple # entries for each value within that range (e.g. bed files) range_cols = args.range.split(',') entries = entries.flatMap(lambda x: cti.expand_range( x, *range_cols, range_except_0=args.range_except_0)) if args.importance: # Data will be aggregated by importance. Only more "important" pieces of information will # be passed onto the lower resolution tiles if they are too crowded tileset = cti.make_tiles_by_importance( sc, entries, dim_names=args.position.split(','), end_dim_names=args.end_position.split(','), max_zoom=args.max_zoom, importance_field=args.importance_field, output_dir=args.output_dir, max_entries_per_tile=args.max_entries_per_tile, gzip_output=args.gzip, add_uuid=args.add_uuid, reverse_importance=args.reverse_importance, adapt_zoom=False, mins=mins, maxs=maxs) else: # Data will be aggregated by binning. This means that it two adjacent bins should be able # to be reduced into one using some function (i.e. 'sum', 'min', 'max') tileset = cti.make_tiles_by_binning( sc, entries, args.position.split(','), args.max_zoom, args.value_field, args.importance_field, bins_per_dimension=args.bins_per_dimension, resolution=args.resolution) all_tiles = tileset['tiles'] if args.elasticsearch_nodes is not None: # save the tiles to an elasticsearch database save_tile_to_elasticsearch = ft.partial( cst.save_tile_to_elasticsearch, elasticsearch_nodes=args.elasticsearch_nodes, elasticsearch_path=args.elasticsearch_path, print_status=args.print_status) (all_tiles.map(lambda x: { "tile_id": ".".join(map(str, x[0])), "tile_value": x[1] }).foreachPartition(save_tile_to_elasticsearch)) dataset_info = cdd.describe_dataset(sys.argv, args) print("saving tileset_info to:", args.elasticsearch_path) (sc.parallelize([{ "tile_value": tileset['tileset_info'], "tile_id": "tileset_info" }]).foreachPartition(save_tile_to_elasticsearch)) (sc.parallelize([{ "tile_value": dataset_info, "tile_id": "dataset_info" }]).foreachPartition(save_tile_to_elasticsearch)) if 'histogram' in tileset: histogram_rdd = sc.parallelize([{ "tile_value": tileset['histogram'], "tile_id": "histogram" }]) histogram_rdd.foreachPartition(save_tile_to_elasticsearch) else: # dump tiles to a directory structure all_tiles.foreach( ft.partial(cst.save_tile, output_dir=args.output_dir, gzip_output=args.gzip)) dataset_info = cdd.describe_dataset(sys.argv, args) with open(op.join(args.output_dir, 'dataset_info'), 'w') as f: json.dump( { "_source": { "tile_id": "dataset_info", "tile_value": dataset_info } }, f, indent=2) with open(op.join(args.output_dir, 'tileset_info'), 'w') as f: json.dump( { "_source": { "tile_id": "tileset_info", "tile_value": tileset['tileset_info'] } }, f, indent=2) if 'histogram' in tileset: with open(op.join(args.output_dir, 'value_histogram'), 'w') as f: json.dump( { "_source": { "tile_id": "histogram", "tile_value": tileset['histogram'] } }, f, indent=2)
def POS_ABS(self, CHROM, POS): chrom_info = nc.get_chrominfo('hg38') return nc.chr_pos_to_genome_pos('chr'+CHROM, POS, chrom_info)
def bigwigs_to_zarr(input_bigwig_files, output_file, starting_resolution, name): # Short-hand for creating a DirectoryStore with a root group. f = zarr.open(output_file, mode='w') compressor = Zlib(level=1) num_samples = len(input_bigwig_files) # Create level zero groups chromosomes_group = f.create_group("chromosomes") # Prepare to fill in chroms dataset chromosomes = nc.get_chromorder('hg38') chromosomes = [str(chr_name) for chr_name in chromosomes[:25] ] # TODO: should more than chr1-chrM be used? num_chromosomes = len(chromosomes) chroms_length_arr = np.array( [nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes], dtype="i8") chroms_cumsum_arr = np.concatenate( (np.array([0]), np.cumsum(chroms_length_arr))) chromosomes_set = set(chromosomes) chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr)) chrom_name_to_cumsum = dict(zip(chromosomes, chroms_cumsum_arr)) # Prepare to fill in resolutions dataset resolutions = [starting_resolution * (2**x) for x in range(16)] # Create each chromosome dataset. for chr_name, chr_len in chrom_name_to_length.items(): chr_group = chromosomes_group.create_group(chr_name) # Create each resolution group. for resolution in resolutions: chr_shape = (num_samples, math.ceil(chr_len / resolution)) chr_group.create_dataset(str(resolution), shape=chr_shape, dtype="f4", fill_value=np.nan, compressor=compressor) # Fill in data for each bigwig file. for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)), desc='bigwigs'): if bbi.is_bigwig(bw_file): chromsizes = bbi.chromsizes(bw_file) matching_chromosomes = set( chromsizes.keys()).intersection(chromosomes_set) # Fill in data for each resolution of a bigwig file. for resolution in resolutions: # Fill in data for each chromosome of a resolution of a bigwig file. for chr_name in matching_chromosomes: chr_len = chrom_name_to_length[chr_name] chr_shape = (num_samples, math.ceil(chr_len / resolution)) arr = bbi.fetch(bw_file, chr_name, 0, chr_len, chr_shape[1], summary="sum") chromosomes_group[chr_name][str(resolution)][ bw_index, :] = arr else: print(f"{bw_file} not is_bigwig") max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print(max_mem) # Append metadata to the top resolution row_infos attribute. row_infos = [] for bw_index, bw_file in enumerate(input_bigwig_files): row_infos.append({ "cluster": int(bw_index + 1), "file": os.path.basename(bw_file) }) # f.attrs should contain all tileset_info properties # For zarr, more attributes are used here to allow "serverless" f.attrs['row_infos'] = row_infos f.attrs['resolutions'] = sorted(resolutions, reverse=True) f.attrs['shape'] = [num_samples, 256] f.attrs['name'] = name f.attrs['coordSystem'] = "hg38" # https://github.com/zarr-developers/zarr-specs/issues/50 f.attrs['multiscales'] = [{ "version": "0.1", "name": chr_name, "datasets": [{ "path": f"chromosomes/{chr_name}/{resolution}" } for resolution in sorted(resolutions, reverse=True)], "type": "zarr-multivec", "metadata": { "chromoffset": int(chrom_name_to_cumsum[chr_name]), "chromsize": int(chr_len), } } for (chr_name, chr_len) in list(zip(chromosomes, chroms_length_arr))]
def _bedgraph(filepath, output_file, assembly, chrom_col, from_pos_col, to_pos_col, value_col, has_header, chromosome, tile_size, chunk_size, method, nan_value, transform, count_nan, closed_interval, chromsizes_filename, zoom_step): last_end = 0 data = [] if output_file is None: output_file = op.splitext(filepath)[0] + '.hitile' print("output file:", output_file) # Override the output file if it existts if op.exists(output_file): os.remove(output_file) f = h5py.File(output_file, 'w') # get the information about the chromosomes in this assembly if chromsizes_filename is not None: chrom_info = nc.get_chrominfo_from_file(chromsizes_filename) chrom_order = [ a.encode('utf-8') for a in nc.get_chromorder_from_file(chromsizes_filename) ] chrom_sizes = nc.get_chromsizes_from_file(chromsizes_filename) else: chrom_info = nc.get_chrominfo(assembly) chrom_order = [a.encode('utf-8') for a in nc.get_chromorder(assembly)] chrom_sizes = nc.get_chromsizes(assembly) assembly_size = chrom_info.total_length print('assembly_size:', assembly_size) tile_size = tile_size chunk_size = tile_size * 2**chunk_size # how many values to read in at once while tiling dsets = [] # data sets at each zoom level nan_dsets = [] # store nan values # initialize the arrays which will store the values at each stored zoom level z = 0 positions = [] # store where we are at the current dataset data_buffers = [[]] nan_data_buffers = [[]] while assembly_size / 2**z > tile_size: dset_length = math.ceil(assembly_size / 2**z) dsets += [ f.create_dataset('values_' + str(z), (dset_length, ), dtype='f', compression='gzip') ] nan_dsets += [ f.create_dataset('nan_values_' + str(z), (dset_length, ), dtype='f', compression='gzip') ] data_buffers += [[]] nan_data_buffers += [[]] positions += [0] z += zoom_step #print("dsets[0][-10:]", dsets[0][-10:]) # load the bigWig file #print("filepath:", filepath) # store some meta data d = f.create_dataset('meta', (1, ), dtype='f') print("assembly:", assembly) #print("chrom_info:", nc.get_chromorder(assembly)) d.attrs['zoom-step'] = zoom_step d.attrs['max-length'] = assembly_size d.attrs['assembly'] = assembly d.attrs['chrom-names'] = chrom_order d.attrs['chrom-sizes'] = chrom_sizes d.attrs['chrom-order'] = chrom_order d.attrs['tile-size'] = tile_size d.attrs['max-zoom'] = max_zoom = math.ceil( math.log(d.attrs['max-length'] / tile_size) / math.log(2)) d.attrs['max-width'] = tile_size * 2**max_zoom d.attrs['max-position'] = 0 print("assembly size (max-length)", d.attrs['max-length']) print("max-width", d.attrs['max-width']) print("max_zoom:", d.attrs['max-zoom']) print("chunk-size:", chunk_size) print("chrom-order", d.attrs['chrom-order']) t1 = time.time() # are we reading the input from stdin or from a file? if filepath == '-': f = sys.stdin else: if filepath.endswith('.gz'): import gzip f = gzip.open(filepath, 'rt') else: f = open(filepath, 'r') curr_zoom = 0 def add_values_to_data_buffers(buffers_to_add, nan_buffers_to_add): curr_zoom = 0 data_buffers[0] += buffers_to_add nan_data_buffers[0] += nan_buffers_to_add curr_time = time.time() - t1 percent_progress = (positions[curr_zoom] + 1) / float(assembly_size) print( "position: {} progress: {:.2f} elapsed: {:.2f} remaining: {:.2f}". format(positions[curr_zoom] + 1, percent_progress, curr_time, curr_time / (percent_progress) - curr_time)) while len(data_buffers[curr_zoom]) >= chunk_size: # get the current chunk and store it, converting nans to 0 print("len(data_buffers[curr_zoom])", len(data_buffers[curr_zoom])) curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size]) nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size]) #curr_chunk[np.isnan(curr_chunk)] = 0 ''' print("1cc:", sum(curr_chunk)) print("1db:", data_buffers[curr_zoom][:chunk_size]) print("1curr_chunk:", nan_curr_chunk) ''' print("positions[curr_zoom]:", positions[curr_zoom]) dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] + chunk_size] = curr_chunk nan_dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] + chunk_size] = nan_curr_chunk # aggregate nan values #nan_curr_chunk[np.isnan(curr_chunk)] = 0 #print("1na_cc:", sum(nan_curr_chunk)) # aggregate and store aggregated values in the next zoom_level's data data_buffers[curr_zoom + 1] += list( ct.aggregate(curr_chunk, 2**zoom_step)) nan_data_buffers[curr_zoom + 1] += list( ct.aggregate(nan_curr_chunk, 2**zoom_step)) data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:] nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][ chunk_size:] data = data_buffers[curr_zoom + 1] nan_data = nan_data_buffers[curr_zoom + 1] # do the same for the nan values buffers positions[curr_zoom] += chunk_size curr_zoom += 1 if curr_zoom * zoom_step >= max_zoom: break values = [] nan_values = [] if has_header: f.readline() # the genome position up to which we've filled in values curr_genome_pos = 0 # keep track of the previous value so that we can use it to fill in NAN values prev_value = 0 for line in f: # each line should indicate a chromsome, start position and end position parts = line.strip().split() start_genome_pos = chrom_info.cum_chrom_lengths[parts[ chrom_col - 1]] + int(parts[from_pos_col - 1]) #print("len(values):", len(values), curr_genome_pos, start_genome_pos) #print("line:", line) if start_genome_pos - curr_genome_pos > 1: values += [np.nan] * (start_genome_pos - curr_genome_pos - 1) nan_values += [1] * (start_genome_pos - curr_genome_pos - 1) curr_genome_pos += (start_genome_pos - curr_genome_pos - 1) # count how many nan values there are in the dataset nan_count = 1 if parts[value_col - 1] == nan_value else 0 # if the provided values are log2 transformed, we have to un-transform them if transform == 'exp2': value = 2**float( parts[value_col - 1]) if not parts[value_col - 1] == nan_value else np.nan else: value = float( parts[value_col - 1]) if not parts[value_col - 1] == nan_value else np.nan # print("pos:", int(parts[to_pos_col-1]) - int(parts[from_pos_col-1])) # we're going to add as many values are as specified in the bedfile line values_to_add = [value] * (int(parts[to_pos_col - 1]) - int(parts[from_pos_col - 1])) nan_counts_to_add = [nan_count] * (int(parts[to_pos_col - 1]) - int(parts[from_pos_col - 1])) if closed_interval: values_to_add += [value] nan_counts_to_add += [nan_count] # print("values_to_add", values_to_add) values += values_to_add nan_values += nan_counts_to_add d.attrs['max-position'] = start_genome_pos + len(values_to_add) #print("values:", values[:30]) curr_genome_pos += len(values_to_add) while len(values) > chunk_size: print("len(values):", len(values), chunk_size) print("line:", line) add_values_to_data_buffers(values[:chunk_size], nan_values[:chunk_size]) values = values[chunk_size:] nan_values = nan_values[chunk_size:] add_values_to_data_buffers(values, nan_values) # store the remaining data while True: # get the current chunk and store it chunk_size = len(data_buffers[curr_zoom]) curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size]) nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size]) ''' print("2curr_chunk", curr_chunk) print("2curr_zoom:", curr_zoom) print("2db", data_buffers[curr_zoom][:100]) ''' dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] + chunk_size] = curr_chunk nan_dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] + chunk_size] = nan_curr_chunk #print("chunk_size:", chunk_size, "len(curr_chunk):", len(curr_chunk), "len(nan_curr_chunk)", len(nan_curr_chunk)) # aggregate and store aggregated values in the next zoom_level's data data_buffers[curr_zoom + 1] += list( ct.aggregate(curr_chunk, 2**zoom_step)) nan_data_buffers[curr_zoom + 1] += list( ct.aggregate(nan_curr_chunk, 2**zoom_step)) data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:] nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][chunk_size:] data = data_buffers[curr_zoom + 1] nan_data = nan_data_buffers[curr_zoom + 1] positions[curr_zoom] += chunk_size curr_zoom += 1 # we've created enough tile levels to cover the entire maximum width if curr_zoom * zoom_step >= max_zoom: break
def _bedpe(filepath, output_file, assembly, importance_column, has_header, max_per_tile, tile_size, max_zoom=None, chromosome=None, chr1_col=0, from1_col=1, to1_col=2, chr2_col=3, from2_col=4, to2_col=5): print('output_file:', output_file) if filepath.endswith('.gz'): print("gzip") f = gzip.open(filepath, 'rt') else: print("plain") f = open(filepath, 'r') if output_file is None: output_file = filepath + ".multires.db" else: output_file = output_file if op.exists(output_file): os.remove(output_file) def line_to_dict(line): parts = line.split() d = {} try: d['xs'] = [ nc.chr_pos_to_genome_pos(parts[chr1_col], int(parts[from1_col]), assembly), nc.chr_pos_to_genome_pos(parts[chr1_col], int(parts[to1_col]), assembly) ] d['ys'] = [ nc.chr_pos_to_genome_pos(parts[chr2_col], int(parts[from2_col]), assembly), nc.chr_pos_to_genome_pos(parts[chr2_col], int(parts[to2_col]), assembly) ] except KeyError: error_str = ( "ERROR converting chromosome position to genome position. " "Please make sure you've specified the correct assembly " "using the --assembly option. " "Current assembly: {}, chromosomes: {},{}".format( assembly, parts[chr1_col], parts[chr2_col])) raise (KeyError(error_str)) d['uid'] = slugid.nice().decode('utf-8') d['chrOffset'] = d['xs'][0] - int(parts[from1_col]) if importance_column is None: d['importance'] = max(d['xs'][1] - d['xs'][0], d['ys'][1] - d['ys'][0]) elif importance_column == 'random': d['importance'] = random.random() else: d['importance'] = float(d[importance_column]) d['fields'] = line return d entries = [] if has_header: f.readline() else: first_line = f.readline().strip() try: parts = first_line.split() ''' print("chr1_col", chr1_col, "chr2_col", chr2_col, "from1_col:", from1_col, "from2_col", from2_col, "to1_col", to1_col, "to2_col", to2_col) ''' pos = int(parts[from1_col]) pos = int(parts[to1_col]) pos = int(parts[from2_col]) pos = int(parts[to2_col]) except ValueError as ve: error_str = "Couldn't convert one of the bedpe coordinates to an integer. If the input file contains a header, make sure to indicate that with the --has-header option. Line: {}".format( first_line) raise (ValueError(error_str)) entries = [line_to_dict(first_line)] entries += [line_to_dict(line.strip()) for line in f] # We neeed chromosome information as well as the assembly size to properly # tile this data tile_size = tile_size chrom_info = nc.get_chrominfo(assembly) assembly_size = chrom_info.total_length + 1 #max_zoom = int(math.ceil(math.log(assembly_size / min_feature_width) / math.log(2))) max_zoom = int(math.ceil( math.log(assembly_size / tile_size) / math.log(2))) ''' if max_zoom is not None and max_zoom < max_zoom: max_zoom = max_zoom ''' # this script stores data in a sqlite database sqlite3.register_adapter(np.int64, lambda val: int(val)) conn = sqlite3.connect(output_file) # store some meta data store_meta_data(conn, 1, max_length=assembly_size, assembly=assembly, chrom_names=nc.get_chromorder(assembly), chrom_sizes=nc.get_chromsizes(assembly), tile_size=tile_size, max_zoom=max_zoom, max_width=tile_size * 2**max_zoom) max_width = tile_size * 2**max_zoom uid_to_entry = {} c = conn.cursor() c.execute(''' CREATE TABLE intervals ( id int PRIMARY KEY, zoomLevel int, importance real, fromX int, toX int, fromY int, toY int, chrOffset int, uid text, fields text ) ''') print("creating rtree") c.execute(''' CREATE VIRTUAL TABLE position_index USING rtree( id, rFromX, rToX, rFromY, rToY ) ''') curr_zoom = 0 counter = 0 max_viewable_zoom = max_zoom if max_zoom is not None and max_zoom < max_zoom: max_viewable_zoom = max_zoom tile_counts = col.defaultdict( lambda: col.defaultdict(lambda: col.defaultdict(int))) entries = sorted(entries, key=lambda x: -x['importance']) counter = 0 for d in entries: curr_zoom = 0 while curr_zoom <= max_zoom: tile_width = tile_size * 2**(max_zoom - curr_zoom) #print("d:", d) tile_from = list( map(lambda x: x / tile_width, [d['xs'][0], d['ys'][0]])) tile_to = list( map(lambda x: x / tile_width, [d['xs'][1], d['ys'][1]])) empty_tiles = True # go through and check if any of the tiles at this zoom level are full for i in range(int(tile_from[0]), int(tile_to[0]) + 1): if not empty_tiles: break for j in range(int(tile_from[1]), int(tile_to[1]) + 1): if tile_counts[curr_zoom][i][j] > max_per_tile: empty_tiles = False break if empty_tiles: # they're all empty so add this interval to this zoom level for i in range(int(tile_from[0]), int(tile_to[0]) + 1): for j in range(int(tile_from[1]), int(tile_to[1]) + 1): tile_counts[curr_zoom][i][j] += 1 #print("adding:", curr_zoom, d) exec_statement = 'INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?,?,?)' ret = c.execute( exec_statement, (counter, curr_zoom, d['importance'], d['xs'][0], d['xs'][1], d['ys'][0], d['ys'][1], d['chrOffset'], d['uid'], d['fields'])) conn.commit() exec_statement = 'INSERT INTO position_index VALUES (?,?,?,?,?)' ret = c.execute( exec_statement, (counter, d['xs'][0], d['xs'][1], d['ys'][0], d['ys'][1] ) #add counter as a primary key ) conn.commit() counter += 1 break curr_zoom += 1 return
def test_clodius_aggregate_bedgraph1(): input_file = op.join(testdir, 'sample_data', 'dm3_values.tsv') output_file = '/tmp/dm3_values.hitile' runner = clt.CliRunner() result = runner.invoke( cca.bedgraph, [input_file, '--output-file', output_file, '--assembly', 'dm3']) a, b, tb = result.exc_info """ print("exc_info:", result.exc_info) print("result:", result) print("result.output", result.output) print("result.error", traceback.print_tb(tb)) print("Exception:", a,b) """ # print("result.output", result.output) f = h5py.File('/tmp/dm3_values.hitile') # max_zoom = f['meta'].attrs['max-zoom'] # TODO: Make assertions about result values = f['values_0'] import numpy as np # print("values:", values[8]) # genome positions are 0 based as stored in hitile files assert (np.isnan(values[8])) assert (values[9] == 1) assert (values[10] == 1) assert (values[13] == 1) assert (np.isnan(values[14])) assert (np.isnan(values[15])) chrom_info = nc.get_chrominfo('dm3') chr_2r_pos = nc.chr_pos_to_genome_pos('chr2R', 0, chrom_info) # print('chr_2r_pos:', chr_2r_pos) assert (np.isnan(values[chr_2r_pos + 28])) assert (values[chr_2r_pos + 29] == 77) assert (values[chr_2r_pos + 38] == 77) assert (values[chr_2r_pos + 39] == 0) assert (result.exit_code == 0) d = cht.get_data(f, 0, 0) # print("d[:10]", d[:10]) # print("sum(d):", sum([x for x in d if not np.isnan(x)])) assert (np.nansum(d) > 1.0 and np.nansum(d) < 10.0) return input_file = op.join(testdir, 'sample_data', 'test3chroms_values.tsv') output_file = '/tmp/test3chroms_values.hitile' runner = clt.CliRunner() result = runner.invoke(cca.bedgraph, [ input_file, '--output-file', output_file, '--assembly', 'test3chroms' ]) # print('output:', result.output, result) f = h5py.File('/tmp/test3chroms_values.hitile') # f['meta'].attrs['max-zoom'] # TODO: Make assertions about result # print('max_zoom:', max_zoom) # print("len", len(f['values_0'])) values = f['values_0'] # print('values', values[:100]) # genome positions are 0 based as stored in hitile files assert (values[8] == 0) assert (values[9] == 1) assert (values[10] == 1) assert (values[13] == 1) assert (values[14] == 0) assert (values[15] == 0) chr2_pos = nc.chr_pos_to_genome_pos('chr2', 0, 'test3chroms') assert (values[chr2_pos + 28] == 0) assert (values[chr2_pos + 29] == 77) assert (values[chr2_pos + 38] == 77) assert (values[chr2_pos + 39] == 0) assert (result.exit_code == 0) d = cht.get_data(f, 0, 0) assert (sum(d) == 770 + 880 + 5)
def __init__(self, f, profile_paths, assembly='hg38', starting_resolution=5000, name="Genomic Profiles"): """ Constructor method :param f: The opened Zarr store object. :type f: zarr.Group :param list[list[str]] profile_paths: A list of cell set paths, one path for each profile. :param str assembly: The genome assembly to use for chromosome lengths, passed to negspy. By default, 'hg38'. :param int starting_resolution: The starting resolution. By default, 5000. :param str name: The name for this set of profiles. By default, 'Genomic Profiles'. """ self.f = f num_profiles = len(profile_paths) compressor = 'default' chromosomes = [ str(chr_name) for chr_name in nc.get_chromorder(assembly)[:25] ] # TODO: should more than chr1-chrM be used? num_chromosomes = len(chromosomes) chroms_length_arr = np.array( [nc.get_chrominfo(assembly).chrom_lengths[x] for x in chromosomes], dtype="i8") chroms_cumsum_arr = np.concatenate( (np.array([0]), np.cumsum(chroms_length_arr))) chromosomes_set = set(chromosomes) chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr)) chrom_name_to_cumsum = dict(zip(chromosomes, chroms_cumsum_arr)) # Prepare to fill in resolutions datasets. resolutions = [starting_resolution * (2**x) for x in range(16)] chromosomes_group = f.create_group("chromosomes") for chr_name, chr_len in chrom_name_to_length.items(): chr_group = chromosomes_group.create_group(chr_name) # Create each resolution group. for resolution in resolutions: chr_shape = (num_profiles, math.ceil(chr_len / resolution)) chr_group.create_dataset(str(resolution), shape=chr_shape, dtype="f4", fill_value=np.nan, compressor=compressor) # f.attrs should contain the properties required for HiGlass's "tileset_info" requests. f.attrs['row_infos'] = [{ "path": profile_path } for profile_path in profile_paths] f.attrs['resolutions'] = sorted(resolutions, reverse=True) f.attrs['shape'] = [num_profiles, 256] f.attrs['name'] = name f.attrs['coordSystem'] = assembly self.resolutions = resolutions self.chromosomes = chromosomes self.chromosomes_group = chromosomes_group self.chrom_name_to_length = chrom_name_to_length self.num_profiles = num_profiles # https://github.com/zarr-developers/zarr-specs/issues/50 f.attrs['multiscales'] = [{ "version": "0.1", "name": chr_name, "datasets": [{ "path": f"chromosomes/{chr_name}/{resolution}" } for resolution in sorted(resolutions, reverse=True)], "type": "zarr-multivec", "metadata": { "chromoffset": int(chrom_name_to_cumsum[chr_name]), "chromsize": int(chr_len), } } for (chr_name, chr_len) in list(zip(chromosomes, chroms_length_arr))]
def main(): parser = argparse.ArgumentParser(description=""" python chr_pos_to_genome_pos.py -t 1,2:3,4 Convert chromosome,position pairs to genome_positions. Assumes that the coordinates refer to the hg19 assembly (unless otherwise specified). Example: 2 NM_000014 chr12 - 9220303 9268825 -> python scripts/chr_pos_to_genome_pos.py -c 3:5,3:6 2 NM_000014 genome - 2115405269 2115453791 -------------------------------- This also works with space-delimited fields: chr5 56765,56766 ->python scripts/chr_pos_to_genome_pos.py -c 1:2 genome 881683465,881683466 """) parser.add_argument('-a', '--assembly', default='hg19') parser.add_argument('-s', '--chromsizes-file', default=None) parser.add_argument('-n', '--new-chrom', default=None) parser.add_argument('-c', '--columns', default='1,2', help="Which columns to translate to genome positions. " "Column pairs should be 1-based and separated by colons") #parser.add_argument('-u', '--useless', action='store_true', # help='Another useless option') args = parser.parse_args() if args.chromsizes_file is not None: chrom_info = nc.get_chrominfo_from_file(args.chromsizes_file) else: chrom_info = nc.get_chrominfo(args.assembly) for line in sys.stdin: try: line_output = [] line_parts = line.strip().split() translated_positions = {} translated_chroms = {} for translate_pair in [[int (y) for y in x.split(':')] for x in args.columns.split(',')]: # go through the pairs of columns that need to be translated to genome position # assume that the position column is comma separated list of values (although it doesn't # actually need to be) chrom,poss = line_parts[translate_pair[0]-1], line_parts[translate_pair[1]-1].strip(",").split(',') genome_pos = ",".join(map(str,[nc.chr_pos_to_genome_pos( chrom, int(pos), chrom_info) for pos in poss])) #line_output += [genome_pos] # note that we've translated these columns and shouldn't include them in the output translated_positions[translate_pair[1]-1] = genome_pos translated_chroms[translate_pair[0]-1] = chrom for i,part in enumerate(line_parts): if i in translated_chroms: # replace chromosome identifiers (e.g. 'chr1') with 'genome' to indicate the positions if args.new_chrom is None: line_output += ['genome({})'.format(chrom)] else: line_output += [args.new_chrom] elif i in translated_positions: # this column used to contain a position so we need to replace it with a translated # position line_output += [translated_positions[i]] else: # if this column didn't contain a translated position output it as is line_output += [part] try: print("\t".join(map(str, line_output))) except BrokenPipeError: # Output is probably being run through "head" or something similar break except KeyError as ke: print("KeyError:", ke, line.strip(), file=sys.stderr)