def stack(_input, output, tile_x_size, tile_y_size, config=None, attrs=None, bbox=None): with rasterio.open(_input) as src: profile = src.profile trans = Affine.to_gdal(src.transform) dt = np.dtype(src.dtypes[0]) # read first band data type # read initial image metadata profile['driver'] = 'TileDB' profile['blockxsize'] = tile_x_size profile['blockysize'] = tile_y_size if 'tiled' in profile: del profile['tiled'] arr = xr.open_rasterio(_input, chunks={'x': tile_x_size, 'y': tile_y_size}) if bbox is None: w = profile['width'] h = profile['height'] bbox = (0, 0, w, h) else: w = bbox[2] - bbox[0] h = bbox[3] - bbox[1] nBlocksX = math.ceil(w / (tile_x_size * 1.0)) nBlocksY = math.ceil(h / (tile_y_size * 1.0)) # GDAL TileDB driver writes/reads blocks so bypass rasterio dom = tiledb.Domain( tiledb.Dim(name='BANDS', domain=(0, profile['count'] - 1), tile=1), tiledb.Dim(name='Y', domain=(0, (nBlocksY * tile_y_size) - 1), tile=tile_y_size, dtype=np.uint64), tiledb.Dim(name='X', domain=(0, (nBlocksX * tile_x_size) - 1), tile=tile_x_size, dtype=np.uint64)) cfg = tiledb.Config(config) ctx = tiledb.Ctx(config=cfg) schema = tiledb.ArraySchema( domain=dom, sparse=False, attrs=[tiledb.Attr(name="TDB_VALUES", dtype=dt)], ctx=ctx) tiledb.DenseArray.create(output, schema) with tiledb.DenseArray(output, 'w', ctx=ctx) as arr_output: arr[:, bbox[0]:bbox[2], bbox[1]:bbox[3]].data.to_tiledb(arr_output, storage_options=config) # write the GDAL metadata file from the source profile vfs = tiledb.VFS() meta = f"{output}/{os.path.basename(output)}.tdb.aux.xml" try: f = vfs.open(meta, "w") root = ET.Element('PAMDataset') geo = ET.SubElement(root, 'GeoTransform') geo.text = ', '.join(map(str, trans)) meta = ET.SubElement(root, 'Metadata') meta.set('domain', 'IMAGE_STRUCTURE') t = ET.SubElement(meta, 'MDI') t.set('key', 'DATA_TYPE') t.text = _gdal_typename(np.complex128) nbits = ET.SubElement(meta, 'MDI') nbits.set('key', 'NBITS') nbits.text = str(dt.itemsize * 8) xsize = ET.SubElement(meta, 'MDI') xsize.set('key', 'X_SIZE') xsize.text = str(w) ysize = ET.SubElement(meta, 'MDI') ysize.set('key', 'Y_SIZE') ysize.text = str(h) vfs.write(f, ET.tostring(root)) finally: vfs.close(f)
from multiprocessing.pool import Pool, ThreadPool, get_context import pdb import gc #graceful shutdown import psutil import signal import os import gc #config tdb_Config = tiledb.Config({ "sm.check_coord_dups": "false", "sm.check_coord_oob": "false", "sm.check_global_order": "false", "sm.num_writer_threads": "50", "sm.num_reader_threads": "50", "sm.num_async_threads": "50", "vfs.num_threads": "50", "sm.memory_budget": "5000000000" }) tdb_Context = tiledb.Ctx(config=tdb_Config) def init_worker(): signal.signal(signal.SIGINT, signal.SIG_IGN) def kill_child_processes(parent_pid, sig=signal.SIGTERM): try: parent = psutil.Process(parent_pid) except psutil.NoSuchProcess:
def config(): """ Output TileDB's default configuration parameters and values. """ click.echo(tiledb.Config())
def ingest(args): if type(args)==type({}): args=args_object_from_args_dict(args) if args.write_chunk > max_write_chunk: print("WARNING: You have specified a write_chunk size of"+str(args.write_chunk)+" but the maximum supported with python serialization is:"+str(max_write_chunk)+". It will be reset to "+str(max_write_chunk)) args.write_chunk=max_write_chunk #create a queue to write the array global write_queue write_queue=Queue(maxsize=args.max_queue_size) #config tdb_Config=tiledb.Config(tdb_config_params) tdb_write_Context=tiledb.Ctx(config=tdb_Config) tdb_read_Context=tiledb.Ctx(config=tdb_Config) overwrite=args.overwrite coord_tile_size=args.coord_tile_size task_tile_size=args.task_tile_size attribute_config=args.attribute_config attribute_config_file=args.attribute_config_file updating=False attribute_info=get_attribute_info(args.attribute_config,args.attribute_config_file) tiledb_metadata=pd.read_csv(args.tiledb_metadata,header=0,sep='\t') num_tasks=tiledb_metadata.shape[0] print("num_tasks:"+str(num_tasks)) print("loaded tiledb metadata") chrom_sizes=pd.read_csv(args.chrom_sizes,header=None,sep='\t') print("loaded chrom sizes") chrom_indices,num_indices=transform_chrom_size_to_indices(chrom_sizes) print("num_indices:"+str(num_indices)) array_out_name=args.array_name if tiledb.object_type(array_out_name) == "array": if overwrite==False: raise Exception("array:"+str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting") else: print("warning: the array: "+str(array_out_name)+" already exists. You provided the --overwrite flag, so it will be updated/overwritten") updating=True else: #create the array: create_new_array(tdb_Context=tdb_write_Context, size=(num_indices,num_tasks-1), attribute_config=attribute_config, attribute_config_file=attribute_config_file, array_out_name=array_out_name, coord_tile_size=coord_tile_size, task_tile_size=task_tile_size, var=False) print("created new array:"+str(array_out_name)) #create metadata array metadata_dict={} metadata_dict['tasks']=[i for i in tiledb_metadata['dataset']] metadata_dict['chroms']=[i for i in chrom_indices.keys()] metadata_dict['sizes']=[i[2] for i in list(chrom_indices.values())] metadata_dict['offsets']=[i[0] for i in list(chrom_indices.values())] num_tasks=tiledb_metadata['dataset'].shape[0] num_chroms=len(chrom_indices.keys()) with tiledb.DenseArray(array_out_name,ctx=tdb_write_Context,mode='w') as cur_array: cur_array.meta['num_tasks']=num_tasks cur_array.meta['num_chroms']=num_chroms for task_index in range(num_tasks): cur_array.meta['_'.join(['task',str(task_index)])]=metadata_dict['tasks'][task_index] for chrom_index in range(num_chroms): cur_array.meta['_'.join(['chrom',str(chrom_index)])]=metadata_dict['chroms'][chrom_index] cur_array.meta['_'.join(['size',str(chrom_index)])]=metadata_dict['sizes'][chrom_index] cur_array.meta['_'.join(['offset',str(chrom_index)])]=metadata_dict['offsets'][chrom_index] print("created tiledb metadata") pool=Pool(processes=args.threads,initializer=init_worker) print("made pool") pool_inputs=[] for task_index,task_row in tiledb_metadata.iterrows(): dataset=task_row['dataset'] #read in filenames for bigwigs data_dict=open_data_for_parsing(task_row,attribute_info) for start_chunk_index in range(0,num_indices,args.write_chunk): end_chunk_index=start_chunk_index+min([num_indices,args.write_chunk]) #convert global indices to chrom+pos indices chunk_chrom_coords=transform_indices_to_chrom_coords(start_chunk_index,end_chunk_index,chrom_indices) if chunk_chrom_coords is None: raise Exception("failed to tranform indices:"+str(start_chunk_index)+"-"+str(end_chunk_index)+ " to chrom coords;"+str(chrom_indices)) for coord_set in chunk_chrom_coords: pool_inputs.append((task_index,data_dict,attribute_info,coord_set,args)) pool_feed_chunk_start=0 pool_feed_chunk_max=len(pool_inputs) chunks_to_process=len(pool_inputs) array_writer=Process(target=write_array,args=([args,updating,chunks_to_process])) try: array_writer.start() except Exception as e: raise e try: while pool_feed_chunk_start < pool_feed_chunk_max: pool_feed_chunk_end=min([pool_feed_chunk_start+queue_feed_chunk_size,pool_feed_chunk_max]) #only do mapping if queue size is not exceeded & total memory consumption is not exceeded write_queue_size=write_queue.qsize() mem_used=psutil.virtual_memory().used / (10**9) print("mapping to pool, queue size:"+str(write_queue_size)) print("mapping to pool, mem used:"+str(mem_used)) while (write_queue_size >=args.max_queue_size) or (mem_used >=args.max_mem_g): time.sleep(10) print("sending to pool:"+str(pool_feed_chunk_start)+"-"+str(pool_feed_chunk_end)+"/"+str(chunks_to_process)) pool.map(process_chunk,pool_inputs[pool_feed_chunk_start:pool_feed_chunk_end]) pool_feed_chunk_start+=queue_feed_chunk_size time.sleep(60) pool.close() except KeyboardInterrupt: kill_child_processes(os.getpid()) pool.terminate() raise except Exception as e: print(e) kill_child_processes(os.getpid()) raise #wait until we're done writing to the tiledb array array_writer.join() print("array_writer.join() is complete") print("shutting down pool") pool.join() print('done!')
class TileHelper(object): """ The TileHelper class for convenient tiledb setup """ config = tiledb.Config() config["vfs.s3.scheme"] = "https" config["vfs.s3.region"] = "us-west-1" config["vfs.s3.use_virtual_addressing"] = "true" ctx = tiledb.Ctx(config) def __init__(self, backend=None, tile_size=1000000, compressor='lz4'): if backend == None: self.root = os.environ.get('TILEDB_ROOT', os.path.realpath('./tiledb/')) if not os.path.isdir(self.root): try: os.makedirs(self.root) except: print( f"Warning: TileHelper not able to create {self.root} for backend {backend}" ) elif backend == 's3': self.root = 's3://sirius-tiledb/' self.tile_size = tile_size if isinstance(compressor, str): self.compressor = (compressor, -1) elif isinstance(compressor, tuple): self.compressor = compressor def create_dense_array(self, arrayID, data): assert isinstance(data, np.ndarray), "data should be an np.ndarray" tile_dims = [] for i_dim, dim_size in enumerate(data.shape): name = f'd{i_dim}' tile = min(self.tile_size, dim_size) tiledim = tiledb.Dim(self.ctx, name=name, domain=(0, dim_size - 1), tile=tile) tile_dims.append(tiledim) domain = tiledb.Domain(self.ctx, *tile_dims) attr = tiledb.Attr(self.ctx, compressor=self.compressor, dtype=data.dtype) schema = tiledb.ArraySchema(self.ctx, domain=domain, sparse=False, attrs=[attr]) tile_array_id = os.path.join(self.root, arrayID) tiledb.DenseArray.create(tile_array_id, schema) dense_array = tiledb.DenseArray(self.ctx, tile_array_id, mode='w') dense_array[:] = data return dense_array def load_dense_array(self, arrayID): tile_array_id = os.path.join(self.root, arrayID) try: return tiledb.DenseArray(self.ctx, tile_array_id) except tiledb.TileDBError as e: print(e) return np.array([]) def remove(self, arrayID): tile_array_id = os.path.join(self.root, arrayID) tiledb.remove(self.ctx, tile_array_id) def ls(self): paths = [] tiledb.ls(self.ctx, self.root, lambda p, l: paths.append(p)) if self.root.startswith("s3://"): results = [p[len(self.root):-1] for p in paths] else: results = [os.path.basename(p) for p in paths] return results
def vacuum_array_metadata(uri): """ Vacuum the already consolidated array metadata in an array located at uri. """ config = tiledb.Config({"sm.vacuum.mode": "array_meta"}) tiledb.vacuum(uri, ctx=tiledb.Ctx(config))
def vacuum_fragments(uri): """ Vacuum the already consolidated fragments in an array located at uri. """ config = tiledb.Config({"sm.vacuum.mode": "fragments"}) tiledb.vacuum(uri, ctx=tiledb.Ctx(config))