def ingest_single_threaded(args): if type(args)==type({}): args=args_object_from_args_dict(args) overwrite=args.overwrite tile_size=args.tile_size attribute_config=args.attribute_config updating=False attribute_info=get_attribute_info(args.attribute_config) tiledb_metadata=pd.read_csv(args.tiledb_metadata,header=0,sep='\t') print("loaded tiledb metadata") chrom_sizes=pd.read_csv(args.chrom_sizes,header=None,sep='\t') print("loaded chrom sizes") #check if the tiledb_group exists, and if not, create it if tiledb.object_type(args.tiledb_group) is not 'group': group_uri=tiledb.group_create(args.tiledb_group) print("created tiledb group") else: group_uri=args.tiledb_group print("tiledb group already exists") for task_index,task_row in tiledb_metadata.iterrows(): dataset=task_row['dataset'] #read in filenames for bigwigs data_dict=open_data_for_parsing(task_row,attribute_info) array_outf_prefix="/".join([args.tiledb_group,dataset]) for chrom_index, chrom_row in chrom_sizes.iterrows(): chrom=chrom_row[0] size=chrom_row[1] array_out_name='.'.join([array_outf_prefix,chrom]) if tiledb.object_type(array_out_name) == "array": if overwrite==False: raise Exception("array:"+str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting") else: print("warning: the array: "+str(array_out_name)+" already exists. You provided the --overwrite flag, so it will be updated/overwritten") updating=True else: #create the array: create_new_array(size=size, attribute_config=attribute_config, array_out_name=array_out_name, tile_size=tile_size) print("created new array:"+str(array_out_name)) print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2)) process_chrom(data_dict,attribute_info,chrom,size,array_out_name,updating,args) print("wrote chrom array for task:"+str(dataset))
def create_tiledb_array(self, n_slots, description): array_name = self.sensor_data_path(description['code']) if tiledb.object_type(array_name) is not None: raise ValueError('duplicate object with path %s' % array_name) shape = description['shape'] assert len(shape) > 0 and n_slots > 0 dims = [ tiledb.Dim(name="delta_t", domain=(0, n_slots), tile=1, dtype=np.int32) ] dims = dims + [ tiledb.Dim( name=f"dim{i}", domain=(0, n - 1), tile=n, dtype=np.int32) for i, n in enumerate(shape) ] dom = tiledb.Domain(*dims, ctx=self.tiledb_ctx) attrs = [ tiledb.Attr(name=aname, dtype=np.float32) for aname in description['controlledProperty'] ] schema = tiledb.ArraySchema(domain=dom, sparse=False, attrs=attrs, ctx=self.tiledb_ctx) # Create the (empty) array on disk. tiledb.DenseArray.create(array_name, schema) return array_name
def isvalid(url): """ Return True if this looks like a valid CXG, False if not. Just a quick/cheap test, not to be fully trusted. """ if not tiledb.object_type(url, ctx=CxgAdaptor.tiledb_ctx) == "group": return False if not tiledb.object_type(path_join(url, "obs"), ctx=CxgAdaptor.tiledb_ctx) == "array": return False if not tiledb.object_type(path_join(url, "var"), ctx=CxgAdaptor.tiledb_ctx) == "array": return False if not tiledb.object_type(path_join(url, "X"), ctx=CxgAdaptor.tiledb_ctx) == "array": return False if not tiledb.object_type(path_join(url, "emb"), ctx=CxgAdaptor.tiledb_ctx) == "group": return False return True
def to_tiledb(self, uri: Union[str, PurePath]) -> None: uri = URL(uri) if not isinstance(uri, PurePath) else uri if tiledb.object_type(str(uri)) != "group": tiledb.group_create(str(uri)) headers_uri = str(uri / "headers") if tiledb.object_type(headers_uri) != "array": dims = self._get_dims(TRACE_FIELDS_SIZE) header_schema = tiledb.ArraySchema( domain=tiledb.Domain(*dims), sparse=False, attrs=[ tiledb.Attr(f.name, f.dtype, filters=TRACE_FIELD_FILTERS) for f in TRACE_FIELDS ], ) with self._tiledb_array(headers_uri, header_schema) as tdb: self._fill_headers(tdb) data_uri = str(uri / "data") if tiledb.object_type(data_uri) != "array": samples = len(self.segy_file.samples) sample_dtype = self.segy_file.dtype sample_size = sample_dtype.itemsize dims = list(self._get_dims(sample_size * samples)) dims.append( tiledb.Dim( name="samples", domain=(0, samples - 1), dtype=dims[0].dtype, tile=np.clip(self.tile_size // sample_size, 1, samples), )) data_schema = tiledb.ArraySchema( domain=tiledb.Domain(*dims), sparse=False, attrs=[ tiledb.Attr("trace", sample_dtype, filters=(tiledb.LZ4Filter(), )) ], ) with self._tiledb_array(data_uri, data_schema) as tdb: self._fill_data(tdb)
def inject_config(self, env_config: ConfigEnvironment) -> None: super(DefaultScenario, self).inject_config(env_config) # Initialize TileDB storage needed for the scenario specific data exp_tmp_dir = self._env_config.temp_dir( experiment_id=self.experiment_id) if self._env_config is not None and exp_tmp_dir is not None: abs_path = pathlib.Path(exp_tmp_dir).resolve().joinpath( 'def_tdb_arrays') self.__tiledb_group_name = abs_path.as_uri() self.__tiledb_stats_array = abs_path.joinpath('stats').as_uri() # Create the tileDB group of arrays used by this scenario tdb_gtype = tiledb.object_type(self.__tiledb_group_name) if tdb_gtype is None: # Group does not exist tiledb.group_create(self.__tiledb_group_name) elif tdb_gtype == 'array': # Exist but an array tiledb.remove(self.__tiledb_group_name) # Remove the array tiledb.group_create( self.__tiledb_group_name) # Create a group instead self._clear_arrays() self._mdde_result_folder_root = env_config.result_dir(self, get_root=True)
def __init__( self, uri, key=None, timestamp=None, ): """ Parameters ---------- uri : str Uniform Resoure Identifier (URI) for TileDB array. May be a path to a local TileDB array or a URI for a remote resource. key : Optional[str] If not None, the key for accessing the TileDB array at the provided URI. timestamp : Optional[int] If not None, time in milliseconds to open the array at. """ if tiledb.object_type(uri) != "array": raise ValueError( f"Unable to read from URI '{uri}'. URI is not a TileDB array.") self._uri = uri self._key = key self._timestamp = timestamp
def _validate_and_initialize(self): """ remember, preload_validation() has already been called, so no need to repeat anything it has done. Load the CXG "group" metadata and cache instance values. Be very aware of multiple versions of the CXG object. CXG versions in the wild: * version 0, aka "no version" -- can be detected by the lack of a cxg_group_metadata array. * version 0.1 -- metadata attache to cxg_group_metadata array. Same as 0, except it adds group metadata. """ a_type = tiledb.object_type(path_join(self.url, "cxg_group_metadata"), ctx=self.tiledb_ctx) if a_type is None: # version 0 cxg_version = "0.0" title = None about = None elif a_type == "array": # version >0 gmd = self.open_array("cxg_group_metadata") cxg_version = gmd.meta["cxg_version"] if cxg_version == "0.1": cxg_properties = json.loads(gmd.meta["cxg_properties"]) title = cxg_properties.get("title", None) about = cxg_properties.get("about", None) if cxg_version not in ["0.0", "0.1"]: raise DatasetAccessError(f"cxg matrix is not valid: {self.url}") self.title = title self.about = about self.cxg_version = cxg_version
def ingest_single_threaded(args): if type(args) == type({}): args = args_object_from_args_dict(args) #config tdb_Config = tiledb.Config(tdb_config_params) tdb_write_Context = tiledb.Ctx(config=tdb_Config) tdb_read_Context = tiledb.Ctx(config=tdb_Config) overwrite = args.overwrite coord_tile_size = args.coord_tile_size task_tile_size = args.task_tile_size attribute_config = args.attribute_config updating = False attribute_info = get_attribute_info(args.attribute_config) tiledb_metadata = pd.read_csv(args.tiledb_metadata, header=0, sep='\t') num_tasks = tiledb_metadata.shape[0] print("loaded tiledb metadata") chrom_sizes = pd.read_csv(args.chrom_sizes, header=None, sep='\t') print("loaded chrom sizes") chrom_indices, num_indices = transform_chrom_size_to_indices(chrom_sizes) print("num_indices:" + str(num_indices)) array_out_name = args.tiledb_group if tiledb.object_type(array_out_name) == "array": if overwrite == False: raise Exception( "array:" + str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting" ) else: print( "warning: the array: " + str(array_out_name) + " already exists. You provided the --overwrite flag, so it will be updated/overwritten" ) updating = True else: #create the array: create_new_array(tdb_Context=tdb_write_Context, size=(num_indices, num_tasks), attribute_config=attribute_config, array_out_name=array_out_name, coord_tile_size=coord_tile_size, task_tile_size=task_tile_size, var=False) print("created new array:" + str(array_out_name)) #create metadata array metadata_dict = {} metadata_dict['tasks'] = [i for i in tiledb_metadata['dataset']] metadata_dict['chroms'] = [i for i in chrom_indices.keys()] metadata_dict['sizes'] = [i[2] for i in list(chrom_indices.values())] metadata_dict['offsets'] = [i[0] for i in list(chrom_indices.values())] num_tasks = tiledb_metadata['dataset'].shape[0] num_chroms = len(chrom_indices.keys()) with tiledb.DenseArray(array_out_name, ctx=tdb_write_Context, mode='w') as cur_array: cur_array.meta['num_tasks'] = num_tasks cur_array.meta['num_chroms'] = num_chroms for task_index in range(num_tasks): cur_array.meta['_'.join( ['task', str(task_index)])] = metadata_dict['tasks'][task_index] for chrom_index in range(num_chroms): cur_array.meta['_'.join( ['chrom', str(chrom_index)])] = metadata_dict['chroms'][chrom_index] cur_array.meta['_'.join( ['size', str(chrom_index)])] = metadata_dict['sizes'][chrom_index] cur_array.meta['_'.join([ 'offset', str(chrom_index) ])] = metadata_dict['offsets'][chrom_index] print("created tiledb metadata") if updating is True: cur_array_toread = tiledb.DenseArray(array_out_name, ctx=tdb_read_Context, mode='r') else: cur_array_toread = None cur_array_towrite = tiledb.DenseArray(array_out_name, ctx=tdb_write_Context, mode='w') for task_index, task_row in tiledb_metadata.iterrows(): dataset = task_row['dataset'] print(dataset) #read in filenames for bigwigs data_dict = open_data_for_parsing(task_row, attribute_info) for start_chunk_index in range(0, num_indices, args.write_chunk): print(str(start_chunk_index) + '/' + str(num_indices)) end_chunk_index = start_chunk_index + min( [num_indices, start_chunk_index + args.write_chunk]) print("end chunk index:" + str(end_chunk_index)) #convert global indices to chrom+pos indices chunk_chrom_coords = transform_indices_to_chrom_coords( start_chunk_index, end_chunk_index, chrom_indices) print("processing:" + str(chunk_chrom_coords)) for coord_set in chunk_chrom_coords: print("\t" + "coord_set:" + str(coord_set)) process_chunk(task_index, data_dict, attribute_info, coord_set, updating, args, cur_array_toread, cur_array_towrite) print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2)) print("wrote chrom array for task:" + str(dataset) + "for index:" + str(start_chunk_index)) print("closing arrays") if cur_array_to_read is not None: cur_array_toread.close() cur_array_towrite.close() print('done!')
def has_array(self, name): a_type = tiledb.object_type(path_join(self.url, name), ctx=self.tiledb_ctx) return a_type == "array"
sparse=True, attrs=[tiledb.Attr(ctx, name="a", dtype=np.int32)]) tiledb.SparseArray.create(array_name, schema) def write_array(): ctx = tiledb.Ctx() with tiledb.SparseArray(ctx, array_name, mode='w') as A: I, J = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] data = np.array(([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) A[I, J] = data def read_array(): ctx = tiledb.Ctx() with tiledb.SparseArray(ctx, array_name, mode='r') as A: data = A[1:11] a_vals = data["a"] for i, coord in enumerate(data["coords"]): print("Cell (%d, %d) has data %d" % (coord[0], coord[1], a_vals[i])) ctx = tiledb.Ctx() if tiledb.object_type(ctx, array_name) != "array": create_array() write_array() read_array()
def ingest(args): try: if type(args) == type({}): args = args_object_from_args_dict(args) overwrite = args.overwrite chrom_threads = args.chrom_threads batch_size = args.batch_size tile_size = args.tile_size attribute_config = args.attribute_config updating = False attribute_info = get_attribute_info(args.attribute_config) tiledb_metadata = pd.read_csv(args.tiledb_metadata, header=0, sep='\t') print("loaded tiledb metadata") chrom_sizes = pd.read_csv(args.chrom_sizes, header=None, sep='\t') print("loaded chrom sizes") #check if the tiledb_group exists, and if not, create it if tiledb.object_type(args.tiledb_group) is not 'group': group_uri = tiledb.group_create(args.tiledb_group) print("created tiledb group") else: group_uri = args.tiledb_group print("tiledb group already exists") for task_index, task_row in tiledb_metadata.iterrows(): dataset = task_row['dataset'] #read in filenames for bigwigs data_dict = open_data_for_parsing(task_row, attribute_info) array_outf_prefix = "/".join([args.tiledb_group, dataset]) pool_inputs = [] for chrom_index, chrom_row in chrom_sizes.iterrows(): chrom = chrom_row[0] size = chrom_row[1] array_out_name = '.'.join([array_outf_prefix, chrom]) if tiledb.object_type(array_out_name) == "array": if overwrite == False: raise Exception( "array:" + str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting" ) else: print( "warning: the array: " + str(array_out_name) + " already exists. You provided the --overwrite flag, so it will be updated/overwritten" ) updating = True else: #create the array: create_new_array(size=size, attribute_config=attribute_config, array_out_name=array_out_name, tile_size=tile_size) print("created new array:" + str(array_out_name)) pool_inputs.append((data_dict, attribute_info, chrom, size, array_out_name, updating, args)) with Pool(chrom_threads, initializer=init_worker) as pool: #with ThreadPool(chrom_threads) as pool: print("made pool") res = pool.map(process_chrom, pool_inputs) pool.close() pool.join() print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2)) print("wrote chrom array for task:" + str(dataset)) except KeyboardInterrupt: print('detected keyboard interrupt') #shutdown the pool pool.terminate() # Kill remaining child processes kill_child_processes(os.getpid()) raise except Exception as e: print(repr(e)) #shutdown the pool pool.terminate() # Kill remaining child processes kill_child_processes(os.getpid()) raise e
def main(): ctx = tiledb.Ctx() print("{!r}".format(tiledb.object_type(ctx, "my_group"))) print("{!r}".format(tiledb.object_type(ctx, "my_dense_array"))) print("{!r}".format(tiledb.object_type(ctx, "my_kv"))) print("{!r}".format(tiledb.object_type(ctx, "invalid_path")))
def guess_can_open(self, filename_or_obj): try: return tiledb.object_type(filename_or_obj) == "array" except tiledb.TileDBError: return False
def ingest(args): if type(args)==type({}): args=args_object_from_args_dict(args) if args.write_chunk > max_write_chunk: print("WARNING: You have specified a write_chunk size of"+str(args.write_chunk)+" but the maximum supported with python serialization is:"+str(max_write_chunk)+". It will be reset to "+str(max_write_chunk)) args.write_chunk=max_write_chunk #create a queue to write the array global write_queue write_queue=Queue(maxsize=args.max_queue_size) #config tdb_Config=tiledb.Config(tdb_config_params) tdb_write_Context=tiledb.Ctx(config=tdb_Config) tdb_read_Context=tiledb.Ctx(config=tdb_Config) overwrite=args.overwrite coord_tile_size=args.coord_tile_size task_tile_size=args.task_tile_size attribute_config=args.attribute_config attribute_config_file=args.attribute_config_file updating=False attribute_info=get_attribute_info(args.attribute_config,args.attribute_config_file) tiledb_metadata=pd.read_csv(args.tiledb_metadata,header=0,sep='\t') num_tasks=tiledb_metadata.shape[0] print("num_tasks:"+str(num_tasks)) print("loaded tiledb metadata") chrom_sizes=pd.read_csv(args.chrom_sizes,header=None,sep='\t') print("loaded chrom sizes") chrom_indices,num_indices=transform_chrom_size_to_indices(chrom_sizes) print("num_indices:"+str(num_indices)) array_out_name=args.array_name if tiledb.object_type(array_out_name) == "array": if overwrite==False: raise Exception("array:"+str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting") else: print("warning: the array: "+str(array_out_name)+" already exists. You provided the --overwrite flag, so it will be updated/overwritten") updating=True else: #create the array: create_new_array(tdb_Context=tdb_write_Context, size=(num_indices,num_tasks-1), attribute_config=attribute_config, attribute_config_file=attribute_config_file, array_out_name=array_out_name, coord_tile_size=coord_tile_size, task_tile_size=task_tile_size, var=False) print("created new array:"+str(array_out_name)) #create metadata array metadata_dict={} metadata_dict['tasks']=[i for i in tiledb_metadata['dataset']] metadata_dict['chroms']=[i for i in chrom_indices.keys()] metadata_dict['sizes']=[i[2] for i in list(chrom_indices.values())] metadata_dict['offsets']=[i[0] for i in list(chrom_indices.values())] num_tasks=tiledb_metadata['dataset'].shape[0] num_chroms=len(chrom_indices.keys()) with tiledb.DenseArray(array_out_name,ctx=tdb_write_Context,mode='w') as cur_array: cur_array.meta['num_tasks']=num_tasks cur_array.meta['num_chroms']=num_chroms for task_index in range(num_tasks): cur_array.meta['_'.join(['task',str(task_index)])]=metadata_dict['tasks'][task_index] for chrom_index in range(num_chroms): cur_array.meta['_'.join(['chrom',str(chrom_index)])]=metadata_dict['chroms'][chrom_index] cur_array.meta['_'.join(['size',str(chrom_index)])]=metadata_dict['sizes'][chrom_index] cur_array.meta['_'.join(['offset',str(chrom_index)])]=metadata_dict['offsets'][chrom_index] print("created tiledb metadata") pool=Pool(processes=args.threads,initializer=init_worker) print("made pool") pool_inputs=[] for task_index,task_row in tiledb_metadata.iterrows(): dataset=task_row['dataset'] #read in filenames for bigwigs data_dict=open_data_for_parsing(task_row,attribute_info) for start_chunk_index in range(0,num_indices,args.write_chunk): end_chunk_index=start_chunk_index+min([num_indices,args.write_chunk]) #convert global indices to chrom+pos indices chunk_chrom_coords=transform_indices_to_chrom_coords(start_chunk_index,end_chunk_index,chrom_indices) if chunk_chrom_coords is None: raise Exception("failed to tranform indices:"+str(start_chunk_index)+"-"+str(end_chunk_index)+ " to chrom coords;"+str(chrom_indices)) for coord_set in chunk_chrom_coords: pool_inputs.append((task_index,data_dict,attribute_info,coord_set,args)) pool_feed_chunk_start=0 pool_feed_chunk_max=len(pool_inputs) chunks_to_process=len(pool_inputs) array_writer=Process(target=write_array,args=([args,updating,chunks_to_process])) try: array_writer.start() except Exception as e: raise e try: while pool_feed_chunk_start < pool_feed_chunk_max: pool_feed_chunk_end=min([pool_feed_chunk_start+queue_feed_chunk_size,pool_feed_chunk_max]) #only do mapping if queue size is not exceeded & total memory consumption is not exceeded write_queue_size=write_queue.qsize() mem_used=psutil.virtual_memory().used / (10**9) print("mapping to pool, queue size:"+str(write_queue_size)) print("mapping to pool, mem used:"+str(mem_used)) while (write_queue_size >=args.max_queue_size) or (mem_used >=args.max_mem_g): time.sleep(10) print("sending to pool:"+str(pool_feed_chunk_start)+"-"+str(pool_feed_chunk_end)+"/"+str(chunks_to_process)) pool.map(process_chunk,pool_inputs[pool_feed_chunk_start:pool_feed_chunk_end]) pool_feed_chunk_start+=queue_feed_chunk_size time.sleep(60) pool.close() except KeyboardInterrupt: kill_child_processes(os.getpid()) pool.terminate() raise except Exception as e: print(e) kill_child_processes(os.getpid()) raise #wait until we're done writing to the tiledb array array_writer.join() print("array_writer.join() is complete") print("shutting down pool") pool.join() print('done!')