示例#1
0
def ingest_single_threaded(args):
    if type(args)==type({}):
        args=args_object_from_args_dict(args)
        
    overwrite=args.overwrite
    tile_size=args.tile_size
    attribute_config=args.attribute_config
    updating=False

    attribute_info=get_attribute_info(args.attribute_config) 
    tiledb_metadata=pd.read_csv(args.tiledb_metadata,header=0,sep='\t')

    print("loaded tiledb metadata")
    chrom_sizes=pd.read_csv(args.chrom_sizes,header=None,sep='\t')
    print("loaded chrom sizes")

    #check if the tiledb_group exists, and if not, create it
    if tiledb.object_type(args.tiledb_group) is not 'group':        
        group_uri=tiledb.group_create(args.tiledb_group)
        print("created tiledb group") 
    else:
        group_uri=args.tiledb_group
        print("tiledb group already exists")
        
    for task_index,task_row in tiledb_metadata.iterrows():
        dataset=task_row['dataset']    
        #read in filenames for bigwigs
        data_dict=open_data_for_parsing(task_row,attribute_info)
        array_outf_prefix="/".join([args.tiledb_group,dataset])
        
        for chrom_index, chrom_row in chrom_sizes.iterrows():
            chrom=chrom_row[0]
            size=chrom_row[1]
            array_out_name='.'.join([array_outf_prefix,chrom])
            
            if tiledb.object_type(array_out_name) == "array":
                if overwrite==False:
                    raise Exception("array:"+str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting")
                else:
                    print("warning: the array: "+str(array_out_name)+" already exists. You provided the --overwrite flag, so it will be updated/overwritten")
                    updating=True
            else:
                #create the array:
                create_new_array(size=size,
                                 attribute_config=attribute_config,
                                 array_out_name=array_out_name,
                                 tile_size=tile_size)
                
                print("created new array:"+str(array_out_name))
            print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2))
            process_chrom(data_dict,attribute_info,chrom,size,array_out_name,updating,args)
            print("wrote chrom array for task:"+str(dataset))
示例#2
0
 def create_tiledb_array(self, n_slots, description):
     array_name = self.sensor_data_path(description['code'])
     if tiledb.object_type(array_name) is not None:
         raise ValueError('duplicate object with path %s' % array_name)
     shape = description['shape']
     assert len(shape) > 0 and n_slots > 0
     dims = [
         tiledb.Dim(name="delta_t",
                    domain=(0, n_slots),
                    tile=1,
                    dtype=np.int32)
     ]
     dims = dims + [
         tiledb.Dim(
             name=f"dim{i}", domain=(0, n - 1), tile=n, dtype=np.int32)
         for i, n in enumerate(shape)
     ]
     dom = tiledb.Domain(*dims, ctx=self.tiledb_ctx)
     attrs = [
         tiledb.Attr(name=aname, dtype=np.float32)
         for aname in description['controlledProperty']
     ]
     schema = tiledb.ArraySchema(domain=dom,
                                 sparse=False,
                                 attrs=attrs,
                                 ctx=self.tiledb_ctx)
     # Create the (empty) array on disk.
     tiledb.DenseArray.create(array_name, schema)
     return array_name
示例#3
0
 def isvalid(url):
     """
     Return True if this looks like a valid CXG, False if not.  Just a quick/cheap
     test, not to be fully trusted.
     """
     if not tiledb.object_type(url, ctx=CxgAdaptor.tiledb_ctx) == "group":
         return False
     if not tiledb.object_type(path_join(url, "obs"), ctx=CxgAdaptor.tiledb_ctx) == "array":
         return False
     if not tiledb.object_type(path_join(url, "var"), ctx=CxgAdaptor.tiledb_ctx) == "array":
         return False
     if not tiledb.object_type(path_join(url, "X"), ctx=CxgAdaptor.tiledb_ctx) == "array":
         return False
     if not tiledb.object_type(path_join(url, "emb"), ctx=CxgAdaptor.tiledb_ctx) == "group":
         return False
     return True
示例#4
0
    def to_tiledb(self, uri: Union[str, PurePath]) -> None:
        uri = URL(uri) if not isinstance(uri, PurePath) else uri

        if tiledb.object_type(str(uri)) != "group":
            tiledb.group_create(str(uri))

        headers_uri = str(uri / "headers")
        if tiledb.object_type(headers_uri) != "array":
            dims = self._get_dims(TRACE_FIELDS_SIZE)
            header_schema = tiledb.ArraySchema(
                domain=tiledb.Domain(*dims),
                sparse=False,
                attrs=[
                    tiledb.Attr(f.name, f.dtype, filters=TRACE_FIELD_FILTERS)
                    for f in TRACE_FIELDS
                ],
            )
            with self._tiledb_array(headers_uri, header_schema) as tdb:
                self._fill_headers(tdb)

        data_uri = str(uri / "data")
        if tiledb.object_type(data_uri) != "array":
            samples = len(self.segy_file.samples)
            sample_dtype = self.segy_file.dtype
            sample_size = sample_dtype.itemsize
            dims = list(self._get_dims(sample_size * samples))
            dims.append(
                tiledb.Dim(
                    name="samples",
                    domain=(0, samples - 1),
                    dtype=dims[0].dtype,
                    tile=np.clip(self.tile_size // sample_size, 1, samples),
                ))
            data_schema = tiledb.ArraySchema(
                domain=tiledb.Domain(*dims),
                sparse=False,
                attrs=[
                    tiledb.Attr("trace",
                                sample_dtype,
                                filters=(tiledb.LZ4Filter(), ))
                ],
            )
            with self._tiledb_array(data_uri, data_schema) as tdb:
                self._fill_data(tdb)
示例#5
0
    def inject_config(self, env_config: ConfigEnvironment) -> None:
        super(DefaultScenario, self).inject_config(env_config)
        # Initialize TileDB storage needed for the scenario specific data
        exp_tmp_dir = self._env_config.temp_dir(
            experiment_id=self.experiment_id)
        if self._env_config is not None and exp_tmp_dir is not None:
            abs_path = pathlib.Path(exp_tmp_dir).resolve().joinpath(
                'def_tdb_arrays')
            self.__tiledb_group_name = abs_path.as_uri()
            self.__tiledb_stats_array = abs_path.joinpath('stats').as_uri()
        # Create the tileDB group of arrays used by this scenario
        tdb_gtype = tiledb.object_type(self.__tiledb_group_name)
        if tdb_gtype is None:  # Group does not exist
            tiledb.group_create(self.__tiledb_group_name)
        elif tdb_gtype == 'array':  # Exist but an array
            tiledb.remove(self.__tiledb_group_name)  # Remove the array
            tiledb.group_create(
                self.__tiledb_group_name)  # Create a group instead
        self._clear_arrays()

        self._mdde_result_folder_root = env_config.result_dir(self,
                                                              get_root=True)
示例#6
0
 def __init__(
     self,
     uri,
     key=None,
     timestamp=None,
 ):
     """
     Parameters
     ----------
     uri : str
         Uniform Resoure Identifier (URI) for TileDB array. May be a path to a
         local TileDB array or a URI for a remote resource.
     key : Optional[str]
         If not None, the key for accessing the TileDB array at the provided URI.
     timestamp : Optional[int]
         If not None, time in milliseconds to open the array at.
     """
     if tiledb.object_type(uri) != "array":
         raise ValueError(
             f"Unable to read from URI '{uri}'. URI is not a TileDB array.")
     self._uri = uri
     self._key = key
     self._timestamp = timestamp
示例#7
0
    def _validate_and_initialize(self):
        """
        remember, preload_validation() has already been called, so
        no need to repeat anything it has done.

        Load the CXG "group" metadata and cache instance values.
        Be very aware of multiple versions of the CXG object.

        CXG versions in the wild:
        * version 0, aka "no version" -- can be detected by the lack
          of a cxg_group_metadata array.
        * version 0.1 -- metadata attache to cxg_group_metadata array.
          Same as 0, except it adds group metadata.
        """
        a_type = tiledb.object_type(path_join(self.url, "cxg_group_metadata"),
                                    ctx=self.tiledb_ctx)
        if a_type is None:
            # version 0
            cxg_version = "0.0"
            title = None
            about = None
        elif a_type == "array":
            # version >0
            gmd = self.open_array("cxg_group_metadata")
            cxg_version = gmd.meta["cxg_version"]
            if cxg_version == "0.1":
                cxg_properties = json.loads(gmd.meta["cxg_properties"])
                title = cxg_properties.get("title", None)
                about = cxg_properties.get("about", None)

        if cxg_version not in ["0.0", "0.1"]:
            raise DatasetAccessError(f"cxg matrix is not valid: {self.url}")

        self.title = title
        self.about = about
        self.cxg_version = cxg_version
示例#8
0
def ingest_single_threaded(args):
    if type(args) == type({}):
        args = args_object_from_args_dict(args)

    #config
    tdb_Config = tiledb.Config(tdb_config_params)
    tdb_write_Context = tiledb.Ctx(config=tdb_Config)
    tdb_read_Context = tiledb.Ctx(config=tdb_Config)

    overwrite = args.overwrite
    coord_tile_size = args.coord_tile_size
    task_tile_size = args.task_tile_size
    attribute_config = args.attribute_config
    updating = False

    attribute_info = get_attribute_info(args.attribute_config)
    tiledb_metadata = pd.read_csv(args.tiledb_metadata, header=0, sep='\t')
    num_tasks = tiledb_metadata.shape[0]

    print("loaded tiledb metadata")
    chrom_sizes = pd.read_csv(args.chrom_sizes, header=None, sep='\t')
    print("loaded chrom sizes")
    chrom_indices, num_indices = transform_chrom_size_to_indices(chrom_sizes)
    print("num_indices:" + str(num_indices))
    array_out_name = args.tiledb_group
    if tiledb.object_type(array_out_name) == "array":
        if overwrite == False:
            raise Exception(
                "array:" + str(array_out_name) +
                "already exists; use the --overwrite flag to overwrite it. Exiting"
            )
        else:
            print(
                "warning: the array: " + str(array_out_name) +
                " already exists. You provided the --overwrite flag, so it will be updated/overwritten"
            )
            updating = True
    else:
        #create the array:
        create_new_array(tdb_Context=tdb_write_Context,
                         size=(num_indices, num_tasks),
                         attribute_config=attribute_config,
                         array_out_name=array_out_name,
                         coord_tile_size=coord_tile_size,
                         task_tile_size=task_tile_size,
                         var=False)
        print("created new array:" + str(array_out_name))
        #create metadata array
        metadata_dict = {}
        metadata_dict['tasks'] = [i for i in tiledb_metadata['dataset']]
        metadata_dict['chroms'] = [i for i in chrom_indices.keys()]
        metadata_dict['sizes'] = [i[2] for i in list(chrom_indices.values())]
        metadata_dict['offsets'] = [i[0] for i in list(chrom_indices.values())]
        num_tasks = tiledb_metadata['dataset'].shape[0]
        num_chroms = len(chrom_indices.keys())
        with tiledb.DenseArray(array_out_name, ctx=tdb_write_Context,
                               mode='w') as cur_array:
            cur_array.meta['num_tasks'] = num_tasks
            cur_array.meta['num_chroms'] = num_chroms
            for task_index in range(num_tasks):
                cur_array.meta['_'.join(
                    ['task',
                     str(task_index)])] = metadata_dict['tasks'][task_index]
            for chrom_index in range(num_chroms):
                cur_array.meta['_'.join(
                    ['chrom',
                     str(chrom_index)])] = metadata_dict['chroms'][chrom_index]
                cur_array.meta['_'.join(
                    ['size',
                     str(chrom_index)])] = metadata_dict['sizes'][chrom_index]
                cur_array.meta['_'.join([
                    'offset', str(chrom_index)
                ])] = metadata_dict['offsets'][chrom_index]
        print("created tiledb metadata")
    if updating is True:
        cur_array_toread = tiledb.DenseArray(array_out_name,
                                             ctx=tdb_read_Context,
                                             mode='r')
    else:
        cur_array_toread = None
    cur_array_towrite = tiledb.DenseArray(array_out_name,
                                          ctx=tdb_write_Context,
                                          mode='w')
    for task_index, task_row in tiledb_metadata.iterrows():
        dataset = task_row['dataset']
        print(dataset)
        #read in filenames for bigwigs
        data_dict = open_data_for_parsing(task_row, attribute_info)
        for start_chunk_index in range(0, num_indices, args.write_chunk):
            print(str(start_chunk_index) + '/' + str(num_indices))
            end_chunk_index = start_chunk_index + min(
                [num_indices, start_chunk_index + args.write_chunk])
            print("end chunk index:" + str(end_chunk_index))
            #convert global indices to chrom+pos indices
            chunk_chrom_coords = transform_indices_to_chrom_coords(
                start_chunk_index, end_chunk_index, chrom_indices)
            print("processing:" + str(chunk_chrom_coords))
            for coord_set in chunk_chrom_coords:
                print("\t" + "coord_set:" + str(coord_set))
                process_chunk(task_index, data_dict, attribute_info, coord_set,
                              updating, args, cur_array_toread,
                              cur_array_towrite)
                print('Gigs:', round(psutil.virtual_memory().used / (10**9),
                                     2))
                print("wrote chrom array for task:" + str(dataset) +
                      "for index:" + str(start_chunk_index))
    print("closing arrays")
    if cur_array_to_read is not None:
        cur_array_toread.close()
    cur_array_towrite.close()
    print('done!')
示例#9
0
 def has_array(self, name):
     a_type = tiledb.object_type(path_join(self.url, name),
                                 ctx=self.tiledb_ctx)
     return a_type == "array"
示例#10
0
        sparse=True,
        attrs=[tiledb.Attr(ctx, name="a", dtype=np.int32)])

    tiledb.SparseArray.create(array_name, schema)


def write_array():
    ctx = tiledb.Ctx()
    with tiledb.SparseArray(ctx, array_name, mode='w') as A:
        I, J = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        data = np.array(([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        A[I, J] = data


def read_array():
    ctx = tiledb.Ctx()
    with tiledb.SparseArray(ctx, array_name, mode='r') as A:
        data = A[1:11]
        a_vals = data["a"]
        for i, coord in enumerate(data["coords"]):
            print("Cell (%d, %d) has data %d" %
                  (coord[0], coord[1], a_vals[i]))


ctx = tiledb.Ctx()
if tiledb.object_type(ctx, array_name) != "array":
    create_array()
    write_array()

read_array()
示例#11
0
def ingest(args):
    try:
        if type(args) == type({}):
            args = args_object_from_args_dict(args)
        overwrite = args.overwrite
        chrom_threads = args.chrom_threads
        batch_size = args.batch_size
        tile_size = args.tile_size
        attribute_config = args.attribute_config
        updating = False

        attribute_info = get_attribute_info(args.attribute_config)
        tiledb_metadata = pd.read_csv(args.tiledb_metadata, header=0, sep='\t')

        print("loaded tiledb metadata")
        chrom_sizes = pd.read_csv(args.chrom_sizes, header=None, sep='\t')
        print("loaded chrom sizes")

        #check if the tiledb_group exists, and if not, create it
        if tiledb.object_type(args.tiledb_group) is not 'group':
            group_uri = tiledb.group_create(args.tiledb_group)
            print("created tiledb group")
        else:
            group_uri = args.tiledb_group
            print("tiledb group already exists")
        for task_index, task_row in tiledb_metadata.iterrows():
            dataset = task_row['dataset']
            #read in filenames for bigwigs
            data_dict = open_data_for_parsing(task_row, attribute_info)
            array_outf_prefix = "/".join([args.tiledb_group, dataset])
            pool_inputs = []
            for chrom_index, chrom_row in chrom_sizes.iterrows():
                chrom = chrom_row[0]
                size = chrom_row[1]
                array_out_name = '.'.join([array_outf_prefix, chrom])
                if tiledb.object_type(array_out_name) == "array":
                    if overwrite == False:
                        raise Exception(
                            "array:" + str(array_out_name) +
                            "already exists; use the --overwrite flag to overwrite it. Exiting"
                        )
                    else:
                        print(
                            "warning: the array: " + str(array_out_name) +
                            " already exists. You provided the --overwrite flag, so it will be updated/overwritten"
                        )
                        updating = True
                else:
                    #create the array:
                    create_new_array(size=size,
                                     attribute_config=attribute_config,
                                     array_out_name=array_out_name,
                                     tile_size=tile_size)
                    print("created new array:" + str(array_out_name))
                pool_inputs.append((data_dict, attribute_info, chrom, size,
                                    array_out_name, updating, args))
            with Pool(chrom_threads, initializer=init_worker) as pool:
                #with ThreadPool(chrom_threads) as pool:
                print("made pool")
                res = pool.map(process_chrom, pool_inputs)
            pool.close()
            pool.join()
            print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2))
            print("wrote chrom array for task:" + str(dataset))
    except KeyboardInterrupt:
        print('detected keyboard interrupt')
        #shutdown the pool
        pool.terminate()
        # Kill remaining child processes
        kill_child_processes(os.getpid())
        raise
    except Exception as e:
        print(repr(e))
        #shutdown the pool
        pool.terminate()
        # Kill remaining child processes
        kill_child_processes(os.getpid())
        raise e
示例#12
0
def main():
    ctx = tiledb.Ctx()
    print("{!r}".format(tiledb.object_type(ctx, "my_group")))
    print("{!r}".format(tiledb.object_type(ctx, "my_dense_array")))
    print("{!r}".format(tiledb.object_type(ctx, "my_kv")))
    print("{!r}".format(tiledb.object_type(ctx, "invalid_path")))
示例#13
0
 def guess_can_open(self, filename_or_obj):
     try:
         return tiledb.object_type(filename_or_obj) == "array"
     except tiledb.TileDBError:
         return False
示例#14
0
def ingest(args):
    if type(args)==type({}):
        args=args_object_from_args_dict(args)
    if args.write_chunk > max_write_chunk:
        print("WARNING: You have specified a write_chunk size of"+str(args.write_chunk)+" but the maximum supported with python serialization is:"+str(max_write_chunk)+". It will be reset to "+str(max_write_chunk))
        args.write_chunk=max_write_chunk

    #create a queue to write the array
    global write_queue
    write_queue=Queue(maxsize=args.max_queue_size)

    #config
    tdb_Config=tiledb.Config(tdb_config_params)
    tdb_write_Context=tiledb.Ctx(config=tdb_Config)   
    tdb_read_Context=tiledb.Ctx(config=tdb_Config)
    
    overwrite=args.overwrite
    coord_tile_size=args.coord_tile_size
    task_tile_size=args.task_tile_size
    attribute_config=args.attribute_config
    attribute_config_file=args.attribute_config_file
    updating=False

    attribute_info=get_attribute_info(args.attribute_config,args.attribute_config_file)
    tiledb_metadata=pd.read_csv(args.tiledb_metadata,header=0,sep='\t')
    num_tasks=tiledb_metadata.shape[0]
    print("num_tasks:"+str(num_tasks))
    
    print("loaded tiledb metadata")
    chrom_sizes=pd.read_csv(args.chrom_sizes,header=None,sep='\t')
    print("loaded chrom sizes")
    chrom_indices,num_indices=transform_chrom_size_to_indices(chrom_sizes)
    print("num_indices:"+str(num_indices))
    array_out_name=args.array_name
    if tiledb.object_type(array_out_name) == "array":
        if overwrite==False:
            raise Exception("array:"+str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting")
        else:
            print("warning: the array: "+str(array_out_name)+" already exists. You provided the --overwrite flag, so it will be updated/overwritten")
            updating=True
    else:
        #create the array:
        create_new_array(tdb_Context=tdb_write_Context,
                         size=(num_indices,num_tasks-1),
                         attribute_config=attribute_config,
                         attribute_config_file=attribute_config_file,
                         array_out_name=array_out_name,
                         coord_tile_size=coord_tile_size,
                         task_tile_size=task_tile_size,
                         var=False)
        print("created new array:"+str(array_out_name))
        #create metadata array
        metadata_dict={}
        metadata_dict['tasks']=[i for i in tiledb_metadata['dataset']]
        metadata_dict['chroms']=[i for i in chrom_indices.keys()]
        metadata_dict['sizes']=[i[2] for i in list(chrom_indices.values())]
        metadata_dict['offsets']=[i[0] for i in list(chrom_indices.values())]
        num_tasks=tiledb_metadata['dataset'].shape[0]
        
        num_chroms=len(chrom_indices.keys())
        with tiledb.DenseArray(array_out_name,ctx=tdb_write_Context,mode='w') as cur_array:
            cur_array.meta['num_tasks']=num_tasks
            cur_array.meta['num_chroms']=num_chroms
            for task_index in range(num_tasks):
                cur_array.meta['_'.join(['task',str(task_index)])]=metadata_dict['tasks'][task_index]
            for chrom_index in range(num_chroms):
                cur_array.meta['_'.join(['chrom',str(chrom_index)])]=metadata_dict['chroms'][chrom_index]
                cur_array.meta['_'.join(['size',str(chrom_index)])]=metadata_dict['sizes'][chrom_index]
                cur_array.meta['_'.join(['offset',str(chrom_index)])]=metadata_dict['offsets'][chrom_index]                                
        print("created tiledb metadata")
    pool=Pool(processes=args.threads,initializer=init_worker)
    print("made pool") 
    pool_inputs=[] 
    for task_index,task_row in tiledb_metadata.iterrows():
        dataset=task_row['dataset']
        #read in filenames for bigwigs
        data_dict=open_data_for_parsing(task_row,attribute_info)
        for start_chunk_index in range(0,num_indices,args.write_chunk):
            end_chunk_index=start_chunk_index+min([num_indices,args.write_chunk])
            #convert global indices to chrom+pos indices
            chunk_chrom_coords=transform_indices_to_chrom_coords(start_chunk_index,end_chunk_index,chrom_indices)
            if chunk_chrom_coords is None:
                raise Exception("failed to tranform indices:"+str(start_chunk_index)+"-"+str(end_chunk_index)+ " to chrom coords;"+str(chrom_indices))
            for coord_set in chunk_chrom_coords:
                pool_inputs.append((task_index,data_dict,attribute_info,coord_set,args))
    pool_feed_chunk_start=0
    pool_feed_chunk_max=len(pool_inputs)
    chunks_to_process=len(pool_inputs)
    array_writer=Process(target=write_array,args=([args,updating,chunks_to_process]))
    try:
        array_writer.start()
    except Exception as e:
        raise e

    try:
        while pool_feed_chunk_start < pool_feed_chunk_max:
            pool_feed_chunk_end=min([pool_feed_chunk_start+queue_feed_chunk_size,pool_feed_chunk_max])
            #only do mapping if queue size is not exceeded & total memory consumption is not exceeded
            write_queue_size=write_queue.qsize()
            mem_used=psutil.virtual_memory().used / (10**9)
            print("mapping to pool, queue size:"+str(write_queue_size))
            print("mapping to pool, mem used:"+str(mem_used))
            while (write_queue_size >=args.max_queue_size) or (mem_used >=args.max_mem_g):
                time.sleep(10)
            print("sending to pool:"+str(pool_feed_chunk_start)+"-"+str(pool_feed_chunk_end)+"/"+str(chunks_to_process))
            pool.map(process_chunk,pool_inputs[pool_feed_chunk_start:pool_feed_chunk_end])
            pool_feed_chunk_start+=queue_feed_chunk_size
            time.sleep(60)
        pool.close()
    except KeyboardInterrupt:
        kill_child_processes(os.getpid())
        pool.terminate()
        raise
    except Exception as e:
        print(e)
        kill_child_processes(os.getpid())
        raise 
        
    #wait until we're done writing to the tiledb array
    array_writer.join()
    print("array_writer.join() is complete")
    print("shutting down pool")
    pool.join()
    print('done!')