def setup(self): self.x_blob = self.file_.create_earray(self.node, "ts_x_values", Atom.from_dtype(np.dtype("int64")), (0,), filters=filters) self.y_blob = self.file_.create_earray(self.node, "ts_y_values", Atom.from_dtype(np.dtype("float64")), (0,), filters=filters) self.bp = self.file_.create_earray(self.node, "bp", Atom.from_dtype(np.dtype("int32")), (0,), filters=filters) description = {} description["unique_id"] = StringCol(itemsize=64, pos=0) description["index"] = UInt32Col(pos=1) description["blank_flags_is_none"] = BoolCol(pos=2) description["label"] = StringCol(itemsize=32, pos=3) description["start"] = UInt32Col(pos=4) description["size"] = UInt32Col(pos=5) description["bp_start"] = UInt32Col(pos=6) description["bp_size"] = UInt32Col(pos=7) self.ts_index = self.file_.create_table(self.node, "ts_index", description, filters=None) # every colums which appears in a where method call should/must be indexed ! # this is not only for performance but for correct lookup as well (I had strange bugs # else) self.ts_index.cols.unique_id.create_index() self.ts_index.cols.index.create_index()
def setup_blobs(self): if not hasattr(self.node, "mz_blob"): self.file_.create_earray(self.node, "mz_blob", Atom.from_dtype(np.dtype("float64")), (0,), filters=filters, ) if not hasattr(self.node, "ii_blob"): self.file_.create_earray(self.node, "ii_blob", Atom.from_dtype(np.dtype("float32")), (0,), filters=filters, )
def setup_blobs(self): if not hasattr(self.node, "ms1_mz_blob"): for level in (1, 2): self.file_.create_earray(self.node, "ms%d_mz_blob" % level, Atom.from_dtype(np.dtype("float64")), (0,), filters=filters, ) self.file_.create_earray(self.node, "ms%d_ii_blob" % level, Atom.from_dtype(np.dtype("float32")), (0,), filters=filters, )
def save_hdf(data, fn, complevel=9, key='data'): filters = Filters(complevel=complevel, complib='blosc') with open_file(fn, mode="w") as f: _ = f.create_carray('/', key, Atom.from_dtype(data.dtype), filters=filters, obj=data)
def setup_blobs(self): if not hasattr(self.node, "mz_blob"): self.file_.create_earray( self.node, "mz_blob", Atom.from_dtype(np.dtype("float64")), (0, ), filters=filters, ) if not hasattr(self.node, "ii_blob"): self.file_.create_earray( self.node, "ii_blob", Atom.from_dtype(np.dtype("float32")), (0, ), filters=filters, )
def create_store(self, col_index): blob = self.file_.create_earray( self.node, "%s__%s" % (self.blob_name_stem, col_index), Atom.from_dtype(np.dtype("uint8")), (0,), filters=filters, chunkshape=(50000,), ) starts = self.file_.create_earray( self.node, "%s_starts__%s" % (self.blob_name_stem, col_index), Atom.from_dtype(np.dtype("uint64")), (0,), filters=filters, chunkshape=(10000,), ) return blob, starts
def __init__(self, file_, name, n_cols, cache_block_size=None): self.n_cols = n_cols self._n_cols_flags = n_cols // 8 + 1 if not hasattr(file_.root, name): self.data = file_.create_earray(file_.root, name, Atom.from_dtype(np.dtype("uint8")), (0,), filters=filters, ) self.n_rows = 0 else: self.data = getattr(file_.root, name) self.n_rows = len(self.data) // (self._n_cols_flags) if cache_block_size is None: cache_block_size = 10000 self.cache_block_size = cache_block_size # in rows self.cache = LruDict(500) self.test_vec = (1 << np.arange(8, dtype="uint8"))[:, None]
def __init__( self, filename, attrs=None, filters=None, sim_sources=None, sim_attrs=None, buffer=1, ): super().__init__(filename, attrs, filters, buffer) self.simgroup = self.file.create_group( self.file.root, "SlowSignalSimulation", "Slow signal simulation data" ) self.sim_tables = [] if sim_sources is not None: for i, source in enumerate(sim_sources): table = self.file.create_table( self.simgroup, "sim_source%d" % i, SimSSTableDs, "Simulation data for %s" % source.name, ) self.sim_tables.append((table, table.row)) self.tables.append(table) # For flush optimization table.attrs["name"] = source.name table.attrs["ra"] = source.ra table.attrs["dec"] = source.dec table.attrs["vMag"] = source.vMag self.table.attrs["simulation"] = True import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore", tables.NaturalNameWarning) warnings.simplefilter("ignore", tables.FlavorWarning) if sim_attrs is not None: for k, sa in sim_attrs.items(): for sub_k, sub_sa in sa.items(): atom = Atom.from_dtype(sub_sa.dtype) ds = self.file.create_carray( self.simgroup, "{}.{}".format(k, sub_k), atom, sub_sa.shape ) ds[:] = sub_sa
def main(argv): # Manually change the list we iterate through to select between the data and masks. (Doing both kills the node.) for file, saveTarget in SKIN_SUBFOLDERS: h5file = saveTarget h5 = open_file(h5file, "w") X = unionJackPrep(file) atom = Atom.from_dtype(X.dtype) flt = Filters(complevel=0) h5data = h5.create_carray(h5.root, "data", atom, X.shape, filters=flt) h5data[:] = X h5data.attrs.mean = None h5data.attrs.std = None h5.flush() h5.close() del h5 del X del atom del flt del h5data gc.collect() print("No mean or std to compute.")
def h5write(filename, varname, data): """Writes one data matrix to HDF5 file. Similar to Matlab function. """ assert isinstance(filename, basestring), "file name must be a string" assert isinstance(varname, basestring), "variable name must be a string" assert isinstance(data, np.ndarray), "data must be a Numpy array" if len(data.shape) == 1: data = data.reshape(-1,1) # remove leading "/" from variable name if varname[0] == "/": varname = varname[1:] try: h5 = openFile(filename, "w") a = Atom.from_dtype(data.dtype) h5.create_array(h5.root, varname, data.T, atom=a) # transpose for Matlab compatibility h5.flush() finally: h5.close()
def test_from_dtype_01(self): atom1 = Atom.from_dtype(numpy.dtype((numpy.int16, (2, 2)))) atom2 = Int16Atom(shape=(2, 2), dflt=0) self.assertEqual(atom1, atom2) self.assertEqual(str(atom1), str(atom2))
def store_tensor(name, tensor): data = tensor._data arr = fileh.createCArray(root, name, Atom.from_dtype(data.dtype), tensor.shape, filters=filters) arr[:] = data
def make_hdf5(data, h5file, dtype=np.float64, delimiter=" ", skiprows=0, comp_level=0): """Makes an HDF5 file from whatever given data. :param data: - input data in Numpy.ndarray or filename, or a shape tuple :param h5file: - name (and path) of the output HDF5 file :param delimiter: - data delimiter for text, csv files :param comp_level: - compression level of the HDF5 file """ assert comp_level < 10, "Compression level must be 0-9 (0 for no compression)" fill = "" # open data file if isinstance(data, np.ndarray): X = data elif isinstance(data, basestring) and data[-3:] in ['npy']: X = np.load(data) elif isinstance(data, basestring) and data[-3:] in ['.gz', 'bz2']: X = np.loadtxt(data, dtype=dtype, delimiter=delimiter, skiprows=skiprows) elif isinstance(data, basestring) and data[-3:] in ['txt', 'csv']: # iterative out-of-memory loader for huge .csv/.txt files fill = "iter" # check data dimensionality with open(data, "rU") as f: for _ in xrange(skiprows): f.readline() reader = csv.reader(f, delimiter=delimiter) for line in reader: X = np.fromiter(line, dtype=dtype) break elif isinstance(data, tuple) and len(data) == 2: X = np.empty((1, 1)) fill = "empty" else: assert False, "Input data must be Numpy ndarray, .npy file, or .txt/.csv text file (compressed .gz/.bz2)" # process data if len(X.shape) == 1: X = X[:, np.newaxis] assert len(X.shape) == 2, "Data in Numpy ndarray must have 2 dimensions" # create hdf5 file if comp_level > 0: flt = Filters(complevel=comp_level, shuffle=True) else: flt = Filters(complevel=0) h5 = open_file(h5file, "w") a = Atom.from_dtype(np.dtype(dtype), dflt=0) # write data to hdf5 file if fill == "iter": # iteratively fill the data h5data = h5.create_earray(h5.root, "data", a, (0, X.shape[0]), filters=flt) with open(data, "rU") as f: for _ in xrange(skiprows): f.readline() reader = csv.reader(f, delimiter=delimiter) for line in reader: row = np.fromiter(line, dtype=dtype) h5data.append(row[np.newaxis, :]) elif fill == "empty": # no fill at all h5data = h5.create_carray(h5.root, "data", a, data, filters=flt) else: # write whole data matrix h5data = h5.create_carray(h5.root, "data", a, X.shape, filters=flt) h5data[:] = X # close the file h5data.attrs.mean = None h5data.attrs.std = None h5.flush() h5.close()
def _add_earray(self, root, name, dtype, shape, **kwargs): atom = Atom.from_dtype(np.dtype(dtype)) earr = self.h5.createEArray(root, name, atom, shape, **kwargs) return earr
def generate_patchcoords(self): if self.has_patchcoords(): patchcoordsfile = self.get_patchcoordsfile() return patchcoordsfile logger.debug(f'Generating patches for {self.imageID}') filters = Filters(complib='zlib', complevel=5) patchcoordsfile = open_file(self.patchcoordsfilepath, mode='w', title=f'{self.imageID} patches', filters=filters) atom = Atom.from_dtype(np.dtype('uint16')) slide = self.get_slide() dslevel = slide.level_count - 1 dscoord = Coord(*slide.level_dimensions[-1]) downsample = slide.level_downsamples[-1] logger.debug('Reading region') dsregion = np.array( slide.read_region((0, 0), dslevel, (dscoord.x, dscoord.y))).transpose(1, 0, 2) logger.debug('Performing GaussianBlur') blurreddsregion = cv2.GaussianBlur(dsregion, (51, 51), 0) blurreddsregion = cv2.cvtColor(blurreddsregion, cv2.COLOR_BGR2GRAY) T_otsu = mahotas.otsu(blurreddsregion) mask = np.zeros_like(blurreddsregion) mask[blurreddsregion < T_otsu] = 1 for patchsize in [128, 256, 512, 1024]: logger.debug(f'patchsize: {patchsize}') dsps = np.round(patchsize / downsample).astype(int) limitcoord = Coord(*dsregion.shape[:2]) / dsps logger.debug('Computing downsampled centers') dscentercoords = [ Coord(int(dsps / 2 + i * dsps), int(dsps / 2 + j * dsps)) for i in range(limitcoord.x - 1) for j in range(limitcoord.y - 1) ] logger.debug('Computing mask coordinates') assert (dscentercoords[-1].x < mask.shape[0] and dscentercoords[-1].y < mask.shape[1]) mask_centers = list( filter(lambda c: mask[c.x, c.y] == 1, dscentercoords)) tile_generator = DeepZoomGenerator(slide, tile_size=patchsize, overlap=0, limit_bounds=False) level_count = tile_generator.level_count - 1 logger.debug( f'Saving patches for {self.imageID} patchsize: {patchsize}') N = len(mask_centers) logger.debug('Retrieving tiles') assert (mask_centers[-1] * downsample) / patchsize <\ Coord(*tile_generator.level_tiles[-1]) valid_coords = [] for (i, coord) in tqdm(list(enumerate(mask_centers))): tile = np.array( tile_generator.get_tile( level_count, ((downsample * coord.x) / patchsize, (downsample * coord.y) / patchsize))) if ((tile > T_otsu).sum() / np.prod(tile.shape)) < 0.25: valid_coords.append(coord) n = len(valid_coords) valid_coords = np.array([c.to_array() for c in valid_coords]) logger.debug((f"Selected {n} tiles out of {N} ({n/N:0.2})" "with percent whitespace < 0.25")) if n > 0: carray = patchcoordsfile.create_carray('/', f'Size{patchsize}', atom, (n, 2)) carray[:, :] = valid_coords patchcoordsfile.close() return True
def test_from_dtype_04(self): atom1 = Atom.from_dtype(numpy.dtype('float64')) atom2 = Float64Atom(shape=(), dflt=0.0) self.assertEqual(atom1, atom2) self.assertEqual(str(atom1), str(atom2))
def test_from_dtype_03(self): with self.assertWarns(Warning): atom1 = Atom.from_dtype(numpy.dtype('U5'), dflt=b'hello') atom2 = StringAtom(itemsize=5, shape=(), dflt=b'hello') self.assertEqual(atom1, atom2) self.assertEqual(str(atom1), str(atom2))
def test_from_dtype_02(self): atom1 = Atom.from_dtype(numpy.dtype("S5"), dflt=b"hello") atom2 = StringAtom(itemsize=5, shape=(), dflt=b"hello") self.assertEqual(atom1, atom2) self.assertEqual(str(atom1), str(atom2))
# Go to the files location in the filesystem. shape_input = options.labels_indir foci_input = options.foci_indir cur_dir = getcwd() try: foci_files = read_filenames(foci_input,'.out') shape_files = read_filenames(shape_input,'.label') except: print "Could not read files from one of " + foci_input + ", " + shape_input sys.exit(1) foci_files.sort() shape_files.sort() # iterate over sorted & zipped files for (f,s) in zip(foci_files,shape_files): label_range = f.split('.')[0] foci_array = read_array_from_file(f,foci_input) shape_array = read_array_from_file(s,shape_input) relabeled_array = combine_arrays(foci_array,shape_array) atom = Atom.from_dtype(relabeled_array.dtype) labels = h5file.createCArray(where=labels_group, name=label_range, atom=atom, shape=relabeled_array.shape, filters=zlib_filters) labels[:] = relabeled_array h5file.flush() # Close the h5 file when done. h5file.close()
## then all dynamic arrays for path, rec in h5buffer.h5Dynamic.items(): try: if display and h5buffer.arrayExclude(path): continue dp, dn = os.path.split(path) data = rec.copy() data.shape = ( 1, ) + data.shape ## add integration dimension to data array shape = list(data.shape) shape[0] = 0 # if data.dtype.name.lower().find('string') <> -1: # atom = StringAtom(shape=shape,length=80,flavor=ARRAYFLAVOR) # else: atom = Atom.from_dtype(data.dtype) # atom = Atom(typeTranslate(data.dtype.name),shape=shape,flavor=ARRAYFLAVOR) arr = h5.createEArray(dp, dn, atom, shape) arr.flavor = 'numpy' arr.append(data) self.log.debug('Dynamic array created: %s' % (path)) except Exception, inst: self.log.exception(inst) self.log.error('Failed on group: %s, name: %s' % (dp, dn)) continue ## and finally all attributes for path, data in h5buffer.h5Attributes.items(): try: if display and h5buffer.arrayExclude(path): continue
def test_from_dtype_04(self): atom1 = Atom.from_dtype(numpy.dtype('Float64')) atom2 = Float64Atom(shape=(), dflt=0.0) self.assertEqual(atom1, atom2) self.assertEqual(str(atom1), str(atom2))
all_plates = [p._v_name for p in h5input.walk_groups("/plates")] all_plates = all_plates[1:] # Create a group for each plate in the output file for plate in all_plates: desc = "plate number " + plate h5output.create_group("/plates/",plate,desc) # Walk the input file, filtering each well we encounter, save to the output file for p in all_plates: plate_group = "/plates/" + p print "processing plate %s " % (p) for w in h5input.walk_nodes(where=plate_group, classname='EArray'): well_name = w._v_name raw_data = w.read() filtered_data = apply_constraints(raw_data, constraints_file=options.filters) atom = Atom.from_dtype(filtered_data.dtype) if (filtered_data.shape[0] > 0): ds = h5output.create_carray(where=plate_group, name=well_name, atom=atom, shape=filtered_data.shape, filters=zlib_filters) ds[:] = filtered_data else: ds = h5output.create_earray(where=plate_group, name=well_name, atom=atom, shape=(0,filtered_data.shape[1]), filters=zlib_filters) h5output.flush() print "done writing to h5 output file" h5output.close() h5input.close()