def _init_table(save_file, trajectories_data): tot_rows = len(trajectories_data) trajectories_data.index = np.arange(tot_rows) TABLE_FILTERS = tables.Filters(complevel=5, complib='zlib', shuffle=True, fletcher32=True) with tables.File(save_file, 'w') as fid: rec_data = trajectories_data.to_records(index=False) rec_data[ 'skeleton_id'] = trajectories_data.index #this is only for the viewer rec_data['frame_number'] = trajectories_data.index fid.create_table('/', 'trajectories_data', obj=rec_data, filters=TABLE_FILTERS) fid.create_carray('/', 'skeleton', tables.Float32Atom(dflt=np.nan), (tot_rows, 49, 2), filters=TABLE_FILTERS) fid.create_carray('/', 'mask', tables.Float32Atom(dflt=np.nan), (tot_rows, roi_size, roi_size), filters=TABLE_FILTERS)
def create_data_file(out_file, n_samples, image_shape, channels=4): hdf5_file = tables.open_file(out_file, mode='w') # complevel - compression level # complib - the library for compression filters = tables.Filters(complevel=5, complib='blosc') data_shape = tuple([0, channels] + list(image_shape)) truth_shape = tuple([0, 1] + list(image_shape)) data_storage = hdf5_file.create_earray(hdf5_file.root, 'data', tables.Float32Atom(), shape=data_shape, filters=filters, expectedrows=n_samples) truth_storage = hdf5_file.create_earray(hdf5_file.root, 'true', tables.UInt8Atom(), shape=truth_shape, filters=filters, expectedrows=n_samples) affine_storage = hdf5_file.create_earray(hdf5_file.root, 'affine', tables.Float32Atom(), shape=(0, 4, 4), filters=filters, expectedrows=n_samples) return hdf5_file, data_storage, truth_storage, affine_storage
def merge(out, fnames): data = tables.openFile(out, mode='a') for fname in fnames: f = tables.openFile(fname, mode='r') raw_targets = f.root.denseFeat if 'denseFeat' in data.root: prev_data = data.root.denseFeat targets = data.createCArray(data.root, '_y', atom=tables.Float32Atom(), shape=((raw_targets.shape[0] + prev_data.shape[0], 436))) targets[:prev_data.shape[0], :] = prev_data[:, :] targets[prev_data.shape[0]:, :] = raw_targets[:, :] data.flush() data.removeNode(data.root, "denseFeat", 1) else: targets = data.createCArray(data.root, '_y', atom=tables.Float32Atom(), shape=((raw_targets.shape[0], 436))) targets[:, :] = raw_targets[:, :] data.flush() data.renameNode(data.root, "denseFeat", "_y") data.flush() f.close() data.close()
def create_data_file(out_file, n_channels, n_samples, image_shape): hdf5_file = tables.open_file(out_file, mode='w') print("DEBUG: Opening HDF5 file") filters = tables.Filters(complevel=5, complib='blosc') data_shape = tuple([0, n_channels] + list(image_shape)) truth_shape = tuple([0, 1] + list(image_shape)) print("DEBUG: Writing data_storage to HDF5 file") data_storage = hdf5_file.create_earray(hdf5_file.root, 'data', tables.Float32Atom(), shape=data_shape, filters=filters, expectedrows=n_samples) print("DEBUG: Writing truth_storage to HDF5 file") truth_storage = hdf5_file.create_earray(hdf5_file.root, 'truth', tables.UInt8Atom(), shape=truth_shape, filters=filters, expectedrows=n_samples) print("DEBUG: Writing affine_storage to HDF5 file") affine_storage = hdf5_file.create_earray(hdf5_file.root, 'affine', tables.Float32Atom(), shape=(0, 4, 4), filters=filters, expectedrows=n_samples) return hdf5_file, data_storage, truth_storage, affine_storage
def setup_hdf5(h5_filename, expectedrows): # Open file h5file = tables.open_file(h5_filename, mode="w") # A group for the normal data table = h5file.create_table(h5file.root, "summary", WhiskerSeg, "Summary data about each whisker segment", expectedrows=expectedrows) # Put the contour here xpixels_vlarray = h5file.create_vlarray( h5file.root, 'pixels_x', tables.Float32Atom(shape=()), title='Every pixel of each whisker (x-coordinate)', expectedrows=expectedrows) ypixels_vlarray = h5file.create_vlarray( h5file.root, 'pixels_y', tables.Float32Atom(shape=()), title='Every pixel of each whisker (y-coordinate)', expectedrows=expectedrows) h5file.close()
def initialize_file(save_name, experiments_data, roi_size, frac_train=0.99): #divide data into train and test set experiments_data = _add_is_train(experiments_data, frac_train) #make sure the exp_data has the correct format for strings exp_data_r = experiments_data.to_records(index=False) dtypes = [] for col in experiments_data.columns: ss = str(exp_data_r[col].dtype) if ss == 'object': ss = 'S{}'.format( experiments_data[col].map(lambda x: len(x)).max()) dtypes.append((col, ss)) dtypes = np.dtype(dtypes) exp_data_r = exp_data_r.astype(dtypes) roi_data_dtypes = np.dtype([(x, np.int32) for x in ROI_DATA_COLS]) #create the new file with tables.File(str(save_name), 'w') as fid_samples: fid_samples.create_table('/', "experiments_data", exp_data_r, filters=TABLE_FILTERS) fid_samples.create_table('/', "roi_data", roi_data_dtypes, filters=TABLE_FILTERS) coords_g = fid_samples.create_group('/', 'coordinates') fid_samples.create_earray(coords_g, 'skeletons', atom=tables.Float32Atom(), shape=(0, 49, 2), chunkshape=(1, 49, 2), filters=TABLE_FILTERS) fid_samples.create_earray(coords_g, 'widths', atom=tables.Float32Atom(), shape=(0, 49), chunkshape=(1, 49), filters=TABLE_FILTERS) fid_samples.create_earray('/', 'mask', atom=tables.Float32Atom(), shape=(0, roi_size, roi_size), chunkshape=(1, roi_size, roi_size), filters=TABLE_FILTERS) fid_samples.create_earray('/', 'full_data', atom=tables.Float32Atom(), shape=(0, roi_size, roi_size), chunkshape=(1, roi_size, roi_size), filters=TABLE_FILTERS)
def addToH5File(h5file, clusters, freqs, store_intermediate=False): #group into nrsolutions poss_nr_sol = [] groups = [] for clusteridx, cluster in enumerate(clusters): if not cluster['nrsol'] in poss_nr_sol: poss_nr_sol.append(cluster['nrsol']) groups.append([]) idx = poss_nr_sol.index(cluster['nrsol']) groups[idx].append(clusteridx) if 'sagefreqIdx' in h5file.root: h5file.removeNode('/sagefreqIdx') h5file.createArray(h5file.root, 'sagefreqIdx', freqs) for igrp, grp in enumerate(groups): # create arrays: if store_intermediate: cdata = np.load('tmp_store_cdata_%d.npy' % (grp[0])) else: cdata = clusters[grp[0]]['cdata'] arrayshape = cdata.shape[:-1] + (len(grp), 4) for name in [ 'sageradec%d' % igrp, 'sagephases%d' % igrp, 'sageamplitudes%d' % igrp ]: if name in h5file.root: h5file.removeNode('/' + name) srcarray = h5file.createCArray(h5file.root, 'sageradec%d' % igrp, tab.Float32Atom(), shape=(len(grp), 2)) pharray = h5file.createCArray(h5file.root, 'sagephases%d' % igrp, tab.Float32Atom(), shape=arrayshape) amparray = h5file.createCArray(h5file.root, 'sageamplitudes%d' % igrp, tab.Float32Atom(), shape=arrayshape) for idx, clusteridx in enumerate(grp): if store_intermediate: cdata = np.load('tmp_store_cdata_%d.npy' % (clusteridx)) else: cdata = clusters[clusteridx]['cdata'] pharray[:, :, :, idx, :] = np.angle(cdata) amparray[:, :, :, idx, :] = np.absolute(cdata) srcarray[idx, :] = np.array( [clusters[clusteridx]['Ra'], clusters[clusteridx]['Dec']]) clusters[clusteridx]['cdata'] = [] if store_intermediate: call("rm tmp_store_cdata_%d.npy" % (clusteridx), shell=True) pharray.flush() amparray.flush() srcarray.flush()
def tables(docompute, dowrite, complib, verbose): # Filenames ifilename = os.path.join(OUT_DIR, "expression-inputs.h5") ofilename = os.path.join(OUT_DIR, "expression-outputs.h5") # Filters shuffle = True if complib == 'blosc': filters = tb.Filters(complevel=1, complib='blosc', shuffle=shuffle) elif complib == 'lzo': filters = tb.Filters(complevel=1, complib='lzo', shuffle=shuffle) elif complib == 'zlib': filters = tb.Filters(complevel=1, complib='zlib', shuffle=shuffle) else: filters = tb.Filters(complevel=0, shuffle=False) if verbose: print("Will use filters:", filters) if dowrite: f = tb.open_file(ifilename, 'w') # Build input arrays t0 = time() root = f.root a = f.create_carray(root, 'a', tb.Float32Atom(), shape, filters=filters) b = f.create_carray(root, 'b', tb.Float32Atom(), shape, filters=filters) if verbose: print("chunkshape:", a.chunkshape) print("chunksize:", np.prod(a.chunkshape) * a.dtype.itemsize) #row = np.linspace(0, 1, ncols) row = np.arange(0, ncols, dtype='float32') for i in range(nrows): a[i] = row * (i + 1) b[i] = row * (i + 1) * 2 f.close() print("[tables.Expr] Time for creating inputs:", round(time() - t0, 3)) if docompute: f = tb.open_file(ifilename, 'r') fr = tb.open_file(ofilename, 'w') a = f.root.a b = f.root.b r1 = f.create_carray(fr.root, 'r1', tb.Float32Atom(), shape, filters=filters) # The expression e = tb.Expr(expr) e.set_output(r1) t0 = time() e.eval() if verbose: print("First ten values:", r1[0, :10]) f.close() fr.close() print("[tables.Expr] Time for computing & save:", round(time() - t0, 3))
def create_features(file_names, file_path): flag = 0 columns = ['train_cat', 'train_dog'] w = tables.open_file('../window.h5', 'w') atom1 = tables.Float32Atom() array_w = w.create_earray(w.root, 'data', atom1, (0, 20)) l = tables.open_file('../label.h5', 'w') atom2 = tables.Float32Atom() array_l = l.create_earray(l.root, 'data', atom2, (0, 1)) temp = [] scaler = StandardScaler() for col in columns: count = 0 for file in file_names[col]: if file == 0: continue filefp = file_path + file data, fs = soundfile.read(filefp) data = data.reshape(-1, 1) wn = np.random.randn(len(data)).reshape(-1, 1) data_wn = data + 0.0075 * wn scaler.fit(data) data = scaler.transform(data) data_wn = scaler.transform(data_wn) #Creating MFCC features for 50ms(800 data points) window and 50% overlap(400 hop_length) mfcc = librosa.feature.mfcc(y=data.reshape(data.shape[0], ), sr=fs, n_fft=800, hop_length=400) mfcc_wn = librosa.feature.mfcc(y=data_wn.reshape( data_wn.shape[0], ), sr=fs, n_fft=800, hop_length=400) #Finally for each of the windows 20 features are extracted mfcc = mfcc.reshape(-1, 20) mfcc_wn = mfcc_wn.reshape(-1, 20) #Creating labels if col == 'train_cat': x = 0 else: x = 1 label = np.array([[x] for i in range(mfcc.shape[0])]).reshape(-1, 1) label_wn = np.array([[x] for i in range(mfcc_wn.shape[0]) ]).reshape(-1, 1) array_w.append(mfcc) array_w.append(mfcc_wn) array_l.append(label) array_l.append(label_wn) count += mfcc.shape[0] + mfcc_wn.shape[0] temp.append(count) n_cat = temp[0] #number of windows with class label as cat n_dog = temp[1] #number of windows with class label as dog w.close() l.close() return n_cat, n_dog
def create_hdf5_file(self, output_filepath, data_group_labels=None, case_list=None): if data_group_labels is None: data_group_labels = self.data_groups.keys() hdf5_file = tables.open_file(output_filepath, mode='w') filters = tables.Filters(complevel=5, complib='blosc') for data_label, data_group in self.data_groups.iteritems(): num_cases = len(self.cases) * self.multiplier if num_cases == 0: raise FileNotFoundError( 'WARNING: No cases found. Cannot write to file.') if data_group.output_shape is None: output_shape = data_group.get_shape() else: output_shape = data_group.output_shape # Add batch dimension data_shape = tuple([0] + list(output_shape)) data_group.data_storage = hdf5_file.create_earray( hdf5_file.root, data_label, tables.Float32Atom(), shape=data_shape, filters=filters, expectedrows=num_cases) # Naming convention is bad here, TODO, think about this. data_group.casename_storage = hdf5_file.create_earray( hdf5_file.root, '_'.join([data_label, 'casenames']), tables.StringAtom(256), shape=(0, 1), filters=filters, expectedrows=num_cases) data_group.affine_storage = hdf5_file.create_earray( hdf5_file.root, '_'.join([data_label, 'affines']), tables.Float32Atom(), shape=(0, 4, 4), filters=filters, expectedrows=num_cases) return hdf5_file
def create_data_file(out_file, n_channels, n_samples, image_shape, storage_names=('data', 'truth', 'affine'), affine_shape=(0, 4, 4), normalize=True, affine_dtype=tables.Float32Atom()): hdf5_file = tables.open_file(out_file, mode='w') filters = tables.Filters( complevel=5 ) #, complib='blosc') # suggested remove in https://github.com/ellisdg/3DUnetCNN/issues/58 data_shape = tuple([0, n_channels] + list(image_shape)) truth_shape = tuple([0, 1] + list(image_shape)) if not normalize: data_storage = hdf5_file.create_earray(hdf5_file.root, storage_names[0], tables.Int8Atom(), shape=data_shape, filters=filters, expectedrows=n_samples) else: data_storage = hdf5_file.create_earray(hdf5_file.root, storage_names[0], tables.Float32Atom(), shape=data_shape, filters=filters, expectedrows=n_samples) truth_storage = hdf5_file.create_earray(hdf5_file.root, storage_names[1], tables.UInt8Atom(), shape=truth_shape, filters=filters, expectedrows=n_samples) affine_storage = hdf5_file.create_earray(hdf5_file.root, storage_names[2], affine_dtype, shape=affine_shape, filters=filters, expectedrows=n_samples) if len(storage_names) == 4: normalization_storage = hdf5_file.create_earray(hdf5_file.root, storage_names[3], tables.Float32Atom(), shape=(0, 2), filters=filters, expectedrows=n_samples) # will hold mean and std of this case for later normalization return hdf5_file, data_storage, truth_storage, affine_storage, normalization_storage return hdf5_file, data_storage, truth_storage, affine_storage
def save(self, filename: str, mode: str = 'h5', **kwargs): """ :param filename: :param mode: :param kwargs: :return: """ if mode == 'h5': compression = tables.Filters(complib='zlib', shuffle=True, complevel=1) h5handle = tables.open_file(filename, mode="w", title="Test file", filters=compression) h5handle.create_array('/', 'topology', np.array(json.dumps( self.dye.dye_definition)).reshape(1), shape=(1, )) h5handle.create_earray(where='/', name='coordinates', atom=tables.Float32Atom(), shape=(0, self.dye.n_atoms, 3)) h5handle.create_earray(where='/', name='time', atom=tables.Float32Atom(), shape=(0, )) h5handle.create_group(where='/', name='fluorescence') h5handle.create_earray(where='/fluorescence/', name='quencher_distance', atom=tables.Float32Atom(), shape=(0, )) # set units h5handle.root.time.set_attr('units', 'picoseconds') h5handle.root.xyz.set_attr('units', 'angstroms') h5handle.root.fluorescence.quencher_distance.set_attr( 'units', 'angstroms') h5handle.root.xyz.append(self.xyz) h5handle.root.time.append(self.time_axis) h5handle.close() elif mode == 'xyz': skip = kwargs.get('skip', 1) coordinates = self.xyz[::skip] n_frames = coordinates.shape[0] coordinates = coordinates.reshape(n_frames, 3) chisurf.fio.coordinates.write_xyz(filename, coordinates) elif mode == 'npy': np.save(filename, self.xyz)
def createHDF5File(self): out_file_path = os.path.join(self._output_path, self._output_file_name) try: hdf5_file = tables.open_file(out_file_path, mode='w') filters = tables.Filters(complevel=5, complib='blosc') data_shape = tuple([0, self.num_modalities] + list(self._image_shape)) data_storage = hdf5_file.create_earray( hdf5_file.root, 'data', tables.Float32Atom(), shape=data_shape, filters=filters, expectedrows=self.num_modalities) if self.label_format == "nii": truth_shape = tuple([0, 1] + list(self._image_shape)) truth_storage = hdf5_file.create_earray( hdf5_file.root, 'truth', tables.UInt8Atom(), shape=truth_shape, filters=filters, expectedrows=self.num_modalities) elif self.label_format == 'csv': truth_shape = tuple([0, self._image_shape[-1]]) truth_storage = hdf5_file.create_earray( hdf5_file.root, 'truth', tables.UInt32Atom(), shape=truth_shape, filters=filters, expectedrows=self.num_modalities) else: raise ValueError("Fail to recognize label format: %s" % self.label_format) affine_storage = hdf5_file.create_earray( hdf5_file.root, 'affine', tables.Float32Atom(), shape=(0, 4, 4), filters=filters, expectedrows=self.num_modalities) return hdf5_file, data_storage, truth_storage, affine_storage except Exception as e: # If something goes wrong, delete the incomplete data file os.remove(out_file_path) raise e
def main(): here = os.path.abspath(os.path.dirname(__file__)) data_dir = os.path.abspath(os.path.join(here, '..', 'data')) file_path = os.path.join(data_dir, 'pytables-earray.h5') # One (and only one) of the shape dimensions *must* be 0. The dimension # being 0 means that the resulting EArray object can be extended along it. # Multiple enlargeable dimensions are not supported right now. shape = (0, 300) # An EArray contains homogeneous data. Every atomic object (i.e. every # single element) has the same type and shape. atom = tb.Float32Atom() # An EArray supports compression filters = tb.Filters(complevel=5, complib='zlib') with tb.open_file(file_path, 'w') as f: # create an empty EArray earray = f.create_earray(where='/', name='Array0', atom=atom, shape=shape, title='My EArray', filters=filters) # number of times that we need to write some data num = 100 for i in range(num): rows = np.random.randint(low=10, high=100) cols = shape[1] # define some data sequence = np.random.random((rows, cols)).astype('float32') # append the data to the EArray earray.append(sequence=sequence)
def insert_embeddings_pytables(self): try: self.get_model() self.model.init_model() self.model.load() embeds_file = tables.open_file(os.path.join( cnt.DATA_FOLDER, cnt.SIAMESE_EMBEDDINGS_FILE), mode='w') atom = tables.Float32Atom() embeds_arr = embeds_file.create_earray( embeds_file.root, 'data', atom, (0, cnt.SIAMESE_EMBEDDING_SIZE)) sent_tokens_file = tables.open_file(os.path.join( cnt.DATA_FOLDER, cnt.SENT_TOKENS_FILE), mode='r') sent_tokens = sent_tokens_file.root.data n, batch_size = len(sent_tokens), cnt.PYTABLES_INSERT_BATCH_SIZE num_batches = int(math.ceil(float(n) / batch_size)) for m in range(num_batches): start, end = m * batch_size, min((m + 1) * batch_size, n) tokens_arr_input = gutils.get_wv_siamese( self.wv_model, sent_tokens[start:end, :]) embeds = self.model.get_embeddings(tokens_arr_input) embeds_arr.append(embeds) finally: sent_tokens_file.close() embeds_file.close()
def add_group_hdf5(save_path, group, expected_shapes, where='/', names=None): """ Adds a new group to an HDF5 archive, with the x train, y train, x test, and y test separated. The expected_shapes should be a list of length 4, one shape that matches to each of these data subsets, unless you override the name parameters. Parameters ---------- save_path <string> : The physical path to the archive file group <string> : The name for the group you are adding expected_shapes <list> : A list of expected shapes for the data that will be added here. It doesn't have to be exact where <string> : A the root hierarchical path that you want to add the group to names <list> : A list of strings for the names of the subsets of the groups Returns ------- hdf5_file, [data] : The HDF5 file that was opened and a list of arrays for each of the datasets that were added. """ names = names if names else ["x_train", "y_train", "x_test", "y_test"] hdf5_file = tables.open_file(save_path, mode='a') h_comp = tables.Filters(complevel=5, complib='blosc') h_group = hdf5_file.create_group(where, group, group) h_data = [] for k, shape in zip(names, expected_shapes): h_data.append(hdf5_file.create_earray(h_group, k, tables.Float32Atom(), shape=(0, shape[1]), filters=h_comp, expectedrows=shape[0])) return hdf5_file, h_data
def add_transformed(save_path, group, buffer=1000, where='/'): """ Takes a group and normalizes it for training. This is done quietly and only transformed groups are used for training networks. Parameters ---------- save_path <string> : The physical path to the archive file group <string> : The name for the group you are adding buffer <int> : an integer defining the number of data points to load into memory at a time. where <string> : A the root hierarchical path that you want to add the group to """ hdf5_file = tables.open_file(save_path, mode='a') parent = hdf5_file.get_node(where+group) data = (parent.x_train, parent.y_train, parent.x_test, parent.y_test) shapes = map(lambda x: x.shape, data) h_comp = tables.Filters(complevel=5, complib='blosc') h_group = hdf5_file.create_group(where+group, 'transformed', 'Data scaled to Gaussian distribution') h_data = [] for k, shape in zip(["x_train", "y_train", "x_test", "y_test"], shapes): h_data.append(hdf5_file.create_carray(h_group, k, tables.Float32Atom(), shape=shape, filters=h_comp)) scale = tr.get_transform(data[0]) for i, d in enumerate(data): for j in xrange(int(math.ceil(d.shape[0] / buffer))): if i%2 == 1: h_data[i][j*buffer:(j+1)*buffer] = d[j*buffer:(j+1)*buffer] else: h_data[i][j * buffer:(j + 1) * buffer] = scale.transform(d[j * buffer:(j + 1) * buffer]) hdf5_file.flush() hdf5_file.close()
def open_h5_files() -> (list, list): float_atom = tables.Float32Atom() int_atom = tables.Int32Atom() fd_m = tables.open_file(os.path.join(MATRIX_DATASET_FOLDER, "all.h5"), mode="w") data_m = fd_m.create_earray(fd_m.root, "data", float_atom, (0, MATRIX_DIMENSION, MATRIX_DIMENSION), expectedrows=600000) label_m = fd_m.create_earray(fd_m.root, "labels", int_atom, (0, 1), expectedrows=600000) fd_t = tables.open_file(os.path.join(TENSOR_DATASET_FOLDER, "all.h5"), mode="w") data_t = fd_t.create_earray( fd_t.root, "data", float_atom, (0, TENSOR_DIMENSION, TENSOR_DIMENSION, TENSOR_DIMENSION), expectedrows=600000) label_t = fd_t.create_earray(fd_t.root, "labels", int_atom, (0, 1), expectedrows=600000) fd_m_test = tables.open_file(os.path.join(MATRIX_DATASET_FOLDER, "all_test.h5"), mode="w") data_m_test = fd_m.create_earray(fd_m_test.root, "data", float_atom, (0, MATRIX_DIMENSION, MATRIX_DIMENSION), expectedrows=60000) label_m_test = fd_m.create_earray(fd_m_test.root, "labels", int_atom, (0, 1), expectedrows=60000) fd_t_test = tables.open_file(os.path.join(TENSOR_DATASET_FOLDER, "all_test.h5"), mode="w") data_t_test = fd_m.create_earray( fd_t_test.root, "data", float_atom, (0, TENSOR_DIMENSION, TENSOR_DIMENSION, TENSOR_DIMENSION), expectedrows=60000) label_t_test = fd_m.create_earray(fd_t_test.root, "labels", int_atom, (0, 1), expectedrows=60000) return (fd_m,data_m,label_m),\ (fd_t,data_t,label_t),\ (fd_m_test,data_m_test,label_m_test),\ (fd_t_test,data_t_test,label_t_test)
def init_hdf5(path, shapes): """ Initialize hdf5 file to be used ba dataset """ x_shape, y_shape = shapes # make pytables ensure_tables() h5file = tables.openFile(path, mode="w", title="SVHN Dataset") gcolumns = h5file.createGroup(h5file.root, "Data", "Data") atom = tables.Float32Atom( ) if config.floatX == 'float32' else tables.Float64Atom() filters = DenseDesignMatrixPyTables.filters h5file.createCArray(gcolumns, 'X', atom=atom, shape=x_shape, title="Data values", filters=filters) h5file.createCArray(gcolumns, 'y', atom=atom, shape=y_shape, title="Data targets", filters=filters) return h5file, gcolumns
def create_data_file(out_file, n_samples, image_shape, modality_names): # pdb.set_trace() hdf5_file = tables.open_file(out_file, mode='w') filters = tables.Filters(complevel=5, complib='blosc') modality_shape = tuple([0, 1] + list(image_shape)) brain_width_shape = (0, 2, 3) modality_storage_list = [ hdf5_file.create_earray(hdf5_file.root, modality_name, tables.Float32Atom(), shape=modality_shape, filters=filters, expectedrows=n_samples) for modality_name in modality_names ] brain_width_storage = hdf5_file.create_earray(hdf5_file.root, 'brain_width', tables.UInt8Atom(), shape=brain_width_shape, filters=filters, expectedrows=n_samples) return hdf5_file, modality_storage_list, brain_width_storage
def init_hdf5(self, path, shapes): """ .. todo:: WRITEME properly Initialize hdf5 file to be used ba dataset """ x_shape, y_shape = shapes # make pytables ensure_tables() h5file = tables.openFile(path, mode="w", title="SVHN Dataset") gcolumns = h5file.createGroup(h5file.root, "Data", "Data") atom = (tables.Float32Atom() if config.floatX == 'float32' else tables.Float64Atom()) h5file.createCArray(gcolumns, 'X', atom=atom, shape=x_shape, title="Data values", filters=self.filters) h5file.createCArray(gcolumns, 'y', atom=atom, shape=y_shape, title="Data targets", filters=self.filters) return h5file, gcolumns
def create_VLFloatArray(self, name, array, group): """Stores a homogenous variable length float array in a group""" self.h5file.create_vlarray(group, name, tables.Float32Atom(), "ragged array of floats", chunkshape = 512)
def create_image_data(): try: img_arr_file = tables.open_file(cnt.IMAGE_ARRAY_PATH, mode='w') atom = tables.Float32Atom() img_arr = img_arr_file.create_earray( img_arr_file.root, 'data', atom, (0, cnt.IMAGE_SIZE, cnt.IMAGE_SIZE, 3)) chunk_size, labels = 5000, [] for df_chunk in pd.read_csv(cnt.OUTPUT_FILE_PATH, chunksize=chunk_size): df = df_chunk[list( df_chunk['image_path'].apply(lambda x: os.path.exists(x)))] print(df.shape) labels += list(df['age_group']) file_paths = list(df['image_path']) img_arr.append([ img_to_array( load_img(image).convert('RGB').resize( (cnt.IMAGE_SIZE, cnt.IMAGE_SIZE))) / 255.0 for image in file_paths ]) shutils.save_data_pkl(labels, cnt.LABELS_PATH) finally: img_arr_file.close()
def interpolate( self, facetlistfile ) : """ """ #facetdbname = os.path.join(self.globaldb, 'facets') #os.system( 'makesourcedb in=%s out=%s append=False' % (facetlistfile, facetdbname) ) #patch_table = pt.table( os.path.join(facetdbname, 'SOURCES', 'PATCHES' ) ) #if 'facets' in self.hdf5.root: self.hdf5.root.facets.remove() #description = {'name': tables.StringCol(40), 'position':tables.Float64Col(2)} #self.facets = self.hdf5.createTable(self.hdf5.root, 'facets', description) #facet = self.facets.row #for patch in patch_table : #facet['name'] = patch['PATCHNAME'] #facet['position'] = array([patch['RA'], patch['DEC']]) #facet.append() #self.facets.flush() self.N_facets = len(self.facets) self.facet_names = self.facets[:]['name'] self.facet_positions = self.facets[:]['position'] print self.n_list if 'STEC_facets' in self.hdf5.root: self.hdf5.root.STEC_facets.remove() self.STEC_facets = self.hdf5.createCArray(self.hdf5.root, 'STEC_facets', tables.Float32Atom(), shape = (self.N_pol, self.n_list[:].shape[0], self.N_facets, self.N_stations)) #if 'facet_piercepoints' in self.hdf5.root: self.hdf5.root.facet_piercepoints.remove() #description = {'positions':tables.Float64Col((self.N_facets, self.N_stations,2)), \ #'positions_xyz':tables.Float64Col((self.N_facets, self.N_stations,3)), \ #'zenith_angles':tables.Float64Col((self.N_facets, self.N_stations))} #self.facet_piercepoints = self.hdf5.createTable(self.hdf5.root, 'facet_piercepoints', description) #height = self.piercepoints.attrs.height #facet_piercepoints_row = self.facet_piercepoints.row #print "Calculating facet piercepoints..." #for n in self.n_list: #piercepoints = PiercePoints( self.times[ n ], self.pointing, self.array_center, self.facet_positions, self.station_positions, height = height ) #facet_piercepoints_row['positions'] = piercepoints.positions #facet_piercepoints_row['positions_xyz'] = piercepoints.positions_xyz #facet_piercepoints_row['zenith_angles'] = piercepoints.zenith_angles #facet_piercepoints_row.append() #self.facet_piercepoints.flush() r_0 = self.TECfit_white.attrs.r_0 beta = self.TECfit_white.attrs.beta for facet_idx in range(self.N_facets) : for station_idx in range(self.N_stations): for pol_idx in range(self.N_pol) : TEC_list = [] for n in range(len(self.n_list)): p = self.facet_piercepoints[n]['positions_xyz'][facet_idx, station_idx,:] za = self.facet_piercepoints[n]['zenith_angles'][facet_idx, station_idx] Xp_table = reshape(self.piercepoints[n]['positions_xyz'], (self.N_piercepoints, 3) ) v = self.TECfit_white[ pol_idx, n, :, : ].reshape((self.N_piercepoints,1)) D2 = sum((Xp_table - p)**2,1) C = (D2 / ( r_0**2 ) )**( beta / 2. ) / -2. self.STEC_facets[pol_idx, n, facet_idx, station_idx] = dot(C, v)/cos(za)
def get_cnn_features(image_list, split, batch_size, relu=False): hdf5_path = "%s-%s" % (split, "cnn_features.hdf5") hdf5_file = tables.open_file(hdf5_path, mode='w') filters = tables.Filters(complevel=5, complib='blosc') data_storage = hdf5_file.createEArray(hdf5_file.root, 'feats', tables.Float32Atom(), shape=(0, 100352), filters=filters, expectedrows=len(image_list)) for start, end in zip( range(0, len(image_list) + batch_size, batch_size), range(batch_size, len(image_list) + batch_size, batch_size)): print("Processing %s images %d-%d / %d" % (split, start, end, len(image_list))) batch_list = image_list[start:end] feats = cnn.get_features(batch_list, layers='conv5_4', layer_sizes=[512, 14, 14]) # transpose and flatten feats to prepare for reshape(14*14, 512) feats = np.array(map(lambda x: x.T.flatten(), feats)) if relu: feats = np.clip(feats, a_min=0., a_max=np.inf, out=feats) # RELU data_storage.append(feats) print("Finished processing %d images" % len(data_storage)) hdf5_file.close()
def create_hdf5_file(output_filepath, data_groups, data_collection): # Investigate hdf5 files. hdf5_file = tables.open_file(output_filepath, mode='w') # Investigate this line. # Compression levels = complevel. No compression = 0 # Compression library = Method of compresion. filters = tables.Filters(complevel=5, complib='blosc') data_storages = [] for data_group_label in data_groups: data_group = data_collection.data_groups[data_group_label] num_cases, output_shape = data_group.get_augment_num_shape() modalities = data_group.get_modalities() # Input data has multiple 'channels' i.e. modalities. data_shape = tuple([0, modalities] + list(output_shape)) print data_group.label, data_shape data_group.data_storage = hdf5_file.create_earray( hdf5_file.root, data_group.label, tables.Float32Atom(), shape=data_shape, filters=filters, expectedrows=num_cases) return hdf5_file
def insert_embeddings_pytables(self, batch_size=25000): try: self.load() embeds_file = tables.open_file('data/w2v_embeddings.h5', mode='w') atom = tables.Float32Atom() embeds_arr = embeds_file.create_earray(embeds_file.root, 'data', atom, (0, self.embedding_size)) tokens_file = tables.open_file('data/sent_tokens.h5', mode='r') sent_tokens = tokens_file.root.data sent_tokens = [[w.decode('utf-8') for w in tokens] for tokens in sent_tokens] n = len(sent_tokens) num_batches = int(math.ceil(float(n) / batch_size)) vocabulary = [ word for word, index in self.tfidf_vectorizer.vocabulary_.items() ] for m in range(num_batches): start, end = m * batch_size, min((m + 1) * batch_size, n) matrix = self.tfidf_vectorizer.transform( sent_tokens[start:end, :]) embeds = self.get_weighted_sentence_vectors( matrix, vocabulary, end - start) embeds_arr.append(embeds) finally: tokens_file.close() embeds_file.close()
def create_hdf5_file(output_filepath, num_cases, output_sizes, preloaded=False): """ Creates a multi-tiered HDF5 file at each resolution provided in 'output_sizes'. Also stores string filepaths associated with the data. Big credit to https://github.com/ellisdg/3DUnetCNN for bringing HDF5 into my life. """ hdf5_file = tables.open_file(output_filepath, mode='w') filters = tables.Filters(complevel=5, complib='blosc') hdf5_file.create_earray(hdf5_file.root, 'imagenames', tables.StringAtom(256), shape=(0, 1), filters=filters, expectedrows=num_cases) for output_size in output_sizes: hdf5_file.create_earray(hdf5_file.root, 'data_' + str(output_size[0]), tables.Float32Atom(), shape=(0, ) + output_size, filters=filters, expectedrows=num_cases) return hdf5_file
def dump_test_set(self, h5filepath, nframes, framesize): # set rng to a hardcoded state, so we always have the same test set! self.numpy_rng.seed(1) with tables.openFile(h5filepath, 'w') as h5file: h5file.createArray(h5file.root, 'test_targets', self.partitions['test']['targets']) vids = h5file.createCArray(h5file.root, 'test_images', tables.Float32Atom(), shape=(10000, nframes, framesize, framesize), filters=tables.Filters(complevel=5, complib='zlib')) pos = h5file.createCArray(h5file.root, 'test_pos', tables.UInt16Atom(), shape=(10000, nframes, 2), filters=tables.Filters(complevel=5, complib='zlib')) for i in range(100): print i (vids[i * 100:(i + 1) * 100], pos[i * 100:(i + 1) * 100], _) = self.get_batch('test', 100, nframes, framesize, idx=np.arange(i * 100, (i + 1) * 100)) h5file.flush()
def resize(h5file, start, stop): ensure_tables() # TODO is there any smarter and more efficient way to this? data = h5file.getNode('/', "Data") try: gcolumns = h5file.createGroup('/', "Data_", "Data") except tables.exceptions.NodeError: h5file.removeNode('/', "Data_", 1) gcolumns = h5file.createGroup('/', "Data_", "Data") start = 0 if start is None else start stop = gcolumns.X.nrows if stop is None else stop atom = tables.Float32Atom() if config.floatX == 'float32' else tables.Float64Atom() filters = DenseDesignMatrixPyTables.filters x = h5file.createCArray(gcolumns, 'X', atom = atom, shape = ((stop - start, data.X.shape[1])), title = "Data values", filters = filters) y = h5file.createCArray(gcolumns, 'y', atom = atom, shape = ((stop - start, 10)), title = "Data targets", filters = filters) x[:] = data.X[start:stop] y[:] = data.y[start:stop] h5file.removeNode('/', "Data", 1) h5file.renameNode('/', "Data", "Data_") h5file.flush() return h5file, gcolumns