def genAmatrixH(nk, cnk, p_in, p_out, K, filename): n = nk[K - 1] + cnk[K - 1] f = tb.open_file(filename, 'w') filters = tb.Filters(complevel=5, complib='blosc') out_indices = f.create_earray(f.root, 'indices', tb.Int32Atom(), shape=(0, ), filters=filters) out_indptr = f.create_carray(f.root, 'indptr', tb.Int32Atom(), shape=(n + 1, ), filters=filters) out_indptr[0] = 0 for k in range(0, K): for i in range(0, nk[k]): con = rand(1, nk[k] - i - 1, density=p_in, format='csr') con.indices[:] = con.indices[:] + cnk[k] + i + 1 out_indices.append(con.indices) out_indptr[i + cnk[k] + 1] = out_indptr[i + cnk[k]] + con.getnnz() for j in range(k + 1, K): con = rand(1, nk[j], density=p_out, format='csr') con.indices[:] = con.indices[:] + cnk[j] out_indices.append(con.indices) out_indptr[i + cnk[k] + 1] += con.getnnz() f.close()
def create_hdf5_file(fn, X_shape, y_shape, num_labels): h5file = tables.openFile(fn, mode="w", title="Dataset") filters = tables.Filters(complib='blosc', complevel=5) gcolumns = h5file.createGroup(h5file.root, "Data", "Data") h5file.createCArray(gcolumns, 'X', atom=tables.Int32Atom(), shape=X_shape, title="Data_X", filters=filters) h5file.createCArray(gcolumns, 'y', atom=tables.Int32Atom(), shape=y_shape, title="Data_y", filters=filters) h5file.createCArray(gcolumns, 'num_labels', atom=tables.Int32Atom(), shape=(1, ), title="num_labels", filters=filters) node = h5file.getNode('/', 'Data') node.num_labels[0] = num_labels h5file.flush() return h5file
def savemat(X, filepath): X = ss.csc_matrix(X) with tb.open_file(filepath, 'w') as f: filters = tb.Filters(complevel=5, complib='blosc') out_data = f.create_earray(f.root, 'data', tb.Float32Atom(), shape=(0, ), filters=filters) out_indices = f.create_earray(f.root, 'indices', tb.Int32Atom(), shape=(0, ), filters=filters) out_indptr = f.create_earray(f.root, 'indptr', tb.Int32Atom(), shape=(0, ), filters=filters) out_shape = f.create_earray(f.root, 'shape', tb.Int32Atom(), shape=(0, ), filters=filters) out_data.append(X.data) out_indices.append(X.indices) out_indptr.append(X.indptr) out_shape.append(np.array([X.shape[0], X.shape[1]]))
def write_h5f_csr(h5f, h5fplace, name, atom, csr_mat): write_h5f_array(h5f, h5fplace, name + '_data', atom, csr_mat.data) write_h5f_array(h5f, h5fplace, name + '_indices', tb.Int32Atom(), csr_mat.indices) write_h5f_array(h5f, h5fplace, name + '_indptr', tb.Int32Atom(), csr_mat.indptr) write_h5f_array(h5f, h5fplace, name + '_shape', tb.Int32Atom(), np.array(csr_mat.shape))
def hist_writer(file, *, group_name: 'options: HIST, HIST2D', table_name: 'options: pmt, pmtMAU, sipm, sipmMAU', compression='ZLIB4', n_sensors: 'number of pmts or sipms', bin_centres: 'np.array of bin centres'): try: hist_group = getattr(file.root, group_name) except tb.NoSuchNodeError: hist_group = file.create_group(file.root, group_name) n_bins = len(bin_centres) hist_table = file.create_earray(hist_group, table_name, atom=tb.Int32Atom(), shape=(0, n_sensors, n_bins), filters=tbl.filters(compression)) ## The bins can be written just once at definition of the writer file.create_array(hist_group, table_name + '_bins', bin_centres) def write_hist(histo: 'np.array of histograms, one for each sensor'): hist_table.append(histo.reshape(1, n_sensors, n_bins)) return write_hist
def __init__(self, shape=None, name="skin_out", format="h5", folder="./../data/"): self.name = name self.format = format self.folder = folder self.shape = shape # name_base = ''.join([char for char in self.name if not char.isdigit()]) # num = ''.join([char for char in self.name if char.isdigit()]) # if len(num) == 0: # num = 0 # else: # num = int(num) # self.filename = "{}{}.{}".format(self.folder, name_base + str(num), self.format) self.filename = "{}{}.{}".format(self.folder, self.name, self.format) num = 0 while file_exists(self.filename): self.filename = "{}{}-({}).{}".format(self.folder, self.name, num, self.format) num += 1 self.file = tables.open_file(self.filename, mode='w') self.file.create_earray(self.file.root, 'data', tables.Int32Atom(), self.shape)
def save_codenn_series(series: Iterable[str], word2code: Dict[str, int], file_path: str, separator: str = '|') -> None: """Save series into file using CODEnn hdf5 format. Args: series (iterable of str): series of `sep` separated string. word2code (dict of str to int): word-to-code mapper. file_path (str): path to the output file. separator (str): separator to separate string into words. """ with tables.open_file(file_path, mode='w') as h5f: table = h5f.create_table('/', 'indices', { 'length': tables.UInt32Col(), 'pos': tables.UInt32Col() }, 'a table of indices and lengths') array = h5f.create_earray('/', 'phrases', tables.Int32Atom(), (0, )) array.flavor = 'numpy' pos = 0 for item in series: item = item.split(separator) length = len(item) index = table.row index['length'] = length index['pos'] = pos index.append() array.append(convert_words_to_codes(item, word2code)) pos += length
def create_VLIntArray(self, name, array, group): """Stores a homogenous variable length integer array in a group""" self.h5file.create_vlarray(group, name, tables.Int32Atom(), "ragged array of ints", chunkshape = 512)
def open_h5_files() -> (list, list): float_atom = tables.Float32Atom() int_atom = tables.Int32Atom() fd_m = tables.open_file(os.path.join(MATRIX_DATASET_FOLDER, "all.h5"), mode="w") data_m = fd_m.create_earray(fd_m.root, "data", float_atom, (0, MATRIX_DIMENSION, MATRIX_DIMENSION), expectedrows=600000) label_m = fd_m.create_earray(fd_m.root, "labels", int_atom, (0, 1), expectedrows=600000) fd_t = tables.open_file(os.path.join(TENSOR_DATASET_FOLDER, "all.h5"), mode="w") data_t = fd_t.create_earray( fd_t.root, "data", float_atom, (0, TENSOR_DIMENSION, TENSOR_DIMENSION, TENSOR_DIMENSION), expectedrows=600000) label_t = fd_t.create_earray(fd_t.root, "labels", int_atom, (0, 1), expectedrows=600000) fd_m_test = tables.open_file(os.path.join(MATRIX_DATASET_FOLDER, "all_test.h5"), mode="w") data_m_test = fd_m.create_earray(fd_m_test.root, "data", float_atom, (0, MATRIX_DIMENSION, MATRIX_DIMENSION), expectedrows=60000) label_m_test = fd_m.create_earray(fd_m_test.root, "labels", int_atom, (0, 1), expectedrows=60000) fd_t_test = tables.open_file(os.path.join(TENSOR_DATASET_FOLDER, "all_test.h5"), mode="w") data_t_test = fd_m.create_earray( fd_t_test.root, "data", float_atom, (0, TENSOR_DIMENSION, TENSOR_DIMENSION, TENSOR_DIMENSION), expectedrows=60000) label_t_test = fd_m.create_earray(fd_t_test.root, "labels", int_atom, (0, 1), expectedrows=60000) return (fd_m,data_m,label_m),\ (fd_t,data_t,label_t),\ (fd_m_test,data_m_test,label_m_test),\ (fd_t_test,data_t_test,label_t_test)
def _make_int_vlarray(h5file: tables.File, name: str, attribute: np.ndarray) -> None: vlarray = h5file.create_vlarray(h5file.root, name=name, atom=tables.Int32Atom(shape=())) for a in attribute: vlarray.append(a)
def hdf5_save(matrix, filename, dtype=np.dtype(np.float64)): ''' Helper function for storing scipy matrices as PyTables HDF5 matrices see http://www.philippsinger.info/?p=464 for further information :param matrix: matrix to store :param filename: filename for storage :param dtype: dtype :return: True ''' #print matrix.shape atom = tb.Atom.from_dtype(dtype) f = tb.open_file(filename, 'w') #print "saving data" filters = tb.Filters(complevel=5, complib='blosc') out = f.create_carray(f.root, 'data', atom, shape=matrix.data.shape, filters=filters) out[:] = matrix.data #print "saving indices" out = f.create_carray(f.root, 'indices', tb.Int32Atom(), shape=matrix.indices.shape, filters=filters) out[:] = matrix.indices #print "saving indptr" out = f.create_carray(f.root, 'indptr', tb.Int32Atom(), shape=matrix.indptr.shape, filters=filters) out[:] = matrix.indptr #print "saving done" f.close() return
def write_categorical(source: CategoricalArraySource, hfile: tables.File, n_workers: int, batchrows: Optional[int] = None, maps: Optional[np.ndarray] = None) -> None: transform = CategoryMapper(maps, source.missing) if maps else IdWorker() n_workers = n_workers if maps else 0 _write_source(source, hfile, tables.Int32Atom(source.shape[-1]), "categorical_data", transform, n_workers, batchrows)
def __openNextFile(self): self.fileName = self.__getNextFileName() self.file = tables.open_file(self.fileName, mode='w') self.arrays = {} self.arrays['I'] = self.file.create_vlarray(self.file.root, 'I', tables.Int32Atom(shape=()), 'I', filters=tables.Filters(1)) self.arrays['Q'] = self.file.create_vlarray(self.file.root, 'Q', tables.Int32Atom(shape=()), 'Q', filters=tables.Filters(1)) self.fileOpened = True self.nWrittenToFile = 0
def create_dataset(filename: str, coefficients: int): print("called create_dataset") print(filename) N = 256 with tables.open_file(filename, "w") as hdf5file: # create array for the object hdf5file.create_earray(hdf5file.root, "object_real", tables.Float32Atom(), shape=(0, N * N)) # create array for the object phase hdf5file.create_earray(hdf5file.root, "object_imag", tables.Float32Atom(), shape=(0, N * N)) # create array for the image hdf5file.create_earray(hdf5file.root, "diffraction_noise", tables.Float32Atom(), shape=(0, N * N)) # create array for the image hdf5file.create_earray(hdf5file.root, "diffraction_noisefree", tables.Float32Atom(), shape=(0, N * N)) # scale hdf5file.create_earray(hdf5file.root, "scale", tables.Float32Atom(), shape=(0, 1)) # zernike coefficients hdf5file.create_earray(hdf5file.root, "coefficients", tables.Float32Atom(), shape=(0, coefficients)) hdf5file.create_earray(hdf5file.root, "N", tables.Int32Atom(), shape=(0, 1)) hdf5file.close() with tables.open_file(filename, mode='a') as hd5file: # save the dimmensions of the data hd5file.root.N.append(np.array([[N]]))
def assign_array(db,name,a,verbose=1): if a.dtype==dtype('int32'): atom = tables.Int32Atom() elif a.dtype==dtype('int64'): atom = tables.Int64Atom() elif a.dtype==dtype('f') or a.dtype==dtype('d'): atom = tables.Float32Atom() else: raise Exception('unknown array type: %s'%a.dtype) if verbose: print "[writing",name,a.shape,atom,"]" node = db.createEArray(db.root,name,atom,shape=[0]+list(a.shape[1:]),filters=tables.Filters(9)) node.append(a)
def _create_table_list(self, name, example): """ Create a new table within the HDF file, where the tables shape and its datatype are determined by *example*. The modified version for creating table with appendList """ type_map = { np.dtype(np.float64): tables.Float64Atom(), np.dtype(np.float32): tables.Float32Atom(), np.dtype(np.int): tables.Int64Atom(), np.dtype(np.int8): tables.Int8Atom(), np.dtype(np.uint8): tables.UInt8Atom(), np.dtype(np.int16): tables.Int16Atom(), np.dtype(np.uint16): tables.UInt16Atom(), np.dtype(np.int32): tables.Int32Atom(), np.dtype(np.uint32): tables.UInt32Atom(), np.dtype(np.bool): tables.BoolAtom(), } try: if type(example) == np.ndarray: h5type = type_map[example.dtype] elif type(example) == list and type(example[0]) == str: h5type = tables.VLStringAtom() except KeyError: raise TypeError("Don't know how to handle dtype '%s'" % example.dtype) if type(example) == np.ndarray: h5dim = (0, ) + example.shape[1:] h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) self.tables[name] = h5.create_earray(h5.root, name, h5type, h5dim, filters=filters) elif type(example) == list and type(example[0]) == str: h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) self.tables[name] = h5.create_vlarray(h5.root, name, h5type, filters=filters) self.types[name] = type(example)
def _create_table(self, name, example): """ Create a new table within the HDF file, where the tables shape and its datatype are determined by *example*. """ type_map = { np.dtype(np.float64): tables.Float64Atom(), np.dtype(np.float32): tables.Float32Atom(), np.dtype(np.int): tables.Int64Atom(), np.dtype(np.int8): tables.Int8Atom(), np.dtype(np.uint8): tables.UInt8Atom(), np.dtype(np.int16): tables.Int16Atom(), np.dtype(np.uint16): tables.UInt16Atom(), np.dtype(np.int32): tables.Int32Atom(), np.dtype(np.uint32): tables.UInt32Atom(), np.dtype(np.bool): tables.BoolAtom(), } try: if type(example) == np.ndarray: h5type = type_map[example.dtype] elif type(example) == str: h5type = tables.VLStringAtom() except KeyError: raise TypeError( "Could not create table %s because of unknown dtype '%s'" % (name, example.dtype)) #+ ", of name: " % example.shape) if type(example) == np.ndarray: h5dim = (0, ) + example.shape h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) self.tables[name] = h5.create_earray(h5.root, name, h5type, h5dim, filters=filters) elif type(example) == str: h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) self.tables[name] = h5.create_vlarray(h5.root, name, h5type, filters=filters) self.types[name] = type(example)
def create_tables(from_gdb, tab_name): track = from_gdb.create_track(tab_name) chrom_list = from_gdb.get_all_chromosomes() atom = tables.Int32Atom(dflt=POS_UNDEF) for chrom in chrom_list: sys.stderr.write(" %s\n" % chrom.name) shape = (chrom.length, 3) carray = track.h5f.createCArray(track.h5f.root, chrom.name, atom, shape, filters=ZLIB_FILTER) #carray[:,:] = POS_UNDEF return track
def save(self, filepath): import tables fitNPBS = scipy.array( [self.NPoints, self.NBind, self.NFix, self.NSmooth]) atom1 = tables.Float64Atom() atom2 = tables.Int32Atom() filters = tables.Filters(complevel=5, complib='zlib') h5f = tables.openFile(filepath, 'w') h5NWMap = h5f.createCArray(h5f.root, 'NWMap', atom2, self.NWMap.shape, filters=filters) h5NWMap[:, :] = self.NWMap h5Nodes = h5f.createCArray(h5f.root, 'Nodes', atom2, self.Nodes.shape, filters=filters) h5Nodes[:] = self.Nodes h5Weights = h5f.createCArray(h5f.root, 'Weights', atom1, self.Weights.shape, filters=filters) h5Weights[:] = self.Weights invA = h5f.createCArray(h5f.root, 'invA', atom1, self.svd_invA.shape, filters=filters) invA[:, :] = self.svd_invA b = h5f.createCArray(h5f.root, 'b', atom1, self.b.shape, filters=filters) b[:] = self.b fixed = h5f.createCArray(h5f.root, 'fixed', atom1, self.fixed.shape, filters=filters) fixed[:] = self.fixed npbs = h5f.createCArray(h5f.root, 'npbs', atom2, fitNPBS.shape) npbs[:] = fitNPBS h5f.close()
def _create_table(self, name, example, parent=None): """ Create a new table within the HDF file, where the tables shape and its datatype are determined by *example*. """ h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) if parent is None: parent = h5.root if type(example) == str: h5type = tables.VLStringAtom() h5.createVLArray(parent, name, h5type, filters=filters) return if type(example) == dict: self.h5.createGroup(parent, name) return #If we get here then we're dealing with numpy arrays example = np.asarray(example) #MODIFICATION: appended name everywhere and introduced string type_map = { np.dtype(np.float64).name: tables.Float64Atom(), np.dtype(np.float32).name: tables.Float32Atom(), np.dtype(np.int).name: tables.Int64Atom(), np.dtype(np.int8).name: tables.Int8Atom(), np.dtype(np.uint8).name: tables.UInt8Atom(), np.dtype(np.int16).name: tables.Int16Atom(), np.dtype(np.uint16).name: tables.UInt16Atom(), np.dtype(np.int32).name: tables.Int32Atom(), np.dtype(np.uint32).name: tables.UInt32Atom(), np.dtype(np.bool).name: tables.BoolAtom(), # Maximal string length of 128 per string - change if needed 'string32': tables.StringAtom(128) } try: h5type = type_map[example.dtype.name] h5dim = (0, ) + example.shape h5.createEArray(parent, name, h5type, h5dim, filters=filters) except KeyError: raise TypeError("Don't know how to handle dtype '%s'" % example.dtype)
def __init__(self,args): super().__init__(args) filename = args['mem_location'] try: self._kill_any_open_file() load_into_mem = args['load_h5_into_mem'] if 'load_h5_into_mem' in args else False if load_into_mem: print('Loading h5 file into memory...') self.h5file = tb.open_file(filename, mode='a', driver="H5FD_CORE") print('Done!') else: self.h5file = tb.open_file(filename, mode='a') self.frame = self.h5file.get_node("/","frame") self.measurements = self.h5file.get_node("/","measurements") self.a_history = self.h5file.get_node("/","a_history") self.aidx = self.h5file.get_node("/","aidx") self.a_taken_prob = self.h5file.get_node("/","a_taken_prob") self.state_value = self.h5file.get_node("/","state_value") self.gae = self.h5file.get_node("/","gae") except: print_exc() try: self.h5file.close() except: pass build_file = input('Unable to load H5 File. Would you like to build a new one? This will overwrite any existing file. (y/n): ') if build_file=='y': self.h5file = tb.open_file(filename, mode='w', title="Doom Replay Data") root = self.h5file.root self.frame = self.h5file.create_vlarray(root,'frame',tb.Float32Atom()) self.measurements = self.h5file.create_vlarray(root,'measurements',tb.Float32Atom()) self.a_history = self.h5file.create_vlarray(root,'a_history',tb.Float32Atom()) self.aidx = self.h5file.create_vlarray(root,'aidx',tb.Int32Atom()) self.a_taken_prob = self.h5file.create_vlarray(root,'a_taken_prob',tb.Float32Atom()) self.state_value = self.h5file.create_vlarray(root,'state_value',tb.Float32Atom()) self.gae = self.h5file.create_vlarray(root,'gae',tb.Float32Atom()) else: raise ValueError("No H5 file loaded. Please load valid h5 file or create a new one")
def fetch_svhn_extra(source_paths, target_path): extra_path = source_paths[0] print('Converting {} to HDF5 (compressed)...'.format(extra_path)) f_out = tables.open_file(target_path, mode='w') g_out = f_out.create_group(f_out.root, 'svhn', 'SVHN data') filters = tables.Filters(complevel=9, complib='blosc') X_u8_arr = f_out.create_earray(g_out, 'extra_X_u8', tables.UInt8Atom(), (0, 3, 32, 32), filters=filters) y_arr = f_out.create_earray(g_out, 'extra_y', tables.Int32Atom(), (0, ), filters=filters) # Load in the extra data Matlab file _insert_svhn_matlab_to_h5(X_u8_arr, y_arr, extra_path) f_out.close() return target_path
def create_sent_tokens_array(): try: tokens_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_TOKENS_FILE), mode='w') atom = tables.StringAtom(itemsize=16) tokens_arr = tokens_file.create_earray(tokens_file.root, 'data', atom, (0, cnt.MAX_WORDS)) vocab = set() n, batch_size = len(items), cnt.PYTABLES_INSERT_BATCH_SIZE num_batches = int(math.ceil(float(n)/batch_size)) for m in range(num_batches): start, end = m*batch_size, min((m+1)*batch_size, n) batch_items = [items[x] for x in range(start, end)] tokens = [gutils.padd_fn(gutils.get_tokens(gutils.get_item_text(item))) for item in batch_items] tokens_arr.append(tokens) vocab.update([x for token in tokens for x in token]) vocab = sorted(list(vocab)) word2idx_map = {w: i + 1 for i, w in enumerate(vocab)} gutils.save_data_pkl(word2idx_map, cnt.WORD2IDX_FILE) sent_tokens = tokens_file.root.data sents_arr_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_ARRAYS_FILE), mode='w') atom = tables.Int32Atom() sents_arr = sents_arr_file.create_earray(sents_arr_file.root, 'data', atom, (0, cnt.MAX_WORDS)) n, batch_size = len(items), cnt.PYTABLES_INSERT_BATCH_SIZE num_batches = int(math.ceil(float(n)/batch_size)) for m in range(num_batches): start, end = m*batch_size, min((m+1)*batch_size, n) tokens = [sent_tokens[x] for x in range(start, end)] sent_arrs = [[gutils.word_to_idx(w, word2idx_map) for w in token] for token in tokens] sents_arr.append(sent_arrs) finally: tokens_file.close() sents_arr_file.close()
def convert_corpus(corpusfile, outfile): """Loads the given [corpusfile] (which should be of a type loadable by Corpus) as a list of lists of sentence arrays (the output of Corpus.build_sentence_document_arrays) and saves it as an HDF5 file at [outfile]. """ c = Corpus(corpusfile) #, ndocs=10000) print "Building document arrays.." docarrays, mvocab = c.build_sentence_document_arrays(vocab) print "Opening output file.." tf = tables.openFile(outfile, mode="w", title="converted_corpus") print "Saving document arrays.." darr = tf.createVLArray(tf.root, "docarrays", tables.Int32Atom(shape=())) for da in docarrays: darr.append(da) print "Saving vocabulary.." varr = tf.createVLArray(tf.root, "vocab", tables.StringAtom(max(map(len, mvocab)))) for w in mvocab: varr.append(w) print "Closing output file.." tf.close()
root = r'/mnt/DATA/Prob_IR/' context_dataset_name = r'context_data' encoded_docs_filename = r'encoded_docs_model' word_index_filename = r'word_index' emb_filename = r'embeddings_dim_' + str(opt['dim']) + '_margin_' + str( opt['margin']) emb_path = os.path.join(root, emb_filename) context_dataset_path = os.path.join(root, context_dataset_name) print("Loading data...") _, words = load_dataset(root, encoded_docs_filename, word_index_filename) idx_words = np.array(list(range(len(words)))) atom = tables.Int32Atom() with tables.open_file(context_dataset_path, mode='r') as f: train_context = torch.Tensor(np.array(f.root.data[:])) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") dim = 20 w = torch.zeros(len(words), dim + dim**2, device=device) init.xavier_uniform_(w) w.requires_grad = True ntot = opt['num_positive'] + 1 opt['tot_batch_size'] = ntot * opt['batch_size'] dataset = TensorDataset(train_context) loader = DataLoader(dataset=dataset,
def preprocess(csv_files, batch_size, numcep, numcontext, alphabet, hdf5_cache_path=None): COLUMNS = ('features', 'features_len', 'transcript', 'transcript_len') print('Preprocessing', csv_files) if hdf5_cache_path and os.path.exists(hdf5_cache_path): with tables.open_file(hdf5_cache_path, 'r') as file: features = file.root.features[:] features_len = file.root.features_len[:] transcript = file.root.transcript[:] transcript_len = file.root.transcript_len[:] # features are stored flattened, so reshape into [n_steps, numcep] for i in range(len(features)): features[i].shape = [features_len[i] + 2 * numcontext, numcep] in_data = list( zip(features, features_len, transcript, transcript_len)) print('Loaded from cache at', hdf5_cache_path) return pandas.DataFrame(data=in_data, columns=COLUMNS) source_data = None for csv in csv_files: file = pandas.read_csv(csv, encoding='utf-8', na_filter=False) #FIXME: not cross-platform csv_dir = os.path.dirname(os.path.abspath(csv)) file['wav_filename'] = file['wav_filename'].str.replace( r'(^[^/])', lambda m: os.path.join(csv_dir, m.group(1))) if source_data is None: source_data = file else: source_data = source_data.append(file) step_fn = partial(process_single_file, numcep=numcep, numcontext=numcontext, alphabet=alphabet) out_data = pmap(step_fn, source_data.iterrows()) if hdf5_cache_path: print('Saving to', hdf5_cache_path) # list of tuples -> tuple of lists features, features_len, transcript, transcript_len = zip(*out_data) with tables.open_file(hdf5_cache_path, 'w') as file: features_dset = file.create_vlarray( file.root, 'features', tables.Float32Atom(), filters=tables.Filters(complevel=1)) # VLArray atoms need to be 1D, so flatten feature array for f in features: features_dset.append(np.reshape(f, -1)) features_len_dset = file.create_array(file.root, 'features_len', features_len) transcript_dset = file.create_vlarray( file.root, 'transcript', tables.Int32Atom(), filters=tables.Filters(complevel=1)) for t in transcript: transcript_dset.append(t) transcript_len_dset = file.create_array(file.root, 'transcript_len', transcript_len) print('Preprocessing done') return pandas.DataFrame(data=out_data, columns=COLUMNS)
"""Small example that shows how to work with variable length arrays of different types, UNICODE strings and general Python objects included.""" from __future__ import print_function import numpy as np import tables import pickle # Open a new empty HDF5 file fileh = tables.open_file("vlarray2.h5", mode="w") # Get the root group root = fileh.root # A test with VL length arrays: vlarray = fileh.create_vlarray(root, 'vlarray1', tables.Int32Atom(), "ragged array of ints") vlarray.append(np.array([5, 6])) vlarray.append(np.array([5, 6, 7])) vlarray.append([5, 6, 9, 8]) # Test with lists of bidimensional vectors vlarray = fileh.create_vlarray(root, 'vlarray2', tables.Int64Atom(shape=(2,)), "Ragged array of vectors") a = np.array([[1, 2], [1, 2]], dtype=np.int64) vlarray.append(a) vlarray.append(np.array([[1, 2], [3, 4]], dtype=np.int64)) vlarray.append(np.zeros(dtype=np.int64, shape=(0, 2))) vlarray.append(np.array([[5, 6]], dtype=np.int64)) # This makes an error (shape) # vlarray.append(array([[5], [6]], dtype=int64))
# t = f.create_table(f.root, 'table', recarray, "mdim recarray") # a0 = f.create_array(f.root, 'field0', recarray['f0'], "mdim int32 array") # a1 = f.create_array(f.root, 'field1', recarray['f1'], "mdim float64 array") # c0 = f.create_carray(f.root, 'cfield0', # tables.Int32Atom(), (2,2,2), # "mdim int32 carray") # c1 = f.create_carray(f.root, 'cfield1', # tables.Float64Atom(), (2,3,3), # "mdim float64 carray") f1 = tables.open_file("chunkshape1.h5", mode="w") c1 = f.create_carray(f1.root, 'cfield1', tables.Int32Atom(), (L, N, M), "scalar int32 carray", tables.Filters(complevel=0)) t1 = time() c1[:] = numpy.empty(shape=(L, 1, 1), dtype="int32") print("carray1 populate time:", time() - t1) f1.close() f2 = tables.open_file("chunkshape2.h5", mode="w") c2 = f.create_carray(f2.root, 'cfield2', tables.Int32Atom(), (L, M, N), "scalar int32 carray", tables.Filters(complevel)) t1 = time() c2[:] = numpy.empty(shape=(L, 1, 1), dtype="int32")
def _create_table_list(self, name, example): """ Create a new table within the HDF file, where the tables shape and its datatype are determined by *example*. The modified version for creating table with appendList """ type_map = { np.dtype(np.float64): tables.Float64Atom(), np.dtype(np.float32): tables.Float32Atom(), np.dtype(np.int): tables.Int64Atom(), np.dtype(np.int8): tables.Int8Atom(), np.dtype(np.uint8): tables.UInt8Atom(), np.dtype(np.int16): tables.Int16Atom(), np.dtype(np.uint16): tables.UInt16Atom(), np.dtype(np.int32): tables.Int32Atom(), np.dtype(np.uint32): tables.UInt32Atom(), np.dtype(np.bool): tables.BoolAtom(), } try: if type(example) == np.ndarray: h5type = type_map[example.dtype] elif type(example) == list and type(example[0]) == str: h5type = tables.VLStringAtom() except KeyError: raise TypeError("Don't know how to handle dtype '%s'" % example.dtype) if type(example) == np.ndarray: h5dim = (0, ) + example.shape[1:] h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) nodes = h5.list_nodes(h5.root) nmpt = name.replace('.', '/\n') nmpt = nmpt.split('\n') path = '/' for kay in range(len(nmpt) - 1): #if not path+nmpt[kay][:-1] in str(nodes): h5.create_group(path,nmpt[kay][:-1]) try: h5.is_visible_node(path + nmpt[kay][:-1]) except: h5.create_group(path, nmpt[kay][:-1]) path += nmpt[kay] self.tables[name] = h5.create_earray(path, nmpt[-1], h5type, h5dim, filters=filters) elif type(example) == list and type(example[0]) == str: h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) nodes = h5.list_nodes(h5.root) nmpt = name.replace('.', '/\n') nmpt = nmpt.split('\n') path = '/' for kay in range(len(nmpt) - 1): #if not path+nmpt[kay][:-1] in str(nodes): h5.create_group(path,nmpt[kay][:-1]) try: h5.is_visible_node(path + nmpt[kay][:-1]) except: h5.create_group(path, nmpt[kay][:-1]) path += nmpt[kay] self.tables[name] = h5.create_vlarray(path, nmpt[-1], h5type, filters=filters) self.types[name] = type(example)
def _hdf5(self, alphabet_path, hdf5_path, ninput=26, ncontext=9): skipped = [] str_to_label = {} alphabet_size = 0 with codecs.open(alphabet_path, 'r', 'utf-8') as fin: for line in fin: if line[0:2] == '\\#': line = '#\n' elif line[0] == '#': continue str_to_label[line[:-1]] = alphabet_size alphabet_size += 1 def process_sample(sample): if len(sample.transcript) == 0: skipped.append(sample.original_name) return None sample.write() try: samplerate, audio = wav.read(sample.file.filename) except: skipped.append(sample.original_name) return None features = mfcc(audio, samplerate=samplerate, numcep=ninput)[::2] empty_context = np.zeros((ncontext, ninput), dtype=features.dtype) features = np.concatenate((empty_context, features, empty_context)) transcript = np.asarray( [str_to_label[c] for c in sample.transcript]) if (2 * ncontext + len(features)) < len(transcript): skipped.append(sample.original_name) return None return features, len(features), transcript, len(transcript) out_data = self._map('Computing MFCC features...', self.samples, process_sample) out_data = [s for s in out_data if s is not None] if len(skipped) > 0: log('WARNING - Skipped %d samples that had no transcription, had been too short for their transcription or had been missed:' % len(skipped)) for s in skipped: log(' - Sample origin: "%s".' % s) if len(out_data) <= 0: log('No samples written to feature DB "%s".' % hdf5_path) return # list of tuples -> tuple of lists features, features_len, transcript, transcript_len = zip(*out_data) log('Writing feature DB...') with tables.open_file(hdf5_path, 'w') as file: features_dset = file.create_vlarray( file.root, 'features', tables.Float32Atom(), filters=tables.Filters(complevel=1)) # VLArray atoms need to be 1D, so flatten feature array for f in features: features_dset.append(np.reshape(f, -1)) features_len_dset = file.create_array(file.root, 'features_len', features_len) transcript_dset = file.create_vlarray( file.root, 'transcript', tables.Int32Atom(), filters=tables.Filters(complevel=1)) for t in transcript: transcript_dset.append(t) transcript_len_dset = file.create_array(file.root, 'transcript_len', transcript_len) log('Wrote features of %d samples to feature DB "%s".' % (len(features), hdf5_path))