def to_hdf5(wrapper, save_as=None): if save_as is None: save_as = wrapper.dataset_name.replace(' ', '_') + '.h5' f = None print wrapper.dataset_name try: f = tables.openFile(save_as, mode='w') train_group = f.createGroup('/', 'train', 'train set') test_group = f.createGroup('/', 'test', 'test set') img_groups = (train_group, None) test_idx = None dsets = [range(len(wrapper)), None] if wrapper.get_standard_train_test_splits() is not None: dsets[0], dsets[1] = wrapper.get_standard_train_test_splits() img_groups = (img_groups[0], test_group) for some_idx, testing in enumerate(dsets): if testing is None: break img_group = img_groups[some_idx] dset = dsets[some_idx] img_table = f.createCArray(img_group, 'img', tables.StringAtom(itemsize=1), shape=(len(dset), 96 * 96 * 3)) label_table = f.createCArray(img_group, 'label', tables.Float64Atom(), shape=(len(dset), len(keypoints_names), 2)) for i, _ in enumerate(dset): img, label = crop_face( PIL.Image.open(wrapper.get_original_image_path(i)), wrapper.get_bbox(i), wrapper.get_eyes_location(i), wrapper.get_keypoints_location(i)) img_table[i, :] = [x for x in img.tostring()] #NOTE: will be stored as cstring, so it is mandatory to store as #strings of size 1 (as \0 would break the string) for name in keypoints_names: if name in label: point = label[name] label_table[i, keypoints_names.index(name), :] = [ point[0], point[1] ] else: label_table[i, keypoints_names.index(name), :] = [-1, -1] finally: if f is not None: f.close()
eggind = np.random.randint(0, num, int(num * (1.0 - rate))) emails[eggind] = 'eggs' return emails # 1. Create an HDF5 file which contains a single emails array, uncompressed. # Close this file and use the getsize() function to report the size on disk. # Time how long this took and print that out as well. NUM = 10000000 RATE = 0.75 emails = email_array(NUM, RATE) t = time() with tb.openFile('uncompressed.h5', 'w') as f: earray = f.createEArray('/', 'emails', tb.StringAtom(4), (0,), expectedrows=NUM) earray.append(emails) tdelta = time() - t msg = "The uncompressed array is {0} bytes and took {1} ms to write." print msg.format(getsize('uncompressed.h5'), tdelta * 1000) # 2. Repeat step 1 but with zlib compression at level 1. filters = tb.Filters(complib='zlib', complevel=1) t = time() with tb.openFile('zlib1.h5', 'w') as f: earray = f.createEArray('/', 'emails', tb.StringAtom(4), (0,),
def create_all_arrays(h5, expectedrows=1000): """ Utility functions used by both create_song_file and create_aggregate_files, creates all the EArrays (empty). INPUT h5 - hdf5 file, open with write or append permissions metadata and analysis groups already exist! """ # group metadata arrays group = h5.root.metadata h5.createEArray(where=group, name='similar_artists', atom=tables.StringAtom(20, shape=()), shape=(0, ), title=ARRAY_DESC_SIMILAR_ARTISTS) h5.createEArray(group, 'artist_terms', tables.StringAtom(256, shape=()), (0, ), ARRAY_DESC_ARTIST_TERMS, expectedrows=expectedrows * 40) h5.createEArray(group, 'artist_terms_freq', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_ARTIST_TERMS_FREQ, expectedrows=expectedrows * 40) h5.createEArray(group, 'artist_terms_weight', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_ARTIST_TERMS_WEIGHT, expectedrows=expectedrows * 40) # group analysis arrays group = h5.root.analysis h5.createEArray(where=group, name='segments_start', atom=tables.Float64Atom(shape=()), shape=(0, ), title=ARRAY_DESC_SEGMENTS_START) h5.createEArray(group, 'segments_confidence', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_SEGMENTS_CONFIDENCE, expectedrows=expectedrows * 300) h5.createEArray(group, 'segments_pitches', tables.Float64Atom(shape=()), (0, 12), ARRAY_DESC_SEGMENTS_PITCHES, expectedrows=expectedrows * 300) h5.createEArray(group, 'segments_timbre', tables.Float64Atom(shape=()), (0, 12), ARRAY_DESC_SEGMENTS_TIMBRE, expectedrows=expectedrows * 300) h5.createEArray(group, 'segments_loudness_max', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_SEGMENTS_LOUDNESS_MAX, expectedrows=expectedrows * 300) h5.createEArray(group, 'segments_loudness_max_time', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_SEGMENTS_LOUDNESS_MAX_TIME, expectedrows=expectedrows * 300) h5.createEArray(group, 'segments_loudness_start', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_SEGMENTS_LOUDNESS_START, expectedrows=expectedrows * 300) h5.createEArray(group, 'sections_start', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_SECTIONS_START, expectedrows=expectedrows * 300) h5.createEArray(group, 'sections_confidence', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_SECTIONS_CONFIDENCE, expectedrows=expectedrows * 300) h5.createEArray(group, 'beats_start', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_BEATS_START, expectedrows=expectedrows * 300) h5.createEArray(group, 'beats_confidence', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_BEATS_CONFIDENCE, expectedrows=expectedrows * 300) h5.createEArray(group, 'bars_start', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_BARS_START, expectedrows=expectedrows * 300) h5.createEArray(group, 'bars_confidence', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_BARS_CONFIDENCE, expectedrows=expectedrows * 300) h5.createEArray(group, 'tatums_start', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_TATUMS_START, expectedrows=expectedrows * 300) h5.createEArray(group, 'tatums_confidence', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_TATUMS_CONFIDENCE, expectedrows=expectedrows * 300) # group musicbrainz arrays group = h5.root.musicbrainz h5.createEArray(where=group, name='artist_mbtags', atom=tables.StringAtom(256, shape=()), shape=(0, ), title=ARRAY_DESC_ARTIST_MBTAGS, expectedrows=expectedrows * 5) h5.createEArray(group, 'artist_mbtags_count', tables.IntAtom(shape=()), (0, ), ARRAY_DESC_ARTIST_MBTAGS_COUNT, expectedrows=expectedrows * 5)
import tables import numpy fileh = tables.open_file('earray1.h5', mode='w') a = tables.StringAtom(itemsize=8) # Use ``a`` as the object type for the enlargeable array. array_c = fileh.create_earray(fileh.root, 'array_c', a, (0, ), "Chars") array_c.append(numpy.array(['a' * 2, 'b' * 4], dtype='S8')) array_c.append(numpy.array(['a' * 6, 'b' * 8, 'c' * 10], dtype='S8')) # Read the string ``EArray`` we have created on disk. for s in array_c: print 'array_c[%s] => %r' % (array_c.nrow, s) # Close the file. fileh.close()
def test_from_dtype_03(self): with self.assertWarns(Warning): atom1 = tb.Atom.from_dtype(np.dtype('U5'), dflt=b'hello') atom2 = tb.StringAtom(itemsize=5, shape=(), dflt=b'hello') self.assertEqual(atom1, atom2) self.assertEqual(str(atom1), str(atom2))
def calc_flow( file_list, out_rel='optflow_rel.hdf', # pair by pair flow (cannot be None) out_total='optflow_total.hdf', # Cumulated flow out_res='optflow_res.hdf', # Residual complevel=0, complevel_res=0, complevel_tot=0, use_last=True, open_func=lambda s: cv2.imread(s, 0), # Preset medium finest_scale=0, gd_iterations=25, patch_size=4, patch_stride=1, alpha=20, delta=5, gamma=1, iterations=20): infos = dict() infos['start_time'] = str(datetime.datetime.now()) infos['dir'] = os.getcwd() infos['host'] = os.uname()[1] infos['algo'] = 'Disflow-rel' infos['algo_version'] = version infos['finest_scale'] = finest_scale infos['gd_iterations'] = gd_iterations infos['patch_size'] = patch_size infos['patch_stride'] = patch_stride infos['alpha'] = alpha infos['delta'] = delta infos['gamma'] = gamma infos['iterations'] = iterations infos['opencv'] = cv2.__version__ infos_s = str(infos).encode('utf-8') o_img = open_func(file_list[0]) height, width = o_img.shape size = 8 * height * width * len(file_list) / 2**20 output_size = 2 * size if out_total else size if out_res: output_size += size / 2 # Same type as the fields, but only 1 component print("Estimated output size: {:.2f} MB".format(output_size)) # Opening the main output file hrel = tables.open_file(unique_name(out_rel), 'w') # Creating the infos node hrel.create_array(hrel.root, 'infos', [infos_s]) # If compression is asked, create the filter filt = tables.Filters(complevel=complevel) if complevel else None # Create the array at the node 'table' arr = hrel.create_earray(hrel.root, 'table', tables.Float32Atom(), (0, height, width, 2), expectedrows=len(file_list), filters=filt) # Create the array of the names of the images max_size = max([len(i.encode('utf-8')) for i in file_list]) names = hrel.create_earray(hrel.root, 'names', tables.StringAtom(max_size), (0, 2), expectedrows=len(file_list)) res_arr = hrel.create_earray(hrel.root, 'res', tables.Float32Atom(), (0, ), expectedrows=len(file_list) - 1) # If asked, create the file and array for the residual if out_res: hres = tables.open_file(unique_name(out_res), 'w') filt_r = tables.Filters(complevel=complevel_res) if complevel_res\ else None arr_r = hres.create_earray(hres.root, 'table', tables.Float32Atom(), (0, height, width), expectedrows=len(file_list), filters=filt_r) names_r = hres.create_earray(hres.root, 'names', tables.StringAtom(max_size), (0, 2), expectedrows=len(file_list) - 1) # If asked, create the total (cumulated field) if out_total: htot = tables.open_file(unique_name(out_total), 'w') filt_t = tables.Filters(complevel=complevel_tot) if complevel_res\ else None arr_t = htot.create_earray(htot.root, 'table', tables.Float32Atom(), (0, height, width, 2), expectedrows=len(file_list), filters=filt_t) names_t = htot.create_earray(htot.root, 'names', tables.StringAtom(max_size), (0, 2), expectedrows=len(file_list) - 1) # Creating the optflow class dis = dis_class() dis.setFinestScale(finest_scale) dis.setGradientDescentIterations(gd_iterations) dis.setPatchSize(patch_size) dis.setPatchStride(patch_stride) dis.setVariationalRefinementAlpha(alpha) dis.setVariationalRefinementDelta(delta) dis.setVariationalRefinementGamma(gamma) dis.setVariationalRefinementIterations(iterations) r = None t0 = t2 = time() total = np.zeros((height, width, 2), dtype=np.float32) # Main loop (can catch kb interrupt) fb = file_list[0] imb = open_func(fb) ai = Async_iter(file_list[1:], open_func) try: for i, (f, img) in enumerate(zip(file_list[1:], ai)): fa = fb ima = imb fb = f imb = img print("Image {}/{}: {}".format(i + 1, len(file_list), fb)) # Adding the names of the two images names.append([[fa.encode('utf-8'), fb.encode('utf-8')]]) # Opening the second image print("Computing optflow...") # Should we initialize the field ? r = dis.calc(ima, imb, r if use_last else None) # Adding the result to the table arr.append(r[None]) # Computing the residual print("Done. Computing residual...") res = get_res(ima, imb, r) res_arr.append(np.array([scalar_res(res)])) if out_res: arr_r.append(res[None]) names_r.append([[fa.encode('utf-8'), fb.encode('utf-8')]]) if out_total: total = compose(total, r) arr_t.append(total[None]) names_t.append([[fa.encode('utf-8'), fb.encode('utf-8')]]) print("Done.") t1 = t2 t2 = time() print("Last loop took {}".format(format_time(t2 - t1))) print(" ETA1 {}".format( format_time((t2 - t1) * (len(file_list) - i - 1)))) print("Elapsed time: {}".format(format_time(t2 - t0))) print(" ETA2 {}".format( format_time((t2 - t0) / (i + 1) * (len(file_list) - i - 1)))) except KeyboardInterrupt: print("Interrupted !") # Support de la reprise ? ai.terminate() print("Correlation finished !") hrel.create_array(hrel.root, 'elapsed', [time() - t0]) print("Closing main hdf file..") hrel.close() print("Done.") if out_res: print("Closing residual hdf file..") hres.close() print("Done.") if out_total: print("Closing total flow hdf file..") htot.close() print("Done.")
h5file.setNodeAttr(group_arr, 'type', 'element') h5file.setNodeAttr(group_arr, 'entityType', 'face') #write data ft_id = h5file.createGroup(h5file.root, 'floatingType') ars_id = h5file.createGroup(ft_id, 'data3D') h5file.setNodeAttr(ars_id, 'floatingType', 'arraySet') h5file.setNodeAttr(ars_id, 'label', 'Data on triangles') data_arr = h5file.createCArray(ars_id, 'data', tables.Float64Atom(), numpy.shape(elttypes), filters=filters) i = 0 for array in data_arr: data_arr[i] = numpy.float64(i) i += 1 h5file.setNodeAttr(data_arr, 'label', 'Current on element') h5file.setNodeAttr(data_arr, 'physicalNature', 'electricCurrent') h5file.setNodeAttr(data_arr, 'unit', 'ampere') ds_id = h5file.createGroup(ars_id, 'ds') dim1_arr = h5file.createCArray(ds_id, 'dim1', tables.StringAtom(42), (1, ), filters=filters) dim1_arr[0] = '/mesh/trianglesMesh/tmesh/group/triangles' h5file.setNodeAttr(dim1_arr, 'label', 'mesh elements') h5file.setNodeAttr(dim1_arr, 'physicalNature', 'meshEntity') h5file.close()
def test_init_parameters_02(self): atom1 = tb.StringAtom(itemsize=12) atom2 = atom1.copy(itemsize=100, shape=(2, 2)) self.assertEqual(atom2, tb.StringAtom(itemsize=100, shape=(2, 2), dflt=b''))
def train(nthreads, maindir, output, testartists, npicks, winsize, finaldim, trainsongs=None, typecompress='picks'): """ Main function to do the training Do the main pass with the number of given threads. Then, reads the tmp files, creates the main output, delete the tmpfiles. INPUT - nthreads - number of threads to use - maindir - dir of the MSD, wehre to find song files - output - main model, contains everything to perform KNN - testartists - set of artists to ignore - npicks - number of samples to pick per song - winsize - window size (in beats) of a sample - finaldim - final dimension of the sample, something like 5? - trainsongs - list of songs to use for training - typecompress - 'picks', 'corrcoeff' or 'cov' RETURN - nothing """ # sanity checks if os.path.isfile(output): print 'ERROR: file', output, 'already exists.' return # initial time t1 = time.time() # do main pass tmpfiles = process_filelist_train_main_pass(nthreads, maindir, testartists, npicks, winsize, finaldim, trainsongs=trainsongs, typecompress=typecompress) if tmpfiles is None: print 'Something went wrong, tmpfiles are None' return # intermediate time t2 = time.time() stimelen = str(datetime.timedelta(seconds=t2 - t1)) print 'Main pass done after', stimelen sys.stdout.flush() # find approximate number of rows per tmpfiles h5 = tables.openFile(tmpfiles[0], 'r') nrows = h5.root.data.year.shape[0] * len(tmpfiles) h5.close() # create output output = tables.openFile(output, mode='a') group = output.createGroup("/", 'data', 'KNN MODEL FILE FOR YEAR RECOGNITION') output.createEArray(group, 'feats', tables.Float64Atom(shape=()), (0, finaldim), 'feats', expectedrows=nrows) output.createEArray(group, 'year', tables.IntAtom(shape=()), (0, ), 'year', expectedrows=nrows) output.createEArray(group, 'track_id', tables.StringAtom(18, shape=()), (0, ), 'track_id', expectedrows=nrows) # aggregate temp files for tmpf in tmpfiles: h5 = tables.openFile(tmpf) output.root.data.year.append(h5.root.data.year[:]) output.root.data.track_id.append(h5.root.data.track_id[:]) output.root.data.feats.append(h5.root.data.feats[:]) h5.close() # delete tmp file os.remove(tmpf) # close output output.close() # final time t3 = time.time() stimelen = str(datetime.timedelta(seconds=t3 - t1)) print 'Whole training done after', stimelen # done return
def process_filelist_train(filelist=None, testartists=None, tmpfilename=None, npicks=None, winsize=None, finaldim=None, typecompress='picks'): """ Main function, process all files in the list (as long as their artist is not in testartist) INPUT filelist - a list of song files testartists - set of artist ID that we should not use tmpfilename - where to save our processed features npicks - number of segments to pick per song winsize - size of each segment we pick finaldim - how many values do we keep typecompress - one of 'picks' (win of btchroma), 'corrcoef' (correlation coefficients), 'cov' (covariance) """ # sanity check for arg in locals().values(): assert not arg is None, 'process_filelist_train, missing an argument, something still None' if os.path.isfile(tmpfilename): print 'ERROR: file', tmpfilename, 'already exists.' return # create outputfile output = tables.openFile(tmpfilename, mode='a') group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION') output.createEArray(group, 'feats', tables.Float64Atom(shape=()), (0, finaldim), '', expectedrows=len(filelist)) output.createEArray(group, 'year', tables.IntAtom(shape=()), (0, ), '', expectedrows=len(filelist)) output.createEArray(group, 'track_id', tables.StringAtom(18, shape=()), (0, ), '', expectedrows=len(filelist)) # random projection ndim = 12 # fixed in this dataset if typecompress == 'picks': randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim) elif typecompress == 'corrcoeff' or typecompress == 'cov': randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim) elif typecompress == 'avgcov': randproj = RANDPROJ.proj_point5(90, finaldim) else: assert False, 'Unknown type of compression: ' + str(typecompress) # iterate over files cnt_f = 0 for f in filelist: cnt_f += 1 # verbose if cnt_f % 50000 == 0: print 'training... checking file #', cnt_f # check file h5 = GETTERS.open_h5_file_read(f) artist_id = GETTERS.get_artist_id(h5) year = GETTERS.get_year(h5) track_id = GETTERS.get_track_id(h5) h5.close() if year <= 0 or artist_id in testartists: continue # we have a train artist with a song year, we're good bttimbre = get_bttimbre(f) if typecompress == 'picks': if bttimbre is None: continue # we even have normal features, awesome! processed_feats = CBTF.extract_and_compress(bttimbre, npicks, winsize, finaldim, randproj=randproj) elif typecompress == 'corrcoeff': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.corr_and_compress(timbres, finaldim, randproj=randproj) elif typecompress == 'cov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.cov_and_compress(timbres, finaldim, randproj=randproj) elif typecompress == 'avgcov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.avgcov_and_compress(timbres, finaldim, randproj=randproj) else: assert False, 'Unknown type of compression: ' + str(typecompress) # save them to tmp file n_p_feats = processed_feats.shape[0] output.root.data.year.append(np.array([year] * n_p_feats)) output.root.data.track_id.append(np.array([track_id] * n_p_feats)) output.root.data.feats.append(processed_feats) # we're done, close output output.close() return
def saveTo(self, file, where, name): """Creates SetArray which is a pickled version of this annotation """ atom = tables.StringAtom(flavor='numpy') node = file.createVLArray(where, name, atom)
def proc_data(pattern, f_ex, params, l_path, d_path, conv_table, CGN): # get list of audio and transcript files audio_files = [d_path + "/" + x for x in list_files(d_path)] audio_files.sort() label_files = [l_path + "/" + x for x in list_files(l_path)] label_files.sort() # create h5 file for the processed data data_file = tables.open_file(params[5] + '.h5', mode='a') # create pytable atoms # if we want filterbanks the feature size is #filters+1 for energy x3 for delta and double delta if params[4] == True: feature_shape = (params[1] + 1) # features are three times bigger if deltas are used if params[6] == True: feature_shape = feature_shape * 3 # if we make MFCCs we take the first 12 cepstral coefficients and energy + delta double delta = 39 features else: feature_shape = (39) f_atom = tables.Float64Atom() # N.B. label size is hard coded. It provides phoneme and 7 articulatory feature # labels l_atom = tables.StringAtom(itemsize=5) # create a feature and label group branching of the root node features = data_file.create_group("/", 'features') labels = data_file.create_group("/", 'labels') # create a dictionary from the conv table cgndict = phoneme_dict(conv_table) # check if the audio and transcript files match if check_files(audio_files, label_files, f_ex): # len(audio_files) for x in range(0, len(audio_files)): #len(audio_files) print('processing file ' + str(x)) # create new leaf nodes in the feature and leave nodes for every audio file f_table = data_file.create_earray(features, audio_files[x][-12:-4], f_atom, (0, feature_shape), expectedrows=100000) l_table = data_file.create_earray(labels, audio_files[x][-12:-4], l_atom, (0, 8), expectedrows=100000) # read audio samples input_data = read(audio_files[x]) # sampling frequency fs = input_data[0] # get window and frameshift size in samples s_window = int(fs * params[2]) s_shift = int(fs * params[3]) # create mfccs [mfcc, frame_nrs] = get_mfcc(input_data, params[0], params[1], s_window, s_shift, params[4], params[6]) # read datatranscript trans = parse_transcript(pattern, label_files[x], CGN) # convert phoneme transcript to articulatory feature transcript l_trans = label_transcript(trans, fs, cgndict) nframes = mfcc.shape[0] # label frames using the labelled transcript l_data = numpy.array(label_frames(nframes, l_trans, s_shift)) # append new data to the tables f_table.append(mfcc) l_table.append(l_data) else: print('audio and transcript files do not match') # close the output files data_file.close() data_file.close() return (mfcc, l_data)
def test_base_class_factory(self): cls_props_target = [('tags', (set,)), ('target_id', (np.str_,)), ('name', (np.str_,)), ('position', (np.ndarray, np.float_)), ('position_error', (np.ndarray, np.float_)), ('description', (np.str_,))] _Target = _base_class_factory('_Target', class_type='base', class_properties=cls_props_target) cls_props_dqt = [('tags', (set,)), ('name', (np.str_,)), ('reference', (np.str_,))] _DataQualityType = _base_class_factory('_DataQualityType', 'base', class_properties=cls_props_dqt) cls_props_rawdt = [('tags', (set,)), ('inc_angle', (np.ndarray, np.float_)), ('inc_angle_error', (np.ndarray, np.float_)), ('bearing', (np.ndarray, np.float_)), ('bearing_error', (np.ndarray, np.float_)), ('position', (np.ndarray, np.float_)), ('position_error', (np.ndarray, np.float_)), ('path_length', (np.ndarray, np.float_)), ('path_length_error', (np.ndarray, np.float_)), ('d_var', (np.ndarray, np.float_)), ('ind_var', (np.ndarray, np.float_)), ('datetime', (np.ndarray, datetime.datetime)), ('data_quality', (np.ndarray, np.float_)), ('integration_time', (np.ndarray, np.float_)), ('no_averages', (np.float_,)), ('temperature', (np.float_,)), ('user_notes', (np.str_,))] cls_refr_rawdt = [('instrument', (_Instrument,)), ('target', (_Target,)), ('type', (_RawDataType,)), ('data_quality_type', (np.ndarray, _DataQualityType))] filename = tempfile.mktemp() h5f = tables.open_file(filename, 'w') h5f.create_earray('/', 'hash', tables.StringAtom(itemsize=28), (0,)) TargetBuffer = _buffer_class_factory('TargetBuffer', class_properties=cls_props_target) DataQualityTypeBuffer = \ _buffer_class_factory('DataQualityTypeBuffer', class_properties=cls_props_dqt) dtb1 = DataQualityTypeBuffer(name='q-measure 1') dtb2 = DataQualityTypeBuffer(name='q-measure 2') tb = TargetBuffer(name='White Island', position=(177.2, -37.5, 50)) group_name = _Target.__name__.strip('_') h5f.create_group('/', group_name) rid = ResourceIdentifier() with warnings.catch_warnings(): warnings.simplefilter('ignore') group = h5f.create_group('/'+group_name, str(rid)) t = _Target(group, tb) rid = ResourceIdentifier() with warnings.catch_warnings(): warnings.simplefilter('ignore') group = h5f.create_group('/'+group_name, str(rid)) dt1 = _DataQualityType(group, dtb1) rid = ResourceIdentifier() with warnings.catch_warnings(): warnings.simplefilter('ignore') group = h5f.create_group('/'+group_name, str(rid)) dt2 = _DataQualityType(group, dtb2) _RawData = _base_class_factory('_RawData', class_type='base', class_properties=cls_props_rawdt, class_references=cls_refr_rawdt) RawDataBuffer = _buffer_class_factory('RawDataBuffer', class_properties=cls_props_rawdt, class_references=cls_refr_rawdt) rdb = RawDataBuffer(d_var=np.zeros((1, 2048)), ind_var=np.arange(2048), datetime=['2017-01-10T15:23:00'], no_averages=23, user_notes='something', target=t, data_quality_type=[dt1, dt2]) group_name = _RawData.__name__.strip('_') h5f.create_group('/', group_name) rid = ResourceIdentifier() with warnings.catch_warnings(): warnings.simplefilter('ignore') group = h5f.create_group('/'+group_name, str(rid)) rd = _RawData(group, rdb) np.testing.assert_array_equal(rd.d_var[:], np.zeros((1, 2048))) with self.assertRaises(AttributeError): rd.something = 'something' with self.assertRaises(AttributeError): rd.d_var = np.ones((1, 2048)) self.assertEqual(rd.user_notes, 'something') self.assertEqual(rd.no_averages, 23) np.testing.assert_array_equal(rd.target.position[:], np.array([177.2, -37.5, 50.])) self.assertEqual(rd.target.name, 'White Island') self.assertEqual(rd.data_quality_type[1].name, 'q-measure 2')
def put_string(h5, where, name, string, ext_name): ''' put a sting in a hdf5 file''' atom = tables.StringAtom(itemsize=len(string)) ca = h5.createCArray(where, name, atom, (1, ), ext_name) ca[:] = string[:]
root = fileh.root vlarray = fileh.create_vlarray(root, 'vlarray1', tables.Int32Atom(), "ragged array of ints", filters=tables.Filters(1)) # Append some (variable length) rows: vlarray.append(numpy.array([5, 6])) vlarray.append(numpy.array([5, 6, 7])) vlarray.append([5, 6, 9, 8]) # Now, do the same with native Python strings. vlarray2 = fileh.create_vlarray(root, 'vlarray2', tables.StringAtom(itemsize=2), "ragged array of strings", filters=tables.Filters(1)) vlarray2.flavor = 'python' # Append some (variable length) rows: vlarray2.append(['5', '66']) vlarray2.append(['5', '6', '77']) vlarray2.append(['5', '6', '9', '88']) # Test with lists of bidimensional vectors vlarray3 = fileh.create_vlarray(root, 'vlarray3', tables.Int64Atom(shape=(2, )), "Ragged array of vectors") a = numpy.array([[1, 2], [1, 2]], dtype=numpy.int64) vlarray3.append(a) vlarray3.append(numpy.array([[1, 2], [3, 4]], dtype=numpy.int64))
def test_init_parameters_01(self): atom1 = tb.StringAtom(itemsize=12) atom2 = atom1.copy() self.assertEqual(atom1, atom2) self.assertEqual(str(atom1), str(atom2)) self.assertIsNot(atom1, atom2)
import random ###################################################################################################################################### os.chdir(base_dir + '/' + dataname + '/' + experiment) seed = random.randrange( sys.maxsize ) #get a random seed so that we can reproducibly do the cross validation setup random.seed(seed) # set the seed #print(f"random seed (note down for reproducibility): {seed}") img_dtype = tables.UInt8Atom( ) # dtype in which the images will be saved, this indicates that images will be saved as unsigned int 8 bit, i.e., [0,255] filenameAtom = tables.StringAtom( itemsize=255 ) #create an atom to store the filename of the image, just in case we need it later. files = glob.glob( '../imgs_json_masks/*.png' ) # create a list of the files, in this case we're only interested in files which have masks so we can use supervised learning #create training and validation stages and split the files appropriately between them phases = {} phases["train"], phases["val"] = next( iter( model_selection.ShuffleSplit(n_splits=1, test_size=test_set_size).split(files))) print(f"\tDataset: {dataname}") print(f"\tExperiment: {experiment}")
def test_init_parameters_03(self): atom1 = tb.StringAtom(itemsize=12) self.assertRaises(TypeError, atom1.copy, foobar=42)
def calc_flow(original_image, file_list, out_file='optflow.hdf', out_res='optflow_res.hdf', complevel=0, complevel_res=0, use_last=True, open_func=lambda s: cv2.imread(s, 0), # Preset medium finest_scale=0, gd_iterations=25, patch_size=4, patch_stride=1, alpha=20, delta=5, gamma=1, iterations=20): infos = dict() infos['start_time'] = str(datetime.datetime.now()) infos['dir'] = os.getcwd() infos['host'] = os.uname()[1] infos['algo'] = 'Disflow' infos['algo_version'] = version infos['finest_scale'] = finest_scale infos['gd_iterations'] = gd_iterations infos['patch_size'] = patch_size infos['patch_stride'] = patch_stride infos['alpha'] = alpha infos['delta'] = delta infos['gamma'] = gamma infos['iterations'] = iterations infos['opencv'] = cv2.__version__ infos_s = str(infos).encode('utf-8') o_img = open_func(original_image) height, width = o_img.shape output_size = 8 * height * width * len(file_list) / 2**20 if out_res: output_size *= 1.5 # Residual has the same type but half the values print("Estimated output size: {:.2f} MB".format(output_size)) # Opening the main output file h = tables.open_file(unique_name(out_file), 'w') # Creating the infos node h.create_array(h.root, 'infos', [infos_s]) # If compression is asked, create the filter filt = tables.Filters(complevel=complevel) if complevel else None # Create the array at the node 'table' arr = h.create_earray(h.root, 'table', tables.Float32Atom(), (0, height, width, 2), expectedrows=len(file_list), filters=filt) # Create the array of the names of the images max_size = max([len(i.encode('utf-8')) for i in file_list + [original_image]]) names = h.create_earray(h.root, 'names', tables.StringAtom(max_size), (0, 2), expectedrows=len(file_list)) res_arr = h.create_earray(h.root, 'res', tables.Float32Atom(), (0,), expectedrows=len(file_list)) # If asked, create the file and array for the residual if out_res: h_res = tables.open_file(unique_name(out_res), 'w') filt_r = tables.Filters(complevel=complevel_res) if complevel_res\ else None arr_r = h.create_earray(h_res.root, 'table', tables.Float32Atom(), (0, height, width), expectedrows=len(file_list), filters=filt_r) names_r = h.create_earray(h_res.root, 'names', tables.StringAtom(max_size), (0, 2), expectedrows=len(file_list)) # Creating the optflow class dis = dis_class() dis.setFinestScale(finest_scale) dis.setGradientDescentIterations(gd_iterations) dis.setPatchSize(patch_size) dis.setPatchStride(patch_stride) dis.setVariationalRefinementAlpha(alpha) dis.setVariationalRefinementDelta(delta) dis.setVariationalRefinementGamma(gamma) dis.setVariationalRefinementIterations(iterations) r = None t0 = t2 = time() # Main loop (can catch kb interrupt) ai = Async_iter(file_list, open_func) try: for i, (f, img) in enumerate(zip(file_list, ai)): print("Image {}/{}: {}".format(i + 1, len(file_list), f)) # Adding the names of the two images names.append([[original_image.encode('utf-8'), f.encode('utf-8')]]) # Opening the second image print("Computing optflow...") # Should we initialize the field ? if use_last: r = dis.calc(o_img, img, r) else: r = dis.calc(o_img, img, None) # Adding the result to the table arr.append(r[None]) # Computing the residual print("Done. Computing residual...") res = get_res(o_img, img, r) res_arr.append(np.array([scalar_res(res)])) if out_res: arr_r.append(res[None]) names_r.append([[original_image.encode('utf-8'), f.encode('utf-8')]]) print("Done.") t1 = t2 t2 = time() print("Last loop took {}".format(format_time(t2 - t1))) print(" ETA1 {}".format(format_time( (t2 - t1) * (len(file_list) - i - 1)))) print("Elapsed time: {}".format(format_time(t2 - t0))) print(" ETA2 {}".format( format_time((t2 - t0) / (i + 1) * (len(file_list) - i - 1)))) except KeyboardInterrupt: print("Interrupted !") # Support de la reprise ? ai.terminate() print("Correlation finished !") h.create_array(h.root, 'elapsed', [time() - t0]) print("Closing main hdf file..") h.close() print("Done.") if out_res: print("Closing residual hdf file..") h_res.close() print("Done.")
def test_from_kind_04(self): atom1 = tb.Atom.from_kind('string', itemsize=5, dflt=b'hello') atom2 = tb.StringAtom(itemsize=5, shape=(), dflt=b'hello') self.assertEqual(atom1, atom2) self.assertEqual(str(atom1), str(atom2))
atom=a, shape=(0, CF.NUMLAT, CF.NUMLON), title=CF.DATA_TITLE) # Create the EArray for grid information, and populate it. a = tables.Float64Atom() myh5.create_earray(myh5.root, name="gridInfo", atom=a, shape=(0, len(CF.INFO_LIST)), title=CF.GRIDINFO_TITLE) mydatum = np.array([CF.INFO_LIST]) # Note extra dimension, for enumeration. myh5.root.gridInfo.append(mydatum) # Create the EArray for date-time information. a = tables.StringAtom(itemsize=CF.DATETIME_SIZE) myh5.create_earray(myh5.root, name="dateTime", atom=a, shape=(0, CF.DATETIME_NUM), title=CF.DATETIME_TITLE) if CF.VERBOSE: print("HDF5 object before being populated:") print(myh5) ##### Loop over the specified directories. ##### idxstart = len(CF.DATAFILE_HEADER) # Outer loop is over per-date folders. for d in range(len(DIR_TOREAD)):
def Run(args): in_fn_list = args.in_fn out_fn = args.out_fn platform = args.platform pileup = args.pileup global param float_type = 'int32' if pileup: import shared.param_p as param else: import shared.param_f as param float_type = 'int8' tensor_shape = param.ont_input_shape if platform == 'ont' else param.input_shape # select all match prefix if file path not exists tables.set_blosc_max_threads(64) int_atom = tables.Atom.from_dtype(np.dtype(float_type)) string_atom = tables.StringAtom(itemsize=param.no_of_positions + 50) long_string_atom = tables.StringAtom(itemsize=5000) # max alt_info length table_file = tables.open_file(out_fn, mode='w', filters=FILTERS) table_file.create_earray(where='/', name='position_matrix', atom=int_atom, shape=[0] + tensor_shape, filters=FILTERS) table_file.create_earray(where='/', name='position', atom=string_atom, shape=(0, 1), filters=FILTERS) table_file.create_earray(where='/', name='label', atom=int_atom, shape=(0, param.label_size), filters=FILTERS) table_file.create_earray(where='/', name='alt_info', atom=long_string_atom, shape=(0, 1), filters=FILTERS) table_dict = utils.update_table_dict() total_compressed = 0 for f in in_fn_list: print("[INFO] Merging file {}".format(f)) fi = tables.open_file(f, model='r') assert (len(fi.root.label) == len(fi.root.position) == len( fi.root.position_matrix) == len(fi.root.alt_info)) for index in range(len(fi.root.label)): table_dict['label'].append(fi.root.label[index]) table_dict['position'].append(fi.root.position[index]) table_dict['position_matrix'].append( fi.root.position_matrix[index]) table_dict['alt_info'].append(fi.root.alt_info[index]) total_compressed += 1 if total_compressed % 500 == 0 and total_compressed > 0: table_dict = utils.write_table_file(table_file, table_dict, tensor_shape, param.label_size, float_type) if total_compressed % 50000 == 0: print("[INFO] Compressed %d tensor" % (total_compressed), file=sys.stderr) fi.close() if total_compressed % 500 != 0 and total_compressed > 0: table_dict = utils.write_table_file(table_file, table_dict, tensor_shape, param.label_size, float_type) print("[INFO] Compressed %d tensor" % (total_compressed), file=sys.stderr) table_file.close()
def strip_water(allatom_filename, protein_filename, protein_atom_indices, min_num_frames=1): """Strip water (or other) atoms from a Core17, Core18, or OCore FAH HDF5 trajectory. Parameters ---------- allatom_filename : str Path to HDF5 trajectory with all atoms. This trajectory must have been generated by concatenate_core17 or concatenate_siegetank--e.g. it must include extra metadata that lists the XTC files (bzipped or in OCore directories) that have already been processed. This file will not be modified. protein_filename : str Path to HDF5 trajectory with all just protein atoms. This trajectory must have been generated by concatenate_core17 or concatenate_siegetank--e.g. it must include extra metadata that lists the XTC files (bzipped or in OCore directories) that have already been processed. This file will be appended to. protein_atom_indices : np.ndarray, dtype='int' List of atom indices to extract from allatom HDF5 file. min_num_frames : int, optional, default=1 Skip if below this number. """ # Check integrity of trajectory if it exists. delete_trajectory_if_broken(allatom_filename) if not os.path.exists(allatom_filename): print("Skipping, %s not found" % allatom_filename) return trj_allatom = HDF5TrajectoryFile(allatom_filename, mode='r') print('all-atom trajectory %s has %d frames' % (allatom_filename, len(trj_allatom))) if len(trj_allatom) < min_num_frames: print("Must have at least %d frames in %s to proceed!" % (min_num_frames, allatom_filename)) del trj_allatom return if hasattr(trj_allatom.root, "processed_filenames"): key = "processed_filenames" # Core17, Core18 style data elif hasattr(trj_allatom.root, "processed_directories"): key = "processed_directories" # Siegetank style data else: raise (ValueError("Can't find processed files in %s" % allatom_filename)) # Check integrity of trajectory if it exists. delete_trajectory_if_broken(protein_filename) # Open the stripped trajectory. trj_protein = HDF5TrajectoryFile(protein_filename, mode='a') try: trj_protein._create_earray(where='/', name=key, atom=tables.StringAtom(1024), shape=(0, )) trj_protein.topology = trj_allatom.topology.subset( protein_atom_indices) except tables.NodeError: pass n_frames_allatom = len(trj_allatom) try: n_frames_protein = len(trj_protein) except tables.NoSuchNodeError: n_frames_protein = 0 filenames_allatom = getattr(trj_allatom.root, key) filenames_protein = getattr(trj_protein._handle.root, key) # Hacky workaround of MDTraj bug #588 n_files_allatom = len(filenames_allatom) n_files_protein = len(filenames_protein) print( "Found %d,%d filenames and %d,%d frames in %s and %s, respectively." % (n_files_allatom, n_files_protein, n_frames_allatom, n_frames_protein, allatom_filename, protein_filename)) if n_frames_protein > n_frames_allatom: raise (ValueError( "Found more frames in protein trajectory (%d) than allatom trajectory (%d)" % (n_frames_protein, n_frames_allatom))) if n_files_protein > n_files_allatom: raise (ValueError( "Found more filenames in protein trajectory (%d) than allatom trajectory (%d)" % (n_files_protein, n_files_allatom))) if n_frames_protein == n_frames_allatom or n_files_allatom == n_files_protein: if not (n_frames_protein == n_frames_allatom and n_files_allatom == n_files_protein): raise (ValueError( "The trajectories must match in BOTH n_frames and n_filenames or NEITHER." )) else: print("Same number of frames and filenames found, skipping.") del trj_allatom, trj_protein return trj_allatom.seek( n_frames_protein) # Jump forward past what we've already stripped. coordinates, time, cell_lengths, cell_angles, velocities, kineticEnergy, potentialEnergy, temperature, alchemicalLambda = trj_allatom.read( ) trj_protein.write( coordinates=coordinates[:, protein_atom_indices], time=time, cell_lengths=cell_lengths, cell_angles=cell_angles) # Ignoring the other fields for now, TODO. filenames_protein.append(filenames_allatom[n_files_protein:]) del trj_allatom, trj_protein
import tables as tb import numpy as np fileh = tb.open_file('earray1.h5', mode='w') a = tb.StringAtom(itemsize=8) # Use ``a`` as the object type for the enlargeable array. array_c = fileh.create_earray(fileh.root, 'array_c', a, (0, ), "Chars") array_c.append(np.array(['a' * 2, 'b' * 4], dtype='S8')) array_c.append(np.array(['a' * 6, 'b' * 8, 'c' * 10], dtype='S8')) # Read the string ``EArray`` we have created on disk. for s in array_c: print(f'array_c[{array_c.nrow}] => {s!r}') # Close the file. fileh.close()
def save_images_h5(units, stimulus_list, name, frame_log, video_file, append): """Assumes each group is three stimuli with image in second position. Concatenate second stimuli with first 0.5s of third stimuli""" # open first so if there's a problem we don't waste time compression_level = 3 dset_filter = tables.filters.Filters(complevel=compression_level, complib='blosc:zstd') with tables.open_file(name + ".h5", 'w') as h5: class_resolver = get_classes_from_stimulus_list(stimulus_list) nclasses = len(class_resolver) frames, image_classes = glia.get_images_from_vid( stimulus_list, frame_log, video_file) image_class_num = list( map(lambda x: class_resolver[str(x)], image_classes)) idx_sorted_order = np.argsort(image_class_num) # save mapping of class_num target to class metadata # this way h5.root.image_classes[n] will give the class metadata string logger.info("create class_resolver with max string of 256") resolver = h5.create_carray(h5.root, "image_classes", tables.StringAtom(itemsize=256), (nclasses, )) img_class_array = np.array(image_classes, dtype="S256")[idx_sorted_order] for i, image_class in enumerate(img_class_array): resolver[i] = image_class atom = tables.Atom.from_dtype(frames[0].dtype) images = h5.create_carray(h5.root, "images", atom, (nclasses, *frames[0].shape), filters=dset_filter) frames = np.array(frames) nFrames = len(frames) for i, idx in enumerate(idx_sorted_order): if idx >= nFrames: logger.warn( f"skipping class {image_classes[idx]} as no accompanying frame. This should only occur if experiment stopped early." ) continue images[i] = frames[idx] print("finished saving images") get_image_responses = glia.compose( # returns a list partial(glia.create_experiments, stimulus_list=stimulus_list, progress=True, append_lifespan=append), partial(glia.group_by, key=lambda x: x["metadata"]["group"]), glia.group_dict_to_list, glia.f_filter(partial(glia.group_contains, "IMAGE")), # truncate to 0.5s glia.f_map(lambda x: [x[1], truncate(x[2], 0.5)]), glia.f_map(glia.merge_experiments), partial(glia.group_by, key=lambda x: x["metadata"]["cohort"]), # glia.f_map(f_flatten) ) image_responses = get_image_responses(units) ncohorts = len(image_responses) ex_cohort = glia.get_value(image_responses) images_per_cohort = len(ex_cohort) print("images_per_cohort", images_per_cohort) duration = ex_cohort[0]["lifespan"] d = int(np.ceil(duration * 1000)) # 1ms bins logger.info(f"ncohorts: {ncohorts}") # import pdb; pdb.set_trace() logger.info(f"nclasses: {nclasses}") if nclasses < 256: class_dtype = np.dtype('uint8') else: class_dtype = np.dtype('uint16') class_resolver_func = lambda c: class_resolver[str(c)] # determine shape experiments = glia.flatten_group_dict(image_responses) nE = len(experiments) d = int(np.ceil(duration * 1000)) # 1ms bins data_shape = (nE, d, Unit.nrow, Unit.ncol, Unit.nunit) print(f"writing to {name}.h5 with zstd compression...") data = h5.create_carray("/", "data", tables.Atom.from_dtype(np.dtype('uint8')), shape=data_shape, filters=dset_filter) target = h5.create_carray("/", "target", tables.Atom.from_dtype(class_dtype), shape=(nE, ), filters=dset_filter) glia.experiments_to_h5(experiments, data, target, partial(get_image_class_from_stim, class_resolver=class_resolver_func), append, class_dtype=class_dtype)
def load_toHDF5(self, hdf5_file=None, verbose=-1): # initialize the list of features and labels n_samples = len(self.ids) filters = tables.Filters(complevel=5, complib='blosc') image_storage = hdf5_file.create_earray(hdf5_file.root, 'imdata', tables.Float32Atom(), shape=self.image_data_shape, filters=filters, expectedrows=n_samples) if self.problem_type is "Classification": truth_storage = hdf5_file.create_earray( hdf5_file.root, 'truth', tables.StringAtom(itemsize=15), shape=self.truth_data_shape, filters=filters, expectedrows=n_samples) elif self.problem_type is "Segmentation": truth_storage = hdf5_file.create_earray( hdf5_file.root, 'truth', tables.UInt8Atom(), shape=self.truth_data_shape, filters=filters, expectedrows=n_samples) # loop over the input images for (i, imagePath) in enumerate(self.data_files): # load the image and extract the class label assuming # that our path has the following format: # /path/to/dataset/{class}/{image}.jpg if self.problem_type is "Classification": subject_name = imagePath[0].split(os.path.sep)[-2] if subject_name in self.ids: images = self.get_images(in_files=imagePath, image_shape=self.input_shape, label_indices=len(imagePath) - 1) label = imagePath[0].split(os.path.sep)[-3] subject_data = [image for image in images] image_storage.append(np.asarray(subject_data)[np.newaxis]) truth_storage.append(np.asarray(label)[np.newaxis]) elif self.problem_type is "Segmentation": subject_name = imagePath[0].split(os.path.sep)[-2] if subject_name in self.ids: images = self.get_images(in_files=imagePath, image_shape=self.input_shape, label_indices=len(imagePath) - 1, slice_number=self.slice_number) subject_data = [image for image in images] image_storage.append( np.asarray(subject_data[:self.n_channels])[np.newaxis]) # DEBUG #image = np.asarray(subject_data[:self.n_channels]) truth_storage.append( np.asarray(subject_data[self.n_channels], dtype=np.uint8)[np.newaxis][np.newaxis]) # elif self.problem_type is "Regression": # image = cv2.imread(imagePath) # show an update every `verbose` images if verbose > 0 and i > 0 and (i + 1) % verbose == 0: print("[INFO] processed {}/{}".format(i + 1, len(self.ids))) return (image_storage)
# **1. Create an HDF5 file which contains a single emails array, uncompressed. Close this file and use the getsize() function to report the size on disk. # Time how long this took and print that out as well.** # In[2]: NUM = 10000000 RATE = 0.75 emails = email_array(NUM, RATE) t = time() with tb.open_file('uncompressed.h5', 'w') as f: earray = f.create_earray('/', 'emails', tb.StringAtom(4), (0, ), expectedrows=NUM) earray.append(emails) tdelta = time() - t msg = "The uncompressed array is {0} bytes and took {1} ms to write." print(msg.format(getsize('uncompressed.h5'), tdelta * 1000)) # **2. Repeat step 1 but with zlib compression at level 1.** # In[3]: filters = tb.Filters(complib='zlib', complevel=1) t = time() with tb.open_file('zlib1.h5', 'w') as f: