def from_flac_to_tfrecords(train_r=0.8, valid_test_r=0.2): # Extract the information about this subset (speakers, chapters) # Dictionary with the following shape: # {speaker_key: {chapters: [...], sex:'M/F', ... } } folder = config.data_root+'/'+config.data_subset speakers_info = data_tools.read_metadata(config.data_subset) keys_to_index = {} for i, key in enumerate(speakers_info.keys()): keys_to_index[key] = i sex = ['M' for i in range(len(speakers_info))] for k, v in speakers_info.items(): i = keys_to_index[k] sex[i] = v['sex'] np.save('genders_index.arr', sex) # exit() allfiles = np.array([os.path.join(r,f) for r,dirs,files in os.walk(folder) for f in files if f.endswith(".flac")]) L = len(allfiles) np.random.shuffle(allfiles) train = allfiles[:int(L*train_r)] valid = allfiles[int(L*train_r):int(L*(train_r+valid_test_r/2))] test = allfiles[int(L*(train_r+valid_test_r/2)):] print len(train), len(valid), len(test) for group_name, data_split in [("train", train),("test", test), ("valid", valid)]: for s in ['M', 'F']: writer = tf.python_io.TFRecordWriter(group_name + '_' + s +'.tfrecords') for file in data_split: splits = file.split('/') key = splits[-3] sex = speakers_info[key]['sex'] if sex == s: raw_audio, sr = load(file, sr=16000) raw_audio = resample(raw_audio, sr, config.fs) raw_audio = raw_audio.astype(np.float32).tostring() feature = tf.train.Example(features=tf.train.Features( feature = { 'audio' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[raw_audio])), 'key' : tf.train.Feature(int64_list=tf.train.Int64List(value=[keys_to_index[key]])) })) print group_name, s, key, keys_to_index[key] writer.write(feature.SerializeToString()) writer.close()
def create_h5_dataset(self, output_fn, subset=config.data_subset, data_root=config.data_root): """ Create a H5 file from the LibriSpeech dataset and the subset given: Inputs: output_fn: filename for the created file subset: LibriSpeech subset : 'dev-clean' , ... data_root: LibriSpeech folder path """ # Extract the information about this subset (speakers, chapters) # Dictionary with the following shape: # {speaker_key: {chapters: [...], sex:'M/F', ... } } speakers_info = data_tools.read_metadata(subset) with h5py.File(output_fn,'w') as data_file: for (key, elements) in speakers_info.items(): if key not in data_file: # Create an H5 Group for each key/speaker data_file.create_group(key) # Current speaker folder path folder = data_root+'/'+subset+'/'+key print_progress(0, len(elements['chapters']), prefix = 'Speaker '+key+' :', suffix = 'Complete') # For all the chapters read by this speaker for i, chapter in enumerate(elements['chapters']): # Find all .flac audio for root, dirs, files in os.walk(folder+'/'+chapter): for file in files: if file.endswith(".flac"): path = os.path.join(root,file) raw_audio, samplerate = sf.read(path) # Generate the spectrogram for the current audio file _, _, spec = create_spectrogram(raw_audio, samplerate) data_file[key].create_dataset(file, data=spec.T.astype(np.complex64), compression="gzip", dtype=np.complex64, compression_opts=0) print_progress(i + 1, len(elements['chapters']), prefix = 'Speaker '+key+' :', suffix = 'Complete') print 'Dataset for the subset: ' + subset + ' has been built'
def create_raw_audio_dataset(output_fn, subset=config.data_subset, data_root=config.data_root): """ Create a H5 file from the LibriSpeech dataset and the subset given: Inputs: output_fn: filename for the created file subset: LibriSpeech subset : 'dev-clean' , ... data_root: LibriSpeech folder path """ from librosa.core import resample, load # Extract the information about this subset (speakers, chapters) # Dictionary with the following shape: # {speaker_key: {chapters: [...], sex:'M/F', ... } } speakers_info = data_tools.read_metadata(subset) with h5py.File(output_fn, 'w') as data_file: for key, elements in tqdm(speakers_info.items(), total=len(speakers_info), desc='Speakers'): if key not in data_file: # Create an H5 Group for each key/speaker data_file.create_group(key) # Current speaker folder path folder = data_root + '/' + subset + '/' + key # For all the chapters read by this speaker for i, chapter in enumerate( tqdm(elements['chapters'], desc='Chapters')): # Find all .flac audio for root, dirs, files in os.walk(folder + '/' + chapter): for file in tqdm(files, desc='Files'): if file.endswith(".flac"): path = os.path.join(root, file) raw_audio, sr = load(path, sr=16000) raw_audio = resample(raw_audio, sr, config.fs) data_file[key].create_dataset( file, shape=raw_audio.shape, data=raw_audio, chunks=raw_audio.shape, maxshape=raw_audio.shape, compression="gzip", compression_opts=9) print 'Dataset for the subset: ' + subset + ' has been built'
def __init__(self, ratio=[0.90, 0.05, 0.05], **kwargs): """ Inputs: ratio: ratio for train / valid / test set kwargs: Dataset parameters """ np.random.seed(config.seed) self.nb_speakers = kwargs['nb_speakers'] self.sex = kwargs['sex'] self.batch_size = kwargs['batch_size'] self.chunk_size = kwargs['chunk_size'] self.no_random_picking = kwargs['no_random_picking'] # Flags for Training/Validation/Testing sets self.TRAIN = 0 self.VALID = 1 self.TEST = 2 # TODO metadata = data_tools.read_metadata() if self.sex != ['M', 'F'] and self.sex != ['F', 'M'] and self.sex != ['M'] and self.sex != ['F']: raise Exception('Sex must be ["M","F"] | ["F","M"] | ["M"] | [F"]') # Create a key to speaker index dictionnary # And count the numbers of speakers self.key_to_index = {} self.sex_to_keys = {} j = 0 if 'M' in self.sex: M = data_tools.males_keys(metadata) self.sex_to_keys['M'] = M for k in M: self.key_to_index[k] = j j += 1 if 'F' in self.sex: F = data_tools.females_keys(metadata) self.sex_to_keys['F'] = F for k in F: self.key_to_index[k] = j j += 1 self.tot_speakers = j self.file = h5py.File(kwargs['dataset'], 'r') # Define all the items related to each key/speaker self.total_items = [] for key in self.key_to_index.keys(): for val in self.file[key]: # Get one file related to a speaker and check how many chunks can be obtained # with the current chunk size chunks = self.file['/'.join([key,val])].shape[0]//self.chunk_size # Add each possible chunks in the items with the following form: # 'key/file/#chunk' self.total_items += ['/'.join([key,val,str(i)]) for i in range(chunks)] np.random.shuffle(self.total_items) self.total_items = self.total_items L = len(self.total_items) # Shuffle all the items # Training / Valid / Test Separation train = self.create_tree(self.total_items[:int(L*ratio[0])]) valid = self.create_tree(self.total_items[int(L*ratio[0]):int(L*(ratio[0]+ratio[1]))]) test = self.create_tree(self.total_items[int(L*(ratio[0]+ratio[1])):]) self.train = TreeIterator(train, self) self.valid = TreeIterator(valid, self) self.test = TreeIterator(test, self)
self.dico[int(item)] = i def __iter__(self): return self def get_labels(self): return self.dico from data_tools import read_metadata, males_keys, females_keys if __name__ == "__main__": ### ### TEST ### H5_dic = read_metadata() print H5_dic chunk_size = 512*100 males = H5PY_RW('test_raw.h5py', subset = males_keys(H5_dic)) fem = H5PY_RW('test_raw.h5py', subset = females_keys(H5_dic)) print 'Data with', len(H5_dic), 'male and female speakers' print males.length(), 'elements' print fem.length(), 'elements' mixed_data = Mixer([males, fem], chunk_size= chunk_size, with_mask=False, with_inputs=True, shuffling=True) batch_size = 128 mixed_data.adjust_split_size_to_batchsize(batch_size)