def prepare_batch(self, features, texts): """ Featurize a minibatch of data, zero pad them and return a dictionary Params: features (list(np.array)): List of ECoG data texts (list(str)): List of texts corresponding to the features Returns: dict: See below for contents """ assert len(features) == len(texts),\ "Inputs and outputs to the network must be of the same number" # Features is a list of (timesteps, feature_dim) arrays input_lengths = [f.shape[0] for f in features] max_length = max(input_lengths) nr_channels = features[0].shape[1] # This may differ for the last batch (may be smaller) batch_size = len(features) # Pad all the inputs so that they are all the same length x = np.zeros((batch_size, max_length, nr_channels)) for i in range(batch_size): feat = features[i] #feat = self.normalize(feat) # Center using means and std x[i, :feat.shape[0], :] = feat y = text_to_int_sequence(texts) sparse_y = sparse_tensor_feed(y) return { 'x': x, # (0-padded features of shape(mb_size,timesteps,feat_dim) 'y': y, # list(int) Labels (integer sequences) 'sparse_y': sparse_y, # A tuple with (indices, values, shape) 'texts': texts, # list(str) Original texts 'input_lengths': input_lengths, # list(int) Length of each input }
def get_intseq(trans, max_intseq_length=80): # PAD t = text_to_int_sequence(trans) while (len(t) < max_intseq_length): t.append(27) # replace with a space char to pad # print(t) return t[:max_intseq_length]
def get_batch(self, partition): """ Obtain a batch of train, validation, or test data """ if partition == 'train': audio_paths = self.train_audio_paths cur_index = self.cur_train_index texts = self.train_texts elif partition == 'valid': audio_paths = self.valid_audio_paths cur_index = self.cur_valid_index texts = self.valid_texts elif partition == 'test': audio_paths = self.test_audio_paths cur_index = self.test_valid_index texts = self.test_texts else: raise Exception("Invalid partition. " "Must be train/validation") features = [ self.normalize(self.featurize(a)) for a in audio_paths[cur_index:cur_index + self.minibatch_size] ] # calculate necessary sizes max_length = max( [features[i].shape[0] for i in range(0, self.minibatch_size)]) max_string_length = max( [len(texts[cur_index + i]) for i in range(0, self.minibatch_size)]) # initialize the arrays X_data = np.zeros([ self.minibatch_size, max_length, self.feat_dim * self.spectrogram + self.mfcc_dim * (not self.spectrogram) ]) labels = np.ones([self.minibatch_size, max_string_length ]) * 28 # blanks input_length = np.zeros([self.minibatch_size, 1]) label_length = np.zeros([self.minibatch_size, 1]) for i in range(0, self.minibatch_size): # calculate X_data & input_length feat = features[i] input_length[i] = feat.shape[0] X_data[i, :feat.shape[0], :] = feat # calculate labels & label_length label = np.array(text_to_int_sequence(texts[cur_index + i])) labels[i, :len(label)] = label label_length[i] = len(label) # return the arrays outputs = {'ctc': np.zeros([self.minibatch_size])} inputs = { 'the_input': X_data, 'the_labels': labels, 'input_length': input_length, 'label_length': label_length } return (inputs, outputs)
def get_max_intseq(comb): max_intseq_length = 0 for x in comb: try: y = text_to_int_sequence(x) if len(y) > max_intseq_length: max_intseq_length = len(y) except: print("error at:", x) return max_intseq_length
def prepare_minibatch(self, audio_paths, texts, durations, arpabets): """ Featurize a minibatch of audio, zero pad them and return a dictionary Params: audio_paths (list(str)): List of paths to audio files texts (list(str)): List of texts corresponding to the audio files Returns: dict: See below for contents """ assert len(audio_paths) == len(texts),\ "Inputs and outputs to the network must be of the same number" # Features is a list of (timesteps, feature_dim) arrays # Calculate the features for each audio clip, as the log of the # Fourier Transform of the audio features = [self.featurize(a) for a in audio_paths] input_lengths = [f.shape[0] for f in features] max_length = max(input_lengths) feature_dim = features[0].shape[1] mb_size = len(features) # Pad all the inputs so that they are all the same length x = np.zeros((mb_size, max_length, feature_dim)) y = [] label_lengths = [] for i in range(mb_size): feat = features[i] feat = self.normalize(feat) # Center using means and std x[i, :feat.shape[0], :] = feat text = text_normalize(texts[i]) label = text_to_int_sequence(text) y.append(label) label_lengths.append(len(label)) y = pad_sequences(y, maxlen=len(max(texts, key=len)), dtype='int32', padding='post', truncating='post', value=-1) res = { 'x': x, # (0-padded features of shape(mb_size,timesteps,feat_dim) 'y': y, # list(int) Flattened labels (integer sequences) 'texts': texts, # list(str) Original texts 'input_lengths': input_lengths, # list(int) Length of each input 'label_lengths': label_lengths # list(int) Length of each label # 'durations' [if use_durations] list(float) Duration of each sample # 'phonemes'[if use_arpabets] list(int) Flattened arpabet ints } if self.use_durations: res['durations'] = durations if self.use_arpabets: arpints, arpaint_lengths = [], [] for i in range(mb_size): arpaint_seq = arpabet_to_int_sequence(arpabets[i]) arpints.append(arpaint_seq) arpaint_lengths.append(len(arpaint_seq)) maxlen = len(max(arpints, key=len)) res['phonemes'] = pad_sequences(arpints, maxlen=maxlen, dtype='int32', padding='post', truncating='post', value=-1) res['phoneme_lengths'] = arpaint_lengths return res
def get_batch(self, partition): """ Obtain a batch of train, validation, or test data """ if partition == 'train': audio_paths = self.train_audio_paths cur_index = self.cur_train_index texts = self.train_texts elif partition == 'valid': audio_paths = self.valid_audio_paths cur_index = self.cur_valid_index texts = self.valid_texts elif partition == 'test': audio_paths = self.test_audio_paths cur_index = self.test_valid_index texts = self.test_texts else: raise Exception("Invalid partition. " "Must be train/validation") features = [self.normalize(self.featurize(a)) for a in audio_paths[cur_index:cur_index+self.minibatch_size]] # calculate necessary sizes max_length = max([features[i].shape[0] for i in range(0, self.minibatch_size)]) max_string_length = max([len(texts[cur_index+i]) for i in range(0, self.minibatch_size)]) # initialize the arrays X_data = np.zeros([self.minibatch_size, max_length, self.feat_dim*self.spectrogram + self.mfcc_dim*(not self.spectrogram)]) labels = np.ones([self.minibatch_size, max_string_length]) * 28 input_length = np.zeros([self.minibatch_size, 1]) label_length = np.zeros([self.minibatch_size, 1]) for i in range(0, self.minibatch_size): # calculate X_data & input_length feat = features[i] input_length[i] = feat.shape[0] X_data[i, :feat.shape[0], :] = feat # calculate labels & label_length label = np.array(text_to_int_sequence(texts[cur_index+i])) labels[i, :len(label)] = label label_length[i] = len(label) # return the arrays outputs = {'ctc': np.zeros([self.minibatch_size])} inputs = {'the_input': X_data, 'the_labels': labels, 'input_length': input_length, 'label_length': label_length } return (inputs, outputs)
def get_batch(self, partition): if partition == 'train': audio_paths = self.train_audio_paths cur_index = self.cur_train_index texts = self.train_texts elif partition == 'valid': audio_paths = self.valid_audio_paths cur_index = self.cur_valid_index texts = self.valid_texts elif partition == 'test': audio_paths = self.test_audio_paths cur_index = self.test_valid_index texts = self.test_texts else: raise Exception("Invalid partition. Must be train/validation") features = [ self.normalize(self.featurize(a)) for a in audio_paths[cur_index:cur_index + self.minibatch_size] ] max_length = max( [features[i].shape[0] for i in range(0, self.minibatch_size)]) max_string_length = max( [len(texts[cur_index + i]) for i in range(0, self.minibatch_size)]) X_data = np.zeros([ self.minibatch_size, max_length, self.feat_dim * self.spectrogram + self.mfcc_dim * (not self.spectrogram) ]) labels = np.ones([self.minibatch_size, max_string_length]) * 28 input_length = np.zeros([self.minibatch_size, 1]) label_length = np.zeros([self.minibatch_size, 1]) for i in range(0, self.minibatch_size): feat = features[i] input_length[i] = feat.shape[0] X_data[i, :feat.shape[0], :] = feat label = np.array(text_to_int_sequence(texts[cur_index + i])) labels[i, :len(label)] = label label_length[i] = len(label) outputs = {'ctc': np.zeros([self.minibatch_size])} inputs = { 'the_input': X_data, 'the_labels': labels, 'input_length': input_length, 'label_length': label_length } return (inputs, outputs)
def prepare_minibatch(self, audio_paths, texts, mode): """ Featurize a minibatch of audio, zero pad them and return a dictionary Params: audio_paths (list(str)): List of paths to audio files texts (list(str)): List of texts corresponding to the audio files Returns: dict: See below for contents """ assert len(audio_paths) == len(texts),\ "Inputs and outputs to the network must be of the same number" # Features is a list of (timesteps, feature_dim) arrays # Calculate the features for each audio clip, as the log of the # Fourier Transform of the audio features = [self.featurize(a, mode) for a in audio_paths] input_lengths = [f.shape[0] for f in features] max_length = max(input_lengths) feature_dim = features[0].shape[1] mb_size = len(features) # Pad all the inputs so that they are all the same length x = np.zeros((mb_size, max_length, feature_dim)) y_temp = [] label_lengths = [] for i in range(mb_size): feat = features[i] feat = self.normalize(feat) # Center using means and std x[i, :feat.shape[0], :] = feat label = text_to_int_sequence(texts[i]) y_temp.append(label) label_lengths.append(len(label)) # padding zero để có thể có được label dạng [batch_size, max_label_length] max_label_length = max(label_lengths) y = np.full((mb_size, max_label_length), -1) for i in range(mb_size): y[i, :label_lengths[i]] = y_temp[i] # Flatten labels to comply with warp-CTC signature y_temp = reduce(lambda i, j: i + j, y_temp) return { 'x': x, # (0-padded features of shape(mb_size,timesteps,feat_dim) 'y': y, # list(int) Flattened labels (integer sequences) 'texts': texts, # list(str) Original texts 'input_lengths': input_lengths, # list(int) Length of each input 'label_lengths': label_lengths # list(int) Length of each label }
def get_batch(self, index, size, audio_paths, texts): # pull necessary info max_length = max( [self.features[index + i].shape[0] for i in range(0, size)]) max_string_length = max( [len(self.train_texts[index + i]) for i in range(0, size)]) # initialize the arrays X_data = np.zeros([size, max_length, self.feat_dim]) labels = np.ones([size, max_string_length]) * 28 input_length = np.zeros([size, 1]) label_length = np.zeros([size, 1]) # populate the arrays for i in range(0, size): # X_data, input_length feat = self.features[index + i] input_length[i] = feat.shape[0] feat = self.normalize(feat) X_data[i, :feat.shape[0], :] = feat # y, label_length label = np.array(text_to_int_sequence(texts[index + i])) - 1 labels[i, :len(label)] = label label_length[i] = len(label) # repare and return the arrays input_length = np.array([ conv_output_length(i, filter_size=11, border_mode='valid', stride=2) for i in input_length ]) outputs = {'ctc': np.zeros([size])} inputs = { 'the_input': X_data, # array; dim: mb_size x max_aud_length x features[0].shape[1] 'the_labels': labels, # array; dim: mb_size, time_steps, num_categories 'input_length': input_length, # array; dim: mb_size x 1 'label_length': label_length # array; dim: mb_size x 1 } return (inputs, outputs)
def get_maxseq_len(trans): # PAD t = text_to_int_sequence(trans) return len(t)
labels = np.ones([size, max_string_length]) * 28 input_length = np.zeros([size, 1]) label_length = np.zeros([size, 1]) for i in range(0, size): # X_data, input_length feat = audio_gen.features[index + i] feat = audio_gen.normalize(feat) input_length[i] = conv_output_length(max_length, filter_size=11, border_mode='valid', stride=2) X_data[i, :feat.shape[0], :] = feat # y, label_length label = np.array(text_to_int_sequence( audio_gen.train_texts[index + i])) - 1 labels[i, :len(label)] = label label_length[i] = 133 def decode_batch(test_func, audio): out = test_func([audio])[0] ret = [] for j in range(out.shape[0]): out_best = list(np.argmax(out[j, :], 1)) out_best = [k for k, g in itertools.groupby(out_best)] # 26 is space, 27 is CTC blank char outstr = '' for c in out_best: if c >= 0 and c < 26: outstr += chr(c + ord('a'))
def load_metadata_from_desc_file(self, desc_file, partition='train', max_duration=10.0): """ Read metadata from the description file (possibly takes long, depending on the filesize) Params: desc_file (str): Path to a JSON-line file that contains labels and paths to the audio files partition (str): One of 'train', 'validation' or 'test' max_duration (float): In seconds, the maximum duration of utterances to train or test on """ logger.info('Reading description file: {} for partition: {}' .format(desc_file, partition)) audio_paths, durations, texts, arpabets = [], [], [], [] with open(desc_file, encoding='utf-8') as json_line_file: for line_num, json_line in enumerate(json_line_file): try: spec = json.loads(json_line) if float(spec['duration']) > max_duration: continue textlen= len(text_to_int_sequence(text_normalize(spec['text']))) speclen= len(spectrogram_from_file(spec['key'])) if textlen > speclen : print('label > feats ignore setence') continue if textlen < 2: print('small label ignore setence') continue audio_paths.append(spec['key']) durations.append(float(spec['duration'])) texts.append(spec['text']) if self.use_arpabets: arpabets.append(spec['arpabet']) except Exception as e: # Change to (KeyError, ValueError) or # (KeyError,json.decoder.JSONDecodeError), depending on # json module version logger.warn('Error reading line #{}: {}' .format(line_num, json_line)) logger.warn(str(e)) if not self.use_arpabets: arpabets = [''] * len(audio_paths) if partition == 'train': self.train_audio_paths = audio_paths self.train_durations = durations self.train_texts = texts self.train_arpabets = arpabets elif partition == 'validation': self.val_audio_paths = audio_paths self.val_durations = durations self.val_texts = texts self.val_arpabets = arpabets elif partition == 'test': self.test_audio_paths = audio_paths self.test_durations = durations self.test_texts = texts self.test_arpabets = arpabets else: raise Exception("Invalid partition to load metadata. " "Must be train/validation/test")