def __init__(self, config): self.am = AM(config) self.am.load_model(False) self.speech_config = config['speech_config'] self.text_config = config['decoder_config'] self.speech_feature = SpeechFeaturizer(self.speech_config) self.text_featurizer = TextFeaturizer(self.text_config) self.decoded = tf.constant([self.text_featurizer.start])
def __init__(self, config, training=True): self.config = config self.batch = config['batch_size'] self.make_file_list(self.config['train_list'] if training else self.config['eval_list'], training=training) self.epochs = 1 self.steps = 0 self.speech_featurizer=SpeechFeaturizer(config) self._target_pad = -(self.config['max_abs_value'] + 0.1)
def __init__(self, config): self.config = config self.update_model_type() self.speech_config = self.config['speech_config'] if self.model_type != 'MultiTask': self.text_config = self.config['decoder_config'] else: self.text_config = self.config['decoder3_config'] self.model_config = self.config['model_config'] self.text_feature = TextFeaturizer(self.text_config, True) self.speech_feature = SpeechFeaturizer(self.speech_config) self.init_steps = None
def __init__(self, config): self.config = config self.update_model_type() self.speech_config = self.config['speech_config'] try: self.text_config = self.config['decoder_config'] except: self.text_config = self.config['decoder4_config'] self.model_config = self.config['model_config'] self.text_feature = TextFeaturizer(self.text_config) self.speech_feature = SpeechFeaturizer(self.speech_config) self.init_steps = None
def __init__(self, config_dict,training=True): self.speech_config = config_dict['speech_config'] self.text_config = config_dict['decoder_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config']['batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.text_featurizer = TextFeaturizer(self.text_config) self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training) self.augment = Augmentation(self.augment_config) self.init_text_to_vocab() self.epochs = 1 self.LAS=False self.steps = 0
def __init__(self, config_dict, training=True): self.speech_config = config_dict['speech_config'] self.text1_config = config_dict['decoder1_config'] self.text2_config = config_dict['decoder2_config'] self.text3_config = config_dict['decoder3_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config']['batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.token1_featurizer = TextFeaturizer(self.text1_config) self.token2_featurizer = TextFeaturizer(self.text2_config) self.token3_featurizer = TextFeaturizer(self.text3_config) self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'], training) self.make_maps(config_dict) self.augment = Augmentation(self.augment_config) self.epochs = 1 self.steps = 0
def __init__(self, config=None,vocoder_config=None): assert config is not None or vocoder_config is not None,'must one config' if config is not None: self.config = config self.acoustic=config['model_name'] else: self.config = None self.vocoder_config=vocoder_config if vocoder_config is not None: self.GL = SpeechFeaturizer(vocoder_config).inv_mel_spectrogram self.vocoder_type=vocoder_config['vocoder_model'] else: self.GL = SpeechFeaturizer(config).inv_mel_spectrogram self.vocoder_type=None if self.config is not None: self.text_featurizer=TextFeaturizer(config)
class AM_DataLoader(): def __init__(self, config_dict,training=True): self.speech_config = config_dict['speech_config'] self.text_config = config_dict['decoder_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config']['batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.text_featurizer = TextFeaturizer(self.text_config) self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training) self.augment = Augmentation(self.augment_config) self.init_text_to_vocab() self.epochs = 1 self.LAS=False self.steps = 0 def load_state(self,outdir): try: self.pick_index=np.load(os.path.join(outdir,'dg_state.npy')).flatten().tolist() self.epochs=1+int(np.mean(self.pick_index)) except FileNotFoundError: print('not found state file') except: print('load state falied,use init state') def save_state(self,outdir): np.save(os.path.join(outdir,'dg_state.npy'),np.array(self.pick_index)) def return_data_types(self): if self.LAS: return (tf.float32, tf.float32, tf.int32, tf.int32, tf.int32,tf.float32) else: return (tf.float32, tf.int32, tf.int32, tf.int32) def return_data_shape(self): f,c=self.speech_featurizer.compute_feature_dim() if self.LAS: return ( tf.TensorShape([None,None,1]) if self.speech_config['use_mel_layer'] else tf.TensorShape([None,None,f,c]), tf.TensorShape([None,]), tf.TensorShape([None,None]), tf.TensorShape([None,]), tf.TensorShape([None,None,None]) ) else: return ( tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape( [None, None, f, c]), tf.TensorShape([None, ]), tf.TensorShape([None, None]), tf.TensorShape([None, ]) ) def get_per_epoch_steps(self): return len(self.train_list)//self.batch def eval_per_epoch_steps(self): return len(self.test_list)//self.batch def init_text_to_vocab(self): pypinyin.load_phrases_dict({'调大': [['tiáo'], ['dà']], '调小': [['tiáo'], ['xiǎo']], '调亮': [['tiáo'], ['liàng']], '调暗': [['tiáo'], ['àn']], '肖': [['xiāo']], '英雄传': [['yīng'], ['xióng'], ['zhuàn']], '新传': [['xīn'], ['zhuàn']], '外传': [['wài'], ['zhuàn']], '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']] }) def text_to_vocab_func(txt): pins=pypinyin.pinyin(txt) pins=[i[0] for i in pins] return pins self.text_to_vocab = text_to_vocab_func def augment_data(self, wavs, label, label_length): if not self.augment.available(): return None mels = [] input_length = [] label_ = [] label_length_ = [] wavs_ = [] max_input = 0 max_wav = 0 for idx, wav in enumerate(wavs): data = self.augment.process(wav.flatten()) speech_feature = self.speech_featurizer.extract(data) if speech_feature.shape[0] // self.speech_config['reduction_factor'] < label_length[idx]: continue max_input = max(max_input, speech_feature.shape[0]) max_wav = max(max_wav, len(data)) wavs_.append(data) mels.append(speech_feature) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) label_.append(label[idx]) label_length_.append(label_length[idx]) for i in range(len(mels)): if mels[i].shape[0] < max_input: pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1],mels[i].shape[2]]) * mels[i].min() mels[i] = np.vstack((mels[i], pad)) wavs_ = self.speech_featurizer.pad_signal(wavs_, max_wav) x = np.array(mels, 'float32') label_ = np.array(label_, 'int32') input_length = np.array(input_length, 'int32') label_length_ = np.array(label_length_, 'int32') wavs_ = np.array(np.expand_dims(wavs_, -1), 'float32') return x, wavs_, input_length, label_, label_length_ def make_file_list(self, wav_list,training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data=[i.strip() for i in data if i!=''] num = len(data) if training: self.train_list = data[:int(num * 0.99)] self.test_list = data[int(num * 0.99):] np.random.shuffle(self.train_list) self.pick_index = [0.] * len(self.train_list) else: self.test_list=data self.offset=0 def only_chinese(self, word): txt='' for ch in word: if '\u4e00' <= ch <= '\u9fff': txt+=ch else: continue return txt def eval_data_generator(self): sample=self.test_list[self.offset:self.offset+self.batch] self.offset+=self.batch speech_features = [] input_length = [] y1 = [] label_length1 = [] max_input = 0 max_label1 = 0 for i in sample: wp, txt = i.strip().split('\t') txt=txt.replace(' ','') try: data = self.speech_featurizer.load_wav(wp) except: print('{} load data failed'.format(wp)) continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * self.speech_config['wav_max_duration']: print('{} duration out of wav_max_duration({})'.format(wp,self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt= self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) max_input = max(max_input, speech_feature.shape[0]) py = self.text_to_vocab(txt) if not self.check_valid(py, self.text_featurizer.vocab_array): print(' {} txt pinyin {} not all in tokens,continue'.format(txt,py)) continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): print('{} feature length < pinyin length,continue'.format(wp)) continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if self.speech_config['use_mel_layer']: speech_features = self.speech_featurizer.pad_signal(speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2]]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) for i in range(len(y1)): if y1[i].shape[0] < max_label1: pad = np.ones(max_label1 - y1[i].shape[0]) * self.text_featurizer.pad y1[i] = np.hstack((y1[i], pad)) x = np.array(speech_features, 'float32') y1 = np.array(y1, 'int32') input_length = np.array(input_length, 'int32') label_length1 = np.array(label_length1, 'int32') return x, input_length, y1, label_length1 def check_valid(self,txt,vocab_list): if len(txt)==0: return False for n in txt: if n in vocab_list: pass else: return False return True def GuidedAttentionMatrix(self, N, T, g=0.2): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N)) ** 2 / (2 * g * g)) return W def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] for i, j in zip(input_length, targets_length): i = int(i) step = int(j) pad = np.ones([inputs_shape, mel_target_shape]) * -1. pad[i:, :step] = 1 att_target = self.GuidedAttentionMatrix(i, step, 0.2) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_targets = np.array(att_targets) return att_targets.astype('float32') def generate(self, train=True): if train: batch=self.batch if self.augment.available() else self.batch*2 indexs = np.argsort(self.pick_index)[:batch] indexs = random.sample(indexs.tolist(), batch//2) sample = [self.train_list[i] for i in indexs] for i in indexs: self.pick_index[int(i)] += 1 self.epochs =1+ int(np.mean(self.pick_index)) else: sample = random.sample(self.test_list, self.batch) speech_features = [] input_length = [] y1 = [] label_length1 = [] max_input = 0 max_label1 = 0 for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('{} load data failed'.format(wp)) continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * self.speech_config['wav_max_duration']: print('{} duration out of wav_max_duration({})'.format(wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt= self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py = self.text_to_vocab(txt) if not self.check_valid(py,self.text_featurizer.vocab_array): print(' {} txt pinyin {} not all in tokens,continue'.format(txt, py)) continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): print('{} feature length < pinyin length,continue'.format(wp)) continue max_input = max(max_input,len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if train and self.augment.available(): for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('load data failed') continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * self.speech_config['wav_max_duration']: continue data = self.augment.process(data) if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py = self.text_to_vocab(txt) if not self.check_valid(py, self.text_featurizer.vocab_array): continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if self.speech_config['use_mel_layer']: speech_features = self.speech_featurizer.pad_signal(speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2]]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) for i in range(len(y1)): if y1[i].shape[0] < max_label1: pad = np.ones(max_label1 - y1[i].shape[0])*self.text_featurizer.pad y1[i] = np.hstack((y1[i], pad)) x = np.array(speech_features, 'float32') y1 = np.array(y1, 'int32') input_length = np.array(input_length, 'int32') label_length1 = np.array(label_length1, 'int32') return x, input_length, y1, label_length1 def generator(self,train=True): while 1: x, input_length, labels, label_length=self.generate(train) if x.shape[0]==0: print('load data length zero,continue') continue if self.LAS: guide_matrix = self.guided_attention(input_length, label_length, np.max(input_length), label_length.max()) yield x, input_length, labels, label_length,guide_matrix else: yield x, input_length, labels, label_length
class MultiTask_DataLoader(): def __init__(self, config_dict,training=True): self.speech_config = config_dict['speech_config'] self.text1_config = config_dict['decoder1_config'] self.text2_config = config_dict['decoder2_config'] self.text3_config = config_dict['decoder3_config'] self.text4_config = config_dict['decoder4_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config']['batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.token1_featurizer = TextFeaturizer(self.text1_config) self.token2_featurizer = TextFeaturizer(self.text2_config) self.token3_featurizer = TextFeaturizer(self.text3_config) self.token4_featurizer = TextFeaturizer(self.text4_config) self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training) self.make_maps(config_dict) self.augment = Augmentation(self.augment_config) self.epochs = 1 self.LAS=True self.steps = 0 self.init_bert(config_dict) def load_state(self,outdir): try: self.pick_index=np.load(os.path.join(outdir,'dg_state.npy')).flatten().tolist() self.epochs=1+int(np.mean(self.pick_index)) except FileNotFoundError: print('not found state file') except: print('load state falied,use init state') def save_state(self,outdir): np.save(os.path.join(outdir,'dg_state.npy'),np.array(self.pick_index)) def load_bert(self, config, checkpoint): model = load_trained_model_from_checkpoint(config, checkpoint, trainable=False, seq_len=None) return model def init_bert(self,config): bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.load_bert(bert_config, bert_checkpoint) def bert_decode(self, x): tokens, segs = [], [] for i in x: t, s = self.bert_token.encode(''.join(i)) tokens.append(t) segs.append(s) return tokens, segs def get_bert_feature(self, bert_t, bert_s): f = [] for t, s in zip(bert_t, bert_s): t = np.expand_dims(np.array(t), 0) s = np.expand_dims(np.array(s), 0) feature = self.bert.predict([t, s]) f.append(feature[0]) return f[0][1:] def return_data_types(self): return (tf.float32, tf.float32, tf.float32,tf.int32, tf.int32,tf.int32,tf.int32,tf.int32,tf.int32,tf.int32,tf.int32, tf.int32,tf.float32) def return_data_shape(self): f,c=self.speech_featurizer.compute_feature_dim() return ( tf.TensorShape([None,None,f,c]), tf.TensorShape([None,None,1]), tf.TensorShape([None, None, 768]), tf.TensorShape([None,]), tf.TensorShape([None,None]), tf.TensorShape([None,]), tf.TensorShape([None, None]), tf.TensorShape([None, ]), tf.TensorShape([None, None]), tf.TensorShape([None, ]), tf.TensorShape([None, None]), tf.TensorShape([None, ]), tf.TensorShape([None,None,None]) ) def get_per_epoch_steps(self): return len(self.train_list)//self.batch def eval_per_epoch_steps(self): return len(self.test_list)//self.batch def make_maps(self,config): with open(config['map_path']['pinyin'],encoding='utf-8') as f: data=f.readlines() data=[i.strip() for i in data if i!=''] self.py_map={} for line in data: key,py=line.strip().split('\t') self.py_map[key]=py if len(py.split(' '))>1: for i,j in zip(list(key),py.split(' ')): self.py_map[i]=j with open(config['map_path']['phone'],encoding='utf-8') as f: data=f.readlines() data=[i.strip() for i in data if i!=''] self.phone_map={} phone_map={} for line in data: key,py=line.strip().split('\t') phone_map[key]=py for key in self.py_map.keys(): key_py=self.py_map[key] if len(key)>1: phone=[] for n in key_py.split(' '): phone+=[phone_map[n]] self.phone_map[key]=' '.join(phone) else: self.phone_map[key]=phone_map[self.py_map[key]] def map(self,txt): cut=lcut(txt) pys=[] phones=[] words=[] for i in cut: word=i.word if word in self.py_map.keys(): py=self.py_map[word] phone=self.phone_map[word] pys+=py.split(' ') phones+=phone.split(' ') words+=list(''.join(py.split(' '))) else: for j in word: pys+=[self.py_map[j]] phones+=self.phone_map[j].split(' ') words+=list(''.join(self.py_map[j])) return pys,phones,words def augment_data(self, wavs, label, label_length): if not self.augment.available(): return None mels = [] input_length = [] label_ = [] label_length_ = [] wavs_ = [] max_input = 0 max_wav = 0 for idx, wav in enumerate(wavs): data = self.augment.process(wav.flatten()) speech_feature = self.speech_featurizer.extract(data) if speech_feature.shape[0] // self.speech_config['reduction_factor'] < label_length[idx]: continue max_input = max(max_input, speech_feature.shape[0]) max_wav = max(max_wav, len(data)) wavs_.append(data) mels.append(speech_feature) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) label_.append(label[idx]) label_length_.append(label_length[idx]) for i in range(len(mels)): if mels[i].shape[0] < max_input: pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1],mels[i].shape[2]]) * mels[i].min() mels[i] = np.vstack((mels[i], pad)) wavs_ = self.speech_featurizer.pad_signal(wavs_, max_wav) x = np.array(mels, 'float32') label_ = np.array(label_, 'int32') input_length = np.array(input_length, 'int32') label_length_ = np.array(label_length_, 'int32') wavs_ = np.array(np.expand_dims(wavs_, -1), 'float32') return x, wavs_, input_length, label_, label_length_ def make_file_list(self, wav_list,training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data=[i.strip() for i in data if i!=''] num = len(data) if training: self.train_list = data[:int(num * 0.99)] self.test_list = data[int(num * 0.99):] np.random.shuffle(self.train_list) self.pick_index = [0.] * len(self.train_list) else: self.test_list=data self.offset=0 def only_chinese(self, word): for ch in word: if '\u4e00' <= ch <= '\u9fff': pass else: return False return True def eval_data_generator(self): sample=self.test_list[self.offset:self.offset+self.batch] self.offset+=self.batch mels = [] input_length = [] words_label = [] words_label_length = [] phone_label = [] phone_label_length = [] py_label = [] py_label_length = [] txt_label = [] txt_label_length = [] bert_features=[] wavs = [] max_wav = 0 max_input = 0 max_label_words = 0 max_label_phone = 0 max_label_py = 0 max_label_txt = 0 for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('load data failed') continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * 7: continue if not self.only_chinese(txt): continue speech_feature = self.speech_featurizer.extract(data) max_input = max(max_input, speech_feature.shape[0]) py,phone,word = self.map(txt) if len(py) == 0: continue e_bert_t, e_bert_s = self.bert_decode([txt]) bert_feature = self.get_bert_feature(e_bert_t, e_bert_s) word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) txt_text_feature = self.token4_featurizer.extract(list(txt)) max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_label_txt = max(max_label_txt, len(txt_text_feature)) max_wav = max(max_wav, len(data)) if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature): continue mels.append(speech_feature) wavs.append(data) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) txt_label.append(np.array(txt_text_feature)) txt_label_length.append(len(txt_text_feature)) bert_features.append(bert_feature) for i in range(len(mels)): if mels[i].shape[0] < max_input: pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1], mels[i].shape[2]]) * mels[i].min() mels[i] = np.vstack((mels[i], pad)) for i in range(len(bert_features)): if bert_features[i].shape[0] < max_label_txt: pading = np.ones([max_label_txt - len(bert_features[i]), 768]) * -10. bert_features[i] = np.vstack((bert_features[i], pading)) wavs = self.speech_featurizer.pad_signal(wavs, max_wav) words_label = self.pad(words_label, max_label_words) phone_label = self.pad(phone_label, max_label_phone) py_label = self.pad(py_label, max_label_py) txt_label = self.pad(txt_label, max_label_txt) x = np.array(mels, 'float32') bert_features = np.array(bert_features, 'float32') words_label = np.array(words_label, 'int32') phone_label = np.array(phone_label, 'int32') py_label = np.array(py_label, 'int32') txt_label = np.array(txt_label, 'int32') input_length = np.array(input_length, 'int32') words_label_length = np.array(words_label_length, 'int32') phone_label_length = np.array(phone_label_length, 'int32') py_label_length = np.array(py_label_length, 'int32') txt_label_length = np.array(txt_label_length, 'int32') wavs = np.array(np.expand_dims(wavs, -1), 'float32') return x, wavs, bert_features,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length def pad(self,words_label,max_label_words): for i in range(len(words_label)): if words_label[i].shape[0] < max_label_words: pad = np.ones(max_label_words - words_label[i].shape[0]) * self.token1_featurizer.pad words_label[i] = np.hstack((words_label[i], pad)) return words_label def GuidedAttention(self, N, T, g=0.2): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N)) ** 2 / (2 * g * g)) return W def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] for i, j in zip(input_length, targets_length): i = int(i) step = int(j) pad = np.ones([inputs_shape, mel_target_shape]) * -1. pad[i:, :step] = 1 att_target = self.GuidedAttention(i, step, 0.2) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_targets = np.array(att_targets) return att_targets.astype('float32') def generate(self, train=True): if train: batch=self.batch if self.augment.available() else self.batch*2 indexs = np.argsort(self.pick_index)[:batch] indexs = random.sample(indexs.tolist(), batch//2) sample = [self.train_list[i] for i in indexs] for i in indexs: self.pick_index[int(i)] += 1 self.epochs = 1+int(np.mean(self.pick_index)) else: sample = random.sample(self.test_list, self.batch) mels = [] input_length = [] words_label = [] words_label_length = [] phone_label = [] phone_label_length = [] py_label = [] py_label_length = [] txt_label = [] txt_label_length = [] bert_features = [] wavs = [] max_wav = 0 max_input = 0 max_label_words = 0 max_label_phone = 0 max_label_py = 0 max_label_txt = 0 for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('load data failed') continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * 7: continue if not self.only_chinese(txt): continue speech_feature = self.speech_featurizer.extract(data) py, phone, word = self.map(txt) if len(py) == 0 or len(phone)==0 or len(word)==0: continue e_bert_t, e_bert_s = self.bert_decode([txt]) bert_feature = self.get_bert_feature(e_bert_t, e_bert_s) word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) txt_text_feature = self.token4_featurizer.extract(list(txt)) if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature) or \ speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(word_text_feature) or \ speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(phone_text_feature): continue max_input = max(max_input, speech_feature.shape[0]) max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_label_txt = max(max_label_txt, len(txt_text_feature)) max_wav = max(max_wav, len(data)) mels.append(speech_feature) wavs.append(data) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) txt_label.append(np.array(txt_text_feature)) txt_label_length.append(len(txt_text_feature)) bert_features.append(bert_feature) if train and self.augment.available(): for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('load data failed') continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * 7: continue if not self.only_chinese(txt): continue data=self.augment.process(data) speech_feature = self.speech_featurizer.extract(data) py, phone, word = self.map(txt) if len(py) == 0 or len(phone) == 0 or len(word) == 0: continue e_bert_t, e_bert_s = self.bert_decode([txt]) bert_feature = self.get_bert_feature(e_bert_t, e_bert_s) word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) txt_text_feature = self.token4_featurizer.extract(list(txt)) if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature) or \ speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(word_text_feature) or \ speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(phone_text_feature): continue max_input = max(max_input, speech_feature.shape[0]) max_wav = max(max_wav, len(data)) max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_label_txt = max(max_label_txt, len(txt_text_feature)) mels.append(speech_feature) wavs.append(data) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) txt_label.append(np.array(txt_text_feature)) txt_label_length.append(len(txt_text_feature)) bert_features.append(bert_feature) for i in range(len(mels)): if mels[i].shape[0] < max_input: pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1], mels[i].shape[2]]) * mels[i].min() mels[i] = np.vstack((mels[i], pad)) for i in range(len(bert_features)): if bert_features[i].shape[0]<max_label_txt: pading = np.ones([max_label_txt - len(bert_features[i]), 768]) * -10. bert_features[i] = np.vstack((bert_features[i], pading)) wavs = self.speech_featurizer.pad_signal(wavs, max_wav) words_label = self.pad(words_label, max_label_words) phone_label = self.pad(phone_label, max_label_phone) py_label = self.pad(py_label, max_label_py) txt_label = self.pad(txt_label, max_label_txt) x = np.array(mels, 'float32') bert_features = np.array(bert_features, 'float32') words_label = np.array(words_label, 'int32') phone_label = np.array(phone_label, 'int32') py_label = np.array(py_label, 'int32') txt_label = np.array(txt_label, 'int32') input_length = np.array(input_length, 'int32') words_label_length = np.array(words_label_length, 'int32') phone_label_length = np.array(phone_label_length, 'int32') py_label_length = np.array(py_label_length, 'int32') txt_label_length = np.array(txt_label_length, 'int32') wavs = np.array(np.expand_dims(wavs, -1), 'float32') return x, wavs, bert_features,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length def generator(self,train=True): while 1: x, wavs,bert_feature, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length=self.generate(train) guide_matrix = self.guided_attention(input_length, txt_label_length, np.max(input_length), txt_label_length.max()) yield x, wavs, bert_feature,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length,guide_matrix
class StreamingASR(object): def __init__(self, config): self.am = AM(config) self.am.load_model(False) self.speech_config = config['speech_config'] self.text_config = config['decoder_config'] self.speech_feature = SpeechFeaturizer(self.speech_config) self.text_featurizer = TextFeaturizer(self.text_config) self.decoded = tf.constant([self.text_featurizer.start]) def stream_detect(self, inputs): data = self.speech_feature.load_wav(inputs) if self.am.model.mel_layer is None: mel = self.speech_feature.extract(data) x = np.expand_dims(mel, 0) else: mel = data.reshape([1, -1, 1]) x = self.am.model.mel_layer(mel) x = self.am.model.encoder(x) # TensorShape([1, 109, 144]) step = x.shape[1] i = 0 while i < step: self.step_decode(x[:, i]) i = i+1 def step_decode(self, step_input): enc = tf.reshape(step_input, [1, 1, -1]) y = self.am.model.predict_net(inputs=tf.reshape(self.decoded, [1, -1]), p_memory_states=None, training=False) y = y[:, -1:] z = self.am.model.joint_net([enc, y], training=False) probs = tf.squeeze(tf.nn.log_softmax(z)) pred = tf.argmax(probs, axis=-1, output_type=tf.int32) pred = tf.reshape(pred, [1]) if pred != 0 and pred != self.text_featurizer.blank: self.decoded = tf.concat([self.decoded, pred], axis=0) print("pred: {}".format(self.text_featurizer.index_to_token[pred.numpy().tolist()[0]])) def predict_stack_buffer(self, wavfile): data = self.speech_feature.load_wav(wavfile) buffer_step = int(len(data) / 16000) j = 0 while j < buffer_step: buffer = data[j * 16000 - j * 5000: (j + 1) * 16000] if self.am.model.mel_layer is None: mel = self.speech_feature.extract(buffer) x = np.expand_dims(mel, 0) else: mel = buffer.reshape([1, -1, 1]) x = self.am.model.mel_layer(mel) x = self.am.model.encoder(x) step = x.shape[1] i = 0 while i < step: enc = tf.reshape(x[:, i], [1, 1, -1]) y = self.am.model.predict_net(inputs=tf.reshape(self.decoded, [1, -1]), p_memory_states=None, training=False) y = y[:, -1:] z = self.am.model.joint_net([enc, y], training=False) logits = tf.squeeze(tf.nn.log_softmax(z)) pred = tf.argmax(logits, axis=-1, output_type=tf.int32) pred = tf.reshape(pred, [1]) if pred != 0 and pred != self.text_featurizer.blank: self.decoded = tf.concat([self.decoded, pred], axis=0) print("buffer_step: {}, " "step: {}, " "pred: {}".format(j, i, self.text_featurizer.index_to_token[pred.numpy().tolist()[0]])) i += 1 j += 1 print(1)
block_outputs=self.block(fc_outputs,training=training) outputs+=self.out_cnn(block_outputs,training=training) return outputs if __name__ == '__main__': from utils.user_config import UserConfig from utils.text_featurizers import TextFeaturizer from utils.speech_featurizers import SpeechFeaturizer import os import time os.environ['CUDA_VISIBLE_DEVICES']='1' config=UserConfig(r'D:\TF2-ASR\configs\am_data.yml',r'D:\TF2-ASR\configs\conformer.yml') config['decoder_config'].update({'model_type':'LAS'}) Tfer=TextFeaturizer(config['decoder_config']) SFer=SpeechFeaturizer(config['speech_config']) f,c=SFer.compute_feature_dim() config['model_config']['LAS_decoder'].update({'n_classes': Tfer.num_classes}) config['model_config']['LAS_decoder'].update({'startid': Tfer.start}) ct=ConformerLAS(config['model_config'],training=False) # ct.add_featurizers(Tfer) x=tf.ones([1,300,f,c]) length=tf.constant([300]) out=ct._build([1,300,f,c],training=True) ct.inference(x,length//4) s=time.time() a=ct.inference(x,length//4) e=time.time() print(e-s,a) # ct.summary()
class TacotronDataLoader(): def __init__(self, config, training=True): self.speech_featurizer = SpeechFeaturizer(config) self.text_featurizer = TextFeaturizer(config) self.config = config self.batch = config['batch_size'] self.make_file_list(self.config['train_list'] if training else self.config['eval_list'], training=training) self.min_value = -self.config['max_abs_value'] self._target_pad = -(self.config['max_abs_value'] + 0.1) self._token_pad = 1. self.epochs = 1 self.steps = 0 def make_file_list(self, wav_list, training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data = [i.strip() for i in data if i != ''] num = len(data) if training: self.train_list = data[:int(num * 0.95)] self.test_list = data[int(num * 0.95):] np.random.shuffle(self.train_list) self.train_offset = 0 self.test_offset = 0 logging.info('load train list {} test list{}'.format( len(self.train_list), len(self.test_list))) if self.config['balance_spk_utts']: spk_utt = {} for line in self.train_list: a, b, c = line.strip().split('\t') if c in spk_utt: spk_utt[c].append(line) else: spk_utt[c] = [line] maxlen = max([len(spk_utt[i]) for i in spk_utt]) self.train_list = [] for key in spk_utt: datas = spk_utt[key] if len(datas) < maxlen: factor = int(np.rint(maxlen / len(datas))) else: factor = 1 datas *= factor self.train_list += datas np.random.shuffle(self.train_list) logging.info('balance spk utts: train list {}'.format( len(self.train_list))) else: self.test_list = data self.offset = 0 def get_per_epoch_steps(self): return len(self.train_list) // self.batch def eval_per_epoch_steps(self): return len(self.test_list) // self.batch def return_data_types(self): #charactor, char_length, mel, mel_length, stop_gts, speaker, guided_attention return (tf.int32, tf.int32, tf.float32, tf.int32, tf.float32, tf.int32, tf.float32) def return_data_shape(self): # charactor, char_length, mel, mel_length, stop_gts, speaker, guided_attention return ( tf.TensorShape([None, None]), tf.TensorShape([ None, ]), tf.TensorShape([None, None, self.config['num_mels']]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None, None]), ) def GuidedAttention(self, N, T, g=0.5): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N))**2 / (2 * g * g)) return W def make_Att_targets(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] att_mask = [] mel_target_shape //= self.config['outputs_per_step'] for i, j in zip(input_length, targets_length): # i=inputs_shape step = int(j / self.config['outputs_per_step']) pad = np.zeros([inputs_shape, mel_target_shape]) pad[i:, :step] = 1 maskpad = np.zeros([inputs_shape, mel_target_shape]) maskpad[:, :step] = 1 att_target = self.GuidedAttention(i, step, 0.1) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_mask.append(maskpad) att_targets = np.array(att_targets) att_mask = np.array(att_mask) return att_targets.astype('float32'), att_mask.astype('float32') def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info( 'history train list not equal new load train list ,data loader use init state' ) self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def extractor(self, ): data = self.train_list + self.test_list while self.train_offset < len(data): charactor, char_length, mel, mel_length, speaker = [], [], [], [], [] audios = [] names = [] for i in range(self.batch * 10): line = data[self.train_offset] self.train_offset += 1 if self.train_offset > len(data) - 1: break wav_path, text, spkid = line.strip().split('\t') try: wav = self.speech_featurizer.load_wav(wav_path) target_mel = self.speech_featurizer.melspectrogram(wav) # print(target_mel.shape) except: logging.info('{} load data failed , skip'.format(wav_path)) continue try: text_tokens = self.text_featurizer.extract(text) except: logging.info('{} to token failed,skip'.format(text)) continue try: speaker_id = self.text_featurizer.spker_map[spkid] except: logging.info('{} not in spk map,skip'.format(spkid)) continue audios.append(wav) names.append(os.path.split(wav_path)[-1].replace('.wav', '')) charactor.append(np.array(text_tokens)) char_length.append(len(text_tokens)) mel.append(target_mel) mel_length.append(len(target_mel)) speaker.append([speaker_id]) if len(charactor) == self.batch: break output_per_step = self.config['outputs_per_step'] charactor = self._prepare_inputs(charactor) char_length = np.array(char_length, 'int32') mel = self._prepare_targets(mel, output_per_step) mel_length = np.array(mel_length, 'int32') speaker = np.array(speaker, 'int32') T = mel.shape[1] * self.speech_featurizer.hop_size audios = tf.keras.preprocessing.sequence.pad_sequences( audios, T, 'float32', 'post', 'post') yield charactor, char_length, mel, mel_length, speaker, audios, names def generate(self, train=True): charactor, char_length, mel, mel_length, stop_gts, speaker=[],[],[],[],[],[] for i in range(self.batch * 10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 wav_path, text, spkid = line.strip().split('\t') try: wav = self.speech_featurizer.load_wav(wav_path) target_mel = self.speech_featurizer.melspectrogram(wav) # print(target_mel.shape) except: logging.info('{} load data failed , skip'.format(wav_path)) continue try: text_tokens = self.text_featurizer.extract(text) except: logging.info('{} to token failed,skip'.format(text)) continue try: speaker_id = self.text_featurizer.spker_map[spkid] except: logging.info('{} not in spk map,skip'.format(spkid)) continue token_target = np.asarray([0.] * (len(target_mel) - 1)) charactor.append(np.array(text_tokens)) char_length.append(len(text_tokens)) mel.append(target_mel) mel_length.append(len(target_mel)) stop_gts.append(token_target) speaker.append([speaker_id]) if len(charactor) == self.batch: break output_per_step = self.config['outputs_per_step'] charactor = self._prepare_inputs(charactor) char_length = np.array(char_length, 'int32') mel = self._prepare_targets(mel, output_per_step) mel_length = np.array(mel_length, 'int32') stop_gts = self._prepare_token_targets(stop_gts, output_per_step) speaker = np.array(speaker, 'int32') return charactor, char_length, mel, mel_length, stop_gts, speaker def _prepare_inputs(self, inputs): max_len = max([len(x) for x in inputs]) return np.stack([self._pad_input(x, max_len) for x in inputs]) def _prepare_targets(self, targets, alignment): max_len = max([len(t) for t in targets]) return np.stack([ self._pad_target(t, self._round_up(max_len, alignment)) for t in targets ]) def _prepare_token_targets(self, targets, alignment): max_len = max([len(t) for t in targets]) + 1 return np.stack([ self._pad_token_target(t, self._round_up(max_len, alignment)) for t in targets ]) def _pad_input(self, x, length): return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self.text_featurizer.pad) def _pad_target(self, t, length): return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad) def _pad_token_target(self, t, length): return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=self._token_pad) def _round_down(self, x, multiple): remainder = x % multiple return x if remainder == 0 else x - remainder def _round_up(self, x, multiple): remainder = x % multiple return x if remainder == 0 else x + multiple - remainder def generator(self, train=True): while 1: charactor, char_length, mel, mel_length, stop_gts, speaker = self.generate( train) if charactor.shape[0] == 0: logging.info('load data length zero,continue') continue guide_matrix, _ = self.make_Att_targets(char_length, mel_length, np.max(char_length), np.max(mel_length)) yield charactor.astype('int32'), char_length.astype('int32'), mel.astype('float32'), mel_length.astype('int32'), stop_gts.astype('float32'), speaker.astype('int32')\ ,guide_matrix.astype('float32')
class AM_DataLoader(): def __init__(self, config_dict, training=True): self.speech_config = config_dict['speech_config'] self.text_config = config_dict['decoder_config'] self.augment_config = config_dict['augments_config'] self.streaming = self.speech_config['streaming'] self.chunk = self.speech_config['sample_rate'] * self.speech_config[ 'streaming_bucket'] self.batch = config_dict['learning_config']['running_config'][ 'batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.text_featurizer = TextFeaturizer(self.text_config) self.make_file_list( self.speech_config['train_list'] if training else self.speech_config['eval_list'], training) self.augment = Augmentation(self.augment_config) self.init_text_to_vocab() self.epochs = 1 self.LAS = False self.steps = 0 def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info( 'history train list not equal new load train list ,data loader use init state' ) self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def return_data_types(self): if self.LAS: return (tf.float32, tf.int32, tf.int32, tf.int32, tf.float32) else: return (tf.float32, tf.int32, tf.int32, tf.int32) def return_data_shape(self): f, c = self.speech_featurizer.compute_feature_dim() if self.LAS: return (tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape([None, None, f, c]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ]), tf.TensorShape([None, None, None])) else: return (tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape([None, None, f, c]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ])) def get_per_epoch_steps(self): return len(self.train_list) // self.batch def eval_per_epoch_steps(self): return len(self.test_list) // self.batch def init_text_to_vocab(self): pypinyin.load_phrases_dict({ '调大': [['tiáo'], ['dà']], '调小': [['tiáo'], ['xiǎo']], '调亮': [['tiáo'], ['liàng']], '调暗': [['tiáo'], ['àn']], '肖': [['xiāo']], '英雄传': [['yīng'], ['xióng'], ['zhuàn']], '新传': [['xīn'], ['zhuàn']], '外传': [['wài'], ['zhuàn']], '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']] }) def text_to_vocab_func(txt): pins = pypinyin.pinyin(txt) pins = [i[0] for i in pins] return pins self.text_to_vocab = text_to_vocab_func def make_file_list(self, wav_list, training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data = [i.strip() for i in data if i != ''] num = len(data) if training: self.train_list = data[:int(num * 0.99)] self.test_list = data[int(num * 0.99):] np.random.shuffle(self.train_list) self.train_offset = 0 self.test_offset = 0 logging.info('load train list {} test list{}'.format( len(self.train_list), len(self.test_list))) else: self.test_list = data self.offset = 0 def only_chinese(self, word): txt = '' for ch in word: if '\u4e00' <= ch <= '\u9fff': txt += ch else: continue return txt def eval_data_generator(self): sample = self.test_list[self.offset:self.offset + self.batch] self.offset += self.batch speech_features = [] input_length = [] y1 = [] label_length1 = [] max_input = 0 max_label1 = 0 for i in sample: wp, txt = i.strip().split('\t') txt = txt.replace(' ', '') try: data = self.speech_featurizer.load_wav(wp) except: logging.info('{} load data failed,skip'.format(wp)) continue if len(data) < 400: logging.info('{} wav too short < 25ms,skip'.format(wp)) continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: logging.info( '{} duration out of wav_max_duration({}) ,skip'.format( wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: if not self.streaming: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = data speech_feature = np.expand_dims(speech_feature, -1) reduce = self.speech_config['reduction_factor'] * ( self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms'] in_len = len(speech_feature) // self.chunk if len(speech_feature) % self.chunk != 0: in_len += 1 chunk_times = self.chunk // reduce if self.chunk % reduce != 0: chunk_times += 1 in_len *= chunk_times else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) max_input = max(max_input, speech_feature.shape[0]) py = self.text_to_vocab(txt) if self.check_valid(py, self.text_featurizer.vocab_array) is not True: logging.info(' {} txt pinyin {} not all in tokens,skip'.format( txt, self.check_valid(py, self.text_featurizer.vocab_array))) continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): logging.info( '{} feature length < pinyin length,skip'.format(wp)) continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if self.speech_config['use_mel_layer']: if self.streaming: max_input = max_input // self.chunk * self.chunk + self.chunk speech_features = self.speech_featurizer.pad_signal( speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([ max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2] ]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) for i in range(len(y1)): if y1[i].shape[0] < max_label1: pad = np.ones(max_label1 - y1[i].shape[0]) * self.text_featurizer.pad y1[i] = np.hstack((y1[i], pad)) x = np.array(speech_features, 'float32') y1 = np.array(y1, 'int32') input_length = np.array(input_length, 'int32') label_length1 = np.array(label_length1, 'int32') return x, input_length, y1, label_length1 def check_valid(self, txt, vocab_list): if len(txt) == 0: return False for n in txt: if n in vocab_list: pass else: return n return True def GuidedAttentionMatrix(self, N, T, g=0.2): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N))**2 / (2 * g * g)) return W def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] for i, j in zip(input_length, targets_length): i = int(i) step = int(j) pad = np.ones([inputs_shape, mel_target_shape]) * -1. pad[i:, :step] = 1 att_target = self.GuidedAttentionMatrix(i, step, 0.2) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_targets = np.array(att_targets) return att_targets.astype('float32') def generate(self, train=True): sample = [] speech_features = [] input_length = [] y1 = [] label_length1 = [] max_input = 0 max_label1 = 0 if train: batch = self.batch // 2 if self.augment.available() else self.batch else: batch = self.batch for i in range(batch * 10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 wp, txt = line.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: logging.info('{} load data failed,skip'.format(wp)) continue if len(data) < 400: continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: logging.info( '{} duration out of wav_max_duration({}),skip'.format( wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: if not self.streaming: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = data speech_feature = np.expand_dims(speech_feature, -1) reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \ self.speech_config['stride_ms'] in_len = len(speech_feature) // self.chunk if len(speech_feature) % self.chunk != 0: in_len += 1 chunk_times = self.chunk // reduce if self.chunk % reduce != 0: chunk_times += 1 in_len *= chunk_times else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py = self.text_to_vocab(txt) if self.check_valid(py, self.text_featurizer.vocab_array) is not True: logging.info( ' {} txt pinyin {} not all in tokens,continue'.format( txt, self.check_valid(py, self.text_featurizer.vocab_array))) continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): logging.info( '{} feature length < pinyin length,continue'.format(wp)) continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) sample.append(line) if len(sample) == batch: break if train and self.augment.available(): for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: continue if len(data) < 400: logging.info('{} wav too short < 25ms,skip'.format(wp)) continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: continue data = self.augment.process(data) if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: if not self.streaming: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = data speech_feature = np.expand_dims(speech_feature, -1) reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \ self.speech_config['stride_ms'] in_len = len(speech_feature) // self.chunk if len(speech_feature) % self.chunk != 0: in_len += 1 chunk_times = self.chunk // reduce if self.chunk % reduce != 0: chunk_times += 1 in_len *= chunk_times else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py = self.text_to_vocab(txt) if not self.check_valid(py, self.text_featurizer.vocab_array): continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if self.speech_config['use_mel_layer']: if self.streaming: reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \ self.speech_config['stride_ms'] max_input = max_input // self.chunk * self.chunk + self.chunk max_in_len = max_input // self.chunk chunk_times = self.chunk // reduce if self.chunk % reduce != 0: chunk_times += 1 max_in_len *= chunk_times input_length = np.clip(input_length, 0, max_in_len) speech_features = self.speech_featurizer.pad_signal( speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([ max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2] ]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) for i in range(len(y1)): if y1[i].shape[0] < max_label1: pad = np.ones(max_label1 - y1[i].shape[0]) * self.text_featurizer.pad y1[i] = np.hstack((y1[i], pad)) x = np.array(speech_features, 'float32') y1 = np.array(y1, 'int32') input_length = np.array(input_length, 'int32') label_length1 = np.array(label_length1, 'int32') return x, input_length, y1, label_length1 def generator(self, train=True): while 1: x, input_length, labels, label_length = self.generate(train) if x.shape[0] == 0: logging.info('load data length zero,continue') continue if self.LAS: guide_matrix = self.guided_attention(input_length, label_length, np.max(input_length), label_length.max()) yield x, input_length, labels, label_length, guide_matrix else: yield x, input_length, labels, label_length
class AM(): def __init__(self, config): self.config = config self.update_model_type() self.speech_config = self.config['speech_config'] if self.model_type != 'MultiTask': self.text_config = self.config['decoder_config'] else: self.text_config = self.config['decoder3_config'] self.model_config = self.config['model_config'] self.text_feature = TextFeaturizer(self.text_config, True) self.speech_feature = SpeechFeaturizer(self.speech_config) self.init_steps = None def update_model_type(self): if 'Streaming' in self.config['model_config']['name']: assert self.config['speech_config']['streaming'] is True assert 'Conformer' in self.config['model_config']['name'] else: assert self.config['speech_config']['streaming'] is False if 'CTC' in self.config['model_config'][ 'name'] and 'Multi' not in self.config['model_config']['name']: self.config['decoder_config'].update({'model_type': 'CTC'}) self.model_type = 'CTC' elif 'Multi' in self.config['model_config']['name']: self.config['decoder1_config'].update({'model_type': 'CTC'}) self.config['decoder2_config'].update({'model_type': 'CTC'}) self.config['decoder3_config'].update({'model_type': 'CTC'}) self.config['decoder_config'].update({'model_type': 'CTC'}) self.model_type = 'MultiTask' elif 'LAS' in self.config['model_config']['name']: self.config['decoder_config'].update({'model_type': 'LAS'}) self.model_type = 'LAS' else: self.config['decoder_config'].update({'model_type': 'Transducer'}) self.model_type = 'Transducer' def conformer_model(self, training): from AMmodel.streaming_conformer import StreamingConformerCTC, StreamingConformerTransducer from AMmodel.conformer import ConformerCTC, ConformerLAS, ConformerTransducer self.model_config.update( {'vocabulary_size': self.text_feature.num_classes}) if self.model_config['name'] == 'ConformerTransducer': self.model_config.pop('LAS_decoder') self.model_config.pop('enable_tflite_convertible') self.model_config.update({'speech_config': self.speech_config}) self.model = ConformerTransducer(**self.model_config) elif self.model_config['name'] == 'ConformerCTC': self.model_config.update({'speech_config': self.speech_config}) self.model = ConformerCTC(**self.model_config) elif self.model_config['name'] == 'ConformerLAS': self.config['model_config']['LAS_decoder'].update( {'n_classes': self.text_feature.num_classes}) self.config['model_config']['LAS_decoder'].update( {'startid': self.text_feature.start}) self.model = ConformerLAS( self.config['model_config'], training=training, enable_tflite_convertible=self.config['model_config'] ['enable_tflite_convertible'], speech_config=self.speech_config) elif self.model_config['name'] == 'StreamingConformerCTC': self.model_config.update({'speech_config': self.speech_config}) self.model = StreamingConformerCTC(**self.model_config) elif self.model_config['name'] == 'StreamingConformerTransducer': self.model_config.pop('enable_tflite_convertible') self.model_config.update({'speech_config': self.speech_config}) self.model = StreamingConformerTransducer(**self.model_config) else: raise ('not in supported model list') def ds2_model(self, training): from AMmodel.deepspeech2 import DeepSpeech2CTC, DeepSpeech2LAS, DeepSpeech2Transducer self.model_config['Transducer_decoder'][ 'vocabulary_size'] = self.text_feature.num_classes f, c = self.speech_feature.compute_feature_dim() input_shape = [None, f, c] self.model_config.update({'input_shape': input_shape}) self.model_config.update( {'dmodel': self.model_config['rnn_conf']['rnn_units']}) if self.model_config['name'] == 'DeepSpeech2Transducer': self.model_config.pop('LAS_decoder') self.model_config.pop('enable_tflite_convertible') self.model = DeepSpeech2Transducer( input_shape, self.model_config, speech_config=self.speech_config) elif self.model_config['name'] == 'DeepSpeech2CTC': self.model = DeepSpeech2CTC(input_shape, self.model_config, self.text_feature.num_classes, speech_config=self.speech_config) elif self.model_config['name'] == 'DeepSpeech2LAS': self.model_config['LAS_decoder'].update( {'n_classes': self.text_feature.num_classes}) self.model_config['LAS_decoder'].update( {'startid': self.text_feature.start}) self.model = DeepSpeech2LAS( self.model_config, input_shape, training=training, enable_tflite_convertible=self. model_config['enable_tflite_convertible'], speech_config=self.speech_config) else: raise ('not in supported model list') def multi_task_model(self, training): from AMmodel.MultiConformer import ConformerMultiTaskCTC token1_feature = TextFeaturizer(self.config['decoder1_config']) token2_feature = TextFeaturizer(self.config['decoder2_config']) token3_feature = TextFeaturizer(self.config['decoder3_config']) self.model_config.update({ 'classes1': token1_feature.num_classes, 'classes2': token2_feature.num_classes, 'classes3': token3_feature.num_classes, }) self.model = ConformerMultiTaskCTC(self.model_config, training=training, speech_config=self.speech_config) def load_model(self, training=True): if 'Multi' in self.model_config['name']: self.multi_task_model(training) elif 'Conformer' in self.model_config['name']: self.conformer_model(training) else: self.ds2_model(training) self.model.add_featurizers(self.text_feature) f, c = self.speech_feature.compute_feature_dim() if not training: if self.text_config['model_type'] != 'LAS': if self.model.mel_layer is not None: self.model._build([ 3, 16000 if self.speech_config['streaming'] is False else self.model.chunk_size * 2, 1 ]) self.model.return_pb_function([None, None, 1]) else: self.model._build([3, 80, f, c]) self.model.return_pb_function([None, None, f, c]) else: if self.model.mel_layer is not None: self.model._build([ 3, 16000 if self.speech_config['streaming'] is False else self.model.chunk_size * 2, 1 ], training) self.model.return_pb_function([None, None, 1]) else: self.model._build([2, 80, f, c], training) self.model.return_pb_function([None, None, f, c]) self.load_checkpoint(self.config) def convert_to_pb(self, export_path): import tensorflow as tf concrete_func = self.model.recognize_pb.get_concrete_function() tf.saved_model.save(self.model, export_path, signatures=concrete_func) def decode_result(self, word): de = [] for i in word: if i != self.text_feature.stop: de.append(self.text_feature.index_to_token[int(i)]) else: break return de def predict(self, fp): if '.pcm' in fp: data = np.fromfile(fp, 'int16') data = np.array(data, 'float32') data /= 32768 else: data = self.speech_feature.load_wav(fp) if self.model.mel_layer is None: mel = self.speech_feature.extract(data) mel = np.expand_dims(mel, 0) input_length = np.array( [[mel.shape[1] // self.model.time_reduction_factor]], 'int32') else: mel = data.reshape([1, -1, 1]) input_length = np.array([[ mel.shape[1] // self.model.time_reduction_factor // (self.speech_config['sample_rate'] * self.speech_config['stride_ms'] / 1000) ]], 'int32') if self.speech_config['streaming']: chunk_size = self.model.chunk_size if mel.shape[1] % chunk_size != 0: T = mel.shape[1] // chunk_size * chunk_size + chunk_size pad_T = T - mel.shape[1] mel = np.hstack((mel, np.zeros([1, pad_T, 1]))) mel = mel.reshape([1, -1, chunk_size, 1]) mel = mel.astype('float32') if 'CTC' in self.model_type: enc_outputs = None result = [] for i in range(mel.shape[1]): input_wav = mel[:, i] es = time.time() enc_output = self.model.extract_feature(input_wav) ee = time.time() enc_output = enc_output.numpy() if enc_outputs is not None: enc_outputs = np.hstack((enc_outputs, enc_output)) else: enc_outputs = enc_output ds = time.time() result_ = self.model.ctc_decode( enc_outputs, np.array([[enc_outputs.shape[1]]], 'int32')) de = time.time() print('extract cost time:', ee - es, 'ctc decode time:', de - ds) result_ = result_.numpy()[0] for n in result_: if n != -1: result.append(n) result = np.array(result) result = np.expand_dims(result, 0) else: states, result = self.model.initial_states(mel) enc_outputs = None start_B = 0 for i in range(mel.shape[1]): input_wav = mel[:, i] es = time.time() enc_output = self.model.extract_feature(input_wav) ee = time.time() enc_output = enc_output.numpy() if enc_outputs is not None: enc_outputs = np.hstack((enc_outputs, enc_output)) else: enc_outputs = enc_output ds = time.time() result, states, start_B = self.model.perform_greedy( enc_outputs, states, result, start_B) de = time.time() print('extract cost time:', ee - es, 'lstm decode time:', de - ds, 'next start T:', int(start_B)) result = result.numpy() result = np.hstack((result, np.ones([1]))) else: result = self.model.recognize_pb(mel, input_length)[0] return result def load_checkpoint(self, config): """Load checkpoint.""" self.checkpoint_dir = os.path.join( config['learning_config']['running_config']["outdir"], "checkpoints") files = os.listdir(self.checkpoint_dir) files.sort(key=lambda x: int(x.split('_')[-1].replace('.h5', ''))) self.model.load_weights(os.path.join(self.checkpoint_dir, files[-1])) self.init_steps = int(files[-1].split('_')[-1].replace('.h5', ''))
class AM(): def __init__(self, config): self.config = config self.update_model_type() self.speech_config = self.config['speech_config'] try: self.text_config = self.config['decoder_config'] except: self.text_config = self.config['decoder4_config'] self.model_config = self.config['model_config'] self.text_feature = TextFeaturizer(self.text_config) self.speech_feature = SpeechFeaturizer(self.speech_config) self.init_steps = None def update_model_type(self): if 'CTC' in self.config['model_config']['name']: self.config['decoder_config'].update({'model_type': 'CTC'}) self.model_type = 'CTC' elif 'Multi' in self.config['model_config']['name']: self.config['decoder1_config'].update({'model_type': 'CTC'}) self.config['decoder2_config'].update({'model_type': 'CTC'}) self.config['decoder3_config'].update({'model_type': 'CTC'}) self.config['decoder4_config'].update({'model_type': 'LAS'}) self.config['decoder_config'].update({'model_type': 'LAS'}) self.model_type = 'MultiTask' elif 'LAS' in self.config['model_config']['name']: self.config['decoder_config'].update({'model_type': 'LAS'}) self.model_type = 'LAS' else: self.config['decoder_config'].update({'model_type': 'Transducer'}) self.model_type = 'Transducer' def conformer_model(self, training): from AMmodel.conformer import ConformerTransducer, ConformerCTC, ConformerLAS self.model_config.update( {'vocabulary_size': self.text_feature.num_classes}) if self.model_config['name'] == 'ConformerTransducer': self.model_config.pop('LAS_decoder') self.model_config.pop('enable_tflite_convertible') self.model = ConformerTransducer(**self.model_config) elif self.model_config['name'] == 'ConformerCTC': self.model = ConformerCTC(**self.model_config) elif self.model_config['name'] == 'ConformerLAS': self.config['model_config']['LAS_decoder'].update( {'n_classes': self.text_feature.num_classes}) self.config['model_config']['LAS_decoder'].update( {'startid': self.text_feature.start}) self.model = ConformerLAS( self.config['model_config'], training=training, enable_tflite_convertible=self.config['model_config'] ['enable_tflite_convertible']) else: raise ('not in supported model list') def ds2_model(self, training): from AMmodel.deepspeech2 import DeepSpeech2CTC, DeepSpeech2LAS, DeepSpeech2Transducer f, c = self.speech_feature.compute_feature_dim() input_shape = [None, f, c] self.model_config.update({'input_shape': input_shape}) if self.model_config['name'] == 'DeepSpeech2Transducer': self.model_config.pop('LAS_decoder') self.model_config.pop('enable_tflite_convertible') self.model = DeepSpeech2Transducer(input_shape, self.model_config) elif self.model_config['name'] == 'DeepSpeech2CTC': self.model = DeepSpeech2CTC(input_shape, self.model_config, self.text_feature.num_classes) elif self.model_config['name'] == 'DeepSpeech2LAS': self.model_config['LAS_decoder'].update( {'n_classes': self.text_feature.num_classes}) self.model_config['LAS_decoder'].update( {'startid': self.text_feature.start}) self.model = DeepSpeech2LAS( self.model_config, input_shape, training=training, enable_tflite_convertible=self. model_config['enable_tflite_convertible']) else: raise ('not in supported model list') def multi_task_model(self, training): from AMmodel.MultiConformer import ConformerMultiTaskLAS token1_feature = TextFeaturizer(self.config['decoder1_config']) token2_feature = TextFeaturizer(self.config['decoder2_config']) token3_feature = TextFeaturizer(self.config['decoder3_config']) token4_feature = TextFeaturizer(self.config['decoder4_config']) self.model_config.update({ 'classes1': token1_feature.num_classes, 'classes2': token2_feature.num_classes, 'classes3': token3_feature.num_classes, }) self.model_config['LAS_decoder'].update( {'n_classes': token4_feature.num_classes}) self.model_config['LAS_decoder'].update( {'startid': token4_feature.start}) self.model = ConformerMultiTaskLAS( self.model_config, training=training, enable_tflite_convertible=self. model_config['enable_tflite_convertible']) def espnet_model(self, training): from AMmodel.espnet import ESPNetCTC, ESPNetLAS, ESPNetTransducer self.config['Transducer_decoder'].update( {'vocabulary_size': self.text_feature.num_classes}) if self.model_config['name'] == 'ESPNetTransducer': self.model = ESPNetTransducer(self.config) elif self.model_config['name'] == 'ESPNetCTC': self.model = ESPNetCTC(self.model_config, self.text_feature.num_classes) elif self.model_config['name'] == 'ESPNetLAS': self.config['LAS_decoder'].update( {'n_classes': self.text_feature.num_classes}) self.config['LAS_decoder'].update( {'startid': self.text_feature.start}) self.model = ESPNetLAS(self.config, training=training, enable_tflite_convertible=self. config['enable_tflite_convertible']) else: raise ('not in supported model list') def load_model(self, training=True): if 'ESPNet' in self.model_config['name']: self.espnet_model(training) elif 'Multi' in self.model_config['name']: self.multi_task_model(training) elif 'Conformer' in self.model_config['name']: self.conformer_model(training) else: self.ds2_model(training) self.model.add_featurizers(self.text_feature) f, c = self.speech_feature.compute_feature_dim() try: if not training: if self.text_config['model_type'] != 'LAS': self.model._build([3, 80, f, c]) self.model._build([2, 80, f, c]) self.model._build([1, 80, f, c]) self.model.return_pb_function(f, c) else: self.model._build([3, 80, f, c], training) self.model._build([1, 80, f, c], training) self.model._build([2, 80, f, c], training) self.model.return_pb_function(f, c) self.load_checkpoint(self.config) except: print('am loading model failed.') def convert_to_pb(self, export_path): import tensorflow as tf f, c = self.speech_feature.compute_feature_dim() self.model.return_pb_function(f, c) concrete_func = self.model.recognize_pb.get_concrete_function() tf.saved_model.save(self.model, export_path, signatures=concrete_func) def decode_result(self, word): de = [] for i in word: if i != self.text_feature.stop: de.append(self.text_feature.index_to_token[int(i)]) else: break return de def predict(self, fp): if '.pcm' in fp: data = np.fromfile(fp, 'int16') data = np.array(data, 'float32') data /= 32768 else: data = self.speech_feature.load_wav(fp) mel = self.speech_feature.extract(data) mel = np.expand_dims(mel, 0) input_length = np.array( [[mel.shape[1] // self.model.time_reduction_factor]], 'int32') result = self.model.recognize_pb(mel, input_length)[0] return result def load_checkpoint(self, config): """Load checkpoint.""" self.checkpoint_dir = os.path.join( config['learning_config']['running_config']["outdir"], "checkpoints") files = os.listdir(self.checkpoint_dir) files.sort(key=lambda x: int(x.split('_')[-1].replace('.h5', ''))) self.model.load_weights(os.path.join(self.checkpoint_dir, files[-1])) self.init_steps = int(files[-1].split('_')[-1].replace('.h5', ''))
class MultiTask_DataLoader(): def __init__(self, config_dict, training=True): self.speech_config = config_dict['speech_config'] self.text1_config = config_dict['decoder1_config'] self.text2_config = config_dict['decoder2_config'] self.text3_config = config_dict['decoder3_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config'][ 'batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.token1_featurizer = TextFeaturizer(self.text1_config) self.token2_featurizer = TextFeaturizer(self.text2_config) self.token3_featurizer = TextFeaturizer(self.text3_config) self.make_file_list( self.speech_config['train_list'] if training else self.speech_config['eval_list'], training) self.make_maps(config_dict) self.augment = Augmentation(self.augment_config) self.epochs = 1 self.steps = 0 def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info( 'history train list not equal train list ,data loader use init state' ) self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def return_data_types(self): return (tf.float32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32) def return_data_shape(self): f, c = self.speech_featurizer.compute_feature_dim() return ( tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape( [None, None, f, c]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ]), ) def get_per_epoch_steps(self): return len(self.train_list) // self.batch def eval_per_epoch_steps(self): return len(self.test_list) // self.batch def make_maps(self, config): with open(config['map_path']['phone'], encoding='utf-8') as f: data = f.readlines() data = [i.strip() for i in data if i != ''] self.phone_map = {} phone_map = {} for line in data: try: key, phone = line.strip().split('\t') except: continue phone_map[key] = phone.split(' ') self.phone_map = phone_map def map(self, txt): pys = pypinyin.pinyin(txt, 8, neutral_tone_with_five=True) pys = [i[0] for i in pys] phones = [] for i in pys: phones += self.phone_map[i] words = ''.join(pys) words = list(words) return pys, phones, words def make_file_list(self, wav_list, training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data = [i.strip() for i in data if i != ''] num = len(data) if training: self.train_list = data[:int(num * 0.99)] self.test_list = data[int(num * 0.99):] np.random.shuffle(self.train_list) self.train_offset = 0 self.test_offset = 0 logging.info('train list : {} test list:{}'.format( len(self.train_list), len(self.test_list))) else: self.test_list = data self.offset = 0 logging.info('eval list: {}'.format(len(self.test_list))) def only_chinese(self, word): txt = '' for ch in word: if '\u4e00' <= ch <= '\u9fff': txt += ch else: continue return txt def check_valid(self, txt, vocab_list): if len(txt) == 0: return False for n in txt: if n in vocab_list: pass else: return n return True def eval_data_generator(self): sample = self.test_list[self.offset:self.offset + self.batch] self.offset += self.batch speech_features = [] input_length = [] words_label = [] words_label_length = [] phone_label = [] phone_label_length = [] py_label = [] py_label_length = [] max_input = 0 max_label_words = 0 max_label_phone = 0 max_label_py = 0 for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: logging.info('{} load data failed,skip'.format(wp)) continue if len(data) < 400: continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: logging.info( '{} duration out of wav_max_duration({}),skip'.format( wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py, phone, word = self.map(txt) if len(py) == 0: continue if not self.check_valid(word, self.token1_featurizer.vocab_array): logging.info( ' {} txt word {} not all in tokens,continue'.format( txt, py)) continue if not self.check_valid(phone, self.token1_featurizer.vocab_array): logging.info( ' {} txt phone {} not all in tokens,continue'.format( txt, py)) continue if not self.check_valid(py, self.token1_featurizer.vocab_array): logging.info( ' {} txt pinyin {} not all in tokens,continue'.format( txt, py)) continue word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) if in_len < len(word_text_feature): continue max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_input = max(max_input, len(speech_feature)) speech_features.append(speech_feature) input_length.append(in_len) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) if self.speech_config['use_mel_layer']: speech_features = self.speech_featurizer.pad_signal( speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([ max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2] ]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) words_label = self.pad(words_label, max_label_words) phone_label = self.pad(phone_label, max_label_phone) py_label = self.pad(py_label, max_label_py) speech_features = np.array(speech_features, 'float32') words_label = np.array(words_label, 'int32') phone_label = np.array(phone_label, 'int32') py_label = np.array(py_label, 'int32') input_length = np.array(input_length, 'int32') words_label_length = np.array(words_label_length, 'int32') phone_label_length = np.array(phone_label_length, 'int32') py_label_length = np.array(py_label_length, 'int32') return speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length def pad(self, words_label, max_label_words): for i in range(len(words_label)): if words_label[i].shape[0] < max_label_words: pad = np.ones(max_label_words - words_label[i].shape[0] ) * self.token1_featurizer.pad words_label[i] = np.hstack((words_label[i], pad)) return words_label def GuidedAttention(self, N, T, g=0.2): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N))**2 / (2 * g * g)) return W def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] for i, j in zip(input_length, targets_length): i = int(i) step = int(j) pad = np.ones([inputs_shape, mel_target_shape]) * -1. pad[i:, :step] = 1 att_target = self.GuidedAttention(i, step, 0.2) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_targets = np.array(att_targets) return att_targets.astype('float32') def generate(self, train=True): sample = [] speech_features = [] input_length = [] words_label = [] words_label_length = [] phone_label = [] phone_label_length = [] py_label = [] py_label_length = [] max_input = 0 max_label_words = 0 max_label_phone = 0 max_label_py = 0 if train: batch = self.batch // 2 if self.augment.available() else self.batch else: batch = self.batch for i in range(batch * 10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 wp, txt = line.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: logging.info('{} load data failed,skip'.format(wp)) continue if len(data) < 400: continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: logging.info( '{} duration out of wav_max_duration({}),skip'.format( wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py, phone, word = self.map(txt) if len(py) == 0: logging.info('py length', len(py), 'skip') continue if self.check_valid( word, self.token1_featurizer.vocab_array) is not True: logging.info( ' {} txt word {} not all in tokens,continue'.format( txt, self.check_valid(word, self.token1_featurizer.vocab_array))) continue # if self.check_valid( phone, self.token2_featurizer.vocab_array) is not True: logging.info( ' {} txt phone {} not all in tokens,continue'.format( txt, self.check_valid(phone, self.token2_featurizer.vocab_array))) continue # if self.check_valid( py, self.token3_featurizer.vocab_array) is not True: logging.info(' {} txt py {} not all in tokens,continue'.format( txt, self.check_valid(py, self.token3_featurizer.vocab_array))) continue word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) if in_len < len(word_text_feature): continue max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_input = max(max_input, len(speech_feature)) speech_features.append(speech_feature) input_length.append(in_len) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) sample.append(line) if len(sample) == batch: break if train and self.augment.available(): for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: continue if len(data) < 400: continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: continue data = self.augment.process(data) if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py, phone, word = self.map(txt) if len(py) == 0: continue word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) if in_len < len(word_text_feature): continue max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_input = max(max_input, len(speech_feature)) speech_features.append(speech_feature) input_length.append(in_len) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) if self.speech_config['use_mel_layer']: speech_features = self.speech_featurizer.pad_signal( speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([ max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2] ]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) words_label = self.pad(words_label, max_label_words) phone_label = self.pad(phone_label, max_label_phone) py_label = self.pad(py_label, max_label_py) speech_features = np.array(speech_features, 'float32') words_label = np.array(words_label, 'int32') phone_label = np.array(phone_label, 'int32') py_label = np.array(py_label, 'int32') input_length = np.array(input_length, 'int32') words_label_length = np.array(words_label_length, 'int32') phone_label_length = np.array(phone_label_length, 'int32') py_label_length = np.array(py_label_length, 'int32') return speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length def generator(self, train=True): while 1: speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length = self.generate( train) yield speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length
class VocoderDataLoader(): def __init__(self, config, training=True): self.config = config self.batch = config['batch_size'] self.make_file_list(self.config['train_list'] if training else self.config['eval_list'], training=training) self.epochs = 1 self.steps = 0 self.speech_featurizer=SpeechFeaturizer(config) self._target_pad = -(self.config['max_abs_value'] + 0.1) def make_file_list(self, wav_list, training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data = [i.strip() for i in data if i != ''] num = len(data) if training: self.train_list = data[:int(num * 0.95)] self.test_list = data[int(num * 0.95):] np.random.shuffle(self.train_list) self.train_offset = 0 self.test_offset = 0 logging.info('load train list {} test list{}'.format(len(self.train_list), len(self.test_list))) else: self.test_list = data self.offset = 0 def get_per_epoch_steps(self): return len(self.train_list) // self.batch def eval_per_epoch_steps(self): return len(self.test_list) // self.batch def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info('history train list not equal new load train list ,data loader use init state') self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def return_data_types(self): return (tf.float32, tf.float32) def return_data_shape(self): return ( tf.TensorShape([None, None, self.config['num_mels']]) , tf.TensorShape([None,None,1]), ) def generate(self,train=True): y = [] x = [] maxnum = self.config['frame_length'] for i in range(self.batch * 10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 if self.config['load_from_npz']: try: data=np.load(line.strip()) target_wav=data['audio'] if self.config['adjust_type']=='tacotron': train_mel=data['tacotron_mel'] elif self.config['adjust_type']=='fastspeech': train_mel = data['fastspeech_mel'] else: train_mel = data['target_mel'] except: logging.info('{} load data failed , skip'.format(line)) continue if len(target_wav) > maxnum: pick = np.random.randint(0, len(target_wav) - maxnum, 1)[0] target_wav = target_wav[pick:pick + maxnum] pick_mel=pick//self.speech_featurizer.hop_size max_mel=maxnum//self.speech_featurizer.hop_size train_mel=train_mel[pick_mel:pick_mel+max_mel] y.append(target_wav) x.append(train_mel) if len(y) == self.batch: break else: try: target_wav=self.speech_featurizer.load_wav(line.strip()) except: logging.info('{} load data failed , skip'.format(line)) continue if len(target_wav) > maxnum: pick = np.random.randint(0, len(target_wav) - maxnum, 1)[0] target_wav = target_wav[pick:pick + maxnum] y.append(target_wav) if len(y)==self.batch: break if maxnum % self.config['num_mels'] != 0: maxnum += self.config['num_mels'] - maxnum % self.config['num_mels'] y=self.speech_featurizer.pad_signal(y,maxnum) if self.config['load_from_npz']: x=self._prepare_targets(x,maxnum//self.speech_featurizer.hop_size) else: process_wav=np.hstack(y) mel= self.speech_featurizer.melspectrogram(process_wav) x=mel.reshape([len(y),-1,self.config['num_mels']]) x = np.array(x,'float32') y = np.array(y,'float32') return x, y[:, :, np.newaxis] def generator(self, train=True): while 1: x,y= self.generate(train) if x.shape[0] == 0: logging.info('load data length zero,continue') continue yield x,y def _prepare_targets(self, targets,max_len=None): if max_len is None: max_len = max([len(t) for t in targets]) new=[] for t in targets: if t.shape[0]>=max_len: new.append(t[:max_len]) else: new.append(self._pad_target(t,max_len)) return np.stack(new) def _pad_target(self, t, length): return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad)