def ctc_cost2(self): #data = self.data data = DataSpeech(self.datapath) data.LoadDataList('dev') x = next(data.data_genetator(32, self.AUDIO_LENGTH)) [test_input_data, y, test_input_length, label_length], labels = x xx = [test_input_data, y, test_input_length, label_length] y_pred2 = self._model.predict(xx) _mean = sum(y_pred2) / len(y_pred2) print(y_pred2, _mean) return _mean
def __init__(self, datapath): ''' 初始化 默认输出的拼音的表示大小是1283,即1282个拼音+1个空白块 ''' MS_OUTPUT_SIZE = 1417 self.MS_OUTPUT_SIZE = MS_OUTPUT_SIZE # 神经网络最终输出的每一个字符向量维度的大小 #self.BATCH_SIZE = BATCH_SIZE # 一次训练的batch self.label_max_string_length = 64 self.AUDIO_LENGTH = 1600 self.AUDIO_FEATURE_LENGTH = 200 self.datapath = datapath self.data = DataSpeech(datapath) self._model = self.CreateModel()
def RecognizeSpeech(self, wavsignal, fs): ''' 最终做语音识别用的函数,识别一个wav序列的语音 不过这里现在还有bug ''' #data = self.data data = DataSpeech('E:\\语音数据集') data.LoadDataList('dev') # 获取输入特征 #data_input = data.GetMfccFeature(wavsignal, fs) data_input = data.GetFrequencyFeature(wavsignal, fs) list_symbol_dic = data.list_symbol # 获取拼音列表 labels = [ 'dong1', 'bei3', 'jun1', 'de5', 'yi4', 'xie1', 'ai4', 'guo2', 'jiang4', 'shi4', 'ma3', 'zhan4', 'shan1', 'li3', 'du4', 'tang2', 'ju4', 'wu3', 'su1', 'bing3', 'ai4', 'deng4', 'tie3', 'mei2', 'deng3', 'ye3', 'fen4', 'qi3', 'kang4', 'zhan4' ] #labels = [ list_symbol_dic[-1] ] #labels = [ list_symbol_dic[-1] ] #while(len(labels) < 32): # labels.append(list_symbol_dic[-1]) feat_out = [] #print("数据编号",n_start,filename) for i in labels: if ('' != i): n = data.SymbolToNum(i) feat_out.append(n) print(feat_out) labels = feat_out x = next( self.data_gen(data_input=np.array(data_input), data_labels=np.array(labels), input_length=len(data_input), labels_length=len(labels), batch_size=2)) [test_input_data, y, test_input_length, label_length], labels = x xx = [test_input_data, y, test_input_length, label_length] pred = self._model.predict(x=xx) print(pred) shape = pred[:, :].shape print(shape) print('test_input_data:', test_input_data) y_p = self.test_func([test_input_data]) print(type(y_p)) print('y_p:', y_p) for j in range(0, 0): mean = sum(y_p[0][0][j]) / len(y_p[0][0][j]) print('max y_p:', max(y_p[0][0][j]), 'min y_p:', min(y_p[0][0][j]), 'mean y_p:', mean, 'mid y_p:', y_p[0][0][j][100]) print('argmin:', np.argmin(y_p[0][0][j]), 'argmax:', np.argmax(y_p[0][0][j])) count = 0 for i in y_p[0][0][j]: if (i < mean): count += 1 print('count:', count) print(K.is_sparse(y_p)) #y_p = K.to_dense(y_p) print(K.is_sparse(y_p)) _list = [] for i in y_p: list_i = [] for j in i: list_j = [] for k in j: list_j.append(np.argmin(k)) list_i.append(list_j) _list.append(list_i) #y_p = np.array(_list, dtype = np.float) y_p = _list #print(y_p,type(y_p),y_p.shape) #y_p = tf.sparse_to_dense(y_p,(2,397),1417,0) print(test_input_length.T) test_input_length = test_input_length.reshape(2, 1) func_in_len = self.test_func_input_length([test_input_length]) print(type(func_in_len)) print(func_in_len) #in_len = np.ones(shape[0]) * shape[1] ctc_decoded = K.ctc_decode(y_p[0][0], input_length=tf.squeeze( func_in_len[0][0][0])) print(ctc_decoded) #ctc_decoded = ctc_decoded[0][0] #out = K.get_value(ctc_decoded)[:,:64] #pred = self._model.predict_on_batch([data_input, labels_num, input_length, label_length]) return pred[0][0] pass