print 'acc: %f%%' % acc evaluator = Evaluate() # model.fit_generator(gen(64), samples_per_epoch=512, nb_epoch=15, # callbacks=[evaluator], # ) # 测试模型 characters2 = characters + ' ' [X_test, y_test, _, _], _ = next(gen(1)) y_pred = base_model.predict(X_test) y_pred = y_pred[:, 2:, :] out = K.get_value( K.ctc_decode( y_pred, input_length=np.ones(y_pred.shape[0]) * y_pred.shape[1], )[0][0])[:, :n_len] # out = ''.join([characters[x] for x in out[0]]) # y_true = ''.join([characters[x] for x in y_test[0]]) # # import pylab # plt.imshow(X_test[0].transpose(1, 0, 2)) # plt.title('pred:' + str(out) + '\ntrue: ' + str(y_true)) # pylab.show() argmax = np.argmax(y_pred, axis=2)[0] print list(zip(argmax, ''.join([characters2[x] for x in argmax]))) # 计算模型总体准确率 print evaluate(base_model) model.save('model.h5')
def create_model(params, gpu=False, two_rnns=False): input_data = Input(name="input", shape=params["input_shape"], dtype="float32") conv1 = Conv2D( params["conv_filters"], params["kernel_size"], padding="same", activation=params["act"], kernel_initializer="he_normal", name="conv1", )(input_data) conv1 = MaxPooling2D(pool_size=(params["pool_size"], params["pool_size"]), name="max1")(conv1) conv1 = Dropout(0.2)(conv1) conv2 = Conv2D( params["conv_filters"], params["kernel_size"], padding="same", activation=params["act"], kernel_initializer="he_normal", name="conv2", )(conv1) conv2 = MaxPooling2D(pool_size=(params["pool_size"], params["pool_size"]), name="max2")(conv2) conv2 = Dropout(0.2)(conv2) # conv1shape = (img_w // (pool_size ** (num_convs - 1)), # (img_h // (pool_size ** (num_convs - 1))) * conv_filters) conv2shape = ( params["img_w"] // (params["pool_size"]**params["num_convs"]), (params["img_h"] // (params["pool_size"]**params["num_convs"])) * params["conv_filters"], ) # Failed attempt to do a skip connection # conv1 = Reshape(target_shape=conv1shape)(conv1) # conv2 = Reshape(target_shape=conv2shape)(conv2) # inner = concatenate([conv1, conv2], axis=2) inner = Reshape(target_shape=conv2shape, name="reshape")(conv2) # cuts down input size going into RNN: inner = Dense(params["time_dense_size"], activation=params["act"], name="dense1")(inner) if gpu: gru_1 = CuDNNGRU( params["rnn1_size"], return_sequences=True, kernel_initializer="he_normal", name="gru1", )(inner) gru_1b = CuDNNGRU( params["rnn1_size"], return_sequences=True, go_backwards=True, kernel_initializer="he_normal", name="gru1_b", )(inner) else: gru_1 = GRU( params["rnn1_size"], return_sequences=True, kernel_initializer="he_normal", name="gru1", reset_after=True, recurrent_activation="sigmoid", )(inner) gru_1b = GRU( params["rnn1_size"], return_sequences=True, go_backwards=True, kernel_initializer="he_normal", name="gru1_b", reset_after=True, recurrent_activation="sigmoid", )(inner) gru1_merged = add([gru_1, gru_1b]) if two_rnns: if gpu: gru_2 = CuDNNGRU( params["rnn2_size"], return_sequences=True, kernel_initializer="he_normal", name="gru2", )(gru1_merged) gru_2b = CuDNNGRU( params["rnn2_size"], return_sequences=True, go_backwards=True, kernel_initializer="he_normal", name="gru2_b", )(gru1_merged) else: gru_2 = GRU( params["rnn2_size"], return_sequences=True, kernel_initializer="he_normal", name="gru2", reset_after=True, recurrent_activation="sigmoid", )(gru1_merged) gru_2b = GRU( params["rnn2_size"], return_sequences=True, go_backwards=True, kernel_initializer="he_normal", name="gru2_b", reset_after=True, recurrent_activation="sigmoid", )(gru1_merged) # transforms RNN output to character activations: if two_rnns: inner = Dense(params["output_size"], kernel_initializer="he_normal", name="dense2")(concatenate([gru_2, gru_2b])) else: inner = Dense(params["output_size"], kernel_initializer="he_normal", name="dense2")(gru1_merged) y_pred = Activation("softmax", name="softmax")(inner) output_labels = Input(name="the_labels", shape=[params["max_string_len"]], dtype="float32") input_lengths = Input(name="input_length", shape=[1], dtype="int64") label_lengths = Input(name="label_length", shape=[1], dtype="int64") # Keras doesn't currently support loss funcs with extra parameters # so CTC loss is implemented in a lambda layer # The loss function def ctc_lambda_func(args): y_pred, labels, input_length, label_length = args # the 2 is critical here since the first couple outputs of the RNN # tend to be garbage: y_pred = y_pred[:, params["ctc_cut"]:, :] return K.ctc_batch_cost(labels, y_pred, input_length, label_length) loss_out = Lambda(ctc_lambda_func, output_shape=(1, ), name="ctc")( [y_pred, output_labels, input_lengths, label_lengths]) train_model = Model( inputs=[input_data, output_labels, input_lengths, label_lengths], outputs=loss_out) top_k_dec_list, _ = K.ctc_decode( y_pred[:, params["ctc_cut"]:, :], K.squeeze(input_lengths, axis=1), greedy=False, top_paths=3, ) decoder0 = K.function([input_data, input_lengths], [top_k_dec_list[0]]) decoder1 = K.function([input_data, input_lengths], [top_k_dec_list[1]]) decoder2 = K.function([input_data, input_lengths], [top_k_dec_list[2]]) decoder_models = decoder0, decoder1, decoder2 return train_model, decoder_models
def predict_text(model, recs_all, recs_len, img_all, img_name=None): texts = [] img_list = [] width_list = [] img_index = 0 # fixme 当前是前面所有长度的和 for i in range(len(recs_len)): if i > 0: recs_len[i] += recs_len[i - 1] for i in range(len(recs_all)): for j in range(len(recs_len)): if i < recs_len[j]: img_index = j break img_rec = dumpRotateImage(img_all[img_index], recs_all[i]).convert('L') scale = img_rec.size[1] * 1.0 / 32 if not scale > 0: continue w = int(img_rec.size[0] / scale) # fixme 像素缩放后小于1pixel if not w > 0: continue img_rec = img_rec.resize((w, 32), Image.BILINEAR) width_list.append(w) # fixme 增强图像对比度 提高识别 img_in = np.array(img_rec).T img_out = np.zeros(img_in.shape, np.uint8) cv2.normalize(img_in, img_out, 255, 0, cv2.NORM_MINMAX, cv2.CV_8U) # fixme 黑白色彩反转 达到黑字白底的目的 # todo 根据面积比较的反转 # todo 可以尝试提取图片的前景色 # black = 0 # for m in range(32): # for n in range(64 if w >= 64 else w): # if img_out[m, n] < 100 : # black += 1 # if black > (32*(64 if w >= 64 else w)/2): # img_out = 255 - img_out # todo 根据顶点的线条比较反转 black = 0 for m in range(32): if img_out[0, m] < 100: black += 1 for n in range(64 if w >= 64 else w): if img_out[n, 0] < 100: black += 1 if black > (32 + (64 if w >= 64 else w)) // 2: img_out = 255 - img_out # todo 获取黑色文字进行二值化(效果不佳) # for i in range(32): # for j in range(w): # if not (img_out[i, j] < 50): # img_out[i, j] = 255 # # ret, img_out = cv2.threshold(img_out, 180, 255, cv2.THRESH_BINARY) img_rec = img_out.astype(np.float32) / 255.0 - 0.5 # img_rec is array img_list.append(img_rec) width_max = max(width_list) X = np.zeros((len(width_list), width_max, 32, 1), dtype=np.float) for i in range(len(width_list)): img_pad = np.zeros((width_max - width_list[i], 32), np.float32) + 0.5 img_rec = np.concatenate((img_list[i], img_pad), axis=0) X[i] = np.expand_dims(img_rec, axis=2) # fixme 保存裁剪后的图像 if not img_name is None: img_out = (img_rec + 0.5) * 255 img_sa = Image.fromarray(img_out.T.astype(np.int32)) img_sa.convert('L').save(root_recs + '/' + img_name + '_%d_.jpg' % i) y_pred = model.predict(X) out = K.get_value( K.ctc_decode(y_pred, input_length=np.ones(y_pred.shape[0]) * y_pred.shape[1])[0][0]) for i in range(len(out)): out_s = u''.join([char[x] for x in out[i] if x != -1]) # texts_str += (out_s) texts.append(out_s) # return texts_str return texts
def RecognizeSpeech(self, wavsignal, fs): ''' 最终做语音识别用的函数,识别一个wav序列的语音 不过这里现在还有bug ''' #data = self.data data = DataSpeech('E:\\语音数据集') data.LoadDataList('dev') # 获取输入特征 #data_input = data.GetMfccFeature(wavsignal, fs) data_input = data.GetFrequencyFeature(wavsignal, fs) input_length = len(data_input) input_length = input_length // 4 data_input = np.array(data_input, dtype=np.float) in_len = np.zeros((1), dtype=np.int32) print(in_len.shape) in_len[0] = input_length batch_size = 1 x_in = np.zeros((batch_size, 1600, 200), dtype=np.float) for i in range(batch_size): x_in[i, 0:len(data_input)] = data_input base_pred = self.base_model.predict(x=x_in) print('base_pred:\n', base_pred) #input_length = tf.squeeze(input_length) #decode_pred = self.model_decode(x=[x_in, in_len]) #print(decode_pred) base_pred = base_pred[:, 2:, :] r = K.ctc_decode(base_pred, in_len, greedy=True, beam_width=64, top_paths=1) print('r', r) #r = K.cast(r[0][0], dtype='float32') #print('r1', r) #print('解码完成') r1 = K.get_value(r[0][0]) print('r1', r1) print('r0', r[1]) r2 = K.get_value(r[1]) print(r2) print('解码完成') list_symbol_dic = data.list_symbol # 获取拼音列表 #arr_zero = np.zeros((1, 200), dtype=np.int16) #一个全是0的行向量 #import matplotlib.pyplot as plt #plt.subplot(111) #plt.imshow(data_input, cmap=plt.get_cmap('gray')) #plt.show() #while(len(data_input)<1600): #长度不够时补全到1600 # data_input = np.row_stack((data_input,arr_zero)) #print(len(data_input)) #list_symbol = data.list_symbol # 获取拼音列表 #labels = [ list_symbol[0] ] #while(len(labels) < 64): # labels.append('') #labels_num = [] #for i in labels: # labels_num.append(data.SymbolToNum(i)) #data_input = np.array(data_input, dtype=np.int16) #data_input = data_input.reshape(data_input.shape[0],data_input.shape[1]) #labels_num = np.array(labels_num, dtype=np.int16) #labels_num = labels_num.reshape(labels_num.shape[0]) #input_length = np.array([data_input.shape[0] // 4 - 3], dtype=np.int16) #input_length = np.array(input_length) #input_length = input_length.reshape(input_length.shape[0]) #label_length = np.array([labels_num.shape[0]], dtype=np.int16) #label_length = np.array(label_length) #label_length = label_length.reshape(label_length.shape[0]) #x = [data_input, labels_num, input_length, label_length] #x = next(data.data_genetator(1, self.AUDIO_LENGTH)) #x = kr.utils.np_utils.to_categorical(x) #print(x) #x=np.array(x) #pred = self._model.predict(x=x) #pred = self._model.predict_on_batch([data_input, labels_num, input_length, label_length]) #return [labels,pred] return r1 pass
train_data_labels_=train_data_labels, reshape_=False), shuffle=False, steps_per_epoch=data_size) model.save( "/home/tatras/Desktop/github-general/cmu-deep-learning-2018/" "hw3/models/2_layer_lstm_ctc_epoch_{}".format(_)) def testing_(): # Training the data in generators test_data_raw = np.load("/home/kiriteegak/Desktop/github-general/" "cmu-deep-learning-2018/hw3/data/dev.npy") sizes = np.apply_along_axis(len, 0, test_data_raw) test_data_raw = np.apply_along_axis(np.expand_dims, 0, test_data_raw, 1) model = load_model( "/home/kiriteegak/Desktop/github-general/cmu-deep-learning-2018/" "hw3/models/2_layer_lstm_ctc_epoch_0", custom_objects={'tf': tf}) print("here") model_changed = change_network_architecture(model) return model_changed.predict(x=test_data_raw), sizes if __name__ == '__main__': test_data_labels = np.load( "/home/kiriteegak/Desktop/github-general/" "cmu-deep-learning-2018/hw3/data/dev_phonemes.npy") outputs, lengths_ = testing_() print(K.ctc_decode(outputs, lengths_, greedy=False))
import time start=time.clock() X_test_1 = np.zeros((1, width1, height1, 3), dtype=np.uint8) X_test_2 = np.zeros((1, width2, height2, 3), dtype=np.uint8) file = codecs.open("test1.txt","a","utf-8") for i in range(0,100000): result="" X_test_1[0] = cv2.resize(cv2.imread('test/'+str(i)+'_1.png'), (width1, height1), cv2.INTER_LINEAR).transpose(1,0,2) y_pred_1 = model1.predict(X_test_1) y_pred_1 = y_pred_1[:,2:,:] out1 = K.get_value(K.ctc_decode(y_pred_1, input_length=np.ones(y_pred_1.shape[0])*y_pred_1.shape[1], )[0][0])[:, :30] out1 = ''.join([characters[x] for x in out1[0]]) result += out1 +";" if os.path.isfile('test/'+str(i)+'_2.png') == True: X_test_1[0] = cv2.resize(cv2.imread('test/'+str(i)+'_2.png'), (width1, height1), cv2.INTER_LINEAR).transpose(1,0,2) y_pred_1 = model1.predict(X_test_1) y_pred_1 = y_pred_1[:,2:,:] out1 = K.get_value(K.ctc_decode(y_pred_1, input_length=np.ones(y_pred_1.shape[0])*y_pred_1.shape[1], )[0][0])[:, :30] out1 = ''.join([characters[x] for x in out1[0]]) result += out1 +";" X_test_2[0] = cv2.resize(cv2.imread('test/'+str(i)+'_0.png'), (width2, height2), cv2.INTER_LINEAR).transpose(1,0,2) y_pred_2 = model2.predict(X_test_2) y_pred_2 = y_pred_2[:,2:,:] out2 = K.get_value(K.ctc_decode(y_pred_2, input_length=np.ones(y_pred_2.shape[0])*y_pred_2.shape[1], )[0][0])[:, :30] out2 = ''.join([characters2[x] for x in out2[0]])
def __keras_decode(y_pred: np.ndarray, input_lengths: np.ndarray, greedy: bool, beam_width: int, top_paths: int) -> list: decoded = k.ctc_decode(y_pred=y_pred, input_length=input_lengths, greedy=greedy, beam_width=beam_width, top_paths=top_paths) return [path.eval(session=k.get_session()) for path in decoded[0]]
def call(self, y_pred): top_k_decoded, logs = K.ctc_decode(y_pred, K.reshape(self.input_length, (-1, )), greedy=True) return K.reshape(top_k_decoded, (-1, 1))
def ctc_pred(model,x,batch_size,input_len,): pred = model.predict(x,batch_size=batch_size) input_len = K.constant([input_len]*len(pred),dtype="int32") decoded = K.ctc_decode(pred, input_len, greedy=True, beam_width=100, top_paths=1) return K.get_value(decoded[0][0])
sample_weight=sample_weight[i:i + batch_size]) total_ctcloss += ctcloss * inputs_train["the_input"].shape[0] * 1. loss_train[epoch] = total_ctcloss / X_train.shape[0] inputs_train = { 'the_input': X_train, 'the_labels': y_train, 'input_length': np.sum(X_train_mask, axis=1, dtype=np.int32), 'label_length': np.squeeze(y_train_mask), } outputs_train = {'ctc': np.zeros([y_train.shape[0]])} preds = test_func([inputs_train["the_input"]])[0] decode_function = K.ctc_decode(preds[:, 2:, :], inputs_train["input_length"] - 2, greedy=False, top_paths=1) labellings = decode_function[0][0].eval(session=sess) # print labellings, len(labellings), len(labellings[0]), shape(labellings) if labellings.shape[1] == 0: ua_train[epoch] = 0.0 wa_train[epoch] = 0.0 else: ua_train[epoch] = unweighted_accuracy(y_train.ravel(), labellings.T[0].ravel()) wa_train[epoch] = weighted_accuracy(y_train.ravel(), labellings.T[0].ravel()) inputs_test = { 'the_input': X_test,
optimizer=sgd) batch, lab, input_len, lab_len = tt.get_batch() size_training_set = int(.8 * len(batch)) print('The training set is of size {}\n'.format(size_training_set)) [x_train, x_test] = np.split(batch, [size_training_set]) [y_train, y_test] = np.split(lab, [size_training_set]) [input_len_train, input_len_test] = np.split(input_len, [size_training_set]) [lab_len_train, lab_len_test] = np.split(lab_len, [size_training_set]) model.fit([x_train, y_train, input_len_train, lab_len_train], [y_train, x_train], batch_size=100, epochs=1) score = model.evaluate([x_test, y_test, input_len_test, lab_len_test], [y_test, x_test]) print('The final score is {}'.format(score)) batch, lab, input_len, lab_len = tt.get_sound_examples('examples') out = K.ctc_decode( model.predict([batch, lab, input_len, lab_len])[1], input_len) E = K.eval(out[0][0]) for k in range(len(E)): print(tt.int_list_to_text(E[k]))
model.output_length = lambda x: x print(model.summary()) return model model = bidirectional_rnn_model( input_dim=161, # change to 13 if you would like to use MFCC features units=512 + 32) print('load Model') model.load_weights('results/model_20.h5') data_gen = AudioGenerator() print("Load file") audio_path = 'output.wav' data_point = data_gen.normalize(data_gen.featurize(audio_path)) print("Start prediction") #input_to_softmax.load_weights(model_path) prediction = model.predict(np.expand_dims(data_point, axis=0), batch_size=1) output_length = [model.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() print(prediction) print(output_length) print(pred_ints) print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints)))
def __init__(self, learning_rate=0.001): conv_filters = 16 kernel_size = (3, 3) pool_size = 2 time_dense_size = 32 rnn_size = 512 img_h = 32 act = 'relu' self.width = K.placeholder(name='width', ndim=0, dtype='int32') self.input_data = Input(name='the_input', shape=(None, img_h, 1), dtype='float32') self.inner = Conv2D(conv_filters, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv1')(self.input_data) self.inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max1')(self.inner) self.inner = Conv2D(conv_filters, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv2')(self.inner) self.inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max2')(self.inner) self.inner = Lambda(self.res, arguments={"last_dim": (img_h // (pool_size ** 2)) * conv_filters \ , "width": self.width // 4})(self.inner) # cuts down input size going into RNN: self.inp = Dense(time_dense_size, activation=act, name='dense1')(self.inner) self.batch_norm = keras.layers.normalization.BatchNormalization()( self.inp) self.gru_1 = Bidirectional(GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1'), merge_mode="sum")(self.batch_norm) self.gru_2 = Bidirectional(GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2'), merge_mode="concat")(self.gru_1) self.y_pred = TimeDistributed( Dense(63, kernel_initializer='he_normal', name='dense2', activation='linear'))(self.gru_2) self.model = Model(inputs=self.input_data, outputs=self.y_pred) self.model.summary() self.out = K.function( [self.input_data, self.width, K.learning_phase()], [self.y_pred]) self.y_true = K.placeholder(name='y_true', ndim=1, dtype='int32') self.input_length = K.placeholder(name='input_length', ndim=1, dtype='int32') self.label_length = K.placeholder(name='label_length', ndim=1, dtype='int32') self.loss_out = K.mean( warpctc_tensorflow.ctc(tf.transpose(self.y_pred, perm=[1, 0, 2]), self.y_true, self.label_length, self.input_length)) # self.optimizer = keras.optimizers.Adam(lr = learning_rate) self.optimizer = keras.optimizers.SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=200) self.update = self.optimizer.get_updates(self.model.trainable_weights, [], loss=self.loss_out) self.network_output = K.ctc_decode( Activation('softmax')(self.y_pred), self.input_length, True)[0][0] self.train_step = K.function([self.input_data, self.width, self.y_true, self.input_length, self.label_length, K.learning_phase()], \ [self.loss_out, self.y_pred], updates = self.update) self.test = K.argmax(self.y_pred, axis=2) self.predict_step = K.function([ self.input_data, self.width, self.input_length, K.learning_phase() ], [self.network_output])
def ctc_accuracy(y_true, y_pred, max_len=MAX_LEN): labels = y_true[:, 2:] input_length = y_true[:, 0] decoded = K.ctc_decode(y_pred, input_length)[0][0] cmp = K.cast(K.equal(labels, decoded), dtype='float') return K.cast(K.equal(K.sum(cmp, axis=-1), max_len), dtype='float')
def evaluate2(self, ltm_images_ph, tcng, sess): db = self.db keys = list(db.keys()) ler_dic = {} tler = 0.0 for idx in range(len(keys)): if idx > 40000: break bnk = keys[idx].split('/')[-1].split('_')[-1].split('.')[0] if bnk not in list(ler_dic.keys()): ler_dic[bnk] = [] image = cv2.imread(db[keys[idx]][3], 0) org_shape = image.shape add_to_bottom = int(self.hl - org_shape[0]) add_to_right = int(self.wl - org_shape[1]) if org_shape[0] > self.hl or org_shape[1] > self.wl: raise Exception("height or width is bigger than " + str(self.hl) + " x " + str(self.wl) + " " + org_shape) padded_image = cv2.copyMakeBorder(image, 0, add_to_bottom, 0, add_to_right, cv2.BORDER_CONSTANT, 0) padded_image = np.array( padded_image.reshape(1, self.hl, self.wl, 1)) ls = np.array( sorted([int(line) for line in db[keys[idx]][2].split('-') ])).reshape(-1, 3) height = np.array(org_shape[0]).reshape(-1, 1) width = np.array(org_shape[1]).reshape(-1, 1) label, seq_len = self.label_processor(db[keys[idx]][0]) label = np.array(label) seq_len = np.array(seq_len).reshape(-1, 1) if True: image = np.concatenate([padded_image, padded_image], axis=0) height = np.concatenate([height, height], axis=0) width = np.concatenate([width, width], axis=0) ls = np.concatenate([ls, ls], axis=0) ltm_images, l_true = ltm_img_processor(image, height, width, ls, double=False) y_pred = sess.run( [tcng.fc_2], feed_dict={ ltm_images_ph: ltm_images, tcng.images_ph: image, tcng.heights_ph: height, tcng.widths_ph: width }) y_pred = y_pred[0] shape = y_pred[:, 2:, :].shape ctc_decode = bknd.ctc_decode(y_pred[:, 2:, :], input_length=np.ones(shape[0]) * shape[1])[0][0] out = bknd.get_value(ctc_decode)[:, :self.maxL] ler = compare1(out, label, self.Ivoc, show=2) ler_dic[bnk].append(float(ler)) tler += ler logging.debug("processed %i out of %i", idx, len(keys)) for bnk in list(ler_dic.keys()): ler_dic[bnk] = np.mean(ler_dic[bnk]) logging.info("ler for bank %i is %f", int(bnk), ler_dic[bnk]) return tler / len(keys)
def Predict(self, batch_size, data_input, in_len): ''' 预测结果 返回语音识别后的拼音符号列表 ''' batch_size = 1 in_len = np.zeros((batch_size), dtype=np.int32) print(in_len.shape) in_len[0] = in_len[0] - 2 x_in = np.zeros((batch_size, 1600, 200), dtype=np.float) for i in range(batch_size): x_in[i, 0:len(data_input)] = data_input base_pred = self.base_model.predict(x=x_in) print('base_pred:\n', base_pred) y_p = base_pred print('base_pred0:\n', base_pred[0][0].shape) #for j in range(200): # mean = np.sum(y_p[0][j]) / y_p[0][j].shape[0] # print('max y_p:',np.max(y_p[0][j]),'min y_p:',np.min(y_p[0][j]),'mean y_p:',mean,'mid y_p:',y_p[0][j][100]) # print('argmin:',np.argmin(y_p[0][j]),'argmax:',np.argmax(y_p[0][j])) # count=0 # for i in range(y_p[0][j].shape[0]): # if(y_p[0][j][i] < mean): # count += 1 # print('count:',count) base_pred = base_pred[:, 2:, :] r = K.ctc_decode(base_pred, in_len, greedy=True, beam_width=100, top_paths=1) print('r', r) #r = K.cast(r[0][0], dtype='float32') #print('r1', r) #print('解码完成') r1 = K.get_value(r[0][0]) print('r1', r1) print('r0', r[1]) r2 = K.get_value(r[1]) print(r2) print('解码完成') list_symbol_dic = GetSymbolList(self.datapath) # 获取拼音列表 r1 = r1[0] r_str = [] for i in r1: r_str.append(list_symbol_dic[i]) #print(r_str) return r_str pass
def RecognizeSpeech(self, wavsignal, fs): ''' 最终做语音识别用的函数,识别一个wav序列的语音 不过这里现在还有bug ''' #data = self.data data = DataSpeech('E:\\语音数据集') data.LoadDataList('dev') # 获取输入特征 #data_input = data.GetMfccFeature(wavsignal, fs) data_input = data.GetFrequencyFeature(wavsignal, fs) list_symbol_dic = data.list_symbol # 获取拼音列表 labels = [ 'dong1', 'bei3', 'jun1', 'de5', 'yi4', 'xie1', 'ai4', 'guo2', 'jiang4', 'shi4', 'ma3', 'zhan4', 'shan1', 'li3', 'du4', 'tang2', 'ju4', 'wu3', 'su1', 'bing3', 'ai4', 'deng4', 'tie3', 'mei2', 'deng3', 'ye3', 'fen4', 'qi3', 'kang4', 'zhan4' ] #labels = [ list_symbol_dic[-1] ] #labels = [ list_symbol_dic[-1] ] #while(len(labels) < 32): # labels.append(list_symbol_dic[-1]) feat_out = [] #print("数据编号",n_start,filename) for i in labels: if ('' != i): n = data.SymbolToNum(i) feat_out.append(n) print(feat_out) labels = feat_out x = next( self.data_gen(data_input=np.array(data_input), data_labels=np.array(labels), input_length=len(data_input), labels_length=len(labels), batch_size=2)) [test_input_data, y, test_input_length, label_length], labels = x xx = [test_input_data, y, test_input_length, label_length] pred = self._model.predict(x=xx) print(pred) shape = pred[:, :].shape print(shape) #print(test_input_data) y_p = self.test_func([test_input_data]) print(type(y_p)) print('y_p:', y_p) for j in range(0, 200): mean = sum(y_p[0][0][j]) / len(y_p[0][0][j]) print('max y_p:', max(y_p[0][0][j]), 'min y_p:', min(y_p[0][0][j]), 'mean y_p:', mean, 'mid y_p:', y_p[0][0][j][100]) print('argmin:', np.argmin(y_p[0][0][j]), 'argmax:', np.argmax(y_p[0][0][j])) count = 0 for i in y_p[0][0][j]: if (i < mean): count += 1 print('count:', count) print(K.is_sparse(y_p)) y_p = K.to_dense(y_p) print(K.is_sparse(y_p)) #y_p = tf.sparse_to_dense(y_p,(2,397),1417,0) print(test_input_length.T) test_input_length = test_input_length.reshape(2, 1) func_in_len = self.test_func_input_length([test_input_length]) print(type(func_in_len)) #in_len = np.ones(shape[0]) * shape[1] ctc_decoded = K.ctc_decode(y_p, input_length=func_in_len) print(ctc_decoded) #ctc_decoded = ctc_decoded[0][0] #out = K.get_value(ctc_decoded)[:,:64] #pred = self._model.predict_on_batch([data_input, labels_num, input_length, label_length]) return pred[0][0] pass
# As our model predicts the probability for each class at each time step, we need to use some transcription function to convert it into actual texts. Here we will use the CTC decoder to get the output text. Let’s see the code: # In[2]: # load the saved best model weights act_model.load_weights('best_model.hdf5') num_val = 15000 # predict outputs on validation images prediction = act_model.predict(valid_img[:num_val]) valid_img = np.array(valid_img) # use CTC decoder out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1], greedy=True)[0][0]) #print(out) out_pred = '' counter = 0 # see the results i = 0 for x in out: print("original_text = ", valid_orig_txt[i]) print("predicted text = ", end = '') for p in x: if int(p) != -1: c = char_list[int(p)] print(char_list[int(p)], end = '') out_pred= out_pred + c if valid_orig_txt[i] == out_pred:
def RecognizeSpeech(self, wavsignal, fs): ''' 最终做语音识别用的函数,识别一个wav序列的语音 不过这里现在还有bug ''' #data = self.data #data = DataSpeech('E:\\语音数据集') #data.LoadDataList('dev') # 获取输入特征 #data_input = data.GetMfccFeature(wavsignal, fs) data_input = GetFrequencyFeature(wavsignal, fs) input_length = len(data_input) input_length = input_length // 4 data_input = np.array(data_input, dtype=np.float) in_len = np.zeros((1), dtype=np.int32) print(in_len.shape) in_len[0] = input_length - 2 batch_size = 1 x_in = np.zeros((batch_size, 1600, 200), dtype=np.float) for i in range(batch_size): x_in[i, 0:len(data_input)] = data_input base_pred = self.base_model.predict(x=x_in) print('base_pred:\n', base_pred) y_p = base_pred print('base_pred0:\n', base_pred[0][0].shape) for j in range(200): mean = np.sum(y_p[0][j]) / y_p[0][j].shape[0] print('max y_p:', np.max(y_p[0][j]), 'min y_p:', np.min(y_p[0][j]), 'mean y_p:', mean, 'mid y_p:', y_p[0][j][100]) print('argmin:', np.argmin(y_p[0][j]), 'argmax:', np.argmax(y_p[0][j])) count = 0 for i in range(y_p[0][j].shape[0]): if (y_p[0][j][i] < mean): count += 1 print('count:', count) #for j in range(0,200): # mean = sum(y_p[0][0][j]) / len(y_p[0][0][j]) # print('max y_p:',max(y_p[0][0][j]),'min y_p:',min(y_p[0][0][j]),'mean y_p:',mean,'mid y_p:',y_p[0][0][j][100]) # print('argmin:',np.argmin(y_p[0][0][j]),'argmax:',np.argmax(y_p[0][0][j])) # count=0 # for i in y_p[0][0][j]: # if(i < mean): # count += 1 # print('count:',count) #decoded_sequences = self.decoder([base_pred, in_len]) #print('decoded_sequences:\n', decoded_sequences) #input_length = tf.squeeze(input_length) #decode_pred = self.model_decode(x=[x_in, in_len]) #print(decode_pred) base_pred = base_pred[:, 2:, :] r = K.ctc_decode(base_pred, in_len, greedy=True, beam_width=100, top_paths=1) print('r', r) #r = K.cast(r[0][0], dtype='float32') #print('r1', r) #print('解码完成') r1 = K.get_value(r[0][0]) print('r1', r1) print('r0', r[1]) r2 = K.get_value(r[1]) print(r2) print('解码完成') list_symbol_dic = GetSymbolList(self.datapath) # 获取拼音列表 r1 = r1[0] r_str = [] for i in r1: r_str.append(list_symbol_dic[i]) #print(r_str) return r_str pass
if (opts.printmodel): plot_model(model, to_file="model.png", show_shapes=True) Image('model.png') if (opts.testing == False): model.fit_generator(gen(opts.batch_size), steps_per_epoch=opts.steps, epochs=opts.epochs, callbacks=[EarlyStopping(patience=10), evaluator], validation_data=gen(), validation_steps=1280) else: print("testing......") characters2 = characters + ' ' [X_test, y_test, _, _], _ = next(gen(1)) #cv2.imwrite("./save_image/test.jpg" , X_test) y_pred = base_model.predict(X_test) y_pred = y_pred[:,2:,:] out = K.get_value(K.ctc_decode(y_pred, input_length=np.ones(y_pred.shape[0])*y_pred.shape[1], )[0][0])[:, :7] out = ''.join([characters[x] for x in out[0]]) y_true = ''.join([characters[x] for x in y_test[0]]) print(out) print(y_true) if(opts.modelname == None and opts.testing == False): run_name = datetime.datetime.now().strftime('%Y:%m:%d:%H:%M:%S') model.save(run_name+".h5") base_model.save("base_"+run_name+".h5") elif(opts.testing ==True): print("Please input testing model name") else: model.save(opts.modelname) base_model.save("base_"+opts.modelname) del model
def _dft_ctc_decode(y_pred, input_length, beam_width=100): assert False, "fixme" sm_y_pred = K.softmax(y_pred) return K.ctc_decode( sm_y_pred, K.flatten(input_length), beam_width=beam_width, greedy=False, top_paths=1)[0][0]
def predict(wavs): # print("pppppppppppppppppp") # 初始化语音 # speaker = win32com.client.Dispatch("SAPI.SpVoice") # my_record() # wavs = glob.glob('.//test_data/voice_test.wav') # wavs = ['/data/user/0/com.example.chaquopytest/files/chaquopy/AssetFinder/app/sjbf_speech2.wav'] # print(wavs) a = join(dirname(__file__), 'asr_video_enhance_2.h5') print(type(a)) graph = tf.compat.v1.get_default_graph() session = tf.compat.v1.Session() with graph.as_default(): with session.as_default(): model = load_model(join(dirname(__file__), 'asr_video_enhance_2.h5')) # model = load_model(join(dirname(__file__), 'asr_video_enhance_2.h5')) # load_model('/data/user/0/com.fangte.yjy.speechrecogni/files/chaquopy/AssetFinder/app/asr_video_enhance_2.h5') pk = join(dirname(__file__), 'dictionary_video_enhance_2.pkl') with open(pk, 'rb') as fr: [_, id2char, mfcc_mean, mfcc_std] = pickle.load(fr) # # char2id = pd.DataFrame(char2id.items(), columns=['name', 'index']) # # print(char2id) # wavs = join(dirname(__file__), l) # wavs = [] # wavs.append(l) # mfcc_mean = np.array([-5.54817, 10.18685, -16.97834, 19.95623, -24.71567, 1.91108, -17.68871, 2.04288, -17.55804, # 0.20271, -9.62210, -5.43127, -1.53957]) # mfcc_std = np.array([4.11379, 16.58478, 15.80970, 18.87008, 18.04815, 21.30934, 19.47388, 18.76543, 16.85591, # 16.07542, 13.90712, 13.12571, 12.20504]) # id2char = {0: '倍', 1: '速', 2: '快', 3: '播', 4: '放', 5: '一', 6: '个', 7: '慢', 8: '0', 9: '.', 10: '5', # 11: '2', 12: '停', 13: '4', 14: '随', 15: '机', 16: '顺', 17: '序', 18: '上', 19: '1', 20: '进', # 21: '下', 22: '暂', 23: '开', 24: '始', 25: '止', 26: '退', 27: '循', 28: '环' # } mfcc_dim = 13 # index = np.random.randint(len(wavs)) # print(wavs[index]) # audio, sr = librosa.load(wavs[index]) print(wavs) audio, sr = librosa.load(wavs) energy = librosa.feature.rms(audio) frames = np.nonzero(energy >= np.max(energy) / 5) indices = librosa.core.frames_to_samples(frames)[1] audio = audio[indices[0]:indices[-1]] if indices.size else audio[0:0] X_data = mfcc(audio, sr, numcep=mfcc_dim, nfft=551) X_data = (X_data - mfcc_mean) / (mfcc_std + 1e-14) # print(X_data.shape) tf.compat.v1.reset_default_graph() with graph.as_default(): with session.as_default(): pred = model.predict(np.expand_dims(X_data, axis=0)) # pred = model.predict(np.expand_dims(X_data, axis=0)) pred_ids = K.eval(K.ctc_decode(pred, [X_data.shape[0]], greedy=False, beam_width=10, top_paths=1)[0][0]) pred_ids = pred_ids.flatten().tolist() text = ''.join([id2char[i] for i in pred_ids]) # print(''.join([id2char[i] for i in pred_ids])) print(text) return text # if __name__ == '__main__': # result = predict() # print(result)
batch_num = 1#264 batch_acc = 0 true_acc = 0 st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') print(st) print(datetime.datetime.now()) for i in range(batch_num): # [X_test, y_test, _, _], _ = next(generator) print(X_test[i]) y_pred = sess.run(y, feed_dict={ x:X_test[i][np.newaxis, :] }) shape = y_pred[:, 2:, :].shape out = K.get_value(K.ctc_decode(y_pred[:, 2:, :], input_length=np.ones(shape[0]) * shape[1])[0][0])[:, :8] # if out.shape[1] == 8: # batch_acc += ((y_test[i] == out).sum(axis=1) == 8).mean() # argmax = np.argmax(y_pred, axis=2)[0] out = ''.join([characters[x] for x in out[0]]).replace(' ', '') y_true = ''.join([characters[x] for x in y_test[i]]).replace(' ', '') if out == y_true: true_acc += 1 """ else: print(out) print(y_true) print("-----------") """ # print(true_acc / batch_num*100)
def ctc_decode(softmax): return K.ctc_decode( softmax, K.tile([K.shape(softmax)[1]], [K.shape(softmax)[0]]))[0]
mat_ori = np.zeros( (height, width - int(31.0 / img_size[0] * img_size[1]), 3), dtype=np.uint8) out_img = np.concatenate([img_reshape, mat_ori], axis=1).transpose([1, 0, 2]) else: out_img = cv2.resize(img, (width, height), interpolation=cv2.INTER_CUBIC) out_img = np.asarray(out_img).transpose([1, 0, 2]) img_list[ii] = np.asarray(out_img) ii += 1 model = load_model('PATH_TO_WEIGHT_FILE') ''' if you want to load model with STN, please use model = load_model('PATH_TO_WEIGHT_FILE', custom_objects={'SpatialTransformer': SpatialTransformer})''' y_pred = model.predict(img_list) shape = y_pred[:, 2:, :].shape ctc_decode = bknd.ctc_decode(y_pred[:, 2:, :], input_length=np.ones(shape[0]) * shape[1])[0][0] out = bknd.get_value(ctc_decode)[:, :label_len] out_list = [] for m in range(len(fileList)): result_str = ''.join([characters[k] for k in out[m]]) out_list.append(result_str) print(out_list)
def ctc_decode(pred): c = K.ctc_decode(pred, input_length=np.ones(pred.shape[0]) * pred.shape[1], greedy=False, beam_width=10)[0][0] print (c)
}, optimizer=Adam(lr=0.0001)) model_final.fit( x=[train_x, train_y, train_input_len, train_label_len], y=train_output, validation_data=([valid_x, valid_y, valid_input_len, valid_label_len], valid_output), epochs=60, batch_size=128) #Check model performance on validation set preds = model.predict(valid_x) decoded = K.get_value( K.ctc_decode(preds, input_length=np.ones(preds.shape[0]) * preds.shape[1], greedy=True)[0][0]) prediction = [] for i in range(valid_size): prediction.append(num_to_label(decoded[i])) y_true = validation_written_df.loc[0:valid_size, 'IDENTITY'] correct_char = 0 total_char = 0 correct = 0 for i in range(valid_size): pr = prediction[i] tr = y_true[i] total_char += len(tr)
print("predicting for:" + pathAndFilename) # predict outputs on validation images # img = Image.open(pathAndFilename) # img = img.resize((128, 32), Image.BICUBIC) # img = np.array(img) /255; # img = np.sum(img, axis=2,keepdims=True) img, _, _, _ = process_data(pathAndFilename, "1_1") img = img / 255. img = np.expand_dims(img, axis=0) prediction = act_model.predict(img) # use CTC decoder out = K.get_value( K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0]) * prediction.shape[1], greedy=False)[0][0]) head, tail = ntpath.split(pathAndFilename) txt = tail.split('_')[1] # see the results i = 0 le = min(10, out.shape[1]) print(out.shape) for x in out: print(txt) for p in range(0, le): if int(x[p]) != -1: print(char_list[int(x[p])], end='') print('\n') i += 1
def get_predictions(index, partition, input_to_softmax, model_path, phn=False): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_test_data() # obtain the true transcription and the audio features if partition == 'test': if phn: transcr = data_gen.test_phn_texts[index] audio_path = data_gen.test_phn_audio_paths[index] elif not phn: transcr = data_gen.test_wrd_texts[index] audio_path = data_gen.test_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': if phn: transcr = data_gen.train_phn_texts[index] audio_path = data_gen.train_phn_audio_paths[index] elif not phn: transcr = data_gen.train_wrd_texts[index] audio_path = data_gen.train_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() # play the audio file, and display the true and predicted transcriptions if not phn: print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints, phn))) print('-' * 80) else: print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) print('Predicted transcription:\n' + '\n') split_true = transcr.split(" ") split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ") print("\033[1;32m" + split_pred[0] + " ", end='') for i in range(1, len(split_true) - 1): if split_true[i - 1] == split_pred[i] or split_true[ i] == split_pred[i] or split_true[i + 1] == split_pred[i]: print("\033[1;32m" + split_pred[i] + " ", end='') else: print("\033[1;31m" + split_pred[i] + " ", end='') print(split_pred[len(split_true) - 1] + " ", end='') split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ") split_true = transcr.split(" ") displayAccuracy(split_true, split_pred, phn)
# print(np.shape(X)) X = np.transpose(X, (0, 2, 3, 1)) X = np.array(X) Y = np.array(Y) return X,Y # the actual loss calc occurs here despite it not being # an internal Keras loss function def ctc_lambda_func(args): y_pred, labels, input_length, label_length = args # the 2 is critical here since the first couple outputs of the RNN # tend to be garbage: # y_pred = y_pred[:, 2:, :] 测试感觉没影响 y_pred = y_pred[:, :, :] return K.ctc_batch_cost(labels, y_pred, input_length, label_length) if __name__ == '__main__': height=150 width=50 input_tensor = Input((height, width, 1)) x = input_tensor for i in range(3): x = Convolution2D(32*2**i, (3, 3), activation='relu', padding='same')(x) # x = Convolution2D(32*2**i, (3, 3), activation='relu')(x) x = MaxPooling2D(pool_size=(2, 2))(x) conv_shape = x.get_shape() # print(conv_shape) x = Reshape(target_shape=(int(conv_shape[1]), int(conv_shape[2] * conv_shape[3])))(x) x = Dense(32, activation='relu')(x) gru_1 = GRU(32, return_sequences=True, kernel_initializer='he_normal', name='gru1')(x) gru_1b = GRU(32, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')(x) gru1_merged = add([gru_1, gru_1b]) ################### gru_2 = GRU(32, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged) gru_2b = GRU(32, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')( gru1_merged) x = concatenate([gru_2, gru_2b]) ###################### x = Dropout(0.25)(x) x = Dense(label_count, kernel_initializer='he_normal', activation='softmax')(x) base_model = Model(inputs=input_tensor, outputs=x) labels = Input(name='the_labels', shape=[seq_len], dtype='float32') input_length = Input(name='input_length', shape=[1], dtype='int64') label_length = Input(name='label_length', shape=[1], dtype='int64') loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([x, labels, input_length, label_length]) model = Model(inputs=[input_tensor, labels, input_length, label_length], outputs=[loss_out]) model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer='adadelta') model.summary() def test(base_model): file_list = [] X, Y = gen_image_data(r'data\test', file_list) y_pred = base_model.predict(X) shape = y_pred[:, :, :].shape # 2: out = K.get_value(K.ctc_decode(y_pred[:, :, :], input_length=np.ones(shape[0]) * shape[1])[0][0])[:, :seq_len] # 2: print() error_count=0 for i in range(len(X)): print(file_list[i]) str_src = str(os.path.split(file_list[i])[-1]).split('.')[0].split('_')[-1] print(out[i]) str_out = ''.join([str(x) for x in out[i] if x!=-1 ]) print(str_src, str_out) if str_src!=str_out: error_count+=1 print('################################',error_count) # img = cv2.imread(file_list[i]) # cv2.imshow('image', img) # cv2.waitKey() class LossHistory(Callback): def on_train_begin(self, logs={}): self.losses = [] def on_epoch_end(self, epoch, logs=None): model.save_weights('model_1018.w') base_model.save_weights('base_model_1018.w') test(base_model) def on_batch_end(self, batch, logs={}): self.losses.append(logs.get('loss')) # checkpointer = ModelCheckpoint(filepath="keras_seq2seq_1018.hdf5", verbose=1, save_best_only=True, ) history = LossHistory() # base_model.load_weights('base_model_1018.w') # model.load_weights('model_1018.w') X,Y=gen_image_data() maxin=4900 subseq_size = 100 batch_size=10 result=model.fit([X[:maxin], Y[:maxin], np.array(np.ones(len(X))*int(conv_shape[1]))[:maxin], np.array(np.ones(len(X))*seq_len)[:maxin]], Y[:maxin], batch_size=20, epochs=1000, callbacks=[history, plotter, EarlyStopping(patience=10)], #checkpointer, history,
def predict(self, X): y_pred = self.model.predict(X) input_length = np.ones(y_pred.shape[0]) * y_pred.shape[1] predicts = K.eval(K.ctc_decode(y_pred, input_length)[0][0]) return predicts