def predict(weight_path, video_path, absolute_max_string_len=32, output_size=28): print ("\nLoading data from disk...") video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH) if os.path.isfile(video_path): video.from_video(video_path) else: video.from_frames(video_path) print ("Data loaded.\n") if K.image_data_format() == 'channels_first': img_c, frames_n, img_w, img_h = video.data.shape else: frames_n, img_w, img_h, img_c = video.data.shape lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, absolute_max_string_len=absolute_max_string_len, output_size=output_size) adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam) lipnet.model.load_weights(weight_path) spell = Spell(path=PREDICT_DICTIONARY) decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, postprocessors=[labels_to_text, spell.sentence]) X_data = np.array([video.data]).astype(np.float32) / 255 input_length = np.array([len(video.data)]) y_pred = lipnet.predict(X_data) result = decoder.decode(y_pred, input_length)[0] return (video, result)
def predict(weight_path, video): global lipnet global adam global spell global decoder if lipnet is None: lipnet = LipNet(img_c=3, img_w=100, img_h=50, frames_n=75, absolute_max_string_len=32, output_size=28) adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) lipnet.model.compile(loss={ 'ctc': lambda y_true, y_pred: y_pred }, optimizer=adam) lipnet.model.load_weights(weight_path) spell = Spell(path=PREDICT_DICTIONARY) decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, postprocessors=[labels_to_text, spell.sentence]) X_data = np.array([video.data]).astype(np.float32) / 255 input_length = np.array([len(video.data)]) y_pred = lipnet.predict(X_data) result = decoder.decode(y_pred, input_length)[0] show_video_subtitle(video.face, result) print result
def predict_videos(video_data, weight_path, absolute_max_string_len=32, output_size=28): if K.image_data_format() == 'channels_first': img_c, frames_n, img_w, img_h = video_data.shape else: frames_n, img_w, img_h, img_c = video_data.shape lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, absolute_max_string_len=absolute_max_string_len, output_size=output_size) adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) lipnet.model.compile(loss={ 'ctc': lambda y_true, y_pred: y_pred }, optimizer=adam) lipnet.model.load_weights(weight_path) spell = Spell(path=PREDICT_DICTIONARY) decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, postprocessors=[labels_to_text, spell.sentence]) X_data = np.array([video_data]).astype(np.float32) / 255 input_length = np.array([len(video_data)]) y_pred = lipnet.predict(X_data) #print(y_pred[0,0]) #print(y_pred[0,40]) result = decoder.decode(y_pred, input_length)[0] return result