def predict(weight_path, video_path, absolute_max_string_len=32, output_size=28): print ("\nLoading data from disk...") video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH) if os.path.isfile(video_path): video.from_video(video_path) else: video.from_frames(video_path) print ("Data loaded.\n") if K.image_data_format() == 'channels_first': img_c, frames_n, img_w, img_h = video.data.shape else: frames_n, img_w, img_h, img_c = video.data.shape lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, absolute_max_string_len=absolute_max_string_len, output_size=output_size) adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam) lipnet.model.load_weights(weight_path) spell = Spell(path=PREDICT_DICTIONARY) decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, postprocessors=[labels_to_text, spell.sentence]) X_data = np.array([video.data]).astype(np.float32) / 255 input_length = np.array([len(video.data)]) y_pred = lipnet.predict(X_data) result = decoder.decode(y_pred, input_length)[0] return (video, result)
def predict(weight_path, video): global lipnet global adam global spell global decoder if lipnet is None: lipnet = LipNet(img_c=3, img_w=100, img_h=50, frames_n=75, absolute_max_string_len=32, output_size=28) adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) lipnet.model.compile(loss={ 'ctc': lambda y_true, y_pred: y_pred }, optimizer=adam) lipnet.model.load_weights(weight_path) spell = Spell(path=PREDICT_DICTIONARY) decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, postprocessors=[labels_to_text, spell.sentence]) X_data = np.array([video.data]).astype(np.float32) / 255 input_length = np.array([len(video.data)]) y_pred = lipnet.predict(X_data) result = decoder.decode(y_pred, input_length)[0] show_video_subtitle(video.face, result) print result
def train(run_name, speaker, start_epoch, stop_epoch, img_c, img_w, img_h, frames_n, absolute_max_string_len, minibatch_size): DATASET_DIR = os.path.join(CURRENT_PATH, speaker, 'datasets') OUTPUT_DIR = os.path.join(CURRENT_PATH, speaker, 'results') LOG_DIR = os.path.join(CURRENT_PATH, speaker, 'logs') curriculum = Curriculum(curriculum_rules) lip_gen = BasicGenerator(dataset_path=DATASET_DIR, minibatch_size=minibatch_size, img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, absolute_max_string_len=absolute_max_string_len, curriculum=curriculum, start_epoch=start_epoch).build() lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, absolute_max_string_len=absolute_max_string_len, output_size=lip_gen.get_output_size()) lipnet.summary() adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) # the loss calc occurs elsewhere, so use a dummy lambda func for the loss lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam) # load weight if necessary if start_epoch > 0: weight_file = os.path.join(OUTPUT_DIR, os.path.join(run_name, 'weights%02d.h5' % (start_epoch - 1))) lipnet.model.load_weights(weight_file) if start_epoch < 1: weight_file = os.path.join(OUTPUT_DIR, os.path.join(CURRENT_PATH,speaker,'results', 'weightsa.h5')) lipnet.model.load_weights(weight_file) spell = Spell(path=PREDICT_DICTIONARY) decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, postprocessors=[labels_to_text, spell.sentence]) # define callbacks statistics = Statistics(lipnet, lip_gen.next_val(), decoder, 256, output_dir=os.path.join(OUTPUT_DIR, run_name)) visualize = Visualize(os.path.join(OUTPUT_DIR, run_name), lipnet, lip_gen.next_val(), decoder, num_display_sentences=minibatch_size) tensorboard = TensorBoard(log_dir=os.path.join(LOG_DIR, run_name)) csv_logger = CSVLogger(os.path.join(LOG_DIR, "{}-{}.csv".format('training',run_name)), separator=',', append=True) checkpoint = ModelCheckpoint(os.path.join(OUTPUT_DIR, run_name, "weights{epoch:02d}.h5"), monitor='val_loss', save_weights_only=True, mode='auto', period=1) lipnet.model.fit_generator(generator=lip_gen.next_train(), steps_per_epoch=lip_gen.default_training_steps, epochs=stop_epoch, validation_data=lip_gen.next_val(), validation_steps=lip_gen.default_validation_steps, callbacks=[checkpoint, statistics, visualize, lip_gen, tensorboard, csv_logger], initial_epoch=start_epoch, verbose=1, max_q_size=5, workers=2, pickle_safe=True)
def buildEmbeddingModel(self): lipnet = LipNet(img_c= self.img_c, img_w= self.img_w, img_h= self.img_h,\ frames_n= self.frames_n, absolute_max_string_len=32, \ output_size=28) lipnet.model.load_weights(self.weight_path) lipnet.model.summary() # we want to freeze all layers up to the bi-directional layer # removing the bidirectional layer and adding our own model = Model(lipnet.model.get_layer('the_input').input, \ lipnet.model.get_layer('time_distributed_1').output) counter = 0 for layer in model.layers: layer.trainable = False counter += 1 x = model.output x = Bidirectional(GRU(128, return_sequences=False, \ kernel_initializer='Orthogonal', name='gru1'), merge_mode='concat')(x) self.lipAuth_embedding = Model(model.input, x) self.lipAuth_embedding.summary()
def predict_videos(video_data, weight_path, absolute_max_string_len=32, output_size=28): if K.image_data_format() == 'channels_first': img_c, frames_n, img_w, img_h = video_data.shape else: frames_n, img_w, img_h, img_c = video_data.shape lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, absolute_max_string_len=absolute_max_string_len, output_size=output_size) adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) lipnet.model.compile(loss={ 'ctc': lambda y_true, y_pred: y_pred }, optimizer=adam) lipnet.model.load_weights(weight_path) spell = Spell(path=PREDICT_DICTIONARY) decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, postprocessors=[labels_to_text, spell.sentence]) X_data = np.array([video_data]).astype(np.float32) / 255 input_length = np.array([len(video_data)]) y_pred = lipnet.predict(X_data) #print(y_pred[0,0]) #print(y_pred[0,40]) result = decoder.decode(y_pred, input_length)[0] return result
def predict(weight_path, video_path, absolute_max_string_len=32, output_size=28): #print("\nLoading data from disk...") video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH) if os.path.isfile(video_path): video.from_video(video_path) else: video.from_frames(video_path) #print("Data loaded.\n") if K.image_data_format() == 'channels_first': img_c, frames_n, img_w, img_h = video.data.shape else: frames_n, img_w, img_h, img_c = video.data.shape lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, absolute_max_string_len=absolute_max_string_len, output_size=output_size) if not MODEL.model: #lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, # absolute_max_string_len=absolute_max_string_len, output_size=output_size) #adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) #lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam) #lipnet.model.load_weights(weight_path) #print("Built Model.") #spell = Spell(path=PREDICT_DICTIONARY) #decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, # postprocessors=[labels_to_text])#, spell.sentence]) MODEL.model = Prebuilt_model(weight_path, video_path, lipnet, absolute_max_string_len, output_size) X_data = np.array([video.data]).astype(np.float32) / 255 input_length = np.array([len(video.data)]) y_pred = MODEL.model.lipnet.predict(X_data) results = MODEL.model.decoder.decode(y_pred, input_length) print("Before cognitive services: " + results[0]) cog = cognitive() cog_result = cog.speech_to_text(cog.text_to_speech(results[0])) print("after cognitive services: " + cog_result) return (video, cog_result)
def stats(weight_path, dataset_path, img_c, img_w, img_h, frames_n, absolute_max_string_len, minibatch_size): lip_gen = BasicGenerator( dataset_path=dataset_path, minibatch_size=minibatch_size, img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, absolute_max_string_len=absolute_max_string_len).build() lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, absolute_max_string_len=absolute_max_string_len, output_size=lip_gen.get_output_size()) adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) lipnet.model.compile(loss={ 'ctc': lambda y_true, y_pred: y_pred }, optimizer=adam) lipnet.model.load_weights(weight_path) spell = Spell(path=PREDICT_DICTIONARY) decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, postprocessors=[labels_to_text, spell.sentence]) statistics = Statistics(lipnet, lip_gen.next_val(), decoder, 256, output_dir=None) lip_gen.on_train_begin() statistics.on_epoch_end(0)
from lipnet.lipreading.videos import Video from lipnet.lipreading.visualization import show_video_subtitle from lipnet.core.decoders import Decoder from lipnet.lipreading.helpers import labels_to_text from lipnet.utils.spell import Spell from lipnet.model2 import LipNet from tensorflow.keras.optimizers import Adam from tensorflow.keras import backend as K import numpy as np import sys import os import tensorflowjs as tfjs lipnet = LipNet(3, 100, 50, 75, 32, 28) lipnet.model.load_weights( "C:/Projects/lipnet/evaluation/models/unseen-weights178.h5") # lipnet.model.summary() tfjs.converters.save_keras_model(lipnet.baseModel, "tfjsModelbase")