Exemplo n.º 1
0
def predict(weight_path, video_path, absolute_max_string_len=32, output_size=28):
    print ("\nLoading data from disk...")
    video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH)
    if os.path.isfile(video_path):
        video.from_video(video_path)
    else:
        video.from_frames(video_path)
    print ("Data loaded.\n")

    if K.image_data_format() == 'channels_first':
        img_c, frames_n, img_w, img_h = video.data.shape
    else:
        frames_n, img_w, img_h, img_c = video.data.shape


    lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n,
                    absolute_max_string_len=absolute_max_string_len, output_size=output_size)

    adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

    lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam)
    lipnet.model.load_weights(weight_path)

    spell = Spell(path=PREDICT_DICTIONARY)
    decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH,
                      postprocessors=[labels_to_text, spell.sentence])

    X_data       = np.array([video.data]).astype(np.float32) / 255
    input_length = np.array([len(video.data)])

    y_pred         = lipnet.predict(X_data)
    result         = decoder.decode(y_pred, input_length)[0]

    return (video, result)
Exemplo n.º 2
0
def predict(weight_path, video):
    global lipnet
    global adam
    global spell
    global decoder

    if lipnet is None:
        lipnet = LipNet(img_c=3,
                        img_w=100,
                        img_h=50,
                        frames_n=75,
                        absolute_max_string_len=32,
                        output_size=28)

        adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

        lipnet.model.compile(loss={
            'ctc': lambda y_true, y_pred: y_pred
        },
                             optimizer=adam)
        lipnet.model.load_weights(weight_path)

        spell = Spell(path=PREDICT_DICTIONARY)
        decoder = Decoder(greedy=PREDICT_GREEDY,
                          beam_width=PREDICT_BEAM_WIDTH,
                          postprocessors=[labels_to_text, spell.sentence])

    X_data = np.array([video.data]).astype(np.float32) / 255
    input_length = np.array([len(video.data)])

    y_pred = lipnet.predict(X_data)
    result = decoder.decode(y_pred, input_length)[0]

    show_video_subtitle(video.face, result)
    print result
Exemplo n.º 3
0
def train(run_name, speaker, start_epoch, stop_epoch, img_c, img_w, img_h, frames_n, absolute_max_string_len, minibatch_size):
    DATASET_DIR = os.path.join(CURRENT_PATH, speaker, 'datasets')
    OUTPUT_DIR = os.path.join(CURRENT_PATH, speaker, 'results')
    LOG_DIR = os.path.join(CURRENT_PATH, speaker, 'logs')

    curriculum = Curriculum(curriculum_rules)
    lip_gen = BasicGenerator(dataset_path=DATASET_DIR,
                                minibatch_size=minibatch_size,
                                img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n,
                                absolute_max_string_len=absolute_max_string_len,
                                curriculum=curriculum, start_epoch=start_epoch).build()

    lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n,
                            absolute_max_string_len=absolute_max_string_len, output_size=lip_gen.get_output_size())
    lipnet.summary()

    adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

    # the loss calc occurs elsewhere, so use a dummy lambda func for the loss
    lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam)

    # load weight if necessary
    if start_epoch > 0:
        weight_file = os.path.join(OUTPUT_DIR, os.path.join(run_name, 'weights%02d.h5' % (start_epoch - 1)))
        lipnet.model.load_weights(weight_file)

    if start_epoch < 1:
        weight_file = os.path.join(OUTPUT_DIR, os.path.join(CURRENT_PATH,speaker,'results', 'weightsa.h5'))
        lipnet.model.load_weights(weight_file)

    spell = Spell(path=PREDICT_DICTIONARY)
    decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH,
                      postprocessors=[labels_to_text, spell.sentence])

    # define callbacks
    statistics  = Statistics(lipnet, lip_gen.next_val(), decoder, 256, output_dir=os.path.join(OUTPUT_DIR, run_name))
    visualize   = Visualize(os.path.join(OUTPUT_DIR, run_name), lipnet, lip_gen.next_val(), decoder, num_display_sentences=minibatch_size)
    tensorboard = TensorBoard(log_dir=os.path.join(LOG_DIR, run_name))
    csv_logger  = CSVLogger(os.path.join(LOG_DIR, "{}-{}.csv".format('training',run_name)), separator=',', append=True)
    checkpoint  = ModelCheckpoint(os.path.join(OUTPUT_DIR, run_name, "weights{epoch:02d}.h5"), monitor='val_loss', save_weights_only=True, mode='auto', period=1)

    lipnet.model.fit_generator(generator=lip_gen.next_train(),
                        steps_per_epoch=lip_gen.default_training_steps, epochs=stop_epoch,
                        validation_data=lip_gen.next_val(), validation_steps=lip_gen.default_validation_steps,
                        callbacks=[checkpoint, statistics, visualize, lip_gen, tensorboard, csv_logger],
                        initial_epoch=start_epoch,
                        verbose=1,
                        max_q_size=5,
                        workers=2,
                        pickle_safe=True)
Exemplo n.º 4
0
    def buildEmbeddingModel(self):
        lipnet = LipNet(img_c= self.img_c, img_w= self.img_w, img_h= self.img_h,\
                        frames_n= self.frames_n, absolute_max_string_len=32, \
                        output_size=28)

        lipnet.model.load_weights(self.weight_path)

        lipnet.model.summary()

        # we want to freeze all layers up to the bi-directional layer
        # removing the bidirectional layer and adding our own
        model = Model(lipnet.model.get_layer('the_input').input, \
                      lipnet.model.get_layer('time_distributed_1').output)

        counter = 0
        for layer in model.layers:
            layer.trainable = False
            counter += 1

        x = model.output
        x = Bidirectional(GRU(128, return_sequences=False, \
                    kernel_initializer='Orthogonal', name='gru1'), merge_mode='concat')(x)

        self.lipAuth_embedding = Model(model.input, x)
        self.lipAuth_embedding.summary()
Exemplo n.º 5
0
def predict_videos(video_data,
                   weight_path,
                   absolute_max_string_len=32,
                   output_size=28):
    if K.image_data_format() == 'channels_first':
        img_c, frames_n, img_w, img_h = video_data.shape
    else:
        frames_n, img_w, img_h, img_c = video_data.shape

    lipnet = LipNet(img_c=img_c,
                    img_w=img_w,
                    img_h=img_h,
                    frames_n=frames_n,
                    absolute_max_string_len=absolute_max_string_len,
                    output_size=output_size)

    adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

    lipnet.model.compile(loss={
        'ctc': lambda y_true, y_pred: y_pred
    },
                         optimizer=adam)
    lipnet.model.load_weights(weight_path)

    spell = Spell(path=PREDICT_DICTIONARY)
    decoder = Decoder(greedy=PREDICT_GREEDY,
                      beam_width=PREDICT_BEAM_WIDTH,
                      postprocessors=[labels_to_text, spell.sentence])

    X_data = np.array([video_data]).astype(np.float32) / 255
    input_length = np.array([len(video_data)])

    y_pred = lipnet.predict(X_data)

    #print(y_pred[0,0])
    #print(y_pred[0,40])

    result = decoder.decode(y_pred, input_length)[0]

    return result
Exemplo n.º 6
0
def predict(weight_path,
            video_path,
            absolute_max_string_len=32,
            output_size=28):
    #print("\nLoading data from disk...")
    video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH)
    if os.path.isfile(video_path):
        video.from_video(video_path)
    else:
        video.from_frames(video_path)
    #print("Data loaded.\n")

    if K.image_data_format() == 'channels_first':
        img_c, frames_n, img_w, img_h = video.data.shape
    else:
        frames_n, img_w, img_h, img_c = video.data.shape

    lipnet = LipNet(img_c=img_c,
                    img_w=img_w,
                    img_h=img_h,
                    frames_n=frames_n,
                    absolute_max_string_len=absolute_max_string_len,
                    output_size=output_size)

    if not MODEL.model:
        #lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n,
        #            absolute_max_string_len=absolute_max_string_len, output_size=output_size)
        #adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        #lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam)
        #lipnet.model.load_weights(weight_path)

        #print("Built Model.")
        #spell = Spell(path=PREDICT_DICTIONARY)
        #decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH,
        #              postprocessors=[labels_to_text])#, spell.sentence])
        MODEL.model = Prebuilt_model(weight_path, video_path, lipnet,
                                     absolute_max_string_len, output_size)

    X_data = np.array([video.data]).astype(np.float32) / 255
    input_length = np.array([len(video.data)])

    y_pred = MODEL.model.lipnet.predict(X_data)
    results = MODEL.model.decoder.decode(y_pred, input_length)
    print("Before cognitive services: " + results[0])
    cog = cognitive()
    cog_result = cog.speech_to_text(cog.text_to_speech(results[0]))
    print("after cognitive services: " + cog_result)

    return (video, cog_result)
Exemplo n.º 7
0
def stats(weight_path, dataset_path, img_c, img_w, img_h, frames_n,
          absolute_max_string_len, minibatch_size):
    lip_gen = BasicGenerator(
        dataset_path=dataset_path,
        minibatch_size=minibatch_size,
        img_c=img_c,
        img_w=img_w,
        img_h=img_h,
        frames_n=frames_n,
        absolute_max_string_len=absolute_max_string_len).build()

    lipnet = LipNet(img_c=img_c,
                    img_w=img_w,
                    img_h=img_h,
                    frames_n=frames_n,
                    absolute_max_string_len=absolute_max_string_len,
                    output_size=lip_gen.get_output_size())

    adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

    lipnet.model.compile(loss={
        'ctc': lambda y_true, y_pred: y_pred
    },
                         optimizer=adam)
    lipnet.model.load_weights(weight_path)

    spell = Spell(path=PREDICT_DICTIONARY)
    decoder = Decoder(greedy=PREDICT_GREEDY,
                      beam_width=PREDICT_BEAM_WIDTH,
                      postprocessors=[labels_to_text, spell.sentence])

    statistics = Statistics(lipnet,
                            lip_gen.next_val(),
                            decoder,
                            256,
                            output_dir=None)

    lip_gen.on_train_begin()
    statistics.on_epoch_end(0)
Exemplo n.º 8
0
from lipnet.lipreading.videos import Video
from lipnet.lipreading.visualization import show_video_subtitle
from lipnet.core.decoders import Decoder
from lipnet.lipreading.helpers import labels_to_text
from lipnet.utils.spell import Spell
from lipnet.model2 import LipNet
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
import numpy as np
import sys
import os
import tensorflowjs as tfjs
lipnet = LipNet(3, 100, 50, 75, 32, 28)
lipnet.model.load_weights(
    "C:/Projects/lipnet/evaluation/models/unseen-weights178.h5")
# lipnet.model.summary()
tfjs.converters.save_keras_model(lipnet.baseModel, "tfjsModelbase")