Exemplo n.º 1
0
def main(argv=None):
    """
    Training.
    """

    ### parametres

    LEARNING_RATE = FLAGS.LEARNING_RATE
    NUMBER_OF_FRAMES = FLAGS.NUMBER_OF_FRAMES
    BATCH_SIZE = FLAGS.BATCH_SIZE
    EPOCH = FLAGS.EPOCH
    TRAINING_DEVICE = FLAGS.TRAINING_DEVICE
    VOCAB_SIZE = FLAGS.VOCAB_SIZE
    NUMBER_OF_WORDS = FLAGS.NUMBER_OF_WORDS
    HIDDEN_SIZE = FLAGS.HIDDEN_SIZE
    INPUT_SIZE = FLAGS.INPUT_SIZE
    NUMBER_OF_LAYERS = FLAGS.NUMBER_OF_LAYERS
    tsfm = transforms.Compose([
        transforms.Resize([224, 224]),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    train_dir = FLAGS.train_dir  #'D:/Machine_Learning/datasets/YouTubeClips_2/YouTubeClips/'
    train_corpus = FLAGS.train_corpus  #'D:/Machine_Learning/datasets/video_corpus/video_corpus.csv'

    print("train_dir is =", train_dir)
    print("train_corpus =", train_corpus)

    utils = Utils()
    all_text = utils.output_text(train_corpus)
    text_processor = TextProcessor(freq_threshold=10)
    dictionary = text_processor.vocab_creator(all_text)

    ### training data preparation
    train_ds = CustomDataset(train_dir,
                             train_corpus,
                             device,
                             dictionary,
                             VOCAB_SIZE,
                             NUMBER_OF_WORDS,
                             INPUT_SIZE,
                             NUMBER_OF_FRAMES,
                             tsfm,
                             model=md.model_vgg)
    train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE)

    ### Model definition
    encoder = Encoder_LSTM(input_size=INPUT_SIZE,
                           hidden_size=HIDDEN_SIZE,
                           num_layers=NUMBER_OF_LAYERS)
    decoder = Decoder_LSTM(input_size=VOCAB_SIZE,
                           hidden_size=HIDDEN_SIZE,
                           num_layers=NUMBER_OF_LAYERS,
                           number_of_words=NUMBER_OF_WORDS)
    model_seq_to_seq = Seq2Seq(encoder, decoder).to(device)
    model = model_seq_to_seq

    ### load the state_dict of model if model has been pretrained.
    if (FLAGS.load_weights):
        print("there are weights to be loaded")

        model.load_state_dict(torch.load(FLAGS.load_weights))

    ### optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()

    #### Model Training

    import time
    print_feq = 1
    best_loss = np.inf
    for epoch in range(1, EPOCH + 1):
        model.train()
        epoch_loss = 0

        for step, (img, label) in enumerate(train_dl):

            time_1 = time.time()  ## timing

            X_1, X_2 = img  ### get inputs

            X_1 = X_1.to(device)  # Set device
            X_2 = X_2.to(device)  # Set device

            label = label.to(device)  # Set output device

            ### zero the parameter gradients
            optimizer.zero_grad()

            ### forward
            prediction = model(X_1, X_2)

            ### Optimize
            prediction = prediction.to(device)
            prediction = torch.squeeze(prediction, 0)
            label = torch.squeeze(label, 0)

            new_label = torch.zeros([label.shape[0]])
            for l in range(label.shape[0]):
                new_label[l] = np.argmax(label[l].cpu())
            new_label = new_label.to(device)
            loss = criterion(prediction, new_label.long())

            # Backward prop.
            loss.backward()
            optimizer.step()

            ### print out statistics
            epoch_loss += loss.item()
            if step % print_feq == 0:
                print('epoch:', epoch, '\tstep:', step + 1, '/',
                      len(train_dl) + 1, '\ttrain loss:',
                      '{:.4f}'.format(loss.item()), '\ttime:', '{:.4f}'.format(
                          (time.time() - time_1) * print_feq), 's')

        ### save best model
        if (epoch_loss < best_loss):
            best_loss = epoch_loss

            model_name = 'MODEL_SEQ2SEQ' + 'VOCAB_SIZE_' + str(
                VOCAB_SIZE) + 'NUMBER_OF_WORDS_' + str(
                    NUMBER_OF_WORDS
                ) + 'HIDDEN_SIZE_' + str(HIDDEN_SIZE) + 'INPUT_SIZE_' + str(
                    INPUT_SIZE) + 'NUMBER_OF_LAYERS_' + str(NUMBER_OF_LAYERS)
            torch.save(model.state_dict(), model_name + '.pth')

        print("The loss for this epoch is = :", epoch_loss / len(train_dl))
Exemplo n.º 2
0
def main(argv=None):
    """
    Training.
    """

    ### parametres

    LEARNING_RATE = FLAGS.LEARNING_RATE
    NUMBER_OF_FRAMES = FLAGS.NUMBER_OF_FRAMES
    BATCH_SIZE = FLAGS.BATCH_SIZE
    EPOCH = FLAGS.EPOCH
    TRAINING_DEVICE = FLAGS.TRAINING_DEVICE
    VOCAB_SIZE = FLAGS.VOCAB_SIZE
    NUMBER_OF_WORDS = FLAGS.NUMBER_OF_WORDS
    HIDDEN_SIZE = FLAGS.HIDDEN_SIZE
    INPUT_SIZE = FLAGS.INPUT_SIZE
    NUMBER_OF_LAYERS = FLAGS.NUMBER_OF_LAYERS
    tsfm = transforms.Compose([
        transforms.Resize([224, 224]),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    train_corpus = FLAGS.train_corpus
    utils = Utils()
    all_text = utils.output_text(train_corpus)
    text_processor = TextProcessor(freq_threshold=10)
    dictionary = text_processor.vocab_creator(all_text)

    ### Model definition
    encoder = Encoder_LSTM(input_size=INPUT_SIZE,
                           hidden_size=HIDDEN_SIZE,
                           num_layers=NUMBER_OF_LAYERS)
    decoder = Decoder_LSTM(input_size=VOCAB_SIZE,
                           hidden_size=HIDDEN_SIZE,
                           num_layers=NUMBER_OF_LAYERS,
                           number_of_words=NUMBER_OF_WORDS)
    model_seq_to_seq = Seq2Seq(encoder, decoder).to(device)
    model = model_seq_to_seq

    ### load the state_dict of model if model has been pretrained.
    model.load_state_dict(torch.load(FLAGS.load_weights))

    #### Model Testing
    model.eval()
    from random import randint
    import matplotlib.pyplot as plt

    utils = Utils()

    video_path = FLAGS.video_file

    video_pre_data = utils.video_to_frames(video_path,
                                           frame_number=NUMBER_OF_FRAMES,
                                           device='cuda',
                                           INPUT_SIZE=INPUT_SIZE,
                                           model=md.model_vgg,
                                           transform=tsfm)

    X_2 = torch.zeros([NUMBER_OF_WORDS, VOCAB_SIZE])

    for i in range(NUMBER_OF_WORDS):
        if (i == 0):

            X_2[i][2] = 1
        else:
            X_2[i][1] = 1

    input_data = video_pre_data.unsqueeze(0)

    final_sentence = []

    X_2 = X_2.unsqueeze(0)
    X_2 = X_2.to(device)
    input_data = input_data.to(device)

    for i in range(NUMBER_OF_WORDS - 1):
        with torch.no_grad():
            predicted = model(input_data, X_2)
            predicted = predicted.squeeze(0)

            final_sentence.append(
                next((key for key, value in dictionary.items()
                      if value == torch.argmax(predicted[i])), None))
            X_2[0][i + 1][torch.argmax(predicted[i])] = 1
            X_2[0][i + 1][1] = 0
    print(final_sentence)