Пример #1
0
def audionet_checker():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AudioNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"],
                     args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"],
                     args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"],
                     args["TX_DROPOUT"], args["NUM_CLASSES"])
    model.to(device)
    T, N, C = 42, args["BATCH_SIZE"], args["AUDIO_FEATURE_SIZE"]
    inputBatch = torch.rand(T, N, C).to(device)
    model.eval()
    with torch.no_grad():
        outputBatch = model(inputBatch)
    print(outputBatch.shape)
    return
Пример #2
0

np.random.seed(args["SEED"])
torch.manual_seed(args["SEED"])
gpuAvailable = torch.cuda.is_available()
device = torch.device("cuda" if gpuAvailable else "cpu")


if args["TRAINED_MODEL_FILE"] is not None:

    print("\nTrained Model File: %s" %(args["TRAINED_MODEL_FILE"]))
    print("\nDemo Directory: %s" %(args["DEMO_DIRECTORY"]))


    #declaring the model and loading the trained weights
    model = AudioNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"],
                     args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"])
    model.load_state_dict(torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device))
    model.to(device)


    #declaring the language model and loading the trained weights
    lm = LRS2CharLM()
    lm.load_state_dict(torch.load(args["TRAINED_LM_FILE"], map_location=device))
    lm.to(device)
    if not args["USE_LM"]:
        lm = None


    #reading the noise file
    if args["TEST_DEMO_NOISY"]:
        _, noise = wavfile.read(args["DATA_DIRECTORY"] + "/noise.wav")
Пример #3
0
def main():

    matplotlib.use("Agg")
    np.random.seed(args["SEED"])
    torch.manual_seed(args["SEED"])
    gpuAvailable = torch.cuda.is_available()
    device = torch.device("cuda" if gpuAvailable else "cpu")
    kwargs = {
        "num_workers": args["NUM_WORKERS"],
        "pin_memory": True
    } if gpuAvailable else {}
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    #declaring the pretrain and the preval datasets and the corresponding dataloaders
    audioParams = {
        "stftWindow": args["STFT_WINDOW"],
        "stftWinLen": args["STFT_WIN_LENGTH"],
        "stftOverlap": args["STFT_OVERLAP"]
    }
    noiseParams = {
        "noiseFile": args["DATA_DIRECTORY"] + "/noise.wav",
        "noiseProb": args["NOISE_PROBABILITY"],
        "noiseSNR": args["NOISE_SNR_DB"]
    }
    pretrainData = LRS2Pretrain("pretrain", args["DATA_DIRECTORY"],
                                args["PRETRAIN_NUM_WORDS"],
                                args["CHAR_TO_INDEX"], args["STEP_SIZE"],
                                audioParams, noiseParams)
    pretrainLoader = DataLoader(pretrainData,
                                batch_size=args["BATCH_SIZE"],
                                collate_fn=collate_fn,
                                shuffle=True,
                                **kwargs)
    noiseParams = {
        "noiseFile": args["DATA_DIRECTORY"] + "/noise.wav",
        "noiseProb": 0,
        "noiseSNR": args["NOISE_SNR_DB"]
    }
    prevalData = LRS2Pretrain("preval", args["DATA_DIRECTORY"],
                              args["PRETRAIN_NUM_WORDS"],
                              args["CHAR_TO_INDEX"], args["STEP_SIZE"],
                              audioParams, noiseParams)
    prevalLoader = DataLoader(prevalData,
                              batch_size=args["BATCH_SIZE"],
                              collate_fn=collate_fn,
                              shuffle=True,
                              **kwargs)

    #declaring the model, optimizer, scheduler and the loss function
    model = AudioNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"],
                     args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"],
                     args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"],
                     args["TX_DROPOUT"], args["NUM_CLASSES"])
    model.to(device)
    optimizer = optim.Adam(model.parameters(),
                           lr=args["INIT_LR"],
                           betas=(args["MOMENTUM1"], args["MOMENTUM2"]))
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode="min",
        factor=args["LR_SCHEDULER_FACTOR"],
        patience=args["LR_SCHEDULER_WAIT"],
        threshold=args["LR_SCHEDULER_THRESH"],
        threshold_mode="abs",
        min_lr=args["FINAL_LR"],
        verbose=True)
    loss_function = nn.CTCLoss(blank=0, zero_infinity=False)

    #removing the checkpoints directory if it exists and remaking it
    if os.path.exists(args["CODE_DIRECTORY"] + "/checkpoints"):
        while True:
            ch = input(
                "Continue and remove the 'checkpoints' directory? y/n: ")
            if ch == "y":
                break
            elif ch == "n":
                exit()
            else:
                print("Invalid input")
        shutil.rmtree(args["CODE_DIRECTORY"] + "/checkpoints")

    os.mkdir(args["CODE_DIRECTORY"] + "/checkpoints")
    os.mkdir(args["CODE_DIRECTORY"] + "/checkpoints/models")
    os.mkdir(args["CODE_DIRECTORY"] + "/checkpoints/plots")

    #loading the pretrained weights
    if args["PRETRAINED_MODEL_FILE"] is not None:
        print("\n\nPre-trained Model File: %s" %
              (args["PRETRAINED_MODEL_FILE"]))
        print("\nLoading the pre-trained model .... \n")
        model.load_state_dict(
            torch.load(args["CODE_DIRECTORY"] + args["PRETRAINED_MODEL_FILE"],
                       map_location=device))
        model.to(device)
        print("Loading Done.\n")

    trainingLossCurve = list()
    validationLossCurve = list()
    trainingWERCurve = list()
    validationWERCurve = list()

    #printing the total and trainable parameters in the model
    numTotalParams, numTrainableParams = num_params(model)
    print("\nNumber of total parameters in the model = %d" % (numTotalParams))
    print("Number of trainable parameters in the model = %d\n" %
          (numTrainableParams))

    print("Number of Words = %d" % (args["PRETRAIN_NUM_WORDS"]))
    print("\nPretraining the model .... \n")

    trainParams = {
        "spaceIx": args["CHAR_TO_INDEX"][" "],
        "eosIx": args["CHAR_TO_INDEX"]["<EOS>"]
    }
    valParams = {
        "decodeScheme": "greedy",
        "spaceIx": args["CHAR_TO_INDEX"][" "],
        "eosIx": args["CHAR_TO_INDEX"]["<EOS>"]
    }

    for step in range(args["NUM_STEPS"]):

        #train the model for one step
        trainingLoss, trainingCER, trainingWER = train(model, pretrainLoader,
                                                       optimizer,
                                                       loss_function, device,
                                                       trainParams)
        trainingLossCurve.append(trainingLoss)
        trainingWERCurve.append(trainingWER)

        #evaluate the model on validation set
        validationLoss, validationCER, validationWER = evaluate(
            model, prevalLoader, loss_function, device, valParams)
        validationLossCurve.append(validationLoss)
        validationWERCurve.append(validationWER)

        #printing the stats after each step
        print(
            "Step: %03d || Tr.Loss: %.6f  Val.Loss: %.6f || Tr.CER: %.3f  Val.CER: %.3f || Tr.WER: %.3f  Val.WER: %.3f"
            % (step, trainingLoss, validationLoss, trainingCER, validationCER,
               trainingWER, validationWER))

        #make a scheduler step
        scheduler.step(validationWER)

        #saving the model weights and loss/metric curves in the checkpoints directory after every few steps
        if ((step % args["SAVE_FREQUENCY"] == 0) or
            (step == args["NUM_STEPS"] - 1)) and (step != 0):

            savePath = args[
                "CODE_DIRECTORY"] + "/checkpoints/models/pretrain_{:03d}w-step_{:04d}-wer_{:.3f}.pt".format(
                    args["PRETRAIN_NUM_WORDS"], step, validationWER)
            torch.save(model.state_dict(), savePath)

            plt.figure()
            plt.title("Loss Curves")
            plt.xlabel("Step No.")
            plt.ylabel("Loss value")
            plt.plot(list(range(1,
                                len(trainingLossCurve) + 1)),
                     trainingLossCurve,
                     "blue",
                     label="Train")
            plt.plot(list(range(1,
                                len(validationLossCurve) + 1)),
                     validationLossCurve,
                     "red",
                     label="Validation")
            plt.legend()
            plt.savefig(
                args["CODE_DIRECTORY"] +
                "/checkpoints/plots/pretrain_{:03d}w-step_{:04d}-loss.png".
                format(args["PRETRAIN_NUM_WORDS"], step))
            plt.close()

            plt.figure()
            plt.title("WER Curves")
            plt.xlabel("Step No.")
            plt.ylabel("WER")
            plt.plot(list(range(1,
                                len(trainingWERCurve) + 1)),
                     trainingWERCurve,
                     "blue",
                     label="Train")
            plt.plot(list(range(1,
                                len(validationWERCurve) + 1)),
                     validationWERCurve,
                     "red",
                     label="Validation")
            plt.legend()
            plt.savefig(
                args["CODE_DIRECTORY"] +
                "/checkpoints/plots/pretrain_{:03d}w-step_{:04d}-wer.png".
                format(args["PRETRAIN_NUM_WORDS"], step))
            plt.close()

    print("\nPretraining Done.\n")

    return
Пример #4
0
    "noiseFile": args["DATA_DIRECTORY"] + "/noise.wav",
    "noiseProb": 0,
    "noiseSNR": args["NOISE_SNR_DB"]
}
prevalData = LRS2Pretrain("preval", args["DATA_DIRECTORY"],
                          args["PRETRAIN_NUM_WORDS"], args["CHAR_TO_INDEX"],
                          args["STEP_SIZE"], audioParams, noiseParams)
prevalLoader = DataLoader(prevalData,
                          batch_size=args["BATCH_SIZE"],
                          collate_fn=collate_fn,
                          shuffle=True,
                          **kwargs)

#declaring the model, optimizer, scheduler and the loss function
model = AudioNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"],
                 args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"],
                 args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"],
                 args["TX_DROPOUT"], args["NUM_CLASSES"])
model.to(device)
optimizer = optim.Adam(model.parameters(),
                       lr=args["INIT_LR"],
                       betas=(args["MOMENTUM1"], args["MOMENTUM2"]))
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="min",
    factor=args["LR_SCHEDULER_FACTOR"],
    patience=args["LR_SCHEDULER_WAIT"],
    threshold=args["LR_SCHEDULER_THRESH"],
    threshold_mode="abs",
    min_lr=args["FINAL_LR"],
    verbose=True)
loss_function = nn.CTCLoss(blank=0, zero_infinity=False)
Пример #5
0
def main():

    np.random.seed(args["SEED"])
    torch.manual_seed(args["SEED"])
    gpuAvailable = torch.cuda.is_available()
    device = torch.device("cuda" if gpuAvailable else "cpu")

    if args["TRAINED_MODEL_FILE"] is not None:

        print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"]))
        print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"]))

        #declaring the model and loading the trained weights
        model = AudioNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"],
                         args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"],
                         args["AUDIO_FEATURE_SIZE"],
                         args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"],
                         args["NUM_CLASSES"])
        model.load_state_dict(
            torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"],
                       map_location=device))
        model.to(device)

        #declaring the language model and loading the trained weights
        lm = LRS2CharLM()
        lm.load_state_dict(
            torch.load(args["TRAINED_LM_FILE"], map_location=device))
        lm.to(device)
        if not args["USE_LM"]:
            lm = None

        #reading the noise file
        if args["TEST_DEMO_NOISY"]:
            _, noise = wavfile.read(args["DATA_DIRECTORY"] + "/noise.wav")
        else:
            noise = None

        print("\n\nRunning Demo .... \n")

        #walking through the demo directory and running the model on all video files in it
        for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]):
            for file in files:
                if file.endswith(".mp4"):
                    sampleFile = os.path.join(root, file[:-4])

                    #preprocessing the sample
                    preprocess_sample(sampleFile)

                    #converting the data sample into appropriate tensors for input to the model
                    audioFile = os.path.join(root, file[:-4]) + ".wav"
                    audioParams = {
                        "stftWindow": args["STFT_WINDOW"],
                        "stftWinLen": args["STFT_WIN_LENGTH"],
                        "stftOverlap": args["STFT_OVERLAP"]
                    }
                    inp, _, inpLen, _ = prepare_main_input(
                        audioFile, None, noise, args["MAIN_REQ_INPUT_LENGTH"],
                        args["CHAR_TO_INDEX"], args["NOISE_SNR_DB"],
                        audioParams)
                    inputBatch, _, inputLenBatch, _ = collate_fn([
                        (inp, None, inpLen, None)
                    ])

                    #running the model
                    inputBatch = (inputBatch.float()).to(device)
                    inputLenBatch = (inputLenBatch.int()).to(device)
                    model.eval()
                    with torch.no_grad():
                        outputBatch = model(inputBatch)

                    #obtaining the prediction using CTC deocder
                    if args["TEST_DEMO_DECODING"] == "greedy":
                        predictionBatch, predictionLenBatch = ctc_greedy_decode(
                            outputBatch, inputLenBatch,
                            args["CHAR_TO_INDEX"]["<EOS>"])

                    elif args["TEST_DEMO_DECODING"] == "search":
                        beamSearchParams = {
                            "beamWidth": args["BEAM_WIDTH"],
                            "alpha": args["LM_WEIGHT_ALPHA"],
                            "beta": args["LENGTH_PENALTY_BETA"],
                            "threshProb": args["THRESH_PROBABILITY"]
                        }
                        predictionBatch, predictionLenBatch = ctc_search_decode(
                            outputBatch, inputLenBatch, beamSearchParams,
                            args["CHAR_TO_INDEX"][" "],
                            args["CHAR_TO_INDEX"]["<EOS>"], lm)

                    else:
                        print("Invalid Decode Scheme")
                        exit()

                    #converting character indices back to characters
                    pred = predictionBatch[:][:-1]
                    pred = "".join(
                        [args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()])

                    #printing the predictions
                    print("File: %s" % (file))
                    print("Prediction: %s" % (pred))
                    print("\n")

        print("Demo Completed.\n")

    else:
        print("\nPath to trained model file not specified.\n")

    return
Пример #6
0
def main():

    np.random.seed(args["SEED"])
    torch.manual_seed(args["SEED"])
    gpuAvailable = torch.cuda.is_available()
    device = torch.device("cuda" if gpuAvailable else "cpu")
    kwargs = {
        "num_workers": args["NUM_WORKERS"],
        "pin_memory": True
    } if gpuAvailable else {}
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    #declaring the test dataset and test dataloader
    audioParams = {
        "stftWindow": args["STFT_WINDOW"],
        "stftWinLen": args["STFT_WIN_LENGTH"],
        "stftOverlap": args["STFT_OVERLAP"]
    }
    if args["TEST_DEMO_NOISY"]:
        noiseParams = {
            "noiseFile": args["DATA_DIRECTORY"] + "/noise.wav",
            "noiseProb": 1,
            "noiseSNR": args["NOISE_SNR_DB"]
        }
    else:
        noiseParams = {
            "noiseFile": args["DATA_DIRECTORY"] + "/noise.wav",
            "noiseProb": 0,
            "noiseSNR": args["NOISE_SNR_DB"]
        }
    testData = LRS2Main("test", args["DATA_DIRECTORY"],
                        args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"],
                        args["STEP_SIZE"], audioParams, noiseParams)
    testLoader = DataLoader(testData,
                            batch_size=args["BATCH_SIZE"],
                            collate_fn=collate_fn,
                            shuffle=True,
                            **kwargs)

    if args["TRAINED_MODEL_FILE"] is not None:

        print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"]))

        #declaring the model, loss function and loading the trained model weights
        model = AudioNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"],
                         args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"],
                         args["AUDIO_FEATURE_SIZE"],
                         args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"],
                         args["NUM_CLASSES"])
        model.load_state_dict(
            torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"],
                       map_location=device))
        model.to(device)
        loss_function = nn.CTCLoss(blank=0, zero_infinity=False)

        #declaring the language model
        lm = LRS2CharLM()
        lm.load_state_dict(
            torch.load(args["TRAINED_LM_FILE"], map_location=device))
        lm.to(device)
        if not args["USE_LM"]:
            lm = None

        print("\nTesting the trained model .... \n")

        beamSearchParams = {
            "beamWidth": args["BEAM_WIDTH"],
            "alpha": args["LM_WEIGHT_ALPHA"],
            "beta": args["LENGTH_PENALTY_BETA"],
            "threshProb": args["THRESH_PROBABILITY"]
        }
        testParams = {
            "decodeScheme": args["TEST_DEMO_DECODING"],
            "beamSearchParams": beamSearchParams,
            "spaceIx": args["CHAR_TO_INDEX"][" "],
            "eosIx": args["CHAR_TO_INDEX"]["<EOS>"],
            "lm": lm
        }

        #evaluating the model over the test set
        testLoss, testCER, testWER = evaluate(model, testLoader, loss_function,
                                              device, testParams)

        #printing the test set loss, CER and WER
        print("Test Loss: %.6f || Test CER: %.3f || Test WER: %.3f" %
              (testLoss, testCER, testWER))
        print("\nTesting Done.\n")

    else:
        print("Path to the trained model file not specified.\n")

    return