def audionet_checker(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = AudioNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.to(device) T, N, C = 42, args["BATCH_SIZE"], args["AUDIO_FEATURE_SIZE"] inputBatch = torch.rand(T, N, C).to(device) model.eval() with torch.no_grad(): outputBatch = model(inputBatch) print(outputBatch.shape) return
np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") if args["TRAINED_MODEL_FILE"] is not None: print("\nTrained Model File: %s" %(args["TRAINED_MODEL_FILE"])) print("\nDemo Directory: %s" %(args["DEMO_DIRECTORY"])) #declaring the model and loading the trained weights model = AudioNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict(torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) #declaring the language model and loading the trained weights lm = LRS2CharLM() lm.load_state_dict(torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None #reading the noise file if args["TEST_DEMO_NOISY"]: _, noise = wavfile.read(args["DATA_DIRECTORY"] + "/noise.wav")
def main(): matplotlib.use("Agg") np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") kwargs = { "num_workers": args["NUM_WORKERS"], "pin_memory": True } if gpuAvailable else {} torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False #declaring the pretrain and the preval datasets and the corresponding dataloaders audioParams = { "stftWindow": args["STFT_WINDOW"], "stftWinLen": args["STFT_WIN_LENGTH"], "stftOverlap": args["STFT_OVERLAP"] } noiseParams = { "noiseFile": args["DATA_DIRECTORY"] + "/noise.wav", "noiseProb": args["NOISE_PROBABILITY"], "noiseSNR": args["NOISE_SNR_DB"] } pretrainData = LRS2Pretrain("pretrain", args["DATA_DIRECTORY"], args["PRETRAIN_NUM_WORDS"], args["CHAR_TO_INDEX"], args["STEP_SIZE"], audioParams, noiseParams) pretrainLoader = DataLoader(pretrainData, batch_size=args["BATCH_SIZE"], collate_fn=collate_fn, shuffle=True, **kwargs) noiseParams = { "noiseFile": args["DATA_DIRECTORY"] + "/noise.wav", "noiseProb": 0, "noiseSNR": args["NOISE_SNR_DB"] } prevalData = LRS2Pretrain("preval", args["DATA_DIRECTORY"], args["PRETRAIN_NUM_WORDS"], args["CHAR_TO_INDEX"], args["STEP_SIZE"], audioParams, noiseParams) prevalLoader = DataLoader(prevalData, batch_size=args["BATCH_SIZE"], collate_fn=collate_fn, shuffle=True, **kwargs) #declaring the model, optimizer, scheduler and the loss function model = AudioNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.to(device) optimizer = optim.Adam(model.parameters(), lr=args["INIT_LR"], betas=(args["MOMENTUM1"], args["MOMENTUM2"])) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode="min", factor=args["LR_SCHEDULER_FACTOR"], patience=args["LR_SCHEDULER_WAIT"], threshold=args["LR_SCHEDULER_THRESH"], threshold_mode="abs", min_lr=args["FINAL_LR"], verbose=True) loss_function = nn.CTCLoss(blank=0, zero_infinity=False) #removing the checkpoints directory if it exists and remaking it if os.path.exists(args["CODE_DIRECTORY"] + "/checkpoints"): while True: ch = input( "Continue and remove the 'checkpoints' directory? y/n: ") if ch == "y": break elif ch == "n": exit() else: print("Invalid input") shutil.rmtree(args["CODE_DIRECTORY"] + "/checkpoints") os.mkdir(args["CODE_DIRECTORY"] + "/checkpoints") os.mkdir(args["CODE_DIRECTORY"] + "/checkpoints/models") os.mkdir(args["CODE_DIRECTORY"] + "/checkpoints/plots") #loading the pretrained weights if args["PRETRAINED_MODEL_FILE"] is not None: print("\n\nPre-trained Model File: %s" % (args["PRETRAINED_MODEL_FILE"])) print("\nLoading the pre-trained model .... \n") model.load_state_dict( torch.load(args["CODE_DIRECTORY"] + args["PRETRAINED_MODEL_FILE"], map_location=device)) model.to(device) print("Loading Done.\n") trainingLossCurve = list() validationLossCurve = list() trainingWERCurve = list() validationWERCurve = list() #printing the total and trainable parameters in the model numTotalParams, numTrainableParams = num_params(model) print("\nNumber of total parameters in the model = %d" % (numTotalParams)) print("Number of trainable parameters in the model = %d\n" % (numTrainableParams)) print("Number of Words = %d" % (args["PRETRAIN_NUM_WORDS"])) print("\nPretraining the model .... \n") trainParams = { "spaceIx": args["CHAR_TO_INDEX"][" "], "eosIx": args["CHAR_TO_INDEX"]["<EOS>"] } valParams = { "decodeScheme": "greedy", "spaceIx": args["CHAR_TO_INDEX"][" "], "eosIx": args["CHAR_TO_INDEX"]["<EOS>"] } for step in range(args["NUM_STEPS"]): #train the model for one step trainingLoss, trainingCER, trainingWER = train(model, pretrainLoader, optimizer, loss_function, device, trainParams) trainingLossCurve.append(trainingLoss) trainingWERCurve.append(trainingWER) #evaluate the model on validation set validationLoss, validationCER, validationWER = evaluate( model, prevalLoader, loss_function, device, valParams) validationLossCurve.append(validationLoss) validationWERCurve.append(validationWER) #printing the stats after each step print( "Step: %03d || Tr.Loss: %.6f Val.Loss: %.6f || Tr.CER: %.3f Val.CER: %.3f || Tr.WER: %.3f Val.WER: %.3f" % (step, trainingLoss, validationLoss, trainingCER, validationCER, trainingWER, validationWER)) #make a scheduler step scheduler.step(validationWER) #saving the model weights and loss/metric curves in the checkpoints directory after every few steps if ((step % args["SAVE_FREQUENCY"] == 0) or (step == args["NUM_STEPS"] - 1)) and (step != 0): savePath = args[ "CODE_DIRECTORY"] + "/checkpoints/models/pretrain_{:03d}w-step_{:04d}-wer_{:.3f}.pt".format( args["PRETRAIN_NUM_WORDS"], step, validationWER) torch.save(model.state_dict(), savePath) plt.figure() plt.title("Loss Curves") plt.xlabel("Step No.") plt.ylabel("Loss value") plt.plot(list(range(1, len(trainingLossCurve) + 1)), trainingLossCurve, "blue", label="Train") plt.plot(list(range(1, len(validationLossCurve) + 1)), validationLossCurve, "red", label="Validation") plt.legend() plt.savefig( args["CODE_DIRECTORY"] + "/checkpoints/plots/pretrain_{:03d}w-step_{:04d}-loss.png". format(args["PRETRAIN_NUM_WORDS"], step)) plt.close() plt.figure() plt.title("WER Curves") plt.xlabel("Step No.") plt.ylabel("WER") plt.plot(list(range(1, len(trainingWERCurve) + 1)), trainingWERCurve, "blue", label="Train") plt.plot(list(range(1, len(validationWERCurve) + 1)), validationWERCurve, "red", label="Validation") plt.legend() plt.savefig( args["CODE_DIRECTORY"] + "/checkpoints/plots/pretrain_{:03d}w-step_{:04d}-wer.png". format(args["PRETRAIN_NUM_WORDS"], step)) plt.close() print("\nPretraining Done.\n") return
"noiseFile": args["DATA_DIRECTORY"] + "/noise.wav", "noiseProb": 0, "noiseSNR": args["NOISE_SNR_DB"] } prevalData = LRS2Pretrain("preval", args["DATA_DIRECTORY"], args["PRETRAIN_NUM_WORDS"], args["CHAR_TO_INDEX"], args["STEP_SIZE"], audioParams, noiseParams) prevalLoader = DataLoader(prevalData, batch_size=args["BATCH_SIZE"], collate_fn=collate_fn, shuffle=True, **kwargs) #declaring the model, optimizer, scheduler and the loss function model = AudioNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.to(device) optimizer = optim.Adam(model.parameters(), lr=args["INIT_LR"], betas=(args["MOMENTUM1"], args["MOMENTUM2"])) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode="min", factor=args["LR_SCHEDULER_FACTOR"], patience=args["LR_SCHEDULER_WAIT"], threshold=args["LR_SCHEDULER_THRESH"], threshold_mode="abs", min_lr=args["FINAL_LR"], verbose=True) loss_function = nn.CTCLoss(blank=0, zero_infinity=False)
def main(): np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") if args["TRAINED_MODEL_FILE"] is not None: print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"])) print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"])) #declaring the model and loading the trained weights model = AudioNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict( torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) #declaring the language model and loading the trained weights lm = LRS2CharLM() lm.load_state_dict( torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None #reading the noise file if args["TEST_DEMO_NOISY"]: _, noise = wavfile.read(args["DATA_DIRECTORY"] + "/noise.wav") else: noise = None print("\n\nRunning Demo .... \n") #walking through the demo directory and running the model on all video files in it for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]): for file in files: if file.endswith(".mp4"): sampleFile = os.path.join(root, file[:-4]) #preprocessing the sample preprocess_sample(sampleFile) #converting the data sample into appropriate tensors for input to the model audioFile = os.path.join(root, file[:-4]) + ".wav" audioParams = { "stftWindow": args["STFT_WINDOW"], "stftWinLen": args["STFT_WIN_LENGTH"], "stftOverlap": args["STFT_OVERLAP"] } inp, _, inpLen, _ = prepare_main_input( audioFile, None, noise, args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], args["NOISE_SNR_DB"], audioParams) inputBatch, _, inputLenBatch, _ = collate_fn([ (inp, None, inpLen, None) ]) #running the model inputBatch = (inputBatch.float()).to(device) inputLenBatch = (inputLenBatch.int()).to(device) model.eval() with torch.no_grad(): outputBatch = model(inputBatch) #obtaining the prediction using CTC deocder if args["TEST_DEMO_DECODING"] == "greedy": predictionBatch, predictionLenBatch = ctc_greedy_decode( outputBatch, inputLenBatch, args["CHAR_TO_INDEX"]["<EOS>"]) elif args["TEST_DEMO_DECODING"] == "search": beamSearchParams = { "beamWidth": args["BEAM_WIDTH"], "alpha": args["LM_WEIGHT_ALPHA"], "beta": args["LENGTH_PENALTY_BETA"], "threshProb": args["THRESH_PROBABILITY"] } predictionBatch, predictionLenBatch = ctc_search_decode( outputBatch, inputLenBatch, beamSearchParams, args["CHAR_TO_INDEX"][" "], args["CHAR_TO_INDEX"]["<EOS>"], lm) else: print("Invalid Decode Scheme") exit() #converting character indices back to characters pred = predictionBatch[:][:-1] pred = "".join( [args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()]) #printing the predictions print("File: %s" % (file)) print("Prediction: %s" % (pred)) print("\n") print("Demo Completed.\n") else: print("\nPath to trained model file not specified.\n") return
def main(): np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") kwargs = { "num_workers": args["NUM_WORKERS"], "pin_memory": True } if gpuAvailable else {} torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False #declaring the test dataset and test dataloader audioParams = { "stftWindow": args["STFT_WINDOW"], "stftWinLen": args["STFT_WIN_LENGTH"], "stftOverlap": args["STFT_OVERLAP"] } if args["TEST_DEMO_NOISY"]: noiseParams = { "noiseFile": args["DATA_DIRECTORY"] + "/noise.wav", "noiseProb": 1, "noiseSNR": args["NOISE_SNR_DB"] } else: noiseParams = { "noiseFile": args["DATA_DIRECTORY"] + "/noise.wav", "noiseProb": 0, "noiseSNR": args["NOISE_SNR_DB"] } testData = LRS2Main("test", args["DATA_DIRECTORY"], args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], args["STEP_SIZE"], audioParams, noiseParams) testLoader = DataLoader(testData, batch_size=args["BATCH_SIZE"], collate_fn=collate_fn, shuffle=True, **kwargs) if args["TRAINED_MODEL_FILE"] is not None: print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"])) #declaring the model, loss function and loading the trained model weights model = AudioNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict( torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) loss_function = nn.CTCLoss(blank=0, zero_infinity=False) #declaring the language model lm = LRS2CharLM() lm.load_state_dict( torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None print("\nTesting the trained model .... \n") beamSearchParams = { "beamWidth": args["BEAM_WIDTH"], "alpha": args["LM_WEIGHT_ALPHA"], "beta": args["LENGTH_PENALTY_BETA"], "threshProb": args["THRESH_PROBABILITY"] } testParams = { "decodeScheme": args["TEST_DEMO_DECODING"], "beamSearchParams": beamSearchParams, "spaceIx": args["CHAR_TO_INDEX"][" "], "eosIx": args["CHAR_TO_INDEX"]["<EOS>"], "lm": lm } #evaluating the model over the test set testLoss, testCER, testWER = evaluate(model, testLoader, loss_function, device, testParams) #printing the test set loss, CER and WER print("Test Loss: %.6f || Test CER: %.3f || Test WER: %.3f" % (testLoss, testCER, testWER)) print("\nTesting Done.\n") else: print("Path to the trained model file not specified.\n") return