def main(): np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda:1" if gpuAvailable else "cpu") kwargs = {"num_workers":args["NUM_WORKERS"], "pin_memory":True} if gpuAvailable else {} torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False #declaring the test dataset and test dataloader videoParams = {"videoFPS":args["VIDEO_FPS"]} testData = LRS2Main("test", args["DATA_DIRECTORY"], args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], args["STEP_SIZE"], videoParams) testLoader = DataLoader(testData, batch_size=args["BATCH_SIZE"], collate_fn=collate_fn, shuffle=True, **kwargs) if args["TRAINED_MODEL_FILE"] is not None: print("\nTrained Model File: %s" %(args["TRAINED_MODEL_FILE"])) #declaring the model, loss function and loading the trained model weights model = VideoNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict(torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) loss_function = nn.CTCLoss(blank=0, zero_infinity=False) #declaring the language model lm = LRS2CharLM() lm.load_state_dict(torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None print("\nTesting the trained model .... \n") beamSearchParams = {"beamWidth":args["BEAM_WIDTH"], "alpha":args["LM_WEIGHT_ALPHA"], "beta":args["LENGTH_PENALTY_BETA"], "threshProb":args["THRESH_PROBABILITY"]} testParams = {"decodeScheme":args["TEST_DEMO_DECODING"], "beamSearchParams":beamSearchParams, "spaceIx":args["CHAR_TO_INDEX"][" "], "eosIx":args["CHAR_TO_INDEX"]["<EOS>"], "lm":lm} #evaluating the model over the test set testLoss, testCER, testWER = evaluate(model, testLoader, loss_function, device, testParams) #printing the test set loss, CER and WER print("Test Loss: %.6f || Test CER: %.3f || Test WER: %.3f" %(testLoss, testCER, testWER)) print("\nTesting Done.\n") else: print("Path to the trained model file not specified.\n") return
def videonet_checker(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = VideoNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.to(device) T, N, C = 10, args["BATCH_SIZE"], args["TX_NUM_FEATURES"] inputBatch = torch.rand(T, N, C).to(device) model.eval() with torch.no_grad(): outputBatch = model(inputBatch) print(outputBatch.shape) return
def main(): matplotlib.use("Agg") np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") kwargs = {"num_workers": args["NUM_WORKERS"], "pin_memory": True} if gpuAvailable else {} torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False #declaring the pretrain and the preval datasets and the corresponding dataloaders videoParams = {"videoFPS":args["VIDEO_FPS"]} pretrainData = LRS2Pretrain("pretrain", args["DATA_DIRECTORY"], args["PRETRAIN_NUM_WORDS"], args["CHAR_TO_INDEX"], args["STEP_SIZE"], videoParams) pretrainLoader = DataLoader(pretrainData, batch_size=args["BATCH_SIZE"], collate_fn=collate_fn, shuffle=True, **kwargs) prevalData = LRS2Pretrain("preval", args["DATA_DIRECTORY"], args["PRETRAIN_NUM_WORDS"], args["CHAR_TO_INDEX"], args["STEP_SIZE"], videoParams) prevalLoader = DataLoader(prevalData, batch_size=args["BATCH_SIZE"], collate_fn=collate_fn, shuffle=True, **kwargs) #declaring the model, optimizer, scheduler and the loss function model = VideoNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.to(device) optimizer = optim.Adam(model.parameters(), lr=args["INIT_LR"], betas=(args["MOMENTUM1"], args["MOMENTUM2"])) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=args["LR_SCHEDULER_FACTOR"], patience=args["LR_SCHEDULER_WAIT"], threshold=args["LR_SCHEDULER_THRESH"], threshold_mode="abs", min_lr=args["FINAL_LR"], verbose=True) loss_function = nn.CTCLoss(blank=0, zero_infinity=False) #removing the checkpoints directory if it exists and remaking it if os.path.exists(args["CODE_DIRECTORY"] + "/checkpoints"): while True: ch = input("Continue and remove the 'checkpoints' directory? y/n: ") if ch == "y": break elif ch == "n": exit() else: print("Invalid input") shutil.rmtree(args["CODE_DIRECTORY"] + "/checkpoints") os.mkdir(args["CODE_DIRECTORY"] + "/checkpoints") os.mkdir(args["CODE_DIRECTORY"] + "/checkpoints/models") os.mkdir(args["CODE_DIRECTORY"] + "/checkpoints/plots") #loading the pretrained weights if args["PRETRAINED_MODEL_FILE"] is not None: print("\n\nPre-trained Model File: %s" %(args["PRETRAINED_MODEL_FILE"])) print("\nLoading the pre-trained model .... \n") model.load_state_dict(torch.load(args["CODE_DIRECTORY"] + args["PRETRAINED_MODEL_FILE"], map_location=device)) model.to(device) print("Loading Done.\n") trainingLossCurve = list() validationLossCurve = list() trainingWERCurve = list() validationWERCurve = list() #printing the total and trainable parameters in the model numTotalParams, numTrainableParams = num_params(model) print("\nNumber of total parameters in the model = %d" %(numTotalParams)) print("Number of trainable parameters in the model = %d\n" %(numTrainableParams)) print("Number of Words = %d" %(args["PRETRAIN_NUM_WORDS"])) print("\nPretraining the model .... \n") trainParams = {"spaceIx":args["CHAR_TO_INDEX"][" "], "eosIx":args["CHAR_TO_INDEX"]["<EOS>"]} valParams = {"decodeScheme":"greedy", "spaceIx":args["CHAR_TO_INDEX"][" "], "eosIx":args["CHAR_TO_INDEX"]["<EOS>"]} for step in range(args["NUM_STEPS"]): #train the model for one step trainingLoss, trainingCER, trainingWER = train(model, pretrainLoader, optimizer, loss_function, device, trainParams) trainingLossCurve.append(trainingLoss) trainingWERCurve.append(trainingWER) #evaluate the model on validation set validationLoss, validationCER, validationWER = evaluate(model, prevalLoader, loss_function, device, valParams) validationLossCurve.append(validationLoss) validationWERCurve.append(validationWER) #printing the stats after each step print("Step: %03d || Tr.Loss: %.6f Val.Loss: %.6f || Tr.CER: %.3f Val.CER: %.3f || Tr.WER: %.3f Val.WER: %.3f" %(step, trainingLoss, validationLoss, trainingCER, validationCER, trainingWER, validationWER)) #make a scheduler step scheduler.step(validationWER) #saving the model weights and loss/metric curves in the checkpoints directory after every few steps if ((step%args["SAVE_FREQUENCY"] == 0) or (step == args["NUM_STEPS"]-1)) and (step != 0): savePath = args["CODE_DIRECTORY"] + "/checkpoints/models/pretrain_{:03d}w-step_{:04d}-wer_{:.3f}.pt".format(args["PRETRAIN_NUM_WORDS"], step, validationWER) torch.save(model.state_dict(), savePath) plt.figure() plt.title("Loss Curves") plt.xlabel("Step No.") plt.ylabel("Loss value") plt.plot(list(range(1, len(trainingLossCurve)+1)), trainingLossCurve, "blue", label="Train") plt.plot(list(range(1, len(validationLossCurve)+1)), validationLossCurve, "red", label="Validation") plt.legend() plt.savefig(args["CODE_DIRECTORY"] + "/checkpoints/plots/pretrain_{:03d}w-step_{:04d}-loss.png".format(args["PRETRAIN_NUM_WORDS"], step)) plt.close() plt.figure() plt.title("WER Curves") plt.xlabel("Step No.") plt.ylabel("WER") plt.plot(list(range(1, len(trainingWERCurve)+1)), trainingWERCurve, "blue", label="Train") plt.plot(list(range(1, len(validationWERCurve)+1)), validationWERCurve, "red", label="Validation") plt.legend() plt.savefig(args["CODE_DIRECTORY"] + "/checkpoints/plots/pretrain_{:03d}w-step_{:04d}-wer.png".format(args["PRETRAIN_NUM_WORDS"], step)) plt.close() print("\nPretraining Done.\n") return
from utils.preprocessing import preprocess_sample from utils.decoders import ctc_greedy_decode, ctc_search_decode np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") if args["TRAINED_MODEL_FILE"] is not None: print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"])) print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"])) #declaring the model and loading the trained weights model = VideoNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict( torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) #declaring the visual frontend module vf = VisualFrontend() vf.load_state_dict( torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device)) vf.to(device) #declaring the language model lm = LRS2CharLM() lm.load_state_dict(torch.load(args["TRAINED_LM_FILE"],
batch_size=args["BATCH_SIZE"], collate_fn=collate_fn, shuffle=True, **kwargs) valData = LRS2Main("val", args["DATA_DIRECTORY"], args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], args["STEP_SIZE"], videoParams) valLoader = DataLoader(valData, batch_size=args["BATCH_SIZE"], collate_fn=collate_fn, shuffle=True, **kwargs) #declaring the model, optimizer, scheduler and the loss function model = VideoNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.to(device) optimizer = optim.Adam(model.parameters(), lr=args["INIT_LR"], betas=(args["MOMENTUM1"], args["MOMENTUM2"])) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode="min", factor=args["LR_SCHEDULER_FACTOR"], patience=args["LR_SCHEDULER_WAIT"], threshold=args["LR_SCHEDULER_THRESH"], threshold_mode="abs", min_lr=args["FINAL_LR"], verbose=True) loss_function = nn.CTCLoss(blank=0, zero_infinity=False)
def main(): np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") if args["TRAINED_MODEL_FILE"] is not None: print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"])) print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"])) #declaring the model and loading the trained weights model = VideoNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict( torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) #declaring the visual frontend module vf = VisualFrontend() vf.load_state_dict( torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device)) vf.to(device) #declaring the language model lm = LRS2CharLM() lm.load_state_dict( torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None print("\n\nRunning Demo .... \n") #walking through the demo directory and running the model on all video files in it for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]): for file in files: if file.endswith(".mp4"): sampleFile = os.path.join(root, file[:-4]) #preprocessing the sample params = { "roiSize": args["ROI_SIZE"], "normMean": args["NORMALIZATION_MEAN"], "normStd": args["NORMALIZATION_STD"], "vf": vf } preprocess_sample(sampleFile, params) #converting the data sample into appropriate tensors for input to the model visualFeaturesFile = os.path.join(root, file[:-4]) + ".npy" videoParams = {"videoFPS": args["VIDEO_FPS"]} inp, _, inpLen, _ = prepare_main_input( visualFeaturesFile, None, args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], videoParams) inputBatch, _, inputLenBatch, _ = collate_fn([ (inp, None, inpLen, None) ]) #running the model inputBatch = (inputBatch.float()).to(device) inputLenBatch = (inputLenBatch.int()).to(device) model.eval() with torch.no_grad(): outputBatch = model(inputBatch) #obtaining the prediction using CTC deocder if args["TEST_DEMO_DECODING"] == "greedy": predictionBatch, predictionLenBatch = ctc_greedy_decode( outputBatch, inputLenBatch, args["CHAR_TO_INDEX"]["<EOS>"]) elif args["TEST_DEMO_DECODING"] == "search": beamSearchParams = { "beamWidth": args["BEAM_WIDTH"], "alpha": args["LM_WEIGHT_ALPHA"], "beta": args["LENGTH_PENALTY_BETA"], "threshProb": args["THRESH_PROBABILITY"] } predictionBatch, predictionLenBatch = ctc_search_decode( outputBatch, inputLenBatch, beamSearchParams, args["CHAR_TO_INDEX"][" "], args["CHAR_TO_INDEX"]["<EOS>"], lm) else: print("Invalid Decode Scheme") exit() #converting character indices back to characters pred = predictionBatch[:][:-1] pred = "".join( [args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()]) #printing the predictions print("File: %s" % (file)) print("Prediction: %s" % (pred)) print("\n") print("Demo Completed.\n") else: print("\nPath to trained model file not specified.\n") return