def lrs2charlm_checker(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = LRS2CharLM() model.load_state_dict( torch.load(args["TRAINED_LM_FILE"], map_location=device)) model.to(device) inp = torch.tensor(args["CHAR_TO_INDEX"][" "] - 1) initStateBatch = None string = list() for i in range(100): inputBatch = inp.reshape(1, 1) inputBatch = inputBatch.to(device) model.eval() with torch.no_grad(): outputBatch, finalStateBatch = model(inputBatch, initStateBatch) outputBatch = torch.exp(outputBatch) out = outputBatch.squeeze() probs = out.tolist() ix = np.random.choice(np.arange(len(probs)), p=probs / np.sum(probs)) char = args["INDEX_TO_CHAR"][ix + 1] string.append(char) inp = torch.tensor(ix) initStateBatch = finalStateBatch print("".join(string)) return
def main(): np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda:1" if gpuAvailable else "cpu") kwargs = {"num_workers":args["NUM_WORKERS"], "pin_memory":True} if gpuAvailable else {} torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False #declaring the test dataset and test dataloader videoParams = {"videoFPS":args["VIDEO_FPS"]} testData = LRS2Main("test", args["DATA_DIRECTORY"], args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], args["STEP_SIZE"], videoParams) testLoader = DataLoader(testData, batch_size=args["BATCH_SIZE"], collate_fn=collate_fn, shuffle=True, **kwargs) if args["TRAINED_MODEL_FILE"] is not None: print("\nTrained Model File: %s" %(args["TRAINED_MODEL_FILE"])) #declaring the model, loss function and loading the trained model weights model = VideoNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict(torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) loss_function = nn.CTCLoss(blank=0, zero_infinity=False) #declaring the language model lm = LRS2CharLM() lm.load_state_dict(torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None print("\nTesting the trained model .... \n") beamSearchParams = {"beamWidth":args["BEAM_WIDTH"], "alpha":args["LM_WEIGHT_ALPHA"], "beta":args["LENGTH_PENALTY_BETA"], "threshProb":args["THRESH_PROBABILITY"]} testParams = {"decodeScheme":args["TEST_DEMO_DECODING"], "beamSearchParams":beamSearchParams, "spaceIx":args["CHAR_TO_INDEX"][" "], "eosIx":args["CHAR_TO_INDEX"]["<EOS>"], "lm":lm} #evaluating the model over the test set testLoss, testCER, testWER = evaluate(model, testLoader, loss_function, device, testParams) #printing the test set loss, CER and WER print("Test Loss: %.6f || Test CER: %.3f || Test WER: %.3f" %(testLoss, testCER, testWER)) print("\nTesting Done.\n") else: print("Path to the trained model file not specified.\n") return
def ctc_search_decode_checker(): outputs = [ "TTTEEEST-IINNNN-G- CC-TTCCC- -DEEE-CO-DD---E -FUU-NCCC--TAA-B-FA--E", "ONNE SSSTEEEP ISSS OOOOVVEERA- FDDA-S A FD-AASDF - AD-AFA DF-ADF SF-ADF", "EVERYTHING ALRIGHT CHECK DONE SH-SG-GAD-G HS- RA-R H J- J-AM GA-AM GA-GA-AD", "SSEEEE-E-- -EEE-VE-NNN ---DDDOOOO-ODDE-E --O-OOOOTTTY AAAASS-SSAAM WORK", "---------------------------------------------------------------------------" ] inpLens = [64, 32, 29, 75, 56] outputProbs = 0.01 * torch.ones( (len(outputs[0]), len(inpLens), args["NUM_CLASSES"])) inpLens = torch.tensor(inpLens) for n in range(len(outputs)): for i in range(len(outputs[n])): char = outputs[n][i] if char == "-": ix = 0 else: ix = args["CHAR_TO_INDEX"][char] outputProbs[i, n, ix] = 1.5 outputLogProbs = torch.log(outputProbs) beamSearchParams = { "beamWidth": args["BEAM_WIDTH"], "alpha": args["LM_WEIGHT_ALPHA"], "beta": args["LENGTH_PENALTY_BETA"], "threshProb": args["THRESH_PROBABILITY"] } device = torch.device("cuda" if torch.cuda.is_available() else "cpu") lm = LRS2CharLM() lm.load_state_dict(torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None predictions, predictionLens = ctc_search_decode( outputLogProbs, inpLens, beamSearchParams, args["CHAR_TO_INDEX"][" "], args["CHAR_TO_INDEX"]["<EOS>"], lm) predictions = [ args["INDEX_TO_CHAR"][ix] for ix in predictions.tolist() if ix != args["CHAR_TO_INDEX"]["<EOS>"] ] predictedSequences = list() s = 0 for ln in predictionLens.tolist(): predictedSequences.append("".join(predictions[s:s + ln - 1])) s = s + ln - 1 print(predictedSequences) return
print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"])) #declaring the model,loss function and loading the trained model weights model = AVNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict( torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) loss_function = nn.CTCLoss(blank=0, zero_infinity=False) #declaring the language model lm = LRS2CharLM() lm.load_state_dict(torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None print("\nTesting the trained model .... \n") beamSearchParams = { "beamWidth": args["BEAM_WIDTH"], "alpha": args["LM_WEIGHT_ALPHA"], "beta": args["LENGTH_PENALTY_BETA"], "threshProb": args["THRESH_PROBABILITY"] } if args["TEST_DEMO_MODE"] == "AO":
def main(): np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") if args["TRAINED_MODEL_FILE"] is not None: print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"])) print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"])) #declaring the model and loading the trained weights model = VideoNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict( torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) #declaring the visual frontend module vf = VisualFrontend() vf.load_state_dict( torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device)) vf.to(device) #declaring the language model lm = LRS2CharLM() lm.load_state_dict( torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None print("\n\nRunning Demo .... \n") #walking through the demo directory and running the model on all video files in it for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]): for file in files: if file.endswith(".mp4"): sampleFile = os.path.join(root, file[:-4]) #preprocessing the sample params = { "roiSize": args["ROI_SIZE"], "normMean": args["NORMALIZATION_MEAN"], "normStd": args["NORMALIZATION_STD"], "vf": vf } preprocess_sample(sampleFile, params) #converting the data sample into appropriate tensors for input to the model visualFeaturesFile = os.path.join(root, file[:-4]) + ".npy" videoParams = {"videoFPS": args["VIDEO_FPS"]} inp, _, inpLen, _ = prepare_main_input( visualFeaturesFile, None, args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], videoParams) inputBatch, _, inputLenBatch, _ = collate_fn([ (inp, None, inpLen, None) ]) #running the model inputBatch = (inputBatch.float()).to(device) inputLenBatch = (inputLenBatch.int()).to(device) model.eval() with torch.no_grad(): outputBatch = model(inputBatch) #obtaining the prediction using CTC deocder if args["TEST_DEMO_DECODING"] == "greedy": predictionBatch, predictionLenBatch = ctc_greedy_decode( outputBatch, inputLenBatch, args["CHAR_TO_INDEX"]["<EOS>"]) elif args["TEST_DEMO_DECODING"] == "search": beamSearchParams = { "beamWidth": args["BEAM_WIDTH"], "alpha": args["LM_WEIGHT_ALPHA"], "beta": args["LENGTH_PENALTY_BETA"], "threshProb": args["THRESH_PROBABILITY"] } predictionBatch, predictionLenBatch = ctc_search_decode( outputBatch, inputLenBatch, beamSearchParams, args["CHAR_TO_INDEX"][" "], args["CHAR_TO_INDEX"]["<EOS>"], lm) else: print("Invalid Decode Scheme") exit() #converting character indices back to characters pred = predictionBatch[:][:-1] pred = "".join( [args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()]) #printing the predictions print("File: %s" % (file)) print("Prediction: %s" % (pred)) print("\n") print("Demo Completed.\n") else: print("\nPath to trained model file not specified.\n") return
def main(): np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") if args["TRAINED_MODEL_FILE"] is not None: print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"])) print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"])) #declaring the model and loading the trained weights model = AVNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict( torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) #declaring the visual frontend module vf = VisualFrontend() vf.load_state_dict( torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device)) vf.to(device) #declaring the language model lm = LRS2CharLM() lm.load_state_dict( torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None #reading the noise file if args["TEST_DEMO_NOISY"]: _, noise = wavfile.read(args["DATA_DIRECTORY"] + "/noise.wav") else: noise = None print("\n\nRunning Demo .... \n") rows = [] print(args['TEST_DEMO_MODE']) #walking through the demo directory and running the model on all video files in it for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]): for file in files: if file.endswith(".mp4"): sNum = file[7] if file[13] <= "9" and file[13] >= "0": cNum = file[12:14] else: cNum = file[12] if file[-6] == "l": cType = "jumble" elif file[-6] == "s": cType = "base" else: cType = file[-8:-4] sampleFile = os.path.join(root, file[:-4]) #preprocessing the sample params = { "roiSize": args["ROI_SIZE"], "normMean": args["NORMALIZATION_MEAN"], "normStd": args["NORMALIZATION_STD"], "vf": vf } preprocess_sample(sampleFile, params) #converting the data sample into appropriate tensors for input to the model audioFile = os.path.join(root, file[:-4]) + ".wav" visualFeaturesFile = os.path.join(root, file[:-4]) + ".npy" audioParams = { "stftWindow": args["STFT_WINDOW"], "stftWinLen": args["STFT_WIN_LENGTH"], "stftOverlap": args["STFT_OVERLAP"] } videoParams = {"videoFPS": args["VIDEO_FPS"]} inp, _, inpLen, _ = prepare_main_input( audioFile, visualFeaturesFile, None, noise, args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], args["NOISE_SNR_DB"], audioParams, videoParams) inputBatch, _, inputLenBatch, _ = collate_fn([ (inp, None, inpLen, None) ]) #running the model inputBatch = ((inputBatch[0].float()).to(device), (inputBatch[1].float()).to(device)) inputLenBatch = (inputLenBatch.int()).to(device) if args["TEST_DEMO_MODE"] == "AO": inputBatch = (inputBatch[0], None) elif args["TEST_DEMO_MODE"] == "VO": inputBatch = (None, inputBatch[1]) elif args["TEST_DEMO_MODE"] == "AV": pass else: print("Invalid Operation Mode.") exit() model.eval() with torch.no_grad(): outputBatch = model(inputBatch) #obtaining the prediction using CTC deocder if args["TEST_DEMO_DECODING"] == "greedy": predictionBatch, predictionLenBatch = ctc_greedy_decode( outputBatch, inputLenBatch, args["CHAR_TO_INDEX"]["<EOS>"]) elif args["TEST_DEMO_DECODING"] == "search": beamSearchParams = { "beamWidth": args["BEAM_WIDTH"], "alpha": args["LM_WEIGHT_ALPHA"], "beta": args["LENGTH_PENALTY_BETA"], "threshProb": args["THRESH_PROBABILITY"] } predictionBatch, predictionLenBatch = ctc_search_decode( outputBatch, inputLenBatch, beamSearchParams, args["CHAR_TO_INDEX"][" "], args["CHAR_TO_INDEX"]["<EOS>"], lm) else: print("Invalid Decode Scheme") exit() #converting character indices back to characters pred = predictionBatch[:][:-1] pred = "".join( [args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()]) #printing the predictions print("File: %s" % (file)) print("Speaker: " + sNum + " Clip: " + cNum + " Clip Type: " + cType) print("Prediction: %s" % (pred)) print("\n") row = [sNum, cNum, cType, pred] rows.append(row) print("Demo Completed.\n") with open("predictions.csv", "w", newline="") as file: writer = csv.writer(file) for x in range(len(rows)): writer.writerow(rows[x]) else: print("\nPath to trained model file not specified.\n") return