def ctc_search_decode_checker(): outputs = [ "TTTEEEST-IINNNN-G- CC-TTCCC- -DEEE-CO-DD---E -FUU-NCCC--TAA-B-FA--E", "ONNE SSSTEEEP ISSS OOOOVVEERA- FDDA-S A FD-AASDF - AD-AFA DF-ADF SF-ADF", "EVERYTHING ALRIGHT CHECK DONE SH-SG-GAD-G HS- RA-R H J- J-AM GA-AM GA-GA-AD", "SSEEEE-E-- -EEE-VE-NNN ---DDDOOOO-ODDE-E --O-OOOOTTTY AAAASS-SSAAM WORK", "---------------------------------------------------------------------------" ] inpLens = [64, 32, 29, 75, 56] outputProbs = 0.01 * torch.ones( (len(outputs[0]), len(inpLens), args["NUM_CLASSES"])) inpLens = torch.tensor(inpLens) for n in range(len(outputs)): for i in range(len(outputs[n])): char = outputs[n][i] if char == "-": ix = 0 else: ix = args["CHAR_TO_INDEX"][char] outputProbs[i, n, ix] = 1.5 outputLogProbs = torch.log(outputProbs) beamSearchParams = { "beamWidth": args["BEAM_WIDTH"], "alpha": args["LM_WEIGHT_ALPHA"], "beta": args["LENGTH_PENALTY_BETA"], "threshProb": args["THRESH_PROBABILITY"] } device = torch.device("cuda" if torch.cuda.is_available() else "cpu") lm = LRS2CharLM() lm.load_state_dict(torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None predictions, predictionLens = ctc_search_decode( outputLogProbs, inpLens, beamSearchParams, args["CHAR_TO_INDEX"][" "], args["CHAR_TO_INDEX"]["<EOS>"], lm) predictions = [ args["INDEX_TO_CHAR"][ix] for ix in predictions.tolist() if ix != args["CHAR_TO_INDEX"]["<EOS>"] ] predictedSequences = list() s = 0 for ln in predictionLens.tolist(): predictedSequences.append("".join(predictions[s:s + ln - 1])) s = s + ln - 1 print(predictedSequences) return
def validation_step(self, batch, batch_idx): evalParams = {"decodeScheme": "greedy", "spaceIx": args["CHAR_TO_INDEX"][" "], "eosIx": args["CHAR_TO_INDEX"]["<EOS>"]} inputBatch, targetBatch, inputLenBatch, targetLenBatch = batch inputBatch, targetBatch = inputBatch.float(), targetBatch.int() inputLenBatch, targetLenBatch = inputLenBatch.int(), targetLenBatch.int() outputBatch = self.model(inputBatch) with torch.backends.cudnn.flags(enabled=False): loss = self.loss_fn(outputBatch, targetBatch, inputLenBatch, targetLenBatch) evalLoss = loss if evalParams["decodeScheme"] == "greedy": predictionBatch, predictionLenBatch = ctc_greedy_decode(outputBatch, inputLenBatch, evalParams["eosIx"]) elif evalParams["decodeScheme"] == "search": predictionBatch, predictionLenBatch = ctc_search_decode(outputBatch, inputLenBatch, evalParams["beamSearchParams"], evalParams["spaceIx"], evalParams["eosIx"], evalParams["lm"]) else: print("Invalid Decode Scheme") exit() evalCER = compute_cer(predictionBatch, targetBatch, predictionLenBatch, targetLenBatch) evalWER = compute_wer(predictionBatch, targetBatch, predictionLenBatch, targetLenBatch, evalParams["spaceIx"]) self.log('val_loss', evalLoss, prog_bar=True) self.log('val_wer', evalWER, prog_bar=True) self.log('val_cer', evalCER, prog_bar=True) return evalLoss
#running the model inputBatch, targetBatch = (inputBatch.float()).to(device), (targetBatch.int()).to(device) inputLenBatch, targetLenBatch = (inputLenBatch.int()).to(device), (targetLenBatch.int()).to(device) model.eval() with torch.no_grad(): outputBatch = model(inputBatch) #obtaining the prediction using CTC deocder if args["TEST_DEMO_DECODING"] == "greedy": predictionBatch, predictionLenBatch = ctc_greedy_decode(outputBatch, inputLenBatch, args["CHAR_TO_INDEX"]["<EOS>"]) elif args["TEST_DEMO_DECODING"] == "search": beamSearchParams = {"beamWidth":args["BEAM_WIDTH"], "alpha":args["LM_WEIGHT_ALPHA"], "beta":args["LENGTH_PENALTY_BETA"], "threshProb":args["THRESH_PROBABILITY"]} predictionBatch, predictionLenBatch = ctc_search_decode(outputBatch, inputLenBatch, beamSearchParams, args["CHAR_TO_INDEX"][" "], args["CHAR_TO_INDEX"]["<EOS>"], lm) else: print("Invalid Decode Scheme") exit() #computing CER and WER cer = compute_cer(predictionBatch, targetBatch, predictionLenBatch, targetLenBatch) wer = compute_wer(predictionBatch, targetBatch, predictionLenBatch, targetLenBatch, args["CHAR_TO_INDEX"][" "]) #converting character indices back to characters pred = predictionBatch[:][:-1] trgt = targetBatch[:][:-1] pred = "".join([args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()]) trgt = "".join([args["INDEX_TO_CHAR"][ix] for ix in trgt.tolist()])
def main(): np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") if args["TRAINED_MODEL_FILE"] is not None: print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"])) print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"])) #declaring the model and loading the trained weights model = VideoNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict( torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) #declaring the visual frontend module vf = VisualFrontend() vf.load_state_dict( torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device)) vf.to(device) #declaring the language model lm = LRS2CharLM() lm.load_state_dict( torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None print("\n\nRunning Demo .... \n") #walking through the demo directory and running the model on all video files in it for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]): for file in files: if file.endswith(".mp4"): sampleFile = os.path.join(root, file[:-4]) #preprocessing the sample params = { "roiSize": args["ROI_SIZE"], "normMean": args["NORMALIZATION_MEAN"], "normStd": args["NORMALIZATION_STD"], "vf": vf } preprocess_sample(sampleFile, params) #converting the data sample into appropriate tensors for input to the model visualFeaturesFile = os.path.join(root, file[:-4]) + ".npy" videoParams = {"videoFPS": args["VIDEO_FPS"]} inp, _, inpLen, _ = prepare_main_input( visualFeaturesFile, None, args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], videoParams) inputBatch, _, inputLenBatch, _ = collate_fn([ (inp, None, inpLen, None) ]) #running the model inputBatch = (inputBatch.float()).to(device) inputLenBatch = (inputLenBatch.int()).to(device) model.eval() with torch.no_grad(): outputBatch = model(inputBatch) #obtaining the prediction using CTC deocder if args["TEST_DEMO_DECODING"] == "greedy": predictionBatch, predictionLenBatch = ctc_greedy_decode( outputBatch, inputLenBatch, args["CHAR_TO_INDEX"]["<EOS>"]) elif args["TEST_DEMO_DECODING"] == "search": beamSearchParams = { "beamWidth": args["BEAM_WIDTH"], "alpha": args["LM_WEIGHT_ALPHA"], "beta": args["LENGTH_PENALTY_BETA"], "threshProb": args["THRESH_PROBABILITY"] } predictionBatch, predictionLenBatch = ctc_search_decode( outputBatch, inputLenBatch, beamSearchParams, args["CHAR_TO_INDEX"][" "], args["CHAR_TO_INDEX"]["<EOS>"], lm) else: print("Invalid Decode Scheme") exit() #converting character indices back to characters pred = predictionBatch[:][:-1] pred = "".join( [args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()]) #printing the predictions print("File: %s" % (file)) print("Prediction: %s" % (pred)) print("\n") print("Demo Completed.\n") else: print("\nPath to trained model file not specified.\n") return
def main(): np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") if args["TRAINED_MODEL_FILE"] is not None: print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"])) print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"])) #declaring the model and loading the trained weights model = AVNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict( torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) #declaring the visual frontend module vf = VisualFrontend() vf.load_state_dict( torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device)) vf.to(device) #declaring the language model lm = LRS2CharLM() lm.load_state_dict( torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None #reading the noise file if args["TEST_DEMO_NOISY"]: _, noise = wavfile.read(args["DATA_DIRECTORY"] + "/noise.wav") else: noise = None print("\n\nRunning Demo .... \n") rows = [] print(args['TEST_DEMO_MODE']) #walking through the demo directory and running the model on all video files in it for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]): for file in files: if file.endswith(".mp4"): sNum = file[7] if file[13] <= "9" and file[13] >= "0": cNum = file[12:14] else: cNum = file[12] if file[-6] == "l": cType = "jumble" elif file[-6] == "s": cType = "base" else: cType = file[-8:-4] sampleFile = os.path.join(root, file[:-4]) #preprocessing the sample params = { "roiSize": args["ROI_SIZE"], "normMean": args["NORMALIZATION_MEAN"], "normStd": args["NORMALIZATION_STD"], "vf": vf } preprocess_sample(sampleFile, params) #converting the data sample into appropriate tensors for input to the model audioFile = os.path.join(root, file[:-4]) + ".wav" visualFeaturesFile = os.path.join(root, file[:-4]) + ".npy" audioParams = { "stftWindow": args["STFT_WINDOW"], "stftWinLen": args["STFT_WIN_LENGTH"], "stftOverlap": args["STFT_OVERLAP"] } videoParams = {"videoFPS": args["VIDEO_FPS"]} inp, _, inpLen, _ = prepare_main_input( audioFile, visualFeaturesFile, None, noise, args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], args["NOISE_SNR_DB"], audioParams, videoParams) inputBatch, _, inputLenBatch, _ = collate_fn([ (inp, None, inpLen, None) ]) #running the model inputBatch = ((inputBatch[0].float()).to(device), (inputBatch[1].float()).to(device)) inputLenBatch = (inputLenBatch.int()).to(device) if args["TEST_DEMO_MODE"] == "AO": inputBatch = (inputBatch[0], None) elif args["TEST_DEMO_MODE"] == "VO": inputBatch = (None, inputBatch[1]) elif args["TEST_DEMO_MODE"] == "AV": pass else: print("Invalid Operation Mode.") exit() model.eval() with torch.no_grad(): outputBatch = model(inputBatch) #obtaining the prediction using CTC deocder if args["TEST_DEMO_DECODING"] == "greedy": predictionBatch, predictionLenBatch = ctc_greedy_decode( outputBatch, inputLenBatch, args["CHAR_TO_INDEX"]["<EOS>"]) elif args["TEST_DEMO_DECODING"] == "search": beamSearchParams = { "beamWidth": args["BEAM_WIDTH"], "alpha": args["LM_WEIGHT_ALPHA"], "beta": args["LENGTH_PENALTY_BETA"], "threshProb": args["THRESH_PROBABILITY"] } predictionBatch, predictionLenBatch = ctc_search_decode( outputBatch, inputLenBatch, beamSearchParams, args["CHAR_TO_INDEX"][" "], args["CHAR_TO_INDEX"]["<EOS>"], lm) else: print("Invalid Decode Scheme") exit() #converting character indices back to characters pred = predictionBatch[:][:-1] pred = "".join( [args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()]) #printing the predictions print("File: %s" % (file)) print("Speaker: " + sNum + " Clip: " + cNum + " Clip Type: " + cType) print("Prediction: %s" % (pred)) print("\n") row = [sNum, cNum, cType, pred] rows.append(row) print("Demo Completed.\n") with open("predictions.csv", "w", newline="") as file: writer = csv.writer(file) for x in range(len(rows)): writer.writerow(rows[x]) else: print("\nPath to trained model file not specified.\n") return
def inference(sampleFile, targetFile=None): print('In Inference') #preprocessing the sample params = { "roiSize": args["ROI_SIZE"], "normMean": args["NORMALIZATION_MEAN"], "normStd": args["NORMALIZATION_STD"], "vf": vf } preprocess_sample(sampleFile, params) #converting the data sample into appropriate tensors for input to the model visualFeaturesFile = sampleFile + ".npy" videoParams = {"videoFPS": args["VIDEO_FPS"]} inp, trgt, inpLen, trgtLen = prepare_main_input( visualFeaturesFile, targetFile, args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], videoParams) inputBatch, targetBatch, inputLenBatch, targetLenBatch = collate_fn([ (inp, trgt, inpLen, trgtLen) ]) #running the model inputBatch, targetBatch = (inputBatch.float()).to(device), ( targetBatch.int()).to(device) inputLenBatch, targetLenBatch = (inputLenBatch.int()).to(device), ( targetLenBatch.int()).to(device) model.eval() with torch.no_grad(): outputBatch = model(inputBatch) #obtaining the prediction using CTC deocder if args["TEST_DEMO_DECODING"] == "greedy": predictionBatch, predictionLenBatch = ctc_greedy_decode( outputBatch, inputLenBatch, args["CHAR_TO_INDEX"]["<EOS>"]) elif args["TEST_DEMO_DECODING"] == "search": beamSearchParams = { "beamWidth": args["BEAM_WIDTH"], "alpha": args["LM_WEIGHT_ALPHA"], "beta": args["LENGTH_PENALTY_BETA"], "threshProb": args["THRESH_PROBABILITY"] } predictionBatch, predictionLenBatch = ctc_search_decode( outputBatch, inputLenBatch, beamSearchParams, args["CHAR_TO_INDEX"][" "], args["CHAR_TO_INDEX"]["<EOS>"], lm) else: print("Invalid Decode Scheme") exit() #comiputing CER and WER #cer = compute_cer(predictionBatch, targetBatch, predictionLenBatch, targetLenBatch) #wer = compute_wer(predictionBatch, targetBatch, predictionLenBatch, targetLenBatch, args["CHAR_TO_INDEX"][" "]) #converting character indices back to characters pred = predictionBatch[:][:-1] pred = "".join([args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()]) #printing the predictions print("Prediction: %s" % (pred)) print("\n")