def preprocess_sample_checker(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") vf = VisualFrontend() vf.load_state_dict(torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device)) vf.to(device) file = args["CODE_DIRECTORY"] + "/demo/00001" params = {"roiSize":args["ROI_SIZE"], "normMean":args["NORMALIZATION_MEAN"], "normStd":args["NORMALIZATION_STD"], "vf":vf} preprocess_sample(file, params) return
import torch import os from tqdm import tqdm import numpy as np from config import args from models.visual_frontend import VisualFrontend from utils.preprocessing import preprocess_sample np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") #declaring the visual frontend module vf = VisualFrontend() vf.load_state_dict( torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device)) vf.to(device) #walking through the data directory and obtaining a list of all files in the dataset filesList = list() for root, dirs, files in os.walk(args["DATA_DIRECTORY"]): for file in files: if file.endswith(".mp4"): filesList.append(os.path.join(root, file[:-4])) #Preprocessing each sample print("\nNumber of data samples to be processed = %d" % (len(filesList))) print("\n\nStarting preprocessing ....\n") params = { "roiSize": args["ROI_SIZE"],
def main(): np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") if args["TRAINED_MODEL_FILE"] is not None: print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"])) print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"])) #declaring the model and loading the trained weights model = VideoNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict( torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) #declaring the visual frontend module vf = VisualFrontend() vf.load_state_dict( torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device)) vf.to(device) #declaring the language model lm = LRS2CharLM() lm.load_state_dict( torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None print("\n\nRunning Demo .... \n") #walking through the demo directory and running the model on all video files in it for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]): for file in files: if file.endswith(".mp4"): sampleFile = os.path.join(root, file[:-4]) #preprocessing the sample params = { "roiSize": args["ROI_SIZE"], "normMean": args["NORMALIZATION_MEAN"], "normStd": args["NORMALIZATION_STD"], "vf": vf } preprocess_sample(sampleFile, params) #converting the data sample into appropriate tensors for input to the model visualFeaturesFile = os.path.join(root, file[:-4]) + ".npy" videoParams = {"videoFPS": args["VIDEO_FPS"]} inp, _, inpLen, _ = prepare_main_input( visualFeaturesFile, None, args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], videoParams) inputBatch, _, inputLenBatch, _ = collate_fn([ (inp, None, inpLen, None) ]) #running the model inputBatch = (inputBatch.float()).to(device) inputLenBatch = (inputLenBatch.int()).to(device) model.eval() with torch.no_grad(): outputBatch = model(inputBatch) #obtaining the prediction using CTC deocder if args["TEST_DEMO_DECODING"] == "greedy": predictionBatch, predictionLenBatch = ctc_greedy_decode( outputBatch, inputLenBatch, args["CHAR_TO_INDEX"]["<EOS>"]) elif args["TEST_DEMO_DECODING"] == "search": beamSearchParams = { "beamWidth": args["BEAM_WIDTH"], "alpha": args["LM_WEIGHT_ALPHA"], "beta": args["LENGTH_PENALTY_BETA"], "threshProb": args["THRESH_PROBABILITY"] } predictionBatch, predictionLenBatch = ctc_search_decode( outputBatch, inputLenBatch, beamSearchParams, args["CHAR_TO_INDEX"][" "], args["CHAR_TO_INDEX"]["<EOS>"], lm) else: print("Invalid Decode Scheme") exit() #converting character indices back to characters pred = predictionBatch[:][:-1] pred = "".join( [args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()]) #printing the predictions print("File: %s" % (file)) print("Prediction: %s" % (pred)) print("\n") print("Demo Completed.\n") else: print("\nPath to trained model file not specified.\n") return
def main(): np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") if args["TRAINED_MODEL_FILE"] is not None: print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"])) print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"])) #declaring the model and loading the trained weights model = AVNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"], args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"]) model.load_state_dict( torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device)) model.to(device) #declaring the visual frontend module vf = VisualFrontend() vf.load_state_dict( torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device)) vf.to(device) #declaring the language model lm = LRS2CharLM() lm.load_state_dict( torch.load(args["TRAINED_LM_FILE"], map_location=device)) lm.to(device) if not args["USE_LM"]: lm = None #reading the noise file if args["TEST_DEMO_NOISY"]: _, noise = wavfile.read(args["DATA_DIRECTORY"] + "/noise.wav") else: noise = None print("\n\nRunning Demo .... \n") rows = [] print(args['TEST_DEMO_MODE']) #walking through the demo directory and running the model on all video files in it for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]): for file in files: if file.endswith(".mp4"): sNum = file[7] if file[13] <= "9" and file[13] >= "0": cNum = file[12:14] else: cNum = file[12] if file[-6] == "l": cType = "jumble" elif file[-6] == "s": cType = "base" else: cType = file[-8:-4] sampleFile = os.path.join(root, file[:-4]) #preprocessing the sample params = { "roiSize": args["ROI_SIZE"], "normMean": args["NORMALIZATION_MEAN"], "normStd": args["NORMALIZATION_STD"], "vf": vf } preprocess_sample(sampleFile, params) #converting the data sample into appropriate tensors for input to the model audioFile = os.path.join(root, file[:-4]) + ".wav" visualFeaturesFile = os.path.join(root, file[:-4]) + ".npy" audioParams = { "stftWindow": args["STFT_WINDOW"], "stftWinLen": args["STFT_WIN_LENGTH"], "stftOverlap": args["STFT_OVERLAP"] } videoParams = {"videoFPS": args["VIDEO_FPS"]} inp, _, inpLen, _ = prepare_main_input( audioFile, visualFeaturesFile, None, noise, args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], args["NOISE_SNR_DB"], audioParams, videoParams) inputBatch, _, inputLenBatch, _ = collate_fn([ (inp, None, inpLen, None) ]) #running the model inputBatch = ((inputBatch[0].float()).to(device), (inputBatch[1].float()).to(device)) inputLenBatch = (inputLenBatch.int()).to(device) if args["TEST_DEMO_MODE"] == "AO": inputBatch = (inputBatch[0], None) elif args["TEST_DEMO_MODE"] == "VO": inputBatch = (None, inputBatch[1]) elif args["TEST_DEMO_MODE"] == "AV": pass else: print("Invalid Operation Mode.") exit() model.eval() with torch.no_grad(): outputBatch = model(inputBatch) #obtaining the prediction using CTC deocder if args["TEST_DEMO_DECODING"] == "greedy": predictionBatch, predictionLenBatch = ctc_greedy_decode( outputBatch, inputLenBatch, args["CHAR_TO_INDEX"]["<EOS>"]) elif args["TEST_DEMO_DECODING"] == "search": beamSearchParams = { "beamWidth": args["BEAM_WIDTH"], "alpha": args["LM_WEIGHT_ALPHA"], "beta": args["LENGTH_PENALTY_BETA"], "threshProb": args["THRESH_PROBABILITY"] } predictionBatch, predictionLenBatch = ctc_search_decode( outputBatch, inputLenBatch, beamSearchParams, args["CHAR_TO_INDEX"][" "], args["CHAR_TO_INDEX"]["<EOS>"], lm) else: print("Invalid Decode Scheme") exit() #converting character indices back to characters pred = predictionBatch[:][:-1] pred = "".join( [args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()]) #printing the predictions print("File: %s" % (file)) print("Speaker: " + sNum + " Clip: " + cNum + " Clip Type: " + cType) print("Prediction: %s" % (pred)) print("\n") row = [sNum, cNum, cType, pred] rows.append(row) print("Demo Completed.\n") with open("predictions.csv", "w", newline="") as file: writer = csv.writer(file) for x in range(len(rows)): writer.writerow(rows[x]) else: print("\nPath to trained model file not specified.\n") return
def main(): np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") #declaring the visual frontend module vf = VisualFrontend() vf.load_state_dict(torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device)) vf.to(device) #walking through the data directory and obtaining a list of all files in the dataset filesList = list() for root, dirs, files in os.walk(args["DATA_DIRECTORY"]): for file in files: if file.endswith(".mp4"): filesList.append(os.path.join(root, file[:-4])) #Preprocessing each sample print("\nNumber of data samples to be processed = %d" %(len(filesList))) print("\n\nStarting preprocessing ....\n") params = {"roiSize":args["ROI_SIZE"], "normMean":args["NORMALIZATION_MEAN"], "normStd":args["NORMALIZATION_STD"], "vf":vf} for file in tqdm(filesList, leave=True, desc="Preprocess", ncols=75): preprocess_sample(file, params) print("\nPreprocessing Done.") #Generating a 1 hour noise file #Fetching audio samples from 20 random files in the dataset and adding them up to generate noise #The length of these clips is the shortest audio sample among the 20 samples print("\n\nGenerating the noise file ....") noise = np.empty((0)) while len(noise) < 16000*3600: noisePart = np.zeros(16000*60) indices = np.random.randint(0, len(filesList), 20) for ix in indices: sampFreq, audio = wavfile.read(filesList[ix] + ".wav") audio = audio/np.max(np.abs(audio)) pos = np.random.randint(0, abs(len(audio)-len(noisePart))+1) if len(audio) > len(noisePart): noisePart = noisePart + audio[pos:pos+len(noisePart)] else: noisePart = noisePart[pos:pos+len(audio)] + audio noise = np.concatenate([noise, noisePart], axis=0) noise = noise[:16000*3600] noise = (noise/20)*32767 noise = np.floor(noise).astype(np.int16) wavfile.write(args["DATA_DIRECTORY"] + "/noise.wav", 16000, noise) print("\nNoise file generated.") #Generating preval.txt for splitting the pretrain set into train and validation sets print("\n\nGenerating the preval.txt file ....") with open(args["DATA_DIRECTORY"] + "/pretrain.txt", "r") as f: lines = f.readlines() if os.path.exists(args["DATA_DIRECTORY"] + "/preval.txt"): with open(args["DATA_DIRECTORY"] + "/preval.txt", "r") as f: lines.extend(f.readlines()) indices = np.arange(len(lines)) np.random.shuffle(indices) valIxs = np.sort(indices[:int(np.ceil(args["PRETRAIN_VAL_SPLIT"]*len(indices)))]) trainIxs = np.sort(indices[int(np.ceil(args["PRETRAIN_VAL_SPLIT"]*len(indices))):]) lines = np.sort(np.array(lines)) with open(args["DATA_DIRECTORY"] + "/pretrain.txt", "w") as f: f.writelines(list(lines[trainIxs])) with open(args["DATA_DIRECTORY"] + "/preval.txt", "w") as f: f.writelines(list(lines[valIxs])) print("\npreval.txt file generated.\n") return
def main(): np.random.seed(args["SEED"]) torch.manual_seed(args["SEED"]) gpuAvailable = torch.cuda.is_available() device = torch.device("cuda" if gpuAvailable else "cpu") #declaring the visual frontend module vf = VisualFrontend() vf.load_state_dict( torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device)) vf.to(device) #walking through the data directory and obtaining a list of all files in the dataset filesList = list() for root, dirs, files in os.walk(args["DATA_DIRECTORY"]): for file in files: if file.endswith(".mp4"): filesList.append(os.path.join(root, file[:-4])) #Preprocessing each sample print("\nNumber of data samples to be processed = %d" % (len(filesList))) print("\n\nStarting preprocessing ....\n") params = { "roiSize": args["ROI_SIZE"], "normMean": args["NORMALIZATION_MEAN"], "normStd": args["NORMALIZATION_STD"], "vf": vf } for file in tqdm(filesList, leave=True, desc="Preprocess", ncols=75): preprocess_sample(file, params) print("\nPreprocessing Done.") #Generating preval.txt for splitting the pretrain set into train and validation sets print("\n\nGenerating the preval.txt file ....") with open(args["DATA_DIRECTORY"] + "/pretrain.txt", "r") as f: lines = f.readlines() if os.path.exists(args["DATA_DIRECTORY"] + "/preval.txt"): with open(args["DATA_DIRECTORY"] + "/preval.txt", "r") as f: lines.extend(f.readlines()) indices = np.arange(len(lines)) np.random.shuffle(indices) valIxs = np.sort( indices[:int(np.ceil(args["PRETRAIN_VAL_SPLIT"] * len(indices)))]) trainIxs = np.sort( indices[int(np.ceil(args["PRETRAIN_VAL_SPLIT"] * len(indices))):]) lines = np.sort(np.array(lines)) with open(args["DATA_DIRECTORY"] + "/pretrain.txt", "w") as f: f.writelines(list(lines[trainIxs])) with open(args["DATA_DIRECTORY"] + "/preval.txt", "w") as f: f.writelines(list(lines[valIxs])) print("\npreval.txt file generated.\n") return