Пример #1
0
def preprocess_sample_checker():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    vf = VisualFrontend()
    vf.load_state_dict(torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device))
    vf.to(device)
    file = args["CODE_DIRECTORY"] + "/demo/00001"
    params = {"roiSize":args["ROI_SIZE"], "normMean":args["NORMALIZATION_MEAN"], "normStd":args["NORMALIZATION_STD"], "vf":vf}
    preprocess_sample(file, params)
    return
Пример #2
0
def preprocess_sample_checker():
    file = args["CODE_DIRECTORY"] + "/demo/00001"
    preprocess_sample(file)
    return
Пример #3
0
gpuAvailable = torch.cuda.is_available()
device = torch.device("cuda" if gpuAvailable else "cpu")

#declaring the visual frontend module
vf = VisualFrontend()
vf.load_state_dict(
    torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device))
vf.to(device)

#walking through the data directory and obtaining a list of all files in the dataset
filesList = list()
for root, dirs, files in os.walk(args["DATA_DIRECTORY"]):
    for file in files:
        if file.endswith(".mp4"):
            filesList.append(os.path.join(root, file[:-4]))

#Preprocessing each sample
print("\nNumber of data samples to be processed = %d" % (len(filesList)))
print("\n\nStarting preprocessing ....\n")

params = {
    "roiSize": args["ROI_SIZE"],
    "normMean": args["NORMALIZATION_MEAN"],
    "normStd": args["NORMALIZATION_STD"],
    "vf": vf
}
for file in tqdm(filesList, leave=True, desc="Preprocess", ncols=75):
    preprocess_sample(file, params)

print("\nPreprocessing Done.")
Пример #4
0
        _, noise = wavfile.read(args["DATA_DIRECTORY"] + "/noise.wav")
    else:
        noise = None


    print("\n\nRunning Demo .... \n")

    #walking through the demo directory and running the model on all video files in it
    for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]):
        for file in files:
            if file.endswith(".mp4"):
                sampleFile = os.path.join(root, file[:-4])
                targetFile = os.path.join(root, file[:-4]) + ".txt"

                #preprocessing the sample
                preprocess_sample(sampleFile)

                #converting the data sample into appropriate tensors for input to the model
                audioFile = os.path.join(root, file[:-4]) + ".wav"
                audioParams = {"stftWindow":args["STFT_WINDOW"], "stftWinLen":args["STFT_WIN_LENGTH"], "stftOverlap":args["STFT_OVERLAP"]}
                inp, trgt, inpLen, trgtLen = prepare_main_input(audioFile, targetFile, noise, args["MAIN_REQ_INPUT_LENGTH"],
                                                                args["CHAR_TO_INDEX"], args["NOISE_SNR_DB"], audioParams)
                inputBatch, targetBatch, inputLenBatch, targetLenBatch = collate_fn([(inp, trgt, inpLen, trgtLen)])

                #running the model
                inputBatch, targetBatch = (inputBatch.float()).to(device), (targetBatch.int()).to(device)
                inputLenBatch, targetLenBatch = (inputLenBatch.int()).to(device), (targetLenBatch.int()).to(device)
                model.eval()
                with torch.no_grad():
                    outputBatch = model(inputBatch)
Пример #5
0
    print("\n\nRunning Demo .... \n")

    #walking through the demo directory and running the model on all video files in it
    for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]):
        for file in files:
            if file.endswith(".mp4"):
                sampleFile = os.path.join(root, file[:-4])

                #preprocessing the sample
                params = {
                    "roiSize": args["ROI_SIZE"],
                    "normMean": args["NORMALIZATION_MEAN"],
                    "normStd": args["NORMALIZATION_STD"],
                    "vf": vf
                }
                preprocess_sample(sampleFile, params)

                #converting the data sample into appropriate tensors for input to the model
                visualFeaturesFile = os.path.join(root, file[:-4]) + ".npy"
                videoParams = {"videoFPS": args["VIDEO_FPS"]}
                inp, _, inpLen, _ = prepare_main_input(
                    visualFeaturesFile, None, args["MAIN_REQ_INPUT_LENGTH"],
                    args["CHAR_TO_INDEX"], videoParams)
                inputBatch, _, inputLenBatch, _ = collate_fn([(inp, None,
                                                               inpLen, None)])

                #running the model
                inputBatch = (inputBatch.float()).to(device)
                inputLenBatch = (inputLenBatch.int()).to(device)
                model.eval()
                with torch.no_grad():
Пример #6
0
def main():

    np.random.seed(args["SEED"])
    torch.manual_seed(args["SEED"])
    gpuAvailable = torch.cuda.is_available()
    device = torch.device("cuda" if gpuAvailable else "cpu")

    if args["TRAINED_MODEL_FILE"] is not None:

        print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"]))
        print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"]))

        #declaring the model and loading the trained weights
        model = VideoNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"],
                         args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"],
                         args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"],
                         args["NUM_CLASSES"])
        model.load_state_dict(
            torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"],
                       map_location=device))
        model.to(device)

        #declaring the visual frontend module
        vf = VisualFrontend()
        vf.load_state_dict(
            torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device))
        vf.to(device)

        #declaring the language model
        lm = LRS2CharLM()
        lm.load_state_dict(
            torch.load(args["TRAINED_LM_FILE"], map_location=device))
        lm.to(device)
        if not args["USE_LM"]:
            lm = None

        print("\n\nRunning Demo .... \n")

        #walking through the demo directory and running the model on all video files in it
        for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]):
            for file in files:
                if file.endswith(".mp4"):
                    sampleFile = os.path.join(root, file[:-4])

                    #preprocessing the sample
                    params = {
                        "roiSize": args["ROI_SIZE"],
                        "normMean": args["NORMALIZATION_MEAN"],
                        "normStd": args["NORMALIZATION_STD"],
                        "vf": vf
                    }
                    preprocess_sample(sampleFile, params)

                    #converting the data sample into appropriate tensors for input to the model
                    visualFeaturesFile = os.path.join(root, file[:-4]) + ".npy"
                    videoParams = {"videoFPS": args["VIDEO_FPS"]}
                    inp, _, inpLen, _ = prepare_main_input(
                        visualFeaturesFile, None,
                        args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"],
                        videoParams)
                    inputBatch, _, inputLenBatch, _ = collate_fn([
                        (inp, None, inpLen, None)
                    ])

                    #running the model
                    inputBatch = (inputBatch.float()).to(device)
                    inputLenBatch = (inputLenBatch.int()).to(device)
                    model.eval()
                    with torch.no_grad():
                        outputBatch = model(inputBatch)

                    #obtaining the prediction using CTC deocder
                    if args["TEST_DEMO_DECODING"] == "greedy":
                        predictionBatch, predictionLenBatch = ctc_greedy_decode(
                            outputBatch, inputLenBatch,
                            args["CHAR_TO_INDEX"]["<EOS>"])

                    elif args["TEST_DEMO_DECODING"] == "search":
                        beamSearchParams = {
                            "beamWidth": args["BEAM_WIDTH"],
                            "alpha": args["LM_WEIGHT_ALPHA"],
                            "beta": args["LENGTH_PENALTY_BETA"],
                            "threshProb": args["THRESH_PROBABILITY"]
                        }
                        predictionBatch, predictionLenBatch = ctc_search_decode(
                            outputBatch, inputLenBatch, beamSearchParams,
                            args["CHAR_TO_INDEX"][" "],
                            args["CHAR_TO_INDEX"]["<EOS>"], lm)

                    else:
                        print("Invalid Decode Scheme")
                        exit()

                    #converting character indices back to characters
                    pred = predictionBatch[:][:-1]
                    pred = "".join(
                        [args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()])

                    #printing the predictions
                    print("File: %s" % (file))
                    print("Prediction: %s" % (pred))
                    print("\n")

        print("Demo Completed.\n")

    else:
        print("\nPath to trained model file not specified.\n")

    return
Пример #7
0
def main():

    np.random.seed(args["SEED"])
    torch.manual_seed(args["SEED"])
    gpuAvailable = torch.cuda.is_available()
    device = torch.device("cuda" if gpuAvailable else "cpu")

    if args["TRAINED_MODEL_FILE"] is not None:

        print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"]))
        print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"]))

        #declaring the model and loading the trained weights
        model = AVNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"],
                      args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"],
                      args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"],
                      args["TX_DROPOUT"], args["NUM_CLASSES"])
        model.load_state_dict(
            torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"],
                       map_location=device))
        model.to(device)

        #declaring the visual frontend module
        vf = VisualFrontend()
        vf.load_state_dict(
            torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device))
        vf.to(device)

        #declaring the language model
        lm = LRS2CharLM()
        lm.load_state_dict(
            torch.load(args["TRAINED_LM_FILE"], map_location=device))
        lm.to(device)
        if not args["USE_LM"]:
            lm = None

        #reading the noise file
        if args["TEST_DEMO_NOISY"]:
            _, noise = wavfile.read(args["DATA_DIRECTORY"] + "/noise.wav")
        else:
            noise = None

        print("\n\nRunning Demo .... \n")

        rows = []

        print(args['TEST_DEMO_MODE'])

        #walking through the demo directory and running the model on all video files in it
        for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]):
            for file in files:
                if file.endswith(".mp4"):

                    sNum = file[7]

                    if file[13] <= "9" and file[13] >= "0":
                        cNum = file[12:14]
                    else:
                        cNum = file[12]

                    if file[-6] == "l":
                        cType = "jumble"
                    elif file[-6] == "s":
                        cType = "base"
                    else:
                        cType = file[-8:-4]

                    sampleFile = os.path.join(root, file[:-4])

                    #preprocessing the sample
                    params = {
                        "roiSize": args["ROI_SIZE"],
                        "normMean": args["NORMALIZATION_MEAN"],
                        "normStd": args["NORMALIZATION_STD"],
                        "vf": vf
                    }
                    preprocess_sample(sampleFile, params)

                    #converting the data sample into appropriate tensors for input to the model
                    audioFile = os.path.join(root, file[:-4]) + ".wav"
                    visualFeaturesFile = os.path.join(root, file[:-4]) + ".npy"
                    audioParams = {
                        "stftWindow": args["STFT_WINDOW"],
                        "stftWinLen": args["STFT_WIN_LENGTH"],
                        "stftOverlap": args["STFT_OVERLAP"]
                    }
                    videoParams = {"videoFPS": args["VIDEO_FPS"]}
                    inp, _, inpLen, _ = prepare_main_input(
                        audioFile, visualFeaturesFile, None, noise,
                        args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"],
                        args["NOISE_SNR_DB"], audioParams, videoParams)
                    inputBatch, _, inputLenBatch, _ = collate_fn([
                        (inp, None, inpLen, None)
                    ])

                    #running the model
                    inputBatch = ((inputBatch[0].float()).to(device),
                                  (inputBatch[1].float()).to(device))
                    inputLenBatch = (inputLenBatch.int()).to(device)
                    if args["TEST_DEMO_MODE"] == "AO":
                        inputBatch = (inputBatch[0], None)
                    elif args["TEST_DEMO_MODE"] == "VO":
                        inputBatch = (None, inputBatch[1])
                    elif args["TEST_DEMO_MODE"] == "AV":
                        pass
                    else:
                        print("Invalid Operation Mode.")
                        exit()

                    model.eval()
                    with torch.no_grad():
                        outputBatch = model(inputBatch)

                    #obtaining the prediction using CTC deocder
                    if args["TEST_DEMO_DECODING"] == "greedy":
                        predictionBatch, predictionLenBatch = ctc_greedy_decode(
                            outputBatch, inputLenBatch,
                            args["CHAR_TO_INDEX"]["<EOS>"])

                    elif args["TEST_DEMO_DECODING"] == "search":
                        beamSearchParams = {
                            "beamWidth": args["BEAM_WIDTH"],
                            "alpha": args["LM_WEIGHT_ALPHA"],
                            "beta": args["LENGTH_PENALTY_BETA"],
                            "threshProb": args["THRESH_PROBABILITY"]
                        }
                        predictionBatch, predictionLenBatch = ctc_search_decode(
                            outputBatch, inputLenBatch, beamSearchParams,
                            args["CHAR_TO_INDEX"][" "],
                            args["CHAR_TO_INDEX"]["<EOS>"], lm)
                    else:
                        print("Invalid Decode Scheme")
                        exit()

                    #converting character indices back to characters
                    pred = predictionBatch[:][:-1]
                    pred = "".join(
                        [args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()])

                    #printing the predictions
                    print("File: %s" % (file))
                    print("Speaker: " + sNum + "   Clip: " + cNum +
                          "   Clip Type: " + cType)
                    print("Prediction: %s" % (pred))
                    print("\n")
                    row = [sNum, cNum, cType, pred]
                    rows.append(row)

        print("Demo Completed.\n")
        with open("predictions.csv", "w", newline="") as file:
            writer = csv.writer(file)
            for x in range(len(rows)):
                writer.writerow(rows[x])

    else:
        print("\nPath to trained model file not specified.\n")

    return
Пример #8
0
def inference(sampleFile, targetFile=None):

    print('In Inference')
    #preprocessing the sample
    params = {
        "roiSize": args["ROI_SIZE"],
        "normMean": args["NORMALIZATION_MEAN"],
        "normStd": args["NORMALIZATION_STD"],
        "vf": vf
    }
    preprocess_sample(sampleFile, params)

    #converting the data sample into appropriate tensors for input to the model
    visualFeaturesFile = sampleFile + ".npy"
    videoParams = {"videoFPS": args["VIDEO_FPS"]}

    inp, trgt, inpLen, trgtLen = prepare_main_input(
        visualFeaturesFile, targetFile, args["MAIN_REQ_INPUT_LENGTH"],
        args["CHAR_TO_INDEX"], videoParams)
    inputBatch, targetBatch, inputLenBatch, targetLenBatch = collate_fn([
        (inp, trgt, inpLen, trgtLen)
    ])

    #running the model
    inputBatch, targetBatch = (inputBatch.float()).to(device), (
        targetBatch.int()).to(device)
    inputLenBatch, targetLenBatch = (inputLenBatch.int()).to(device), (
        targetLenBatch.int()).to(device)
    model.eval()
    with torch.no_grad():
        outputBatch = model(inputBatch)

    #obtaining the prediction using CTC deocder
    if args["TEST_DEMO_DECODING"] == "greedy":
        predictionBatch, predictionLenBatch = ctc_greedy_decode(
            outputBatch, inputLenBatch, args["CHAR_TO_INDEX"]["<EOS>"])

    elif args["TEST_DEMO_DECODING"] == "search":
        beamSearchParams = {
            "beamWidth": args["BEAM_WIDTH"],
            "alpha": args["LM_WEIGHT_ALPHA"],
            "beta": args["LENGTH_PENALTY_BETA"],
            "threshProb": args["THRESH_PROBABILITY"]
        }
        predictionBatch, predictionLenBatch = ctc_search_decode(
            outputBatch, inputLenBatch, beamSearchParams,
            args["CHAR_TO_INDEX"][" "], args["CHAR_TO_INDEX"]["<EOS>"], lm)

    else:
        print("Invalid Decode Scheme")
        exit()

    #comiputing CER and WER
    #cer = compute_cer(predictionBatch, targetBatch, predictionLenBatch, targetLenBatch)
    #wer = compute_wer(predictionBatch, targetBatch, predictionLenBatch, targetLenBatch, args["CHAR_TO_INDEX"][" "])

    #converting character indices back to characters
    pred = predictionBatch[:][:-1]
    pred = "".join([args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()])

    #printing the predictions
    print("Prediction: %s" % (pred))
    print("\n")
Пример #9
0
def main():

    np.random.seed(args["SEED"])
    torch.manual_seed(args["SEED"])
    gpuAvailable = torch.cuda.is_available()
    device = torch.device("cuda" if gpuAvailable else "cpu")



    #declaring the visual frontend module
    vf = VisualFrontend()
    vf.load_state_dict(torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device))
    vf.to(device)


    #walking through the data directory and obtaining a list of all files in the dataset
    filesList = list()
    for root, dirs, files in os.walk(args["DATA_DIRECTORY"]):
        for file in files:
            if file.endswith(".mp4"):
                filesList.append(os.path.join(root, file[:-4]))


    #Preprocessing each sample
    print("\nNumber of data samples to be processed = %d" %(len(filesList)))
    print("\n\nStarting preprocessing ....\n")

    params = {"roiSize":args["ROI_SIZE"], "normMean":args["NORMALIZATION_MEAN"], "normStd":args["NORMALIZATION_STD"], "vf":vf}
    for file in tqdm(filesList, leave=True, desc="Preprocess", ncols=75):
        preprocess_sample(file, params)

    print("\nPreprocessing Done.")



    #Generating a 1 hour noise file
    #Fetching audio samples from 20 random files in the dataset and adding them up to generate noise
    #The length of these clips is the shortest audio sample among the 20 samples
    print("\n\nGenerating the noise file ....")

    noise = np.empty((0))
    while len(noise) < 16000*3600:
        noisePart = np.zeros(16000*60)
        indices = np.random.randint(0, len(filesList), 20)
        for ix in indices:
            sampFreq, audio = wavfile.read(filesList[ix] + ".wav")
            audio = audio/np.max(np.abs(audio))
            pos = np.random.randint(0, abs(len(audio)-len(noisePart))+1)
            if len(audio) > len(noisePart):
                noisePart = noisePart + audio[pos:pos+len(noisePart)]
            else:
                noisePart = noisePart[pos:pos+len(audio)] + audio
        noise = np.concatenate([noise, noisePart], axis=0)
    noise = noise[:16000*3600]
    noise = (noise/20)*32767
    noise = np.floor(noise).astype(np.int16)
    wavfile.write(args["DATA_DIRECTORY"] + "/noise.wav", 16000, noise)

    print("\nNoise file generated.")



    #Generating preval.txt for splitting the pretrain set into train and validation sets
    print("\n\nGenerating the preval.txt file ....")

    with open(args["DATA_DIRECTORY"] + "/pretrain.txt", "r") as f:
        lines = f.readlines()

    if os.path.exists(args["DATA_DIRECTORY"] + "/preval.txt"):
        with open(args["DATA_DIRECTORY"] + "/preval.txt", "r") as f:
            lines.extend(f.readlines())

    indices = np.arange(len(lines))
    np.random.shuffle(indices)
    valIxs = np.sort(indices[:int(np.ceil(args["PRETRAIN_VAL_SPLIT"]*len(indices)))])
    trainIxs = np.sort(indices[int(np.ceil(args["PRETRAIN_VAL_SPLIT"]*len(indices))):])

    lines = np.sort(np.array(lines))
    with open(args["DATA_DIRECTORY"] + "/pretrain.txt", "w") as f:
        f.writelines(list(lines[trainIxs]))
    with open(args["DATA_DIRECTORY"] + "/preval.txt", "w") as f:
        f.writelines(list(lines[valIxs]))

    print("\npreval.txt file generated.\n")

    return
Пример #10
0

#walking through the data directory and obtaining a list of all files in the dataset
filesList = list()
for root, dirs, files in os.walk(args["DATA_DIRECTORY"]):
    for file in files:
        if file.endswith(".mp4"):
            filesList.append(os.path.join(root, file[:-4]))


#Preprocessing each sample
print("\nNumber of data samples to be processed = %d" %(len(filesList)))
print("\n\nStarting preprocessing ....\n")

for file in tqdm(filesList, leave=True, desc="Preprocess", ncols=75):
    preprocess_sample(file)

print("\nPreprocessing Done.")



#Generating a 1 hour noise file
#Fetching audio samples from 20 random files in the dataset and adding them up to generate noise
#The length of these clips is the shortest audio sample among the 20 samples
print("\n\nGenerating the noise file ....")

noise = np.empty((0))
while len(noise) < 16000*3600:
    noisePart = np.zeros(16000*60)
    indices = np.random.randint(0, len(filesList), 20)
    for ix in indices:
Пример #11
0
def main():

    np.random.seed(args["SEED"])
    torch.manual_seed(args["SEED"])
    gpuAvailable = torch.cuda.is_available()
    device = torch.device("cuda" if gpuAvailable else "cpu")

    #declaring the visual frontend module
    vf = VisualFrontend()
    vf.load_state_dict(
        torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device))
    vf.to(device)

    #walking through the data directory and obtaining a list of all files in the dataset
    filesList = list()
    for root, dirs, files in os.walk(args["DATA_DIRECTORY"]):
        for file in files:
            if file.endswith(".mp4"):
                filesList.append(os.path.join(root, file[:-4]))

    #Preprocessing each sample
    print("\nNumber of data samples to be processed = %d" % (len(filesList)))
    print("\n\nStarting preprocessing ....\n")

    params = {
        "roiSize": args["ROI_SIZE"],
        "normMean": args["NORMALIZATION_MEAN"],
        "normStd": args["NORMALIZATION_STD"],
        "vf": vf
    }
    for file in tqdm(filesList, leave=True, desc="Preprocess", ncols=75):
        preprocess_sample(file, params)

    print("\nPreprocessing Done.")

    #Generating preval.txt for splitting the pretrain set into train and validation sets
    print("\n\nGenerating the preval.txt file ....")

    with open(args["DATA_DIRECTORY"] + "/pretrain.txt", "r") as f:
        lines = f.readlines()

    if os.path.exists(args["DATA_DIRECTORY"] + "/preval.txt"):
        with open(args["DATA_DIRECTORY"] + "/preval.txt", "r") as f:
            lines.extend(f.readlines())

    indices = np.arange(len(lines))
    np.random.shuffle(indices)
    valIxs = np.sort(
        indices[:int(np.ceil(args["PRETRAIN_VAL_SPLIT"] * len(indices)))])
    trainIxs = np.sort(
        indices[int(np.ceil(args["PRETRAIN_VAL_SPLIT"] * len(indices))):])

    lines = np.sort(np.array(lines))
    with open(args["DATA_DIRECTORY"] + "/pretrain.txt", "w") as f:
        f.writelines(list(lines[trainIxs]))
    with open(args["DATA_DIRECTORY"] + "/preval.txt", "w") as f:
        f.writelines(list(lines[valIxs]))

    print("\npreval.txt file generated.\n")

    return