예제 #1
0
def lrs2charlm_checker():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LRS2CharLM()
    model.load_state_dict(
        torch.load(args["TRAINED_LM_FILE"], map_location=device))
    model.to(device)

    inp = torch.tensor(args["CHAR_TO_INDEX"][" "] - 1)
    initStateBatch = None
    string = list()
    for i in range(100):
        inputBatch = inp.reshape(1, 1)
        inputBatch = inputBatch.to(device)
        model.eval()
        with torch.no_grad():
            outputBatch, finalStateBatch = model(inputBatch, initStateBatch)

        outputBatch = torch.exp(outputBatch)
        out = outputBatch.squeeze()
        probs = out.tolist()
        ix = np.random.choice(np.arange(len(probs)), p=probs / np.sum(probs))
        char = args["INDEX_TO_CHAR"][ix + 1]
        string.append(char)

        inp = torch.tensor(ix)
        initStateBatch = finalStateBatch

    print("".join(string))
    return
예제 #2
0
def main():

    np.random.seed(args["SEED"])
    torch.manual_seed(args["SEED"])
    gpuAvailable = torch.cuda.is_available()
    device = torch.device("cuda:1" if gpuAvailable else "cpu")
    kwargs = {"num_workers":args["NUM_WORKERS"], "pin_memory":True} if gpuAvailable else {}
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


    #declaring the test dataset and test dataloader
    videoParams = {"videoFPS":args["VIDEO_FPS"]}
    testData = LRS2Main("test", args["DATA_DIRECTORY"], args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"], args["STEP_SIZE"],
                        videoParams)
    testLoader = DataLoader(testData, batch_size=args["BATCH_SIZE"], collate_fn=collate_fn, shuffle=True, **kwargs)


    if args["TRAINED_MODEL_FILE"] is not None:

        print("\nTrained Model File: %s" %(args["TRAINED_MODEL_FILE"]))

        #declaring the model, loss function and loading the trained model weights
        model = VideoNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"], args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"],
                         args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"], args["NUM_CLASSES"])
        model.load_state_dict(torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"], map_location=device))
        model.to(device)
        loss_function = nn.CTCLoss(blank=0, zero_infinity=False)


        #declaring the language model
        lm = LRS2CharLM()
        lm.load_state_dict(torch.load(args["TRAINED_LM_FILE"], map_location=device))
        lm.to(device)
        if not args["USE_LM"]:
            lm = None


        print("\nTesting the trained model .... \n")

        beamSearchParams = {"beamWidth":args["BEAM_WIDTH"], "alpha":args["LM_WEIGHT_ALPHA"], "beta":args["LENGTH_PENALTY_BETA"],
                            "threshProb":args["THRESH_PROBABILITY"]}
        testParams = {"decodeScheme":args["TEST_DEMO_DECODING"], "beamSearchParams":beamSearchParams, "spaceIx":args["CHAR_TO_INDEX"][" "],
                      "eosIx":args["CHAR_TO_INDEX"]["<EOS>"], "lm":lm}

        #evaluating the model over the test set
        testLoss, testCER, testWER = evaluate(model, testLoader, loss_function, device, testParams)

        #printing the test set loss, CER and WER
        print("Test Loss: %.6f || Test CER: %.3f || Test WER: %.3f" %(testLoss, testCER, testWER))
        print("\nTesting Done.\n")


    else:
        print("Path to the trained model file not specified.\n")

    return
예제 #3
0
def ctc_search_decode_checker():
    outputs = [
        "TTTEEEST-IINNNN-G-   CC-TTCCC- -DEEE-CO-DD---E       -FUU-NCCC--TAA-B-FA--E",
        "ONNE SSSTEEEP    ISSS  OOOOVVEERA- FDDA-S A FD-AASDF - AD-AFA DF-ADF SF-ADF",
        "EVERYTHING ALRIGHT CHECK DONE SH-SG-GAD-G HS- RA-R H J- J-AM GA-AM GA-GA-AD",
        "SSEEEE-E--  -EEE-VE-NNN  ---DDDOOOO-ODDE-E   --O-OOOOTTTY AAAASS-SSAAM WORK",
        "---------------------------------------------------------------------------"
    ]
    inpLens = [64, 32, 29, 75, 56]

    outputProbs = 0.01 * torch.ones(
        (len(outputs[0]), len(inpLens), args["NUM_CLASSES"]))
    inpLens = torch.tensor(inpLens)
    for n in range(len(outputs)):
        for i in range(len(outputs[n])):
            char = outputs[n][i]
            if char == "-":
                ix = 0
            else:
                ix = args["CHAR_TO_INDEX"][char]
            outputProbs[i, n, ix] = 1.5
    outputLogProbs = torch.log(outputProbs)

    beamSearchParams = {
        "beamWidth": args["BEAM_WIDTH"],
        "alpha": args["LM_WEIGHT_ALPHA"],
        "beta": args["LENGTH_PENALTY_BETA"],
        "threshProb": args["THRESH_PROBABILITY"]
    }
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    lm = LRS2CharLM()
    lm.load_state_dict(torch.load(args["TRAINED_LM_FILE"],
                                  map_location=device))
    lm.to(device)
    if not args["USE_LM"]:
        lm = None

    predictions, predictionLens = ctc_search_decode(
        outputLogProbs, inpLens, beamSearchParams, args["CHAR_TO_INDEX"][" "],
        args["CHAR_TO_INDEX"]["<EOS>"], lm)
    predictions = [
        args["INDEX_TO_CHAR"][ix] for ix in predictions.tolist()
        if ix != args["CHAR_TO_INDEX"]["<EOS>"]
    ]
    predictedSequences = list()
    s = 0
    for ln in predictionLens.tolist():
        predictedSequences.append("".join(predictions[s:s + ln - 1]))
        s = s + ln - 1
    print(predictedSequences)
    return
예제 #4
0
    print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"]))

    #declaring the model,loss function and loading the trained model weights
    model = AVNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"],
                  args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"],
                  args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"],
                  args["TX_DROPOUT"], args["NUM_CLASSES"])
    model.load_state_dict(
        torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"],
                   map_location=device))
    model.to(device)
    loss_function = nn.CTCLoss(blank=0, zero_infinity=False)

    #declaring the language model
    lm = LRS2CharLM()
    lm.load_state_dict(torch.load(args["TRAINED_LM_FILE"],
                                  map_location=device))
    lm.to(device)
    if not args["USE_LM"]:
        lm = None

    print("\nTesting the trained model .... \n")

    beamSearchParams = {
        "beamWidth": args["BEAM_WIDTH"],
        "alpha": args["LM_WEIGHT_ALPHA"],
        "beta": args["LENGTH_PENALTY_BETA"],
        "threshProb": args["THRESH_PROBABILITY"]
    }
    if args["TEST_DEMO_MODE"] == "AO":
예제 #5
0
파일: demo.py 프로젝트: zz12375/deep_avsr
def main():

    np.random.seed(args["SEED"])
    torch.manual_seed(args["SEED"])
    gpuAvailable = torch.cuda.is_available()
    device = torch.device("cuda" if gpuAvailable else "cpu")

    if args["TRAINED_MODEL_FILE"] is not None:

        print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"]))
        print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"]))

        #declaring the model and loading the trained weights
        model = VideoNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"],
                         args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"],
                         args["TX_FEEDFORWARD_DIM"], args["TX_DROPOUT"],
                         args["NUM_CLASSES"])
        model.load_state_dict(
            torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"],
                       map_location=device))
        model.to(device)

        #declaring the visual frontend module
        vf = VisualFrontend()
        vf.load_state_dict(
            torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device))
        vf.to(device)

        #declaring the language model
        lm = LRS2CharLM()
        lm.load_state_dict(
            torch.load(args["TRAINED_LM_FILE"], map_location=device))
        lm.to(device)
        if not args["USE_LM"]:
            lm = None

        print("\n\nRunning Demo .... \n")

        #walking through the demo directory and running the model on all video files in it
        for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]):
            for file in files:
                if file.endswith(".mp4"):
                    sampleFile = os.path.join(root, file[:-4])

                    #preprocessing the sample
                    params = {
                        "roiSize": args["ROI_SIZE"],
                        "normMean": args["NORMALIZATION_MEAN"],
                        "normStd": args["NORMALIZATION_STD"],
                        "vf": vf
                    }
                    preprocess_sample(sampleFile, params)

                    #converting the data sample into appropriate tensors for input to the model
                    visualFeaturesFile = os.path.join(root, file[:-4]) + ".npy"
                    videoParams = {"videoFPS": args["VIDEO_FPS"]}
                    inp, _, inpLen, _ = prepare_main_input(
                        visualFeaturesFile, None,
                        args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"],
                        videoParams)
                    inputBatch, _, inputLenBatch, _ = collate_fn([
                        (inp, None, inpLen, None)
                    ])

                    #running the model
                    inputBatch = (inputBatch.float()).to(device)
                    inputLenBatch = (inputLenBatch.int()).to(device)
                    model.eval()
                    with torch.no_grad():
                        outputBatch = model(inputBatch)

                    #obtaining the prediction using CTC deocder
                    if args["TEST_DEMO_DECODING"] == "greedy":
                        predictionBatch, predictionLenBatch = ctc_greedy_decode(
                            outputBatch, inputLenBatch,
                            args["CHAR_TO_INDEX"]["<EOS>"])

                    elif args["TEST_DEMO_DECODING"] == "search":
                        beamSearchParams = {
                            "beamWidth": args["BEAM_WIDTH"],
                            "alpha": args["LM_WEIGHT_ALPHA"],
                            "beta": args["LENGTH_PENALTY_BETA"],
                            "threshProb": args["THRESH_PROBABILITY"]
                        }
                        predictionBatch, predictionLenBatch = ctc_search_decode(
                            outputBatch, inputLenBatch, beamSearchParams,
                            args["CHAR_TO_INDEX"][" "],
                            args["CHAR_TO_INDEX"]["<EOS>"], lm)

                    else:
                        print("Invalid Decode Scheme")
                        exit()

                    #converting character indices back to characters
                    pred = predictionBatch[:][:-1]
                    pred = "".join(
                        [args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()])

                    #printing the predictions
                    print("File: %s" % (file))
                    print("Prediction: %s" % (pred))
                    print("\n")

        print("Demo Completed.\n")

    else:
        print("\nPath to trained model file not specified.\n")

    return
예제 #6
0
def main():

    np.random.seed(args["SEED"])
    torch.manual_seed(args["SEED"])
    gpuAvailable = torch.cuda.is_available()
    device = torch.device("cuda" if gpuAvailable else "cpu")

    if args["TRAINED_MODEL_FILE"] is not None:

        print("\nTrained Model File: %s" % (args["TRAINED_MODEL_FILE"]))
        print("\nDemo Directory: %s" % (args["DEMO_DIRECTORY"]))

        #declaring the model and loading the trained weights
        model = AVNet(args["TX_NUM_FEATURES"], args["TX_ATTENTION_HEADS"],
                      args["TX_NUM_LAYERS"], args["PE_MAX_LENGTH"],
                      args["AUDIO_FEATURE_SIZE"], args["TX_FEEDFORWARD_DIM"],
                      args["TX_DROPOUT"], args["NUM_CLASSES"])
        model.load_state_dict(
            torch.load(args["CODE_DIRECTORY"] + args["TRAINED_MODEL_FILE"],
                       map_location=device))
        model.to(device)

        #declaring the visual frontend module
        vf = VisualFrontend()
        vf.load_state_dict(
            torch.load(args["TRAINED_FRONTEND_FILE"], map_location=device))
        vf.to(device)

        #declaring the language model
        lm = LRS2CharLM()
        lm.load_state_dict(
            torch.load(args["TRAINED_LM_FILE"], map_location=device))
        lm.to(device)
        if not args["USE_LM"]:
            lm = None

        #reading the noise file
        if args["TEST_DEMO_NOISY"]:
            _, noise = wavfile.read(args["DATA_DIRECTORY"] + "/noise.wav")
        else:
            noise = None

        print("\n\nRunning Demo .... \n")

        rows = []

        print(args['TEST_DEMO_MODE'])

        #walking through the demo directory and running the model on all video files in it
        for root, dirs, files in os.walk(args["DEMO_DIRECTORY"]):
            for file in files:
                if file.endswith(".mp4"):

                    sNum = file[7]

                    if file[13] <= "9" and file[13] >= "0":
                        cNum = file[12:14]
                    else:
                        cNum = file[12]

                    if file[-6] == "l":
                        cType = "jumble"
                    elif file[-6] == "s":
                        cType = "base"
                    else:
                        cType = file[-8:-4]

                    sampleFile = os.path.join(root, file[:-4])

                    #preprocessing the sample
                    params = {
                        "roiSize": args["ROI_SIZE"],
                        "normMean": args["NORMALIZATION_MEAN"],
                        "normStd": args["NORMALIZATION_STD"],
                        "vf": vf
                    }
                    preprocess_sample(sampleFile, params)

                    #converting the data sample into appropriate tensors for input to the model
                    audioFile = os.path.join(root, file[:-4]) + ".wav"
                    visualFeaturesFile = os.path.join(root, file[:-4]) + ".npy"
                    audioParams = {
                        "stftWindow": args["STFT_WINDOW"],
                        "stftWinLen": args["STFT_WIN_LENGTH"],
                        "stftOverlap": args["STFT_OVERLAP"]
                    }
                    videoParams = {"videoFPS": args["VIDEO_FPS"]}
                    inp, _, inpLen, _ = prepare_main_input(
                        audioFile, visualFeaturesFile, None, noise,
                        args["MAIN_REQ_INPUT_LENGTH"], args["CHAR_TO_INDEX"],
                        args["NOISE_SNR_DB"], audioParams, videoParams)
                    inputBatch, _, inputLenBatch, _ = collate_fn([
                        (inp, None, inpLen, None)
                    ])

                    #running the model
                    inputBatch = ((inputBatch[0].float()).to(device),
                                  (inputBatch[1].float()).to(device))
                    inputLenBatch = (inputLenBatch.int()).to(device)
                    if args["TEST_DEMO_MODE"] == "AO":
                        inputBatch = (inputBatch[0], None)
                    elif args["TEST_DEMO_MODE"] == "VO":
                        inputBatch = (None, inputBatch[1])
                    elif args["TEST_DEMO_MODE"] == "AV":
                        pass
                    else:
                        print("Invalid Operation Mode.")
                        exit()

                    model.eval()
                    with torch.no_grad():
                        outputBatch = model(inputBatch)

                    #obtaining the prediction using CTC deocder
                    if args["TEST_DEMO_DECODING"] == "greedy":
                        predictionBatch, predictionLenBatch = ctc_greedy_decode(
                            outputBatch, inputLenBatch,
                            args["CHAR_TO_INDEX"]["<EOS>"])

                    elif args["TEST_DEMO_DECODING"] == "search":
                        beamSearchParams = {
                            "beamWidth": args["BEAM_WIDTH"],
                            "alpha": args["LM_WEIGHT_ALPHA"],
                            "beta": args["LENGTH_PENALTY_BETA"],
                            "threshProb": args["THRESH_PROBABILITY"]
                        }
                        predictionBatch, predictionLenBatch = ctc_search_decode(
                            outputBatch, inputLenBatch, beamSearchParams,
                            args["CHAR_TO_INDEX"][" "],
                            args["CHAR_TO_INDEX"]["<EOS>"], lm)
                    else:
                        print("Invalid Decode Scheme")
                        exit()

                    #converting character indices back to characters
                    pred = predictionBatch[:][:-1]
                    pred = "".join(
                        [args["INDEX_TO_CHAR"][ix] for ix in pred.tolist()])

                    #printing the predictions
                    print("File: %s" % (file))
                    print("Speaker: " + sNum + "   Clip: " + cNum +
                          "   Clip Type: " + cType)
                    print("Prediction: %s" % (pred))
                    print("\n")
                    row = [sNum, cNum, cType, pred]
                    rows.append(row)

        print("Demo Completed.\n")
        with open("predictions.csv", "w", newline="") as file:
            writer = csv.writer(file)
            for x in range(len(rows)):
                writer.writerow(rows[x])

    else:
        print("\nPath to trained model file not specified.\n")

    return