Exemplo n.º 1
0
def parse_librispeech_tester(strRootDir: str,
                             strFileType: str = '.flac',
                             nCountSample: int = 1000,
                             nSamplingRate: int = 16000,
                             nFFTCount: int = 512,
                             nMelOrder: int = 24,
                             nMFCCOrder: int = 13,
                             nContextSize: int = 10,
                             dWindowLength: float = 0.025,
                             dShiftLength: float = 0.01,
                             strFeatureType: str = "mfcc",
                             strMode: str = "dvector"  # dvector, gmm, ctc, las
                             ):
    pDicFile = collections.defaultdict(list)
    pListTestFile = []
    # Extract file names
    for strRoot, strDir, pListFileName in tqdm(os.walk(strRootDir)):
        iCount = 0
        for strFileName in pListFileName:
            if splitext(strFileName)[1] == strFileType:
                strID = splitext(strFileName)[0].split('-')[0]
                pDicFile[strID].append(os.path.join(strRoot, strFileName))
                iCount += 1
                if iCount > nCountSample:
                    break
    # Listing test and train dataset
    for i, pListFileName in pDicFile.items():
        pListTestFile.extend(pListFileName[:int(len(pListFileName))])
    # Labeling speakers for PyTorch Training
    pDicLabel = {}
    pListSpeakers = list(pDicFile.keys())
    nSpeakersCount = len(pListSpeakers)
    for i in range(nSpeakersCount):
        pDicLabel[pListSpeakers[i]] = i
    # Transform data dictionary
    pDataTest = YoonDataset()
    for strFileName in pListTestFile:
        strID = splitext(basename(strFileName))[0].split('-')[0]
        pSpeech = YoonSpeech(strFileName=strFileName, nSamplingRate=nSamplingRate, strFeatureType=strFeatureType,
                             nContextSize=nContextSize,
                             nFFTCount=nFFTCount, nMelOrder=nMelOrder, nMFCCOrder=nMFCCOrder,
                             dWindowLength=dWindowLength, dShiftLength=dShiftLength)
        pObject = YoonObject(nID=int(pDicLabel[strID]), strName=strID, strType=strFeatureType, pSpeech=pSpeech)
        pDataTest.append(pObject)
    if strMode == "dvector" or strMode == "gmm":
        nDimOutput = nSpeakersCount
    elif strMode == "ctc" or strMode == "las":
        nDimOutput = yoonspeech.DEFAULT_PHONEME_COUNT
    else:
        raise ValueError("Unsupported parsing mode")
    return nDimOutput, pDataTest
Exemplo n.º 2
0
def train(pTrainData: YoonDataset, pTestData: YoonDataset, strModelPath: str):
    # Make dataset
    pTrainSet = pTrainData.to_gmm_dataset()
    pTestSet = pTestData.to_gmm_dataset()
    # Shuffle dataset
    numpy.random.shuffle(pTrainSet)
    numpy.random.shuffle(pTestSet)
    # GMM training
    nMixture = 4
    pDicGMM = {}
    for i in range(len(pTrainSet)):
        pDicGMM[pTrainSet[i][1]] = sklearn.mixture.GaussianMixture(
            n_components=nMixture, random_state=48, covariance_type='diag')
        # The covariance type is "full" matrix if we use "Deltas" data
    for i in tqdm(range(len(pTrainSet))):
        pDicGMM[pTrainSet[i][1]].fit(pTrainSet[i][0])
    # GMM test
    iAccuracy = 0
    for i, (pInputData, nTargetLabel) in enumerate(pTestSet):
        pDicCandidateScore = {}
        # Calculate likelihood scores for all the trained GMMs.
        for jSpeaker in pDicGMM.keys():
            pDicCandidateScore[jSpeaker] = pDicGMM[jSpeaker].score(pInputData)
        nLabelEstimated = max(pDicCandidateScore.keys(),
                              key=(lambda key: pDicCandidateScore[key]))
        print("Estimated: {0}, Score: {1:.2f}, True: {2}".format(
            nLabelEstimated, pDicCandidateScore[nLabelEstimated],
            nTargetLabel),
              end='    ')
        if nTargetLabel == nLabelEstimated:
            print("Correct!")
            iAccuracy += 1
        else:
            print("Incorrect...")
    print("Accuracy: {:.2f}".format(iAccuracy / len(pTestSet) * 100.0))
    # Save GMM modeling
    with open(strModelPath, 'wb') as pFile:
        pickle.dump(pDicGMM, pFile)
        print("Save {} GMM models".format(len(pDicGMM)))
Exemplo n.º 3
0
def recognition(pSpeech: YoonSpeech, strModelPath: str, nCountSpeakers: int):
    # Warp data set
    pTestData = YoonDataset(None, pSpeech)
    # Check if we can use a GPU device
    if torch.cuda.is_available():
        pDevice = torch.device('cuda')
    else:
        pDevice = torch.device('cpu')
    # Load DVector model
    pModel = DVector(nDimInput=pTestData.get_dimension(),
                     nDimOutput=nCountSpeakers).to(device=pDevice)
    pModel.eval()
    pFile = torch.load(strModelPath)
    pModel.load_state_dict(pFile['model'])
    # Recognition model
    pTensorInput = torch.from_numpy(
        pTestData[0].buffer).to(pDevice).unsqueeze(0)
    pArrayOutput = pModel(pTensorInput, bExtract=True).detach().cpu().numpy()
    nLabelEstimated = numpy.argmax(pArrayOutput,
                                   -1)  # Index of maximum of output layer
    print("Estimated: {0}, Score : {1:.2f}".format(nLabelEstimated,
                                                   numpy.max(pArrayOutput)))
    return nLabelEstimated
Exemplo n.º 4
0
def test(pTestData: YoonDataset, strModelPath: str, nCountSpeakers: int):
    # Check if we can use a GPU device
    if torch.cuda.is_available():
        pDevice = torch.device('cuda')
    else:
        pDevice = torch.device('cpu')
    print("{} device activation".format(pDevice.__str__()))
    # Load DVector model
    pModel = DVector(nDimInput=pTestData.get_dimension(),
                     nDimOutput=nCountSpeakers).to(pDevice)  # Check train data
    pModel.eval()
    pFile = torch.load(strModelPath)
    pModel.load_state_dict(pFile['model'])
    print("Successfully load the Model in path")
    # Define a data path for plot for test
    pDataSet = DvectorDataset(pTestData)
    pDataLoader = DataLoader(pDataSet,
                             batch_size=1,
                             shuffle=False,
                             collate_fn=collate_dvector,
                             num_workers=0,
                             pin_memory=True)
    pBar = tqdm(pDataLoader)
    print("Length of data = ", len(pBar))
    pListOutput = []
    pListTarget = []
    for i, (pTensorInput, pTensorTarget) in enumerate(pBar):
        pTensorInput = pTensorInput.type(torch.FloatTensor).to(pDevice)
        pTensorOutput = pModel(pTensorInput, bExtract=False)
        pListOutput.append(pTensorOutput.detach().cpu().numpy())
        pListTarget.append(pTensorTarget.detach().cpu().numpy()
                           [0])  # (Batch, Label) to (Label)
    # Prepare embeddings for plot
    pArrayOutput = numpy.concatenate(pListOutput)
    pArrayTarget = numpy.array(pListTarget)
    # Obtain embedding for the t-SNE plot
    pTSNE = TSNE(n_components=2)
    pArrayOutput = pArrayOutput.reshape(len(pArrayOutput), -1)
    pArrayTSNE = pTSNE.fit_transform(pArrayOutput)
    # Draw plot
    __draw_tSNE(pArrayTSNE, pArrayTarget)
Exemplo n.º 5
0
def test(pTestData: YoonDataset, strModelPath: str):
    # Load GMM modeling
    with open(strModelPath, 'rb') as pFile:
        pDicGMM = pickle.load(pFile)
    pTestSet = pTestData.to_gmm_dataset()
    # GMM test
    iAccuracy = 0
    for i, (pInputData, nTargetLabel) in enumerate(pTestSet):
        pDicCandidateScore = {}
        # Calculate likelihood scores for all the trained GMMs.
        for jSpeaker in pDicGMM.keys():
            pDicCandidateScore[jSpeaker] = pDicGMM[jSpeaker].score(pInputData)
        nLabelEstimated = max(pDicCandidateScore.keys(),
                              key=(lambda key: pDicCandidateScore[key]))
        print("Estimated: {0}, Score: {1:.2f}, True: {2}".format(
            nLabelEstimated, pDicCandidateScore[nLabelEstimated],
            nTargetLabel),
              end='    ')
        if nTargetLabel == nLabelEstimated:
            print("Correct!")
            iAccuracy += 1
        else:
            print("Incorrect...")
    print("Accuracy: {:.2f}".format(iAccuracy / len(pTestSet) * 100.0))
Exemplo n.º 6
0
def train(nEpoch: int,
          strModelPath: str,
          pTrainData: YoonDataset,
          pValidationData: YoonDataset,
          nCountSpeakers,
          bInitEpoch=False):
    dLearningRate = 0.01
    # Check if we can use a GPU Device
    if torch.cuda.is_available():
        pDevice = torch.device('cuda')
    else:
        pDevice = torch.device('cpu')
    print("{} device activation".format(pDevice.__str__()))
    # Define the training and testing data-set
    pTrainSet = DvectorDataset(pTrainData)
    pTrainLoader = DataLoader(pTrainSet,
                              batch_size=8,
                              shuffle=True,
                              collate_fn=collate_dvector,
                              num_workers=0,
                              pin_memory=True)
    pValidationSet = DvectorDataset(pValidationData)
    pValidationLoader = DataLoader(pValidationSet,
                                   batch_size=1,
                                   shuffle=False,
                                   collate_fn=collate_dvector,
                                   num_workers=0,
                                   pin_memory=True)
    # Define a network model
    pModel = DVector(nDimInput=pTrainData.get_dimension(),
                     nDimOutput=nCountSpeakers).to(pDevice)
    # Set the optimizer with adam
    pOptimizer = torch.optim.Adam(pModel.parameters(), lr=dLearningRate)
    # Set the training criterion
    pCriterion = torch.nn.CrossEntropyLoss(reduction='mean')
    # Load pre-trained model
    nStart = 0
    print("Directory of the pre-trained model: {}".format(strModelPath))
    if strModelPath is not None and os.path.exists(
            strModelPath) and bInitEpoch is False:
        pModelData = torch.load(strModelPath)
        nStart = pModelData['epoch']
        pModel.load_state_dict(pModelData['model'])
        pOptimizer.load_state_dict(pModelData['optimizer'])
        print("## Successfully load the model at {} epochs!".format(nStart))
    # Train and Test Repeat
    dMinLoss = 10000.0
    nCountDecrease = 0
    for iEpoch in range(nStart, nEpoch + 1):
        # Train the network
        __process_train(iEpoch,
                        pModel=pModel,
                        pDataLoader=pTrainLoader,
                        pCriterion=pCriterion,
                        pOptimizer=pOptimizer)
        # Test the network
        dLoss = __process_evaluate(pModel=pModel,
                                   pDataLoader=pValidationLoader,
                                   pCriterion=pCriterion)
        # Save the optimal model
        if dLoss < dMinLoss:
            dMinLoss = dLoss
            torch.save(
                {
                    'epoch': iEpoch,
                    'model': pModel.state_dict(),
                    'optimizer': pOptimizer.state_dict()
                }, strModelPath)
            nCountDecrease = 0
        else:
            nCountDecrease += 1
            # Decrease the learning rate by 2 when the test loss decrease 3 times in a row
            if nCountDecrease == 3:
                pDicOptimizerState = pOptimizer.state_dict()
                pDicOptimizerState['param_groups'][0]['lr'] /= 2
                pOptimizer.load_state_dict(pDicOptimizerState)
                print('learning rate is divided by 2')
                nCountDecrease = 0
Exemplo n.º 7
0
def parse_librispeech_trainer(root_dir: str,
                              file_type: str = '.flac',
                              sample_len: int = 1000,
                              sample_rate: int = 16000,
                              fft_count: int = 512,
                              mel_order: int = 24,
                              mfcc_order: int = 13,
                              context_size: int = 10,
                              win_len: float = 0.025,
                              shift_len: float = 0.01,
                              feature_type: str = "mfcc",
                              train_rate: float = 0.8,
                              mode: str = "dvector"  # dvector, gmm, ctc, las
                              ):

    def get_line_in_trans(file_path, id):
        with open(file_path) as pFile:
            lines = pFile.read().lower().split('\n')[:-1]
        for line in lines:
            if id in line:
                line = line.replace(id + ' ', "")
                return line

    def make_speech_buffer(file_path):
        speech = YoonSpeech(sample_rate=sample_rate, context_size=context_size,
                            fft_count=fft_count, mel_order=mel_order, mfcc_order=mfcc_order,
                            win_len=win_len, shift_len=shift_len)
        speech.load_sound_file(file_path)
        return speech

    feature_file_dic = collections.defaultdict(list)
    trans_file_dic = collections.defaultdict(dict)
    trans_files = []
    test_files = []
    # Extract file names
    for root, dir_, file_paths in tqdm(os.walk(root_dir)):
        i = 0
        for path in file_paths:
            if splitext(path)[1] == file_type:
                id_ = splitext(path)[0].split('-')[0]
                feature_file_dic[id_].append(os.path.join(root, path))
                i += 1
                if i > sample_len:
                    break
            elif splitext(path)[1] == ".txt":  # Recognition the words
                id_, part = splitext(path)[0].split('-')
                part = part.replace(".trans", "")
                trans_file_dic[id_][part] = os.path.join(root, path)
    # Listing test and train dataset
    for i, file_paths in feature_file_dic.items():
        trans_files.extend(file_paths[:int(len(file_paths) * train_rate)])
        test_files.extend(file_paths[int(len(file_paths) * train_rate):])
    # Labeling speakers for Speaker recognition
    speaker_dic = {}
    speakers = list(feature_file_dic.keys())
    num_speakers = len(speakers)
    for i in range(num_speakers):
        speaker_dic[speakers[i]] = i
    # Transform data dictionary
    train_dataset = YoonDataset()
    eval_dataset = YoonDataset()
    for path in trans_files:
        basename_ = splitext(basename(path))[0]
        id_, part = basename_.split('-')[0], basename_.split('-')[1]
        word = get_line_in_trans(trans_file_dic[id_][part], basename_)
        speech = make_speech_buffer(path)
        obj = YoonObject(id=int(speaker_dic[id_]), name=id_, word=word, type=feature_type, speech=speech)
        train_dataset.append(obj)
    for path in test_files:
        basename_ = splitext(basename(path))[0]
        id_, part = basename_.split('-')[0], basename_.split('-')[1]
        word = get_line_in_trans(trans_file_dic[id_][part], basename_)
        speech = make_speech_buffer(path)
        obj = YoonObject(id=int(speaker_dic[id_]), name=id_, word=word, type=feature_type, speech=speech)
        eval_dataset.append(obj)
    print("Length of Train = {}".format(train_dataset.__len__()))
    print("Length of Test = {}".format(eval_dataset.__len__()))
    if mode == "dvector" or mode == "gmm":
        output_dim = num_speakers
    elif mode == "ctc" or mode == "las":
        output_dim = yoonspeech.DEFAULT_PHONEME_COUNT
    else:
        raise ValueError("Unsupported parsing mode")
    return output_dim, train_dataset, eval_dataset
Exemplo n.º 8
0
def train(
        nEpoch: int,
        pTrainData: YoonDataset,
        pValidationData: YoonDataset,
        strModelPath="model_ctc.pth",
        nSizeBatch=8,
        bInitTrain=False,
        dLearningRate=0.01,
        nWorker=0,  # 0 = CPU, 4 = CUDA
):
    # Set the device for running the CTC model
    # Check if we can use a GPU device
    if torch.cuda.is_available():
        pDevice = torch.device('cuda')
    else:
        pDevice = torch.device('cpu')
    # Define a network architecture
    pModel = CTC(nDimInput=pTrainData.get_dimension(),
                 nDimOutput=256,
                 nCountClass=yoonspeech.DEFAULT_PHONEME_COUNT)
    pModel = pModel.to(pDevice)
    # Define an optimizer
    pOptimizer = Adam(pModel.parameters(), lr=dLearningRate)
    # Define a training criterion
    pCriterion = torch.nn.CTCLoss(blank=0)
    # Load the pre-trained model if you resume the training from the model
    nStart = 0
    if os.path.isfile(strModelPath) and bInitTrain is False:
        pModelData = torch.load(strModelPath, map_location=pDevice)
        pModel.load_state_dict(pModelData['encoder'])
        pOptimizer.load_state_dict(pModelData['optimizer'])
        nStart = pModelData['epoch']
        print("## Success to load the CTC model : epoch {}".format(nStart))
    # Define training and test dataset
    pTrainDataset = ASRDataset(pTrainData)
    pTrainLoader = DataLoader(pTrainDataset,
                              batch_size=nSizeBatch,
                              collate_fn=collate_asrdata,
                              shuffle=True,
                              num_workers=nWorker,
                              pin_memory=True)
    pValidDataset = ASRDataset(pValidationData)
    pValidLoader = DataLoader(pValidDataset,
                              batch_size=nSizeBatch,
                              collate_fn=collate_asrdata,
                              shuffle=False,
                              num_workers=nWorker,
                              pin_memory=True)
    # Perform training / validation processing
    dMinLoss = 10000.0
    nCountDecrease = 0
    for iEpoch in range(nStart, nEpoch + 1):
        __process_train(iEpoch, pModel, pTrainLoader, pCriterion, pOptimizer)
        dValidLoss, dValidLER = __process_test(iEpoch, pModel, pTrainLoader,
                                               pCriterion)
        # Save the trained model at every 10 epochs
        if dMinLoss > dValidLoss:
            dMinLoss = dValidLoss
            torch.save(
                {
                    'epoch': iEpoch,
                    'encoder': pModel.state_dict(),
                    'optimizer': pOptimizer.state_dict()
                }, strModelPath)
            nCountDecrease = 0
        else:
            nCountDecrease += 1
            # Decrease the learning rate by 2 when the test loss decrease 5 times in a row
            if nCountDecrease == 5:
                pDicOptimizerState = pOptimizer.state_dict()
                pDicOptimizerState['param_groups'][0]['lr'] /= 2
                pOptimizer.load_state_dict(pDicOptimizerState)
                print('learning rate is divided by 2')
                nCountDecrease = 0