Exemplo n.º 1
0
def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern,
                  sDataProtoFile):

    dbInfo = dl.DatabaseInfo()
    dbInfo.name = 'dataset_test'
    dbInfo.data_handler = 'deeplearn'
    dbInfo.path_prefix = sDataDir
    datasetInfo = dbInfo.data.add()
    datasetInfo.data_format = dl.DatasetInfo.PBM
    datasetInfo.size = 0
    datasetInfo.sparse_label = True
    datasetInfo.type = dl.DatasetInfo.TEST_SET
    datasetInfo.file_pattern = sDataFilePattern

    uttIDBatch = []
    uttIDLength = []
    featMatBatch = None
    batchSz = -1
    iFileIdx = 0
    kaldiIn = KaldiReadIn(ark_file)
    kaldiIn.open()
    uttID, featMat = kaldiIn.next()
    while featMat is not None:
        if batchSz < 0:
            batchSz = 400 * 1024 * 1024 / (4 * featMat.shape[1])
            datasetInfo.dimensions = featMat.shape[1] + 1
            datasetInfo.label_start_index = datasetInfo.dimensions - 1

        if featMatBatch is None:
            featMatBatch = featMat
        else:
            featMatBatch = np.vstack([featMatBatch, featMat])
        uttIDBatch.append(uttID)
        uttIDLength.append(featMat.shape[0])

        if featMatBatch.shape[0] >= batchSz:
            util.WriteProto(
                sDataFileTemplate % iFileIdx,
                util.npy2ProtoMat(
                    np.hstack(
                        [featMatBatch,
                         np.zeros((featMatBatch.shape[0], 1))])))
            iFileIdx += 1
            datasetInfo.size += featMatBatch.shape[0]
            featMatBatch = None
        uttID, featMat = kaldiIn.next()
    kaldiIn.close()

    # last batch
    if featMatBatch is not None:
        util.WriteProto(
            sDataFileTemplate % iFileIdx,
            util.npy2ProtoMat(
                np.hstack([featMatBatch,
                           np.zeros((featMatBatch.shape[0], 1))])))
        iFileIdx += 1
        datasetInfo.size += featMatBatch.shape[0]
    util.WriteProto(sDataProtoFile, dbInfo)
    return (uttIDBatch, uttIDLength)
Exemplo n.º 2
0
    wdir = os.path.abspath(arguments['wdir'])
    output_file_prefix = arguments['output_file_prefix']
    sModelFile = arguments['model_file']
    sDeeplearnPath = arguments['deeplearn_path']

    # paths for output files
    output_scp = output_file_prefix + '.scp'
    output_ark = output_file_prefix + '.ark'
    removeFile(output_scp)
    removeFile(output_ark)

    sDataDir = os.path.join(wdir, 'data')
    if not os.path.exists(sDataDir):
        os.mkdir(sDataDir)

    kaldiIn = KaldiReadIn(ark_file)
    kaldiIn.open()
    kaldiOut = KaldiWriteOut(output_scp, output_ark)
    kaldiOut.open()
    uttIDBatch = []
    uttIDLength = []
    featMatBatch = None
    batchSz = -1
    uttID, featMat = kaldiIn.next()
    while featMat is not None:
        if batchSz < 0:
            batchSz = 300 * 1024 * 1024 / (4 * featMat.shape[1])

        if featMatBatch is None:
            featMatBatch = featMat
        else: