def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern, sDataProtoFile): dbInfo = dl.DatabaseInfo() dbInfo.name = 'dataset_test' dbInfo.data_handler = 'deeplearn' dbInfo.path_prefix = sDataDir datasetInfo = dbInfo.data.add() datasetInfo.data_format = dl.DatasetInfo.PBM datasetInfo.size = 0 datasetInfo.sparse_label = True datasetInfo.type = dl.DatasetInfo.TEST_SET datasetInfo.file_pattern = sDataFilePattern uttIDBatch = [] uttIDLength = [] featMatBatch = None batchSz = -1 iFileIdx = 0 kaldiIn = KaldiReadIn(ark_file) kaldiIn.open() uttID, featMat = kaldiIn.next() while featMat is not None: if batchSz < 0: batchSz = 400 * 1024 * 1024 / (4 * featMat.shape[1]) datasetInfo.dimensions = featMat.shape[1] + 1 datasetInfo.label_start_index = datasetInfo.dimensions - 1 if featMatBatch is None: featMatBatch = featMat else: featMatBatch = np.vstack([featMatBatch, featMat]) uttIDBatch.append(uttID) uttIDLength.append(featMat.shape[0]) if featMatBatch.shape[0] >= batchSz: util.WriteProto( sDataFileTemplate % iFileIdx, util.npy2ProtoMat( np.hstack( [featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] featMatBatch = None uttID, featMat = kaldiIn.next() kaldiIn.close() # last batch if featMatBatch is not None: util.WriteProto( sDataFileTemplate % iFileIdx, util.npy2ProtoMat( np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] util.WriteProto(sDataProtoFile, dbInfo) return (uttIDBatch, uttIDLength)
def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern, sDataProtoFile): dbInfo = dl.DatabaseInfo() dbInfo.name = 'dataset_test' dbInfo.data_handler = 'deeplearn' dbInfo.path_prefix = sDataDir datasetInfo = dbInfo.data.add() datasetInfo.data_format = dl.DatasetInfo.PBM datasetInfo.size = 0 datasetInfo.sparse_label = True datasetInfo.type = dl.DatasetInfo.TEST_SET datasetInfo.file_pattern = sDataFilePattern uttIDBatch = [] uttIDLength = [] featMatBatch = None batchSz = -1 iFileIdx = 0 kaldiIn = KaldiReadIn(ark_file) kaldiIn.open() uttID, featMat = kaldiIn.next() while featMat is not None: if batchSz < 0: batchSz = 400*1024*1024 / (4*featMat.shape[1]) datasetInfo.dimensions = featMat.shape[1] + 1 datasetInfo.label_start_index = datasetInfo.dimensions - 1 if featMatBatch is None: featMatBatch = featMat else: featMatBatch = np.vstack([featMatBatch, featMat]) uttIDBatch.append(uttID) uttIDLength.append(featMat.shape[0]) if featMatBatch.shape[0] >= batchSz: util.WriteProto(sDataFileTemplate % iFileIdx, util.npy2ProtoMat(np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] featMatBatch = None uttID, featMat = kaldiIn.next() kaldiIn.close() # last batch if featMatBatch is not None: util.WriteProto(sDataFileTemplate % iFileIdx, util.npy2ProtoMat(np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] util.WriteProto(sDataProtoFile, dbInfo) return (uttIDBatch, uttIDLength)
removeFile(output_scp) removeFile(output_ark) sDataDir = os.path.join(wdir, 'data') if not os.path.exists(sDataDir): os.mkdir(sDataDir) kaldiIn = KaldiReadIn(ark_file) kaldiIn.open() kaldiOut = KaldiWriteOut(output_scp,output_ark) kaldiOut.open() uttIDBatch = [] uttIDLength = [] featMatBatch = None batchSz = -1 uttID, featMat = kaldiIn.next() while featMat is not None: if batchSz < 0: batchSz = 300*1024*1024 / (4*featMat.shape[1]) if featMatBatch is None: featMatBatch = featMat else: featMatBatch = np.vstack([featMatBatch, featMat]) uttIDBatch.append(uttID) uttIDLength.append(featMat.shape[0]) if featMatBatch.shape[0] >= batchSz: featOut = extractRepresentation(featMatBatch, sDataDir, sDeeplearnPath, sModelFile) rIdx = 0 for i, uId in enumerate(uttIDBatch):
removeFile(output_scp) removeFile(output_ark) sDataDir = os.path.join(wdir, 'data') if not os.path.exists(sDataDir): os.mkdir(sDataDir) kaldiIn = KaldiReadIn(ark_file) kaldiIn.open() kaldiOut = KaldiWriteOut(output_scp, output_ark) kaldiOut.open() uttIDBatch = [] uttIDLength = [] featMatBatch = None batchSz = -1 uttID, featMat = kaldiIn.next() while featMat is not None: if batchSz < 0: batchSz = 300 * 1024 * 1024 / (4 * featMat.shape[1]) if featMatBatch is None: featMatBatch = featMat else: featMatBatch = np.vstack([featMatBatch, featMat]) uttIDBatch.append(uttID) uttIDLength.append(featMat.shape[0]) if featMatBatch.shape[0] >= batchSz: featOut = extractRepresentation(featMatBatch, sDataDir, sDeeplearnPath, sModelFile) rIdx = 0