def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern, sDataProtoFile): dbInfo = dl.DatabaseInfo() dbInfo.name = 'dataset_test' dbInfo.data_handler = 'deeplearn' dbInfo.path_prefix = sDataDir datasetInfo = dbInfo.data.add() datasetInfo.data_format = dl.DatasetInfo.PBM datasetInfo.size = 0 datasetInfo.sparse_label = True datasetInfo.type = dl.DatasetInfo.TEST_SET datasetInfo.file_pattern = sDataFilePattern uttIDBatch = [] uttIDLength = [] featMatBatch = None batchSz = -1 iFileIdx = 0 kaldiIn = KaldiReadIn(ark_file) kaldiIn.open() uttID, featMat = kaldiIn.next() while featMat is not None: if batchSz < 0: batchSz = 400 * 1024 * 1024 / (4 * featMat.shape[1]) datasetInfo.dimensions = featMat.shape[1] + 1 datasetInfo.label_start_index = datasetInfo.dimensions - 1 if featMatBatch is None: featMatBatch = featMat else: featMatBatch = np.vstack([featMatBatch, featMat]) uttIDBatch.append(uttID) uttIDLength.append(featMat.shape[0]) if featMatBatch.shape[0] >= batchSz: util.WriteProto( sDataFileTemplate % iFileIdx, util.npy2ProtoMat( np.hstack( [featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] featMatBatch = None uttID, featMat = kaldiIn.next() kaldiIn.close() # last batch if featMatBatch is not None: util.WriteProto( sDataFileTemplate % iFileIdx, util.npy2ProtoMat( np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] util.WriteProto(sDataProtoFile, dbInfo) return (uttIDBatch, uttIDLength)
def pfile2Proto(pfilePath, filePrefix, pbmDir): pfile = read_dataset(pfilePath, {'partition': 1024 * 1024 * 400}) dsInfo = dl.DatasetInfo() dsInfo.data_format = dl.DatasetInfo.PBM dsInfo.sparse_label = True dsInfo.file_pattern = '%s[0-9]*.pbm' % filePrefix dim = None sz = 0 for i, (data, label) in enumerate(zip(pfile.feat_mats, pfile.label_vecs)): dataset = util.npy2ProtoMat(np.hstack([data, label[:, None]])) util.WriteProto(os.path.join(pbmDir, '%s%05d.pbm' % (filePrefix, i)), dataset) if dim is None: dim = data.shape[1] + 1 if dim != data.shape[1] + 1: print dim, sz, data, shape, label.shape assert dim == data.shape[1] + 1 sz += data.shape[0] dsInfo.size = sz dsInfo.dimensions = dim dsInfo.label_start_index = dim - 1 return dsInfo
def createPbmDataset(pfiles, pbmDir, protoFilePath, gpuMem): assert gpuMem > 0.1 dbInfo = dl.DatabaseInfo() for (name, sPath) in pfiles: assert name == 'train' or name == 'valid' or name == 'test' dsInfo = pfile2Proto(sPath, name + '_part', pbmDir) if name == 'train': dsInfo.type = dl.DatasetInfo.TRAIN_SET elif name == 'valid': dsInfo.type = dl.DatasetInfo.EVAL_SET else: dsInfo.type = dl.DatasetInfo.TEST_SET dbInfo.data.extend([dsInfo]) dbInfo.name = 'dataset' dbInfo.data_handler = 'deeplearn' dbInfo.main_memory = 6.0 dbInfo.gpu_memory = float(gpuMem) dbInfo.path_prefix = pbmDir util.WriteProto(protoFilePath, dbInfo)
evalOp.randomize = False evalOp.get_last_piece = True evalOp.verbose = False # test on all models for i in xrange(0, config.PHONES): for j in xrange(i + 1, config.PHONES): sResultFile = os.path.join(sResultDir, '%d_%d.csv' % (i, j)) if os.path.exists(sResultFile): continue print 'Testing for %d-%d, writing results to %s' % (i, j, sResultFile) evalOp.result_file = sResultFile util.WriteProto(sEvalOpFile, evalOp) sModelFile = sModelFiles % (i, j) args = [ sDeeplearnPath, 'eval', sModelFile, '--eval-op=%s' % sEvalOpFile ] pr = subprocess.Popen(args, stderr=subprocess.STDOUT) pr.wait() if pr.returncode != 0: print 'Failed to test %d-%d' % (i, j) exit(1) # run majorityVote, compute "probabilities" sHardVoteFile = os.path.join(sMajorVoteDir, 'hard.csv') sSoftVoteFile = os.path.join(sMajorVoteDir, 'soft.csv')
if not os.path.exists(sOutputModelFile): # modify architecture... sModelDir = os.path.join(wdir, 'model/') createDir(sModelDir) sCurrentDir = os.path.split(os.path.realpath( os.path.abspath(__file__)))[0] model = util.ReadProto( os.path.join(sCurrentDir, 'prototype/conv_timit.pbtxt'), dl.ModelData()) model.name = 'spn_conv' for n in model.nodes: if n.name == 'output': n.dimension = num_outputs break sModelFile = os.path.join(sModelDir, 'spn_conv.pbtxt') util.WriteProto(sModelFile, model) trainOp = util.ReadProto( os.path.join(sCurrentDir, 'prototype/train.pbtxt'), dl.Operation()) sCheckpointDir = os.path.join(sModelDir, 'cp') trainOp.name = 'train' trainOp.data_proto = sDataProtoFile trainOp.checkpoint_directory = sCheckpointDir trainOp.verbose = False sTrainOpFile = os.path.join(sModelDir, 'train.pbtxt') util.WriteProto(sTrainOpFile, trainOp) evalOp = util.ReadProto( os.path.join(sCurrentDir, 'prototype/eval.pbtxt'), dl.Operation()) evalOp.data_proto = sDataProtoFile evalOp.verbose = False
def extractRepresentation(data, wdir, sDeeplearnPath, sModelFile): # append label column data = np.hstack([data, np.zeros((data.shape[0], 1))]) npyData = util.npy2ProtoMat(data) sDataFile = os.path.join(wdir, 'input.pbm') util.WriteProto(sDataFile, npyData) sDataProtoFile = os.path.join(wdir, 'data.pbtxt') dbInfo = dl.DatabaseInfo() dbInfo.name = 'dataset_extract' dbInfo.data_handler = 'deeplearn' dbInfo.path_prefix = wdir datasetInfo = dbInfo.data.add() datasetInfo.data_format = dl.DatasetInfo.PBM datasetInfo.size = data.shape[0] datasetInfo.dimensions = data.shape[1] datasetInfo.label_start_index = datasetInfo.dimensions - 1 datasetInfo.sparse_label = True datasetInfo.type = dl.DatasetInfo.TEST_SET datasetInfo.file_pattern = 'input.pbm' util.WriteProto(sDataProtoFile, dbInfo) sEvalOpFile = os.path.join(wdir, 'eval.pbtxt') sExtractedActs = os.path.join(wdir, 'acts') sLayerName = 'conv2' if not os.path.exists(sExtractedActs): os.mkdir(sExtractedActs) evalOp = dl.Operation() evalOp.name = 'extract' evalOp.stop_condition.all_processed = True evalOp.operation_type = dl.Operation.TEST evalOp.data_proto = sDataProtoFile evalOp.randomize = False evalOp.get_last_piece = True evalOp.verbose = False evalOp.extracted_layers.append(sLayerName) evalOp.extracted_output_dir = sExtractedActs evalOp.extracted_data_format = dl.DatasetInfo.PBM evalOp.extracted_data_sets.append(dl.DatasetInfo.TEST_SET) util.WriteProto(sEvalOpFile, evalOp) sOutFileTemplate = os.path.join(sExtractedActs, sLayerName, '*.pbm') for s in sorted(glob.glob(sOutFileTemplate)): try: os.remove(s) except Exception: pass # run the network... args = [ sDeeplearnPath, 'extract', sModelFile, '--eval-op=%s' % sEvalOpFile ] pr = subprocess.Popen(args, stderr=subprocess.STDOUT) pr.wait() if pr.returncode != 0: print 'Failed to extract representations' exit(1) # read Dataset mOutput = None for s in sorted(glob.glob(sOutFileTemplate)): m = util.proto2Npy(util.ReadProto(s, dl.Matrix())) if mOutput is None: mOutput = m else: mOutput = np.vstack([mOutput, m]) if mOutput.shape[0] != data.shape[0]: print 'Invalid results' exit(1) return mOutput