def runSeq(opts): fid = open(opts.out_file, 'w') phone_map = get_char_map(opts.dataDir) print phone_map print len(phone_map) alisDir = opts.alisDir if opts.alisDir else opts.dataDir loader = dl.DataLoader(opts.dataDir, opts.rawDim, opts.inputDim, alisDir) hyps = list() refs = list() hypscores = list() refscores = list() numphones = list() subsets = list() alignments = list() if MODEL_TYPE != 'ngram': cfg_file = '/deep/u/zxie/rnnlm/13/cfg.json' params_file = '/deep/u/zxie/rnnlm/13/params.pk' #cfg_file = '/deep/u/zxie/dnn/11/cfg.json' #params_file = '/deep/u/zxie/dnn/11/params.pk' cfg = load_config(cfg_file) model_class, model_hps = get_model_class_and_params(MODEL_TYPE) opt_hps = OptimizerHyperparams() model_hps.set_from_dict(cfg) opt_hps.set_from_dict(cfg) clm = model_class(None, model_hps, opt_hps, train=False, opt='nag') with open(params_file, 'rb') as fin: clm.from_file(fin) else: from srilm import LM from decoder_config import LM_ARPA_FILE print 'Loading %s...' % LM_ARPA_FILE clm = LM(LM_ARPA_FILE) print 'Done.' #clm = None for i in range(opts.start_file, opts.start_file + opts.numFiles): data_dict, alis, keys, _ = loader.loadDataFileDict(i) # For later alignments keys = sorted(keys) # For Switchboard filter if DATA_SUBSET == 'eval2000': if SWBD_SUBSET == 'swbd': keys = [k for k in keys if k.startswith('sw')] elif SWBD_SUBSET == 'callhome': keys = [k for k in keys if k.startswith('en')] ll_file = pjoin(LIKELIHOODS_DIR, 'loglikelihoods_%d.pk' % i) ll_fid = open(ll_file, 'rb') probs_dict = pickle.load(ll_fid) # Parallelize decoding over utterances print 'Decoding utterances in parallel, n_jobs=%d, file=%d' % ( NUM_CPUS, i) decoded_utts = Parallel(n_jobs=NUM_CPUS)(delayed(decode_utterance)( k, probs_dict[k], alis[k], phone_map, lm=clm) for k in keys) for k, (hyp, ref, hypscore, refscore, align) in zip(keys, decoded_utts): if refscore is None: refscore = 0.0 if hypscore is None: hypscore = 0.0 hyp = replace_contractions(hyp) fid.write(k + ' ' + ' '.join(hyp) + '\n') hyps.append(hyp) refs.append(ref) hypscores.append(hypscore) refscores.append(refscore) numphones.append(len(alis[k])) subsets.append('callhm' if k.startswith('en') else 'swbd') alignments.append(align) fid.close() # Pickle some values for computeStats.py pkid = open(opts.out_file.replace('.txt', '.pk'), 'wb') pickle.dump(hyps, pkid) pickle.dump(refs, pkid) pickle.dump(hypscores, pkid) pickle.dump(refscores, pkid) pickle.dump(numphones, pkid) pickle.dump(subsets, pkid) pickle.dump(alignments, pkid) pkid.close()
w = np.random.choice(range(model.hps.output_size), p=probs) char = chars[w] return char if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('cfg_file', help='config file with run data for model to use') args = parser.parse_args() cfg = load_config(args.cfg_file) model_class, model_hps = get_model_class_and_params(MODEL_TYPE) opt_hps = OptimizerHyperparams() model_hps.set_from_dict(cfg) opt_hps.set_from_dict(cfg) cfg = CfgStruct(**cfg) SAMPLES = 100 SAMPLE_LENGTH = 100 # PARAM ALPHA = 1.0 # FIXME PARAM LM_ORDER = CONTEXT + 1 with open(CHAR_CORPUS_VOCAB_FILE, 'rb') as fin: char_inds = pickle.load(fin) chars = dict((v, k) for k, v in char_inds.iteritems()) # Construct network model = model_class(None, model_hps, opt_hps, train=False)
def runSeq(opts): fid = open(opts.out_file, 'w') phone_map = get_char_map(opts.dataDir) print phone_map print len(phone_map) alisDir = opts.alisDir if opts.alisDir else opts.dataDir loader = dl.DataLoader(opts.dataDir, opts.rawDim, opts.inputDim, alisDir) hyps = list() refs = list() hypscores = list() refscores = list() numphones = list() subsets = list() alignments = list() if MODEL_TYPE != 'ngram': cfg_file = '/deep/u/zxie/rnnlm/13/cfg.json' params_file = '/deep/u/zxie/rnnlm/13/params.pk' #cfg_file = '/deep/u/zxie/dnn/11/cfg.json' #params_file = '/deep/u/zxie/dnn/11/params.pk' cfg = load_config(cfg_file) model_class, model_hps = get_model_class_and_params(MODEL_TYPE) opt_hps = OptimizerHyperparams() model_hps.set_from_dict(cfg) opt_hps.set_from_dict(cfg) clm = model_class(None, model_hps, opt_hps, train=False, opt='nag') with open(params_file, 'rb') as fin: clm.from_file(fin) else: from srilm import LM from decoder_config import LM_ARPA_FILE print 'Loading %s...' % LM_ARPA_FILE clm = LM(LM_ARPA_FILE) print 'Done.' #clm = None for i in range(opts.start_file, opts.start_file + opts.numFiles): data_dict, alis, keys, _ = loader.loadDataFileDict(i) # For later alignments keys = sorted(keys) # For Switchboard filter if DATA_SUBSET == 'eval2000': if SWBD_SUBSET == 'swbd': keys = [k for k in keys if k.startswith('sw')] elif SWBD_SUBSET == 'callhome': keys = [k for k in keys if k.startswith('en')] ll_file = pjoin(LIKELIHOODS_DIR, 'loglikelihoods_%d.pk' % i) ll_fid = open(ll_file, 'rb') probs_dict = pickle.load(ll_fid) # Parallelize decoding over utterances print 'Decoding utterances in parallel, n_jobs=%d, file=%d' % (NUM_CPUS, i) decoded_utts = Parallel(n_jobs=NUM_CPUS)(delayed(decode_utterance)(k, probs_dict[k], alis[k], phone_map, lm=clm) for k in keys) for k, (hyp, ref, hypscore, refscore, align) in zip(keys, decoded_utts): if refscore is None: refscore = 0.0 if hypscore is None: hypscore = 0.0 hyp = replace_contractions(hyp) fid.write(k + ' ' + ' '.join(hyp) + '\n') hyps.append(hyp) refs.append(ref) hypscores.append(hypscore) refscores.append(refscore) numphones.append(len(alis[k])) subsets.append('callhm' if k.startswith('en') else 'swbd') alignments.append(align) fid.close() # Pickle some values for computeStats.py pkid = open(opts.out_file.replace('.txt', '.pk'), 'wb') pickle.dump(hyps, pkid) pickle.dump(refs, pkid) pickle.dump(hypscores, pkid) pickle.dump(refscores, pkid) pickle.dump(numphones, pkid) pickle.dump(subsets, pkid) pickle.dump(alignments, pkid) pkid.close()