Пример #1
0
def runSeq(opts):
    fid = open(opts.out_file, 'w')
    phone_map = get_char_map(opts.dataDir)
    print phone_map
    print len(phone_map)

    alisDir = opts.alisDir if opts.alisDir else opts.dataDir
    loader = dl.DataLoader(opts.dataDir, opts.rawDim, opts.inputDim, alisDir)

    hyps = list()
    refs = list()
    hypscores = list()
    refscores = list()
    numphones = list()
    subsets = list()
    alignments = list()

    if MODEL_TYPE != 'ngram':
        cfg_file = '/deep/u/zxie/rnnlm/13/cfg.json'
        params_file = '/deep/u/zxie/rnnlm/13/params.pk'
        #cfg_file = '/deep/u/zxie/dnn/11/cfg.json'
        #params_file = '/deep/u/zxie/dnn/11/params.pk'

        cfg = load_config(cfg_file)
        model_class, model_hps = get_model_class_and_params(MODEL_TYPE)
        opt_hps = OptimizerHyperparams()
        model_hps.set_from_dict(cfg)
        opt_hps.set_from_dict(cfg)

        clm = model_class(None, model_hps, opt_hps, train=False, opt='nag')
        with open(params_file, 'rb') as fin:
            clm.from_file(fin)
    else:
        from srilm import LM
        from decoder_config import LM_ARPA_FILE
        print 'Loading %s...' % LM_ARPA_FILE
        clm = LM(LM_ARPA_FILE)
        print 'Done.'
    #clm = None

    for i in range(opts.start_file, opts.start_file + opts.numFiles):
        data_dict, alis, keys, _ = loader.loadDataFileDict(i)
        # For later alignments
        keys = sorted(keys)

        # For Switchboard filter
        if DATA_SUBSET == 'eval2000':
            if SWBD_SUBSET == 'swbd':
                keys = [k for k in keys if k.startswith('sw')]
            elif SWBD_SUBSET == 'callhome':
                keys = [k for k in keys if k.startswith('en')]

        ll_file = pjoin(LIKELIHOODS_DIR, 'loglikelihoods_%d.pk' % i)
        ll_fid = open(ll_file, 'rb')
        probs_dict = pickle.load(ll_fid)

        # Parallelize decoding over utterances
        print 'Decoding utterances in parallel, n_jobs=%d, file=%d' % (
            NUM_CPUS, i)
        decoded_utts = Parallel(n_jobs=NUM_CPUS)(delayed(decode_utterance)(
            k, probs_dict[k], alis[k], phone_map, lm=clm) for k in keys)

        for k, (hyp, ref, hypscore, refscore,
                align) in zip(keys, decoded_utts):
            if refscore is None:
                refscore = 0.0
            if hypscore is None:
                hypscore = 0.0
            hyp = replace_contractions(hyp)
            fid.write(k + ' ' + ' '.join(hyp) + '\n')

            hyps.append(hyp)
            refs.append(ref)
            hypscores.append(hypscore)
            refscores.append(refscore)
            numphones.append(len(alis[k]))
            subsets.append('callhm' if k.startswith('en') else 'swbd')
            alignments.append(align)

    fid.close()

    # Pickle some values for computeStats.py
    pkid = open(opts.out_file.replace('.txt', '.pk'), 'wb')
    pickle.dump(hyps, pkid)
    pickle.dump(refs, pkid)
    pickle.dump(hypscores, pkid)
    pickle.dump(refscores, pkid)
    pickle.dump(numphones, pkid)
    pickle.dump(subsets, pkid)
    pickle.dump(alignments, pkid)
    pkid.close()
Пример #2
0
    w = np.random.choice(range(model.hps.output_size), p=probs)
    char = chars[w]

    return char


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('cfg_file', help='config file with run data for model to use')
    args = parser.parse_args()

    cfg = load_config(args.cfg_file)
    model_class, model_hps = get_model_class_and_params(MODEL_TYPE)
    opt_hps = OptimizerHyperparams()
    model_hps.set_from_dict(cfg)
    opt_hps.set_from_dict(cfg)
    cfg = CfgStruct(**cfg)

    SAMPLES = 100
    SAMPLE_LENGTH = 100
    # PARAM
    ALPHA = 1.0
    # FIXME PARAM
    LM_ORDER = CONTEXT + 1

    with open(CHAR_CORPUS_VOCAB_FILE, 'rb') as fin:
        char_inds = pickle.load(fin)
    chars = dict((v, k) for k, v in char_inds.iteritems())

    # Construct network
    model = model_class(None, model_hps, opt_hps, train=False)
Пример #3
0
def runSeq(opts):
    fid = open(opts.out_file, 'w')
    phone_map = get_char_map(opts.dataDir)
    print phone_map
    print len(phone_map)

    alisDir = opts.alisDir if opts.alisDir else opts.dataDir
    loader = dl.DataLoader(opts.dataDir, opts.rawDim, opts.inputDim, alisDir)

    hyps = list()
    refs = list()
    hypscores = list()
    refscores = list()
    numphones = list()
    subsets = list()
    alignments = list()

    if MODEL_TYPE != 'ngram':
        cfg_file = '/deep/u/zxie/rnnlm/13/cfg.json'
        params_file = '/deep/u/zxie/rnnlm/13/params.pk'
        #cfg_file = '/deep/u/zxie/dnn/11/cfg.json'
        #params_file = '/deep/u/zxie/dnn/11/params.pk'

        cfg = load_config(cfg_file)
        model_class, model_hps = get_model_class_and_params(MODEL_TYPE)
        opt_hps = OptimizerHyperparams()
        model_hps.set_from_dict(cfg)
        opt_hps.set_from_dict(cfg)

        clm = model_class(None, model_hps, opt_hps, train=False, opt='nag')
        with open(params_file, 'rb') as fin:
            clm.from_file(fin)
    else:
        from srilm import LM
        from decoder_config import LM_ARPA_FILE
        print 'Loading %s...' % LM_ARPA_FILE
        clm = LM(LM_ARPA_FILE)
        print 'Done.'
    #clm = None

    for i in range(opts.start_file, opts.start_file + opts.numFiles):
        data_dict, alis, keys, _ = loader.loadDataFileDict(i)
        # For later alignments
        keys = sorted(keys)

        # For Switchboard filter
        if DATA_SUBSET == 'eval2000':
            if SWBD_SUBSET == 'swbd':
                keys = [k for k in keys if k.startswith('sw')]
            elif SWBD_SUBSET == 'callhome':
                keys = [k for k in keys if k.startswith('en')]

        ll_file = pjoin(LIKELIHOODS_DIR, 'loglikelihoods_%d.pk' % i)
        ll_fid = open(ll_file, 'rb')
        probs_dict = pickle.load(ll_fid)

        # Parallelize decoding over utterances
        print 'Decoding utterances in parallel, n_jobs=%d, file=%d' % (NUM_CPUS, i)
        decoded_utts = Parallel(n_jobs=NUM_CPUS)(delayed(decode_utterance)(k, probs_dict[k], alis[k], phone_map, lm=clm) for k in keys)

        for k, (hyp, ref, hypscore, refscore, align) in zip(keys, decoded_utts):
            if refscore is None:
                refscore = 0.0
            if hypscore is None:
                hypscore = 0.0
            hyp = replace_contractions(hyp)
            fid.write(k + ' ' + ' '.join(hyp) + '\n')

            hyps.append(hyp)
            refs.append(ref)
            hypscores.append(hypscore)
            refscores.append(refscore)
            numphones.append(len(alis[k]))
            subsets.append('callhm' if k.startswith('en') else 'swbd')
            alignments.append(align)

    fid.close()

    # Pickle some values for computeStats.py
    pkid = open(opts.out_file.replace('.txt', '.pk'), 'wb')
    pickle.dump(hyps, pkid)
    pickle.dump(refs, pkid)
    pickle.dump(hypscores, pkid)
    pickle.dump(refscores, pkid)
    pickle.dump(numphones, pkid)
    pickle.dump(subsets, pkid)
    pickle.dump(alignments, pkid)
    pkid.close()