def main(): parser = argparse.ArgumentParser() parser.add_argument( 'model_prefix', nargs='?', default='model/complete/enfr', help='The prefix of nmt model path, default is "%(default)s"') parser.add_argument( '--start', action="store", metavar="index", dest="start", type=int, default=1, help='The starting index of saved model to test, default is %(default)s' ) parser.add_argument( '--end', action="store", metavar="index", dest="end", type=int, default=10, help='The ending index of saved model to test, default is %(default)s') parser.add_argument( '--gap', action="store", metavar="index", dest="interval", type=int, default=10000, help= 'The interval between two consecutive tested models\' indexes, default is %(default)s' ) parser.add_argument('--result', action='store', metavar='filename', dest='result_file', type=str, default='trans_result.tsv', help='Target small train file, default is %(default)s') parser.add_argument('--beam', action="store", metavar="beam_size", dest="beam_size", type=int, default=4, help='The beam size for translation, default is 4') parser.add_argument('--dataset', action='store', dest='dataset', default='en-fr_bpe', help='Dataset, default is "%(default)s"') args = parser.parse_args() if args.result_file == 'trans_result.tsv': model_file_name = os.path.split(args.model_prefix)[-1] args.result_file = './translated/complete/{}_bs{}.txt'.format( os.path.splitext(model_file_name)[0], args.beam_size) else: model_file_name = os.path.split(args.result_file)[-1] print args bleus = {} train1, train2, small1, small2, dev1, dev2, dev3, test1, test2, dic1, dic2 = Datasets[ args.dataset] for idx in xrange(args.start, args.end + 1): trans_model_file = '%s.iter%d.npz' % (os.path.splitext( args.model_prefix)[0], idx * args.interval) trans_result_file = '%s.iter%d.txt' % (os.path.splitext( args.result_file)[0], idx * args.interval) if not os.path.exists(trans_result_file): exec_str = 'python translate_single.py -b 32 -k {} -p 1 -n {} {} {} {} {}\n'.format( args.beam_size, trans_model_file, './data/dic/{}'.format(dic1), './data/dic/{}'.format(dic2), './data/test/{}'.format(test1), trans_result_file) print 'Translate model {} '.format(trans_model_file) print exec_str pl_output = subprocess.Popen(exec_str, shell=True, stdout=subprocess.PIPE).stdout.read() if 'tc' in args.dataset: # first de-truecase, then de-bpe exec_str = 'perl scripts/moses/detruecase.perl < {} > {}.detc'.format( trans_result_file, trans_result_file) pl_output = subprocess.Popen(exec_str, shell=True, stdout=subprocess.PIPE).stdout.read() trans_result_file = '{}.detc'.format(trans_result_file) if 'bpe' in args.dataset: with open('{}.bpe'.format(trans_result_file), 'w') as fout: fout.write(de_bpe(open(trans_result_file, 'r').read())) trans_result_file = '{}.bpe'.format(trans_result_file) bleus[idx] = get_bleu('./data/test/{}'.format(test2), trans_result_file) print 'model %s, bleu %.2f' % (idx * args.interval, bleus[idx]) args.result_file = './translated/complete/{}_s{}_e{}.txt'.format( os.path.splitext(model_file_name)[0], args.start, args.end) bleu_array = sorted(bleus.items(), key=operator.itemgetter(0), reverse=False) with open(args.result_file, 'w') as fout: fout.write('\n'.join( [str(idx) + '\t' + str(score) for (idx, score) in bleu_array]))
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, alpha=0, normalize=False, chr_level=False, batch_size=1, zhen=False, src_trg_table_path=None, search_all_alphas=False, ref_file=None, dump_all=False, args=None): batch_mode = batch_size > 1 assert batch_mode # load model model_options options = load_options_test(model) src_trg_table = None if src_trg_table_path: with open(src_trg_table_path, 'rb') as f: src_trg_table = pkl.load(f) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) model_type = 'NMTModel' if args.trg_attention: model_type = 'TrgAttnNMTModel' model, _ = build_and_init_model(model, options=options, build=False, model_type=model_type) f_init, f_next = model.build_sampler(trng=trng, use_noise=use_noise, batch_mode=batch_mode, dropout=options['use_dropout'], need_srcattn=zhen) trans, all_cand_ids, all_cand_trans, all_scores, word_idic_tgt = translate_whole( model, f_init, f_next, trng, dictionary, dictionary_target, source_file, k, normalize, alpha=alpha, src_trg_table=src_trg_table, zhen=zhen, n_words_src=options['n_words_src'], echo=True, batch_size=batch_size) if search_all_alphas: all_alpha_values = 0.1 * np.array(xrange(11)) for alpha_v in all_alpha_values: trans_ids = [] for samples, sample_scores in zip(all_cand_ids, all_scores): trans_ids.append(samples[chosen_by_len_alpha( samples, sample_scores, alpha_v)]) trans_strs = '\n'.join(seqs2words(trans_ids, word_idic_tgt)) if 'tc' in source_file: trans_strs = de_tc(trans_strs) if 'bpe' in source_file: trans_strs = de_bpe(trans_strs) print 'alpha %.2f, bleu %.2f' % ( alpha_v, get_bleu(ref_file, trans_strs, type_in='string')) else: with open(saveto, 'w') as f: print >> f, '\n'.join(trans) if dump_all: saveto_dump_all = '%s.all_beam%d' % (saveto, k) with open(saveto_dump_all, 'w') as f: print >> f, '\n'.join(all_cand_trans) print 'Done'
def main(): parser = argparse.ArgumentParser( description='Replace UNK in the translated file, and get BLEU.') parser.add_argument('model', help='The model path') parser.add_argument('translated_file', help='The translated file with UNK') parser.add_argument( 'table', nargs='?', default='./data/dic/fastAlign_en2fr.pkl', help='Source-Target table path, default is %(default)s') parser.add_argument('--dataset', action='store', dest='dataset', default='en-fr_bpe', help='Dataset, default is "%(default)s"') parser.add_argument('--nbest', action="store", metavar="N", dest="nbest", type=int, default=1, help='number of best, default is %(default)s') parser.add_argument('-B', action='store_false', default=True, dest='bleu', help='Get BLEU, default is True, set to False') parser.add_argument( '-d', '--dump', action='store_true', default=False, dest='dump', help='Dump translated file without UNK, default is False, set to True') args = parser.parse_args() print 'model: {}, translated file: {}'.format(args.model, args.translated_file) train1, train2, small1, small2, valid1, valid2, test1, test2, dic1, dic2 = Datasets[ args.dataset] options, src_sents_num, trans_sents_num, src_sents_str, trans_sents_str, src_tgt_table = _load_data( args, './data/dic/{}'.format(dic1), './data/dic/{}'.format(dic2), './data/test/{}'.format(test1), ) replace_unk(args, src_sents_num, trans_sents_num, src_sents_str, trans_sents_str, src_tgt_table) translated_string = '\n'.join(' '.join(w for w in s) for s in trans_sents_str) + '\n' postfix = '.nounk' if 'bpe' in args.dataset: translated_string = de_bpe(translated_string) postfix = '.bpe' + postfix if args.dump: with open('{}{}'.format(args.translated_file, postfix), 'w') as f: print >> f, translated_string, if args.bleu: bleu = get_bleu( './data/test/{}'.format(test2), translated_string, type_in='string', ) print 'BLEU: {:.2f}'.format(bleu)