def __init__(self, model, lang, gpu=False, wx=False): self.lang = lang self.is_ip_wx = wx parser = argparse.ArgumentParser( description='transliterate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) self.opt = parser.parse_args() self.trans_dict = dict() self.broken_words = dict() file_path = os.path.dirname(os.path.abspath(__file__)) if self.lang == 'hin': self.to_utf = WXC(order='wx2utf', lang='hin') self.non_alpha = re.compile(u'([^a-zA-Z]+)') self.alpha_letters = set(string.ascii_letters) self.com_abbr = { 'b': ['BI', 'be'], 'd': ['xI', 'xe'], 'g': ['jI'], 'k': ['ke', 'ki', 'kI'], 'h': ['hE', 'hEM'], 'ha': ['hE', 'hEM'], 'n': ['ina', 'ne'], 'm': ['meM', 'mEM'], 'p': ['pe'], 'q': ['kyoM'], 'r': ['Ora', 'ora'], 's': ['isa', 'se'], 'y': ['ye'] } if self.lang == 'eng': self.non_alpha = re.compile(u'([^a-z]+)') self.alpha_letters = set(string.ascii_letters[:26]) with open('%s/extras/COMMON_ABBR.eng' % file_path) as fp: self.com_abbr = {} for line in fp: k, v = line.split() self.com_abbr[k] = v.split('|') dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] if gpu: self.opt.gpu = 0 self.opt.cuda = self.opt.gpu > -1 self.opt.model = model self.opt.n_best = 5 self.opt.lang = lang if self.opt.cuda: torch.cuda.set_device(self.opt.gpu) # Load the model. self.fields, self.model, self.model_opt = onmt.ModelConstructor.load_test_model( self.opt, dummy_opt.__dict__)
def main(anno_file_name, col_headers, raw_args=None, verbose=True): parser = argparse.ArgumentParser(description='evaluate.py') opts.translate_opts(parser) opt = parser.parse_args(raw_args) torch.cuda.set_device(opt.gpu) opt.db_file = os.path.join(opt.data_path, '{}.db'.format(opt.split)) opt.pre_word_vecs = os.path.join(opt.data_path, 'embedding') dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) opts.train_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.anno = anno_file_name engine = DBEngine(opt.db_file) js_list = table.IO.read_anno_json(opt.anno) prev_best = (None, None) sql_query = [] for fn_model in glob.glob(opt.model_path): opt.model = fn_model translator = Translator(opt, dummy_opt.__dict__) data = table.IO.TableDataset(js_list, translator.fields, None, False) test_data = table.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=True, sort_within_batch=False) # inference r_list = [] for batch in test_data: r_list += translator.translate(batch) r_list.sort(key=lambda x: x.idx) pred = r_list[-1] sql_pred = { 'agg': pred.agg, 'sel': pred.sel, 'conds': pred.recover_cond_to_gloss(js_list[-1]) } if verbose: print('\n sql_pred: ', sql_pred, '\n') print('\n col_headers: ', col_headers, '\n') sql_query = Query(sql_pred['sel'], sql_pred['agg'], sql_pred['conds']) try: ans_pred = engine.execute_query(js_list[-1]['table_id'], Query.from_dict(sql_pred), lower=True, verbose=verbose) except Exception as e: ans_pred = None return sql_query.get_complete_query(col_headers), ans_pred
def __init__( self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_97.30_ppl_1.41_e7.pt', dynamic_dict=True, attn_debug=True, share_vocab=True, replace_unk=True, verbose=True): #def __init__(self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_78.18_ppl_9.60_e4.pt'): #def __init__(self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_82.37_ppl_6.28_e8.pt'): #def __init__(self, modelfile='/data1/data1/Anirban/structure2text/model_softmax_1_acc_84.10_ppl_2.13_e1.pt'): print('Loading ' + modelfile) parser = argparse.ArgumentParser( description='seq2seq_predict', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) #opt = parser.parse_args() opt, unknown = parser.parse_known_args() print('Unknown arguments ', unknown) opt.dynamic_dict = dynamic_dict opt.attn_debug = attn_debug opt.share_vocab = share_vocab opt.replace_unk = replace_unk opt.verbose = verbose dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) opt.src = 'temp_seq2seq_pred_%f.txt' % time.time() opt.model = modelfile print('Loading seq2seq model...') # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) self.opt = opt self.fields = fields self.model = model self.model_opt = model_opt
def translate(src, model, output): parser = argparse.ArgumentParser( description='translate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.translate_opts(parser) opt = parser.parse_known_args([])[0] if opt.batch_size != 1: print("WARNING: -batch_size isn't supported currently, " "we set it to 1 for now!") opt.batch_size = 1 opt.src = src opt.model = model opt.output = output dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator(opt, dummy_opt.__dict__) out_file = codecs.open(opt.output, 'w', 'utf-8') gold_out_file = codecs.open("gold_" + opt.output, 'w', 'utf-8') #print "TRANSLATOR SOURCE VOCAB" #for i in range(len(translator.fields["src"].vocab.itos)): # print i, translator.fields["src"].vocab.itos[i] #print data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, use_filter_pred=False) test_data = onmt.IO.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, shuffle=False) counter = count(1) for batch in test_data: pred_batch, gold_batch, pred_scores, gold_scores, attn, src \ = translator.translate(batch, data) # z_batch: an iterator over the predictions, their scores, # the gold sentence, its score, and the source sentence for each # sentence in the batch. It has to be zip_longest instead of # plain-old zip because the gold_batch has length 0 if the target # is not included. z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores, (sent.squeeze(1) for sent in src.split(1, dim=1))) for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch: n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]] out_file.write('\n'.join(n_best_preds)) out_file.write('\n') out_file.flush() words = get_src_words(src_sent, translator.fields["src"].vocab.itos) #print words gold_out_file.write(words) gold_out_file.write('\n') gold_out_file.flush()
from __future__ import division import os import argparse import torch import codecs import glob import table import table.IO import opts parser = argparse.ArgumentParser(description='evaluate.py') opts.translate_opts(parser) opt = parser.parse_args() torch.cuda.set_device(opt.gpu) opt.anno = os.path.join(opt.root_dir, opt.dataset, '{}.json'.format(opt.split)) opt.bpe_path = os.path.join(opt.root_dir, opt.dataset, 'bpe.pt') opt.pre_word_vecs = os.path.join(opt.root_dir, opt.dataset, 'embedding') if opt.beam_size > 0: opt.batch_size = 1 def get_run_epoch_by_fn(fn_model): tk_list = fn_model.split('/') for tk in tk_list: if tk.startswith('run.'): _run = tk[4:] elif tk.startswith('m_'): _epoch = tk.split('_')[1] return int(_run), int(_epoch)
def main(training=False, fields=None, model=None, opt=None, writer=None, step=0, corpus_type="dev", multi_process=False): time = Time() if training: assert fields is not None assert model is not None assert opt is not None model.eval() model.generator.eval() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) out_file = codecs.open( "{}_{}_pred_{}.txt".format(opt.save_model, corpus_type.replace("/", "_"), str(step)), "w", "utf-8") print("Output file: ", out_file.name) copy_attn = opt.copy_attn model_opt = opt else: # Load the model. parser = argparse.ArgumentParser( description='translate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) opt = parser.parse_args() dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) out_file = codecs.open(opt.output, 'w', 'utf-8') assert opt.tgt is None data = onmt.io.build_dataset(fields, opt.src, opt.tgt, use_filter_pred=False, ngram=model_opt.ngram) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.translate_batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) output, pred_score_total, pred_words_total = \ translate_single_process(opt, model, fields, data, data_iter, f=out_file) outfile_name = out_file.name if opt.bpe: import subprocess subprocess.check_output("sed 's/\@\@ //g' < {} > {}".format( outfile_name, outfile_name + ".nonbpe"), shell=True) outfile_name = outfile_name + ".nonbpe" if opt.new_bpe: generate_nonbpe(outfile_name) outfile_name = outfile_name + ".nonbpe" # if writer is not None: # ratio_stats.log_tensorboard(writer, step) # _report_score('PRED', pred_score_total, pred_words_total, writer, step, corpus_type) metric = 0 if opt.tgt: # _report_score('GOLD', gold_score_total, gold_words_total, writer, step, corpus_type) if opt.report_single_bleu: metric = _report_single_source_bleu(opt, outfile_name, writer, step, corpus_type) if opt.report_multi_bleu: metric = _report_multi_source_bleu(outfile_name, writer, step, corpus_type) if opt.report_rouge: metric = _report_rouge(opt) # if opt.dump_beam: # import json # json.dump(translator.beam_accum, # codecs.open(opt.dump_beam, 'w', 'utf-8')) time.timeit(task="Translation Testing") return metric
def get_model_api(): """Returns lambda function for api""" # initialize model once and for all # initialize config for translate parser = argparse.ArgumentParser( description='translate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) opt = parser.parse_args() # initialize config for model dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta, opt.coverage_penalty, opt.length_penalty) translator = onmt.translate.Translator(model, fields, beam_size=opt.beam_size, n_best=opt.n_best, global_scorer=scorer, max_length=opt.max_length, copy_attn=model_opt.copy_attn, cuda=opt.cuda, beam_trace=opt.dump_beam != "", min_length=opt.min_length) # File to write sentences to. out_file = codecs.open(opt.output, 'w', 'utf-8') # hw_count = 0 # start_0 = current_milli_time() def model_api(input_data): """ Args: input_data: submitted to the API, json string Returns: output_data: after some transformation, to be returned to the API """ # process input global hw_count global start_0 res = {} request_id = str(uuid.uuid4()) res['id'] = input_data['id'] scgink = input_data['scg_ink'] try: scgink_data = ScgImage(scgink, request_id) except: res['status'] = 'error' res['info'] = 'bad scgink data' return res # empty traces due to scgink data if not scgink_data.traces: res['info'] = 'wrong scgink data' res['status'] = 'error' return res start_t = current_milli_time() img_file_path = outdir + '/' + request_id + '_input.png' #convert to png format scgink_data.save_image(img_file_path) #preprocess image filename, postfix, processed_img = img_file_path, '.png', outdir + '/' + request_id + '_preprocessed.png' crop_blank_default_size, pad_size, buckets, downsample_ratio = [ 600, 60 ], (8, 8, 8, 8), default_buckets, 2 l = (filename, postfix, processed_img, crop_blank_default_size, pad_size, buckets, downsample_ratio) if not preprocess(l): res['status'] = 'error' return res # construct data os.system('echo ' + request_id + '_preprocessed.png ' + '>temp/test.txt') src = 'temp/test.txt' src_dir = 'temp' #print "src=", src #print "src_dir=", src_dir data = onmt.io.build_dataset(fields, opt.data_type, src, None, src_dir=src_dir, sample_rate=opt.sample_rate, window_size=opt.window_size, window_stride=opt.window_stride, window=opt.window, use_filter_pred=False) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) # Inference builder = onmt.translate.TranslationBuilder(data, translator.fields, opt.n_best, opt.replace_unk, opt.tgt) cnt = 0 for batch in data_iter: batch_data = translator.translate_batch(batch, data) translations = builder.from_batch(batch_data) for trans in translations: cnt += 1 n_best_preds = [ " ".join(pred) for pred in trans.pred_sents[:opt.n_best] ] now_t = current_milli_time() #hw_count = hw_count + 1 #if hw_count %100 == 0 : # app.logger.debug( "last 100 "+(now_t - start_0 )) # start_0 = now_t # app.logger.debug( "time spent "+( now_t -start_t)) # process the output n_best_latex = [] for pred in n_best_preds: n_best_latex.append(detokenizer(pred)) n_best_ascii = [] for pred in n_best_latex: n_best_ascii.append(latex_asciimath(pred)) # return the output for the api res['status'] = "succuss" res['info'] = now_t - start_t res['mathml'] = '' res['latex'] = n_best_latex[0] res['asciimath'] = n_best_ascii[0] res['n_best_latex'] = n_best_latex res['n_best_ascii'] = n_best_ascii app.logger.debug(request_id + "\t" + n_best_latex[0] + "\n") return res return model_api
def main(): parser = argparse.ArgumentParser( description='translate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) group = parser.add_argument_group('Rerank') group.add_argument('-templates', required=True, help="Path to the test templates") opt = parser.parse_args() dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Load the model. fields, model, model_opt = \ model_utils.load_test_model(opt, dummy_opt.__dict__) fields["spliter_pos"] = torchtext.data.Field(use_vocab=False, dtype=torch.long, sequential=False) # Unfold templates src_path, tmp_path = txt_utils.unfold_templates(opt.src, opt.templates) # Test data data = txt_utils.build_template_dataset(fields, src_path, None, tmp_path, use_filter_pred=False, with_pos=True, dynamic_dict=False) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) count = 0 #offset=0 #scores=[] score_dict = {} for batch in data_iter: #print(batch.indices) #index=batch.indices-offset src = onmt.io.make_features(batch, 'src', 'text') predict_score = model.predict_rouge(src, batch.src[1], batch.spliter_pos) #ordered_score=predict_score[index].data #scores.extend(ordered_score) #offset+=index.size(0) for index, score in zip(batch.indices.data, predict_score.data): score_dict[int(index)] = float(score) count += 1 if count % 100 == 0: print('score {} batches'.format(count)) #if count>10: break # File to write sentences to. score_file = opt.output + '.score' print('score_file is ' + score_file) print('opt.tgt is ' + opt.tgt) out_file = open(score_file, 'w', encoding='utf-8') print(len(score_dict)) for index in range(len(score_dict)): print(score_dict[index], file=out_file) out_file.close() select_templates(src_path, tmp_path, score_file, opt.output, opt.tgt)
import torch from itertools import count import onmt.io import onmt.translate import onmt import onmt.ModelConstructor import onmt.modules import opts parser = argparse.ArgumentParser( description='translate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) opt = parser.parse_args() def _report_score(name, score_total, words_total): print("%s AVG SCORE: %.4f, %s PPL: %.4f" % ( name, score_total / words_total, name, math.exp(-score_total / words_total))) def _report_bleu(): import subprocess path = os.path.split(os.path.realpath(__file__))[0] print() res = subprocess.check_output(