def __init__(self, model, lang, gpu=False, wx=False): self.lang = lang self.is_ip_wx = wx parser = argparse.ArgumentParser( description='transliterate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) self.opt = parser.parse_args() self.trans_dict = dict() self.broken_words = dict() file_path = os.path.dirname(os.path.abspath(__file__)) if self.lang == 'hin': self.to_utf = WXC(order='wx2utf', lang='hin') self.non_alpha = re.compile(u'([^a-zA-Z]+)') self.alpha_letters = set(string.ascii_letters) self.com_abbr = { 'b': ['BI', 'be'], 'd': ['xI', 'xe'], 'g': ['jI'], 'k': ['ke', 'ki', 'kI'], 'h': ['hE', 'hEM'], 'ha': ['hE', 'hEM'], 'n': ['ina', 'ne'], 'm': ['meM', 'mEM'], 'p': ['pe'], 'q': ['kyoM'], 'r': ['Ora', 'ora'], 's': ['isa', 'se'], 'y': ['ye'] } if self.lang == 'eng': self.non_alpha = re.compile(u'([^a-z]+)') self.alpha_letters = set(string.ascii_letters[:26]) with open('%s/extras/COMMON_ABBR.eng' % file_path) as fp: self.com_abbr = {} for line in fp: k, v = line.split() self.com_abbr[k] = v.split('|') dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] if gpu: self.opt.gpu = 0 self.opt.cuda = self.opt.gpu > -1 self.opt.model = model self.opt.n_best = 5 self.opt.lang = lang if self.opt.cuda: torch.cuda.set_device(self.opt.gpu) # Load the model. self.fields, self.model, self.model_opt = onmt.ModelConstructor.load_test_model( self.opt, dummy_opt.__dict__)
def parse_args(): parser = argparse.ArgumentParser( description='umt.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.preprocess_opts(parser) opt = parser.parse_args() return opt
def parse_args(): parser = argparse.ArgumentParser( description='preprocess.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.preprocess_opts(parser) opt = parser.parse_args() torch.manual_seed(opt.seed) return opt
def parse_args(): parser = argparse.ArgumentParser( description='umt.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.model_opts(parser) opts.preprocess_opts(parser) opts.train_opts(parser) opt = parser.parse_args() torch.manual_seed(opt.seed) if opt.word_vec_size != -1: opt.src_word_vec_size = opt.word_vec_size opt.tgt_word_vec_size = opt.word_vec_size if opt.layers != -1: opt.enc_layers = opt.layers opt.dec_layers = opt.layers opt.brnn = (opt.encoder_type == "brnn") # if opt.seed > 0: random.seed(opt.seed) torch.manual_seed(opt.seed) if torch.cuda.is_available() and not opt.gpuid: print("WARNING: You have a CUDA device, should run with -gpuid 0") if opt.gpuid: cuda.set_device(opt.gpuid[0]) if opt.seed > 0: torch.cuda.manual_seed(opt.seed) if len(opt.gpuid) > 1: sys.stderr.write("Sorry, multigpu isn't supported yet, coming soon!\n") sys.exit(1) # Set up the Crayon logging server. if opt.exp_host != "": from pycrayon import CrayonClient cc = CrayonClient(hostname=opt.exp_host) experiments = cc.get_experiment_names() print(experiments) if opt.exp in experiments: cc.remove_experiment(opt.exp) return opt
def parse_args(): parser = argparse.ArgumentParser( description='preprocess.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.preprocess_opts(parser) opt = parser.parse_args() torch.manual_seed(opt.seed) check_existing_pt_files(opt) return opt
def parse_args(): parser = configargparse.ArgumentParser( description='preprocess.py', formatter_class=configargparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.config_opts(parser) opts.preprocess_opts(parser) opt = parser.parse_args() torch.manual_seed(opt.seed) check_existing_pt_files(opt) return opt
def __init__( self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_97.30_ppl_1.41_e7.pt', dynamic_dict=True, attn_debug=True, share_vocab=True, replace_unk=True, verbose=True): #def __init__(self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_78.18_ppl_9.60_e4.pt'): #def __init__(self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_82.37_ppl_6.28_e8.pt'): #def __init__(self, modelfile='/data1/data1/Anirban/structure2text/model_softmax_1_acc_84.10_ppl_2.13_e1.pt'): print('Loading ' + modelfile) parser = argparse.ArgumentParser( description='seq2seq_predict', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) #opt = parser.parse_args() opt, unknown = parser.parse_known_args() print('Unknown arguments ', unknown) opt.dynamic_dict = dynamic_dict opt.attn_debug = attn_debug opt.share_vocab = share_vocab opt.replace_unk = replace_unk opt.verbose = verbose dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) opt.src = 'temp_seq2seq_pred_%f.txt' % time.time() opt.model = modelfile print('Loading seq2seq model...') # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) self.opt = opt self.fields = fields self.model = model self.model_opt = model_opt
def parse_args(): parser = argparse.ArgumentParser( description='template_preprocess.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.preprocess_opts(parser) group = parser.add_argument_group('Template') group.add_argument('-train_template', required=True, help="Path to the training template") group.add_argument('-valid_template', required=True, help="Path to the valid template") opt = parser.parse_args() torch.manual_seed(opt.seed) check_existing_pt_files(opt) return opt
# -*- coding: utf-8 -*- import onmt import onmt.IO import argparse import torch import opts import codecs parser = argparse.ArgumentParser(description='preprocess.py') opts.add_md_help_argument(parser) # **Preprocess Options** parser.add_argument('-config', help="Read options from this file") parser.add_argument('-data_type', default="text", help="Type of the source input. Options are [text|img].") parser.add_argument('-data_img_dir', default=".", help="Location of source images") parser.add_argument('-train_src', required=True, help="Path to the training source data") parser.add_argument('-train_tgt', required=True, help="Path to the training target data") parser.add_argument('-valid_src', required=True, help="Path to the validation source data")
def main(training=False, fields=None, model=None, opt=None, writer=None, step=0, corpus_type="dev", multi_process=False): time = Time() if training: assert fields is not None assert model is not None assert opt is not None model.eval() model.generator.eval() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) out_file = codecs.open( "{}_{}_pred_{}.txt".format(opt.save_model, corpus_type.replace("/", "_"), str(step)), "w", "utf-8") print("Output file: ", out_file.name) copy_attn = opt.copy_attn model_opt = opt else: # Load the model. parser = argparse.ArgumentParser( description='translate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) opt = parser.parse_args() dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) out_file = codecs.open(opt.output, 'w', 'utf-8') assert opt.tgt is None data = onmt.io.build_dataset(fields, opt.src, opt.tgt, use_filter_pred=False, ngram=model_opt.ngram) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.translate_batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) output, pred_score_total, pred_words_total = \ translate_single_process(opt, model, fields, data, data_iter, f=out_file) outfile_name = out_file.name if opt.bpe: import subprocess subprocess.check_output("sed 's/\@\@ //g' < {} > {}".format( outfile_name, outfile_name + ".nonbpe"), shell=True) outfile_name = outfile_name + ".nonbpe" if opt.new_bpe: generate_nonbpe(outfile_name) outfile_name = outfile_name + ".nonbpe" # if writer is not None: # ratio_stats.log_tensorboard(writer, step) # _report_score('PRED', pred_score_total, pred_words_total, writer, step, corpus_type) metric = 0 if opt.tgt: # _report_score('GOLD', gold_score_total, gold_words_total, writer, step, corpus_type) if opt.report_single_bleu: metric = _report_single_source_bleu(opt, outfile_name, writer, step, corpus_type) if opt.report_multi_bleu: metric = _report_multi_source_bleu(outfile_name, writer, step, corpus_type) if opt.report_rouge: metric = _report_rouge(opt) # if opt.dump_beam: # import json # json.dump(translator.beam_accum, # codecs.open(opt.dump_beam, 'w', 'utf-8')) time.timeit(task="Translation Testing") return metric
def add_cmdline_args(argparser): # opts.py opts.add_md_help_argument(argparser) opts.model_opts(argparser) opts.train_opts(argparser) opt = argparser.parse_args()
def get_model_api(): """Returns lambda function for api""" # initialize model once and for all # initialize config for translate parser = argparse.ArgumentParser( description='translate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) opt = parser.parse_args() # initialize config for model dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Load the model. fields, model, model_opt = \ onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta, opt.coverage_penalty, opt.length_penalty) translator = onmt.translate.Translator(model, fields, beam_size=opt.beam_size, n_best=opt.n_best, global_scorer=scorer, max_length=opt.max_length, copy_attn=model_opt.copy_attn, cuda=opt.cuda, beam_trace=opt.dump_beam != "", min_length=opt.min_length) # File to write sentences to. out_file = codecs.open(opt.output, 'w', 'utf-8') # hw_count = 0 # start_0 = current_milli_time() def model_api(input_data): """ Args: input_data: submitted to the API, json string Returns: output_data: after some transformation, to be returned to the API """ # process input global hw_count global start_0 res = {} request_id = str(uuid.uuid4()) res['id'] = input_data['id'] scgink = input_data['scg_ink'] try: scgink_data = ScgImage(scgink, request_id) except: res['status'] = 'error' res['info'] = 'bad scgink data' return res # empty traces due to scgink data if not scgink_data.traces: res['info'] = 'wrong scgink data' res['status'] = 'error' return res start_t = current_milli_time() img_file_path = outdir + '/' + request_id + '_input.png' #convert to png format scgink_data.save_image(img_file_path) #preprocess image filename, postfix, processed_img = img_file_path, '.png', outdir + '/' + request_id + '_preprocessed.png' crop_blank_default_size, pad_size, buckets, downsample_ratio = [ 600, 60 ], (8, 8, 8, 8), default_buckets, 2 l = (filename, postfix, processed_img, crop_blank_default_size, pad_size, buckets, downsample_ratio) if not preprocess(l): res['status'] = 'error' return res # construct data os.system('echo ' + request_id + '_preprocessed.png ' + '>temp/test.txt') src = 'temp/test.txt' src_dir = 'temp' #print "src=", src #print "src_dir=", src_dir data = onmt.io.build_dataset(fields, opt.data_type, src, None, src_dir=src_dir, sample_rate=opt.sample_rate, window_size=opt.window_size, window_stride=opt.window_stride, window=opt.window, use_filter_pred=False) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) # Inference builder = onmt.translate.TranslationBuilder(data, translator.fields, opt.n_best, opt.replace_unk, opt.tgt) cnt = 0 for batch in data_iter: batch_data = translator.translate_batch(batch, data) translations = builder.from_batch(batch_data) for trans in translations: cnt += 1 n_best_preds = [ " ".join(pred) for pred in trans.pred_sents[:opt.n_best] ] now_t = current_milli_time() #hw_count = hw_count + 1 #if hw_count %100 == 0 : # app.logger.debug( "last 100 "+(now_t - start_0 )) # start_0 = now_t # app.logger.debug( "time spent "+( now_t -start_t)) # process the output n_best_latex = [] for pred in n_best_preds: n_best_latex.append(detokenizer(pred)) n_best_ascii = [] for pred in n_best_latex: n_best_ascii.append(latex_asciimath(pred)) # return the output for the api res['status'] = "succuss" res['info'] = now_t - start_t res['mathml'] = '' res['latex'] = n_best_latex[0] res['asciimath'] = n_best_ascii[0] res['n_best_latex'] = n_best_latex res['n_best_ascii'] = n_best_ascii app.logger.debug(request_id + "\t" + n_best_latex[0] + "\n") return res return model_api
def main(): parser = argparse.ArgumentParser( description='translate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) group = parser.add_argument_group('Rerank') group.add_argument('-templates', required=True, help="Path to the test templates") opt = parser.parse_args() dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) # Load the model. fields, model, model_opt = \ model_utils.load_test_model(opt, dummy_opt.__dict__) fields["spliter_pos"] = torchtext.data.Field(use_vocab=False, dtype=torch.long, sequential=False) # Unfold templates src_path, tmp_path = txt_utils.unfold_templates(opt.src, opt.templates) # Test data data = txt_utils.build_template_dataset(fields, src_path, None, tmp_path, use_filter_pred=False, with_pos=True, dynamic_dict=False) # Sort batch by decreasing lengths of sentence required by pytorch. # sort=False means "Use dataset's sortkey instead of iterator's". data_iter = onmt.io.OrderedIterator(dataset=data, device=opt.gpu, batch_size=opt.batch_size, train=False, sort=False, sort_within_batch=True, shuffle=False) count = 0 #offset=0 #scores=[] score_dict = {} for batch in data_iter: #print(batch.indices) #index=batch.indices-offset src = onmt.io.make_features(batch, 'src', 'text') predict_score = model.predict_rouge(src, batch.src[1], batch.spliter_pos) #ordered_score=predict_score[index].data #scores.extend(ordered_score) #offset+=index.size(0) for index, score in zip(batch.indices.data, predict_score.data): score_dict[int(index)] = float(score) count += 1 if count % 100 == 0: print('score {} batches'.format(count)) #if count>10: break # File to write sentences to. score_file = opt.output + '.score' print('score_file is ' + score_file) print('opt.tgt is ' + opt.tgt) out_file = open(score_file, 'w', encoding='utf-8') print(len(score_dict)) for index in range(len(score_dict)): print(score_dict[index], file=out_file) out_file.close() select_templates(src_path, tmp_path, score_file, opt.output, opt.tgt)
import onmt import onmt.io import onmt.Models import onmt.ModelConstructor import onmt.modules from onmt.Utils import use_gpu import opts parser = argparse.ArgumentParser( description='train.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # opts.py opts.add_md_help_argument(parser) opts.model_opts(parser) opts.train_opts(parser) opt = parser.parse_args() if opt.word_vec_size != -1: opt.src_word_vec_size = opt.word_vec_size opt.tgt_word_vec_size = opt.word_vec_size if opt.layers != -1: opt.enc_layers = opt.layers opt.dec_layers = opt.layers opt.brnn = (opt.encoder_type == "brnn") if opt.seed > 0: random.seed(opt.seed)
def main(): rebuild_vocab = False if rebuild_vocab: trainfile = '/D/home/lili/mnt/DATA/convaws/convdata/conv-test_v.json' train = pd.read_json(trainfile) print('Read training data from: {}'.format(trainfile)) valfile = '/D/home/lili/mnt/DATA/convaws/convdata/conv-val_v.json' val = pd.read_json(valfile) print('Read validation data from: {}'.format(valfile)) train_srs = train.context.values.tolist() train_tgt = train.replies.values.tolist() val_srs = val.context.values.tolist() val_tgt = val.replies.values.tolist() src_vocab, _ = hierdata.buildvocab(train_srs + val_srs) tgt_vocab, tgtwords = hierdata.buildvocab(train_tgt + val_tgt) else: print('load vocab from pt file') dicts = torch.load('test_vocabs.pt') #tgt = pd.read_json('./tgt.json') #src = pd.read_json('./src.json') src_vocab = dicts['src_word2id'] tgt_vocab = dicts['tgt_word2id'] tgtwords = dicts['tgt_id2word'] print('source vocab size: {}'.format(len(src_vocab))) print('source vocab test, bill: {} , {}'.format( src_vocab['<pad>'], src_vocab['bill'])) print('target vocab size: {}'.format(len(tgt_vocab))) print('target vocab test, bill: {}, {}'.format(tgt_vocab['<pad>'], tgt_vocab['bill'])) print('target vocat testing:') print('word: <pad> get :{}'.format(tgtwords[tgt_vocab['<pad>']])) print('word: bill get :{}'.format(tgtwords[tgt_vocab['bill']])) print('word: service get :{}'.format(tgtwords[tgt_vocab['service']])) parser = argparse.ArgumentParser(description='train.py') # opts.py opts.add_md_help_argument(parser) opts.model_opts(parser) opts.train_opts(parser) opt = parser.parse_args() dummy_opt = parser.parse_known_args([])[0] opt.cuda = opt.gpuid[0] > -1 if opt.cuda: torch.cuda.set_device(opt.gpuid[0]) checkpoint = opt.model print('Building model...') model = ModelHVAE.make_base_model( opt, src_vocab, tgt_vocab, opt.cuda, checkpoint ) ### Done #### How to integrate the two embedding layers... print(model) tally_parameters(model) ### Done testfile = '/D/home/lili/mnt/DATA/convaws/convdata/conv-val_v.json' test = pd.read_json(testfile) print('Test training data from: {}'.format(testfile)) test_srs = test.context.values.tolist() test_tgt = test.replies.values.tolist() test_batch_size = 16 test_iter = data_util.gen_minibatch(test_srs, test_tgt, test_batch_size, src_vocab, tgt_vocab) tgtvocab = tgt_vocab optim = Optim.Optim('adam', 1e-3, 5) train_loss = Loss.VAELoss(model.generator, tgtvocab) valid_loss = Loss.VAELoss(model.generator, tgtvocab) trainer = Trainer.VaeTrainer(model, test_iter, test_iter, train_loss, valid_loss, optim) valid_stats = trainer.validate() print('Validation perplexity: %g' % valid_stats.ppl()) print('Validation accuracy: %g' % valid_stats.accuracy())