예제 #1
0
    def __init__(self, model, lang, gpu=False, wx=False):
        self.lang = lang
        self.is_ip_wx = wx
        parser = argparse.ArgumentParser(
            description='transliterate.py',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        opts.add_md_help_argument(parser)
        opts.translate_opts(parser)

        self.opt = parser.parse_args()
        self.trans_dict = dict()
        self.broken_words = dict()
        file_path = os.path.dirname(os.path.abspath(__file__))

        if self.lang == 'hin':
            self.to_utf = WXC(order='wx2utf', lang='hin')
            self.non_alpha = re.compile(u'([^a-zA-Z]+)')
            self.alpha_letters = set(string.ascii_letters)
            self.com_abbr = {
                'b': ['BI', 'be'],
                'd': ['xI', 'xe'],
                'g': ['jI'],
                'k': ['ke', 'ki', 'kI'],
                'h': ['hE', 'hEM'],
                'ha': ['hE', 'hEM'],
                'n': ['ina', 'ne'],
                'm': ['meM', 'mEM'],
                'p': ['pe'],
                'q': ['kyoM'],
                'r': ['Ora', 'ora'],
                's': ['isa', 'se'],
                'y': ['ye']
            }

        if self.lang == 'eng':
            self.non_alpha = re.compile(u'([^a-z]+)')
            self.alpha_letters = set(string.ascii_letters[:26])
            with open('%s/extras/COMMON_ABBR.eng' % file_path) as fp:
                self.com_abbr = {}
                for line in fp:
                    k, v = line.split()
                    self.com_abbr[k] = v.split('|')

        dummy_parser = argparse.ArgumentParser(description='train.py')
        opts.model_opts(dummy_parser)
        dummy_opt = dummy_parser.parse_known_args([])[0]
        if gpu:
            self.opt.gpu = 0

        self.opt.cuda = self.opt.gpu > -1
        self.opt.model = model
        self.opt.n_best = 5
        self.opt.lang = lang
        if self.opt.cuda:
            torch.cuda.set_device(self.opt.gpu)

        # Load the model.
        self.fields, self.model, self.model_opt = onmt.ModelConstructor.load_test_model(
            self.opt, dummy_opt.__dict__)
예제 #2
0
def parse_args():
    parser = argparse.ArgumentParser(
        description='umt.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    opts.add_md_help_argument(parser)
    opts.preprocess_opts(parser)
    opt = parser.parse_args()
    return opt
예제 #3
0
def parse_args():
    parser = argparse.ArgumentParser(
        description='preprocess.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    opts.add_md_help_argument(parser)
    opts.preprocess_opts(parser)

    opt = parser.parse_args()
    torch.manual_seed(opt.seed)

    return opt
예제 #4
0
def parse_args():
    parser = argparse.ArgumentParser(
        description='umt.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    opts.add_md_help_argument(parser)
    opts.model_opts(parser)
    opts.preprocess_opts(parser)
    opts.train_opts(parser)

    opt = parser.parse_args()
    torch.manual_seed(opt.seed)

    if opt.word_vec_size != -1:
        opt.src_word_vec_size = opt.word_vec_size
        opt.tgt_word_vec_size = opt.word_vec_size

    if opt.layers != -1:
        opt.enc_layers = opt.layers
        opt.dec_layers = opt.layers

    opt.brnn = (opt.encoder_type == "brnn")

    # if opt.seed > 0:
    random.seed(opt.seed)
    torch.manual_seed(opt.seed)

    if torch.cuda.is_available() and not opt.gpuid:
        print("WARNING: You have a CUDA device, should run with -gpuid 0")

    if opt.gpuid:
        cuda.set_device(opt.gpuid[0])
        if opt.seed > 0:
            torch.cuda.manual_seed(opt.seed)

    if len(opt.gpuid) > 1:
        sys.stderr.write("Sorry, multigpu isn't supported yet, coming soon!\n")
        sys.exit(1)

    # Set up the Crayon logging server.
    if opt.exp_host != "":
        from pycrayon import CrayonClient

        cc = CrayonClient(hostname=opt.exp_host)

        experiments = cc.get_experiment_names()
        print(experiments)
        if opt.exp in experiments:
            cc.remove_experiment(opt.exp)

    return opt
예제 #5
0
def parse_args():
    parser = argparse.ArgumentParser(
        description='preprocess.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    opts.add_md_help_argument(parser)
    opts.preprocess_opts(parser)

    opt = parser.parse_args()
    torch.manual_seed(opt.seed)

    check_existing_pt_files(opt)

    return opt
예제 #6
0
def parse_args():
    parser = configargparse.ArgumentParser(
        description='preprocess.py',
        formatter_class=configargparse.ArgumentDefaultsHelpFormatter)

    opts.add_md_help_argument(parser)
    opts.config_opts(parser)
    opts.preprocess_opts(parser)

    opt = parser.parse_args()
    torch.manual_seed(opt.seed)

    check_existing_pt_files(opt)

    return opt
예제 #7
0
    def __init__(
            self,
            modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_97.30_ppl_1.41_e7.pt',
            dynamic_dict=True,
            attn_debug=True,
            share_vocab=True,
            replace_unk=True,
            verbose=True):
        #def __init__(self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_78.18_ppl_9.60_e4.pt'):
        #def __init__(self, modelfile='/data1/struct2text/s2s_models_v3/model_softmax_acc_82.37_ppl_6.28_e8.pt'):
        #def __init__(self, modelfile='/data1/data1/Anirban/structure2text/model_softmax_1_acc_84.10_ppl_2.13_e1.pt'):
        print('Loading ' + modelfile)
        parser = argparse.ArgumentParser(
            description='seq2seq_predict',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        opts.add_md_help_argument(parser)
        opts.translate_opts(parser)
        #opt = parser.parse_args()
        opt, unknown = parser.parse_known_args()
        print('Unknown arguments ', unknown)
        opt.dynamic_dict = dynamic_dict
        opt.attn_debug = attn_debug
        opt.share_vocab = share_vocab
        opt.replace_unk = replace_unk
        opt.verbose = verbose
        dummy_parser = argparse.ArgumentParser(description='train.py')
        opts.model_opts(dummy_parser)
        dummy_opt = dummy_parser.parse_known_args([])[0]

        opt.cuda = opt.gpu > -1
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)

        opt.src = 'temp_seq2seq_pred_%f.txt' % time.time()
        opt.model = modelfile

        print('Loading seq2seq model...')
        # Load the model.
        fields, model, model_opt = \
            onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)

        self.opt = opt
        self.fields = fields
        self.model = model
        self.model_opt = model_opt
예제 #8
0
def parse_args():
    parser = argparse.ArgumentParser(
        description='template_preprocess.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    opts.add_md_help_argument(parser)
    opts.preprocess_opts(parser)
    group = parser.add_argument_group('Template')
    group.add_argument('-train_template', required=True,
                       help="Path to the training template")
    group.add_argument('-valid_template', required=True,
                       help="Path to the valid template")

    opt = parser.parse_args()
    torch.manual_seed(opt.seed)

    check_existing_pt_files(opt)

    return opt
예제 #9
0
# -*- coding: utf-8 -*-

import onmt
import onmt.IO
import argparse
import torch
import opts
import codecs

parser = argparse.ArgumentParser(description='preprocess.py')
opts.add_md_help_argument(parser)

# **Preprocess Options**
parser.add_argument('-config', help="Read options from this file")

parser.add_argument('-data_type',
                    default="text",
                    help="Type of the source input. Options are [text|img].")
parser.add_argument('-data_img_dir',
                    default=".",
                    help="Location of source images")

parser.add_argument('-train_src',
                    required=True,
                    help="Path to the training source data")
parser.add_argument('-train_tgt',
                    required=True,
                    help="Path to the training target data")
parser.add_argument('-valid_src',
                    required=True,
                    help="Path to the validation source data")
예제 #10
0
def main(training=False,
         fields=None,
         model=None,
         opt=None,
         writer=None,
         step=0,
         corpus_type="dev",
         multi_process=False):
    time = Time()
    if training:
        assert fields is not None
        assert model is not None
        assert opt is not None
        model.eval()
        model.generator.eval()
        opt.cuda = opt.gpu > -1
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        out_file = codecs.open(
            "{}_{}_pred_{}.txt".format(opt.save_model,
                                       corpus_type.replace("/", "_"),
                                       str(step)), "w", "utf-8")
        print("Output file: ", out_file.name)
        copy_attn = opt.copy_attn
        model_opt = opt
    else:
        # Load the model.
        parser = argparse.ArgumentParser(
            description='translate.py',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        opts.add_md_help_argument(parser)
        opts.translate_opts(parser)

        opt = parser.parse_args()
        dummy_parser = argparse.ArgumentParser(description='train.py')
        opts.model_opts(dummy_parser)
        dummy_opt = dummy_parser.parse_known_args([])[0]

        opt.cuda = opt.gpu > -1
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)

        fields, model, model_opt = \
            onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)

        out_file = codecs.open(opt.output, 'w', 'utf-8')

    assert opt.tgt is None
    data = onmt.io.build_dataset(fields,
                                 opt.src,
                                 opt.tgt,
                                 use_filter_pred=False,
                                 ngram=model_opt.ngram)

    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".

    data_iter = onmt.io.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.translate_batch_size,
                                        train=False,
                                        sort=False,
                                        sort_within_batch=True,
                                        shuffle=False)
    output, pred_score_total, pred_words_total = \
            translate_single_process(opt, model, fields, data, data_iter, f=out_file)
    outfile_name = out_file.name

    if opt.bpe:
        import subprocess
        subprocess.check_output("sed 's/\@\@ //g' < {} > {}".format(
            outfile_name, outfile_name + ".nonbpe"),
                                shell=True)
        outfile_name = outfile_name + ".nonbpe"
    if opt.new_bpe:
        generate_nonbpe(outfile_name)
        outfile_name = outfile_name + ".nonbpe"
    # if writer is not None:
    #     ratio_stats.log_tensorboard(writer, step)
    # _report_score('PRED', pred_score_total, pred_words_total, writer, step, corpus_type)
    metric = 0
    if opt.tgt:
        # _report_score('GOLD', gold_score_total, gold_words_total, writer, step, corpus_type)
        if opt.report_single_bleu:
            metric = _report_single_source_bleu(opt, outfile_name, writer,
                                                step, corpus_type)
        if opt.report_multi_bleu:
            metric = _report_multi_source_bleu(outfile_name, writer, step,
                                               corpus_type)
        if opt.report_rouge:
            metric = _report_rouge(opt)

    # if opt.dump_beam:
    #     import json
    #     json.dump(translator.beam_accum,
    #               codecs.open(opt.dump_beam, 'w', 'utf-8'))

    time.timeit(task="Translation Testing")
    return metric
예제 #11
0
파일: opennmt.py 프로젝트: jsedoc/ParlAI
 def add_cmdline_args(argparser):
     # opts.py
     opts.add_md_help_argument(argparser)
     opts.model_opts(argparser)
     opts.train_opts(argparser)
     opt = argparser.parse_args()
예제 #12
0
def get_model_api():
    """Returns lambda function for api"""

    # initialize model once and for all

    # initialize config for translate
    parser = argparse.ArgumentParser(
        description='translate.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    opts.add_md_help_argument(parser)
    opts.translate_opts(parser)
    opt = parser.parse_args()

    # initialize config for model
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Load the model.
    fields, model, model_opt = \
        onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)
    scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta,
                                             opt.coverage_penalty,
                                             opt.length_penalty)
    translator = onmt.translate.Translator(model,
                                           fields,
                                           beam_size=opt.beam_size,
                                           n_best=opt.n_best,
                                           global_scorer=scorer,
                                           max_length=opt.max_length,
                                           copy_attn=model_opt.copy_attn,
                                           cuda=opt.cuda,
                                           beam_trace=opt.dump_beam != "",
                                           min_length=opt.min_length)

    # File to write sentences to.
    out_file = codecs.open(opt.output, 'w', 'utf-8')

    #    hw_count = 0
    #    start_0 = current_milli_time()

    def model_api(input_data):
        """
        Args:
            input_data: submitted to the API, json string

        Returns:
            output_data: after some transformation, to be
                returned to the API

        """

        # process input
        global hw_count
        global start_0
        res = {}
        request_id = str(uuid.uuid4())
        res['id'] = input_data['id']
        scgink = input_data['scg_ink']
        try:
            scgink_data = ScgImage(scgink, request_id)
        except:
            res['status'] = 'error'
            res['info'] = 'bad scgink data'
            return res
        # empty traces due to scgink data
        if not scgink_data.traces:
            res['info'] = 'wrong scgink data'
            res['status'] = 'error'
            return res

        start_t = current_milli_time()

        img_file_path = outdir + '/' + request_id + '_input.png'
        #convert to png format
        scgink_data.save_image(img_file_path)

        #preprocess image
        filename, postfix, processed_img = img_file_path, '.png', outdir + '/' + request_id + '_preprocessed.png'
        crop_blank_default_size, pad_size, buckets, downsample_ratio = [
            600, 60
        ], (8, 8, 8, 8), default_buckets, 2

        l = (filename, postfix, processed_img, crop_blank_default_size,
             pad_size, buckets, downsample_ratio)
        if not preprocess(l):
            res['status'] = 'error'
            return res

        # construct data
        os.system('echo ' + request_id + '_preprocessed.png ' +
                  '>temp/test.txt')
        src = 'temp/test.txt'
        src_dir = 'temp'
        #print "src=", src
        #print "src_dir=", src_dir
        data = onmt.io.build_dataset(fields,
                                     opt.data_type,
                                     src,
                                     None,
                                     src_dir=src_dir,
                                     sample_rate=opt.sample_rate,
                                     window_size=opt.window_size,
                                     window_stride=opt.window_stride,
                                     window=opt.window,
                                     use_filter_pred=False)

        # Sort batch by decreasing lengths of sentence required by pytorch.
        # sort=False means "Use dataset's sortkey instead of iterator's".
        data_iter = onmt.io.OrderedIterator(dataset=data,
                                            device=opt.gpu,
                                            batch_size=opt.batch_size,
                                            train=False,
                                            sort=False,
                                            sort_within_batch=True,
                                            shuffle=False)

        # Inference
        builder = onmt.translate.TranslationBuilder(data, translator.fields,
                                                    opt.n_best,
                                                    opt.replace_unk, opt.tgt)

        cnt = 0
        for batch in data_iter:
            batch_data = translator.translate_batch(batch, data)
            translations = builder.from_batch(batch_data)

            for trans in translations:
                cnt += 1
                n_best_preds = [
                    " ".join(pred) for pred in trans.pred_sents[:opt.n_best]
                ]

        now_t = current_milli_time()
        #hw_count = hw_count + 1
        #if hw_count %100 == 0 :
        #    app.logger.debug( "last 100 "+(now_t - start_0 ))
        #    start_0 = now_t
        #    app.logger.debug(  "time spent "+( now_t -start_t))

        # process the output
        n_best_latex = []
        for pred in n_best_preds:
            n_best_latex.append(detokenizer(pred))

        n_best_ascii = []
        for pred in n_best_latex:
            n_best_ascii.append(latex_asciimath(pred))

        # return the output for the api
        res['status'] = "succuss"
        res['info'] = now_t - start_t
        res['mathml'] = ''
        res['latex'] = n_best_latex[0]
        res['asciimath'] = n_best_ascii[0]
        res['n_best_latex'] = n_best_latex
        res['n_best_ascii'] = n_best_ascii
        app.logger.debug(request_id + "\t" + n_best_latex[0] + "\n")

        return res

    return model_api
예제 #13
0
def main():
    parser = argparse.ArgumentParser(
        description='translate.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    opts.add_md_help_argument(parser)
    opts.translate_opts(parser)
    group = parser.add_argument_group('Rerank')
    group.add_argument('-templates',
                       required=True,
                       help="Path to the test templates")
    opt = parser.parse_args()
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Load the model.
    fields, model, model_opt = \
        model_utils.load_test_model(opt, dummy_opt.__dict__)

    fields["spliter_pos"] = torchtext.data.Field(use_vocab=False,
                                                 dtype=torch.long,
                                                 sequential=False)

    # Unfold templates
    src_path, tmp_path = txt_utils.unfold_templates(opt.src, opt.templates)

    # Test data
    data = txt_utils.build_template_dataset(fields,
                                            src_path,
                                            None,
                                            tmp_path,
                                            use_filter_pred=False,
                                            with_pos=True,
                                            dynamic_dict=False)

    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".
    data_iter = onmt.io.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        sort_within_batch=True,
                                        shuffle=False)

    count = 0
    #offset=0
    #scores=[]
    score_dict = {}
    for batch in data_iter:
        #print(batch.indices)
        #index=batch.indices-offset
        src = onmt.io.make_features(batch, 'src', 'text')
        predict_score = model.predict_rouge(src, batch.src[1],
                                            batch.spliter_pos)
        #ordered_score=predict_score[index].data
        #scores.extend(ordered_score)
        #offset+=index.size(0)
        for index, score in zip(batch.indices.data, predict_score.data):
            score_dict[int(index)] = float(score)
        count += 1
        if count % 100 == 0:
            print('score {} batches'.format(count))
        #if count>10: break

    # File to write sentences to.
    score_file = opt.output + '.score'
    print('score_file is ' + score_file)
    print('opt.tgt is ' + opt.tgt)
    out_file = open(score_file, 'w', encoding='utf-8')
    print(len(score_dict))
    for index in range(len(score_dict)):
        print(score_dict[index], file=out_file)
    out_file.close()
    select_templates(src_path, tmp_path, score_file, opt.output, opt.tgt)
예제 #14
0
import onmt
import onmt.io
import onmt.Models
import onmt.ModelConstructor
import onmt.modules
from onmt.Utils import use_gpu
import opts


parser = argparse.ArgumentParser(
    description='train.py',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

# opts.py
opts.add_md_help_argument(parser)
opts.model_opts(parser)
opts.train_opts(parser)

opt = parser.parse_args()
if opt.word_vec_size != -1:
    opt.src_word_vec_size = opt.word_vec_size
    opt.tgt_word_vec_size = opt.word_vec_size

if opt.layers != -1:
    opt.enc_layers = opt.layers
    opt.dec_layers = opt.layers

opt.brnn = (opt.encoder_type == "brnn")
if opt.seed > 0:
    random.seed(opt.seed)
예제 #15
0
def main():
    rebuild_vocab = False
    if rebuild_vocab:
        trainfile = '/D/home/lili/mnt/DATA/convaws/convdata/conv-test_v.json'
        train = pd.read_json(trainfile)
        print('Read training data from: {}'.format(trainfile))

        valfile = '/D/home/lili/mnt/DATA/convaws/convdata/conv-val_v.json'
        val = pd.read_json(valfile)
        print('Read validation data from: {}'.format(valfile))
        train_srs = train.context.values.tolist()
        train_tgt = train.replies.values.tolist()
        val_srs = val.context.values.tolist()
        val_tgt = val.replies.values.tolist()
        src_vocab, _ = hierdata.buildvocab(train_srs + val_srs)
        tgt_vocab, tgtwords = hierdata.buildvocab(train_tgt + val_tgt)

    else:
        print('load vocab from pt file')
        dicts = torch.load('test_vocabs.pt')
        #tgt = pd.read_json('./tgt.json')
        #src = pd.read_json('./src.json')
        src_vocab = dicts['src_word2id']
        tgt_vocab = dicts['tgt_word2id']
        tgtwords = dicts['tgt_id2word']
        print('source vocab size: {}'.format(len(src_vocab)))
        print('source vocab test, bill: {} , {}'.format(
            src_vocab['<pad>'], src_vocab['bill']))
        print('target vocab size: {}'.format(len(tgt_vocab)))
        print('target vocab test, bill: {}, {}'.format(tgt_vocab['<pad>'],
                                                       tgt_vocab['bill']))
        print('target vocat testing:')
        print('word: <pad> get :{}'.format(tgtwords[tgt_vocab['<pad>']]))
        print('word: bill get :{}'.format(tgtwords[tgt_vocab['bill']]))
        print('word: service get :{}'.format(tgtwords[tgt_vocab['service']]))

    parser = argparse.ArgumentParser(description='train.py')

    # opts.py
    opts.add_md_help_argument(parser)
    opts.model_opts(parser)
    opts.train_opts(parser)
    opt = parser.parse_args()

    dummy_opt = parser.parse_known_args([])[0]

    opt.cuda = opt.gpuid[0] > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpuid[0])

    checkpoint = opt.model
    print('Building model...')
    model = ModelHVAE.make_base_model(
        opt, src_vocab, tgt_vocab, opt.cuda, checkpoint
    )  ### Done  #### How to integrate the two embedding layers...
    print(model)
    tally_parameters(model)  ### Done

    testfile = '/D/home/lili/mnt/DATA/convaws/convdata/conv-val_v.json'
    test = pd.read_json(testfile)
    print('Test training data from: {}'.format(testfile))

    test_srs = test.context.values.tolist()
    test_tgt = test.replies.values.tolist()

    test_batch_size = 16
    test_iter = data_util.gen_minibatch(test_srs, test_tgt, test_batch_size,
                                        src_vocab, tgt_vocab)

    tgtvocab = tgt_vocab

    optim = Optim.Optim('adam', 1e-3, 5)
    train_loss = Loss.VAELoss(model.generator, tgtvocab)
    valid_loss = Loss.VAELoss(model.generator, tgtvocab)
    trainer = Trainer.VaeTrainer(model, test_iter, test_iter, train_loss,
                                 valid_loss, optim)
    valid_stats = trainer.validate()
    print('Validation perplexity: %g' % valid_stats.ppl())
    print('Validation accuracy: %g' % valid_stats.accuracy())