Exemplo n.º 1
0
    def processLines(self, path_to_file, opt, dummy_opt):

        opt.cuda = opt.gpu > -1

        if opt.cuda:
            torch.cuda.set_device(opt.gpu)

        translator = onmt.Translator(opt, dummy_opt.__dict__)

        words_idx = []
        sems_idx = []
        with codecs.open(path_to_file, "r", "utf-8") as corpus_file:
            for line in corpus_file:
                wordindex, semindex = int(line.split()[0]), int(
                    line.split()[1])
                words_idx.append(wordindex)
                sems_idx.append(semindex)

        data = onmt.IO.ONMTDataset(path_to_file, None, translator.fields, None)
        train_data = onmt.IO.OrderedIterator(dataset=data,
                                             device=opt.gpu,
                                             batch_size=opt.batch_size,
                                             train=False,
                                             sort=False,
                                             shuffle=False)

        word_encodings = []
        for i, batch in enumerate(train_data):
            # print batch.__dict__['src']
            word_idx = words_idx[i]
            encodings = translator.encode(batch, data)[word_idx]

            sample = {'x': encodings.data, 'y': sems_idx[i]}
            self.dataset.append(sample)
Exemplo n.º 2
0
    def __init__(self, model_path, gpu_id=1):
        parser = argparse.ArgumentParser(description='translate.py')
        parser.add_argument('-model', required=True,
                            help='Path to model .pt file')
        parser.add_argument(
            '-src', required=True,
            help='Source sequence to decode (one line per sequence)')
        parser.add_argument('-src_img_dir',   default="",
                            help='Source image directory')
        parser.add_argument('-tgt',
                            help='True target sequence (optional)')
        parser.add_argument('-output', default='pred.txt',
                            help="""Path to output the predictions (each line will
                            be the decoded sequence""")
        parser.add_argument('-beam_size',  type=int, default=5,
                            help='Beam size')
        parser.add_argument('-batch_size', type=int, default=30,
                            help='Batch size')
        parser.add_argument('-max_sent_length', type=int, default=100,
                            help='Maximum sentence length.')
        parser.add_argument('-replace_unk', action="store_true",
                            help="""Replace the generated UNK tokens with the
                            source token that had highest attention weight. If
                            phrase_table is provided, it will lookup the
                            identified source token and give the corresponding
                            target token. If it is not provided (or the
                            identified source token does not exist in the
                            table) then it will copy the source token""")
        parser.add_argument(
            '-verbose', action="store_true",
            help='Print scores and predictions for each sentence')
        parser.add_argument('-attn_debug', action="store_true",
                            help='Print best attn for each word')
        parser.add_argument('-dump_beam', type=str, default="",
                            help='File to dump beam information to.')

        parser.add_argument('-n_best', type=int, default=1,
                            help="""If verbose is set, will output the n_best
                            decoded sentences""")

        parser.add_argument('-gpu', type=int, default=-1,
                            help="Device to run on")
        # options most relevant to summarization
        parser.add_argument('-dynamic_dict', action='store_true',
                            help="Create dynamic dictionaries")
        parser.add_argument('-share_vocab', action='store_true',
                            help="Share source and target vocabulary")
        # Alpha and Beta values for Google Length + Coverage penalty
        # Described here: https://arxiv.org/pdf/1609.08144.pdf, Section 7
        parser.add_argument('-alpha', type=float, default=0.0,
                            help="""Google NMT length penalty parameter
                            (higher = longer generation)""")
        parser.add_argument('-beta', type=float, default=0.0,
                            help="""Coverage penalty parameter""")

        opt = parser.parse_args(( '-model %s -src /tmp/a -tgt /tmp/b -output /tmp/c -gpu %d -verbose -beam_size 5 -batch_size 1 -n_best 5 -replace_unk' % (model_path, gpu_id)).split()) # noqa
        opt.cuda = opt.gpu > -1
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        self.translator = onmt.Translator(opt)
Exemplo n.º 3
0
    def __init__(self, opt):
        self.opt = opt
        self.sep = opt.seprator + " "
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        self.bpe = BPE(codecs.open(self.opt.bpe_codes, 'r', encoding="UTF-8"),
                       self.opt.seprator, None, None)

        self.translator = onmt.Translator(opt)

        self.nlp = BosonNLP("NGhNiav2.16134.DvyEDmGzYd2S")
Exemplo n.º 4
0
    def __init__(self, opt):
        self.opt = opt
        self.sep = opt.seprator + " "
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"),
                       opt.seprator, None, None)

        self.tokenizer = MosesTokenizer()
        self.detokenizer = MosesDetokenizer()
        self.translator = onmt.Translator(opt)
Exemplo n.º 5
0
def translate(model, src, tgt, src_dict, tgt_dict, beam_size=10):
    opt.beam_size = beam_size
    opt.n_best = 1
    opt.replace_unk = True

    def addone(f):
        for line in f:
            yield line
        yield None

    translator = onmt.Translator(opt, model, src_dict, tgt_dict)

    srcBatch, tgtBatch = [], []

    tgtF = codecs.open(tgt, 'r', 'utf-8')
    pred_list = []
    out_name = 'tmp/' + opt.save_model.split('/')[-1] + '.tmp'
    out = codecs.open(out_name, 'w', 'utf-8')
    for line in addone(codecs.open(src, 'r', 'utf-8')):

        if line is not None:
            srcTokens = line.split()
            srcBatch += [srcTokens]
            tgtTokens = tgtF.readline().split()
            tgtBatch += [tgtTokens]

            if len(srcBatch) < opt.batch_size:
                continue
        else:
            # at the end of file, check last batch
            if len(srcBatch) == 0:
                break

        predBatch, _, _ = translator.translate(srcBatch, tgtBatch)
        pred_list += predBatch

        srcBatch, tgtBatch = [], []

    for pred in pred_list:
        out.write(' '.join(pred[0]) + '\n')

    tgtF.close()
    out.close()

    bleu_results = subprocess.Popen('perl -X multi-bleu.perl ' + tgt + ' < ' +
                                    out_name,
                                    stdout=subprocess.PIPE,
                                    shell=True).stdout.readline()

    model.train()
    model.decoder.attn.applyMask(None)

    return str(bleu_results)
def translate():
    logging.info('Translating ...')
    opt = parser.parse_args()
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    translator = onmt.Translator(opt)
    outF = codecs.open(opt.output, 'w', encoding='utf-8')
    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0
    srcBatch, tgtBatch = [], []
    count = 0
    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()

    for line in addone(codecs.open(opt.src, encoding='utf-8')):
        if line is not None:
            count += 1
            srcTokens = line.split()
            srcBatch += [srcTokens]

            if len(srcBatch) < opt.batch_size:
                continue
        else:
            # at the end of file, check last batch
            if len(srcBatch) == 0:
                break

        predBatch, predScore, goldScore = translator.translate(
            srcBatch, tgtBatch)
        predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x[0]) for x in predBatch)

        for b in range(len(predBatch)):
            count += 1
            outF.write(" ".join(predBatch[b][0]) + '\n')
            outF.flush()

        if count % 1024 == 0:
            logging.info('Translated {} sentences'.format(count))
        srcBatch, tgtBatch = [], []
Exemplo n.º 7
0
    def __init__(self, generator, tgt_vocab, smoothing_epsilon, aux_checkpoint):
        super(NMTKLDivNMTLossCompute, self).__init__(generator, tgt_vocab)
        self.copy_attn = False
        weight = torch.ones(len(tgt_vocab))
        weight[self.padding_idx] = 0

        # standard NLL loss term:
        self.criterion0 = nn.NLLLoss(weight, size_average=False)

        # ratio between normal cross entropy and LM cross entropy
        self.smoothing_epsilon = smoothing_epsilon

        #initial the aux model
        # The first argument of onmt.Translator just needs *.model and *.gpu
        OptModel = namedtuple('OptModel', ['model', 'gpu'])
        opt_model = OptModel(model=aux_checkpoint, gpu=0)  # TODO how do we know it is gpu 0 ???????????
        self.translator = onmt.Translator(opt_model, dict())

        # # ONLY USED FOR DEBUG
        self.debugLangModelNLL = nn.NLLLoss(weight, size_average=False)
def main():
    opt = parser.parse_args()
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    translator = onmt.Translator(opt)

    outF = open(opt.output, 'w')

    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0

    count = 0

    predBatch, predScore = translator.sample(opt.num_pts)
 
    #predScoreTotal += sum(score[0] for score in predScore)
    predWordsTotal += sum(len(x[0]) for x in predBatch)
    #if tgtF is not None:
    #    goldScoreTotal += sum(goldScore)
    #    goldWordsTotal += sum(len(x) for x in tgtBatch)

    for b in range(len(predBatch)):
        count += 1
        outF.write(" ".join(predBatch[b][0]) + '\n')
        outF.flush()

        if opt.verbose:
            #srcSent = ' '.join(srcBatch[b])
            #if translator.tgt_dict.lower:
            #    srcSent = srcSent.lower()
            #print('SENT %d: %s' % (count, srcSent))
            print('PRED %d: %s' % (count, " ".join(predBatch[b][0])))
            print("PRED SCORE: %.4f" % predScore[b][0])

            if opt.n_best > 1:
                print('\nBEST HYP:')
                for n in range(opt.n_best):
                    print("[%.4f] %s" % (predScore[b][n], " ".join(predBatch[b][n])))

            print('')
def main():
    global translator
    global args

    args = parse_arguments()

    print("Loading responder model.")
    print(args.model)
    if os.path.isfile(args.model):
        print("model file exists")
    else:
        print("OH NO MODEL FILE DOESN'T EXIST")

    args.cuda = args.gpu > -1
    if args.cuda:
        torch.cuda.set_device(args.gpu)

    translator = onmt.Translator(args)

    print("Starting service.")
    hostport = args.listen_host.split(':')
    if len(hostport) == 2:
        port = int(hostport[1])
    else:
        port = 5000
    options = {
        'bind': '{}:{}'.format(hostport[0], port),
        'threads': args.threads,
        'workers': args.workers
    }
    if args.statsd_host:
        options['statsd_host'] = args.statsd_host
    if args.prefix_statsd:
        options['statsd_prefix'] = args.prefix_statsd
    if args.debug:
        options['log_level'] = 'DEBUG'
    GunicornApplication(app, options).run()
Exemplo n.º 10
0
    def ask(self):
        opt = parser.parse_args()
        opt.cuda = opt.gpu > -1
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)

        translator = onmt.Translator(opt)
        predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0
        srcBatch, tgtBatch = [], []

        if opt.dump_beam != "":
            translator.initBeamAccum()

        while True:
            src = input('\n>')
            src_unk = self.encode(src)
            srcTokens = src_unk.split()
            srcBatch += [srcTokens]

            # at the end of file, check last batch
            if not src:
                break
            # import pudb; pudb.set_trace()
            predBatch, predScore, goldScore = translator.translate(srcBatch,
                                                                   tgtBatch)

            if opt.dump_beam:
                # pprint.pprint(translator.beam_accum)
                translator.initBeamAccum()

            pred_unks = [" ".join(predBatch[0][n]) for n in range(opt.n_best)]
            preds = [self.decode(src, src_unk, pred_unks[n]) for n in range(opt.n_best)]
            print('\nUNK: {}'.format(src_unk))
            for n in range(opt.n_best):
                print('BEST {}: \n {} \n {}'.format(n+1, pred_unks[n], preds[n]))
            srcBatch, tgtBatch = [], []
def init_translate_model(opt, dummy_opt):
    return onmt.Translator(opt, dummy_opt.__dict__)
def main():
    opt = parser.parse_args()
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    translator = onmt.Translator(opt)

    outF = codecs.open(opt.output, 'w', 'utf-8')

    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0

    srcBatch, tgtBatch = [], []

    count = 0

    tgtF = codecs.open(opt.tgt, 'r', 'utf-8') if opt.tgt else None

    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()

    for line in addone(codecs.open(opt.src, 'r', 'utf-8')):
        if line is not None:
            srcTokens = line.split()
            srcBatch += [srcTokens]
            if tgtF:
                tgtTokens = tgtF.readline().split() if tgtF else None
                tgtBatch += [tgtTokens]

            if len(srcBatch) < opt.batch_size:
                continue
        else:
            # at the end of file, check last batch
            if len(srcBatch) == 0:
                break

        predBatch, predScore, goldScore, attn, src \
            = translator.translate(srcBatch, tgtBatch)
        predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x[0]) for x in predBatch)
        if tgtF is not None:
            goldScoreTotal += sum(goldScore)
            goldWordsTotal += sum(len(x) for x in tgtBatch)

        for b in range(len(predBatch)):
            count += 1
            try:
                # python2
                outF.write(
                    " ".join([i.decode('utf-8')
                              for i in predBatch[b][0]]) + '\n')
            except AttributeError:
                # python3: can't do .decode on a str object
                outF.write(" ".join(predBatch[b][0]) + '\n')
            outF.flush()

            if opt.verbose:
                srcSent = ' '.join(srcBatch[b])
                if translator.tgt_dict.lower:
                    srcSent = srcSent.lower()
                os.write(1, bytes('SENT %d: %s\n' % (count, srcSent), 'UTF-8'))
                os.write(
                    1,
                    bytes('PRED %d: %s\n' % (count, " ".join(predBatch[b][0])),
                          'UTF-8'))
                print("PRED SCORE: %.4f" % predScore[b][0])

                if tgtF is not None:
                    tgtSent = ' '.join(tgtBatch[b])
                    if translator.tgt_dict.lower:
                        tgtSent = tgtSent.lower()
                    os.write(
                        1, bytes('GOLD %d: %s\n' % (count, tgtSent), 'UTF-8'))
                    print("GOLD SCORE: %.4f" % goldScore[b])

                if opt.n_best > 1:
                    print('\nBEST HYP:')
                    for n in range(opt.n_best):
                        os.write(
                            1,
                            bytes(
                                "[%.4f] %s\n" %
                                (predScore[b][n], " ".join(predBatch[b][n])),
                                'UTF-8'))

                if opt.attn_debug:
                    print('')
                    for i, w in enumerate(predBatch[b][0]):
                        print(w)
                        _, ids = attn[b][0][i].sort(0, descending=True)
                        for j in ids[:5].tolist():
                            print("\t%s\t%d\t%3f" %
                                  (srcTokens[j], j, attn[b][0][i][j]))

        srcBatch, tgtBatch = [], []

    reportScore('PRED', predScoreTotal, predWordsTotal)
    if tgtF:
        reportScore('GOLD', goldScoreTotal, goldWordsTotal)

    if tgtF:
        tgtF.close()

    if opt.dump_beam:
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
def main():
    opt = parser.parse_args()
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    translator = onmt.Translator(opt)

    outF = codecs.open(opt.output, 'w', 'utf-8')

    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0

    srcBatch, cxtBatch, tgtBatch = [], [], []

    count = 0

    cxtF = codecs.open(opt.cxt, "r", "utf-8") if opt.cxt else None
    tgtF = codecs.open(opt.tgt, "r", "utf-8") if opt.tgt else None
    for line in addone(codecs.open(opt.src, "r", "utf-8")):

        if line is not None:
            srcTokens = line.split()
            if cxtF:
                cline = cxtF.readline()
                if cline == '':
                    continue
                else:
                    cxtTokens = cline.split()
            if tgtF:
                tgtTokens = tgtF.readline().split()
                tgtBatch += [tgtTokens]
            cxtBatch += [cxtTokens]
            srcBatch += [srcTokens]

            if len(srcBatch) < opt.batch_size:
                continue
        else:
            # at the end of file, check last batch
            if len(srcBatch) == 0:
                break

        predBatch, predScore, goldScore = translator.translate(
            srcBatch, cxtBatch, tgtBatch)

        predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x[0]) for x in predBatch)
        if tgtF is not None:
            goldScoreTotal += sum(goldScore)
            goldWordsTotal += sum(len(x) for x in tgtBatch)

        for b in range(len(predBatch)):
            count += 1
            outF.write(" ".join(predBatch[b][0]) + '\n')
            outF.flush()

            if opt.verbose:
                srcSent = ' '.join(srcBatch[b])
                if translator.tgt_dict.lower:
                    srcSent = srcSent.lower()
                print('SENT %d: %s' % (count, srcSent))
                print('PRED %d: %s' % (count, " ".join(predBatch[b][0])))
                print("PRED SCORE: %.4f" % predScore[b][0])

                if tgtF is not None:
                    tgtSent = ' '.join(tgtBatch[b])
                    if translator.tgt_dict.lower:
                        tgtSent = tgtSent.lower()
                    print('GOLD %d: %s ' % (count, tgtSent))
                    print("GOLD SCORE: %.4f" % goldScore[b])

                if opt.n_best > 1:
                    print('\nBEST HYP:')
                    for n in range(opt.n_best):
                        print("[%.4f] %s" %
                              (predScore[b][n], " ".join(predBatch[b][n])))

                print('')

        srcBatch, cxtBatch, tgtBatch = [], [], []

    reportScore('PRED', predScoreTotal, predWordsTotal)
    if tgtF:
        reportScore('GOLD', goldScoreTotal, goldWordsTotal)

    if cxtF:
        cxtF.close()
    if tgtF:
        tgtF.close()
Exemplo n.º 14
0
 def __init__(self):
     import sentencepiece as spm
     opt = OptionContentTransfer()
     self.model = onmt.Translator(opt)
     self.tokenizer = spm.SentencePieceProcessor()
     self.tokenizer.load(opt.path_tokenizer)
Exemplo n.º 15
0
def main():
    opt = parser.parse_args()
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    translator = onmt.Translator(opt)

    outF = open(opt.output, 'w')

    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0

    srcBatch, tgtBatch = [], []

    count = 0

    tgtF = open(opt.tgt) if opt.tgt else None
    for line in addone(open(opt.src)):

        if line is not None:
            srcTokens = line.split()
            srcBatch += [srcTokens]
            if tgtF:
                tgtTokens = tgtF.readline().split() if tgtF else None
                tgtBatch += [tgtTokens]

            if len(srcBatch) < opt.batch_size:
                continue
        else:
            # at the end of file, check last batch
            if len(srcBatch) == 0:
                break

        if opt.dump_input_encoding:
            # predBatch is a batch_size x rnn_size torch FloatTensors
            predBatch = translator.dump_input_encoding(srcBatch, tgtBatch)
            for b in range(len(predBatch)):
                count += 1
                outF.write(
                    '%d ' % count +
                    " ".join([str(fl)
                              for fl in predBatch[b].data.tolist()]) + '\n')
                outF.flush()
        else:
            predBatch, predScore, goldScore = translator.translate(
                srcBatch, tgtBatch)

            predScoreTotal += sum(score[0] for score in predScore)
            predWordsTotal += sum(len(x[0]) for x in predBatch)
            if tgtF is not None:
                goldScoreTotal += sum(goldScore)
                goldWordsTotal += sum(len(x) for x in tgtBatch)

            for b in range(len(predBatch)):
                count += 1
                outF.write(" ".join(predBatch[b][0]) + '\n')
                outF.flush()

                if opt.verbose:
                    srcSent = ' '.join(srcBatch[b])
                    if translator.tgt_dict.lower:
                        srcSent = srcSent.lower()
                    print('SENT %d: %s' % (count, srcSent))
                    print('PRED %d: %s' % (count, " ".join(predBatch[b][0])))
                    print("PRED SCORE: %.4f" % predScore[b][0])

                    if tgtF is not None:
                        tgtSent = ' '.join(tgtBatch[b])
                        if translator.tgt_dict.lower:
                            tgtSent = tgtSent.lower()
                        print('GOLD %d: %s ' % (count, tgtSent))
                        print("GOLD SCORE: %.4f" % goldScore[b])

                    if opt.n_best > 1:
                        print('\nBEST HYP:')
                        for n in range(opt.n_best):
                            print("[%.4f] %s" %
                                  (predScore[b][n], " ".join(predBatch[b][n])))

                    print('')
            reportScore('PRED', predScoreTotal, predWordsTotal)

        srcBatch, tgtBatch = [], []
    if tgtF:
        reportScore('GOLD', goldScoreTotal, goldWordsTotal)

    if tgtF:
        tgtF.close()
Exemplo n.º 16
0
    srcTokens = []
    org_input = list(raw_input("Input sentence:").decode('utf8'))
    clean_input = replace_wide_chars(org_input)
    srcTokens.append(clean_input)

    while len(srcTokens[0]) > 0:
        predBatch, _, _ = translator.translate(srcTokens, [])
        predicted_words = predBatch[0][0]
        print(''.join(predicted_words))

        srcTokens = []
        org_input = list(raw_input("Input sentence:").decode('utf8'))
        clean_input = replace_wide_chars(org_input)
        srcTokens.append(clean_input)


if __name__ == "__main__":
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    sys.stdout.write("Loading model file... ")
    sys.stdout.flush()
    translator = onmt.Translator(opt)
    print("Done.")
    if opt.src:
        decode_file(translator)
    else:
        print("Decode from prompt, input empty string to terminate. ")
        decode_stream(translator)
Exemplo n.º 17
0
def main():
    opt = parser.parse_args()
    opt.cuda = opt.gpu > -1
    torch.cuda.set_device(opt.gpu)

    translator = onmt.Translator(opt)

    outF = open(opt.output, 'w')

    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0

    srcBatch, tgtBatch = [], []

    count = 0

    tgtF = open(opt.tgt) if opt.tgt else None
    for line in open(opt.src):

        srcTokens = line.split()
        srcBatch += [srcTokens]
        if tgtF:
            tgtTokens = tgtF.readline().split() if tgtF else None
            tgtBatch += [tgtTokens]

        if len(srcBatch) < opt.batch_size:
            continue

        predBatch, predScore, goldScore = translator.translate(
            srcBatch, tgtBatch)

        predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x) for x in predBatch)
        if tgtF is not None:
            goldScoreTotal += sum(goldScore)
            goldWordsTotal += sum(len(x) for x in tgtBatch)

        for b in range(len(predBatch)):
            count += 1
            outF.write(" ".join(predBatch[b][0]) + '\n')

            if opt.verbose:
                print('SENT %d: %s' % (count, " ".join(srcBatch[b])))
                print('PRED %d: %s' % (count, " ".join(predBatch[b][0])))
                print("PRED SCORE: %.4f" % predScore[b][0])

                if tgtF is not None:
                    print('GOLD %d: %s ' % (count, " ".join(tgtBatch[b])))
                    print("GOLD SCORE: %.4f" % goldScore[b])

                if opt.n_best > 1:
                    print('\nBEST HYP:')
                    for n in range(opt.n_best):
                        print("[%.4f] %s" %
                              (predScore[b][n], " ".join(predBatch[b][0])))

                print('')

        srcBatch, tgtBatch = [], []

    reportScore('PRED', predScoreTotal, predWordsTotal)
    if tgtF:
        reportScore('GOLD', goldScoreTotal, goldWordsTotal)

    if tgtF:
        tgtF.close()
Exemplo n.º 18
0
def main():
    opt = parser.parse_args()
    print(opt)
    opt.cuda = opt.gpu > -1
    onmt.Constants.cudaActivated = opt.cuda

    if opt.cuda:
        torch.cuda.set_device(opt.gpu)
    
    # Always pick n_best
    opt.n_best = opt.beam_size
        
    
    if opt.output == "stdout":
        outF = sys.stdout
    else:
        outF = open(opt.output, 'w')

    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0

    srcBatch, tgtBatch = [], []

    count = 0

    tgtF = open(opt.tgt) if opt.tgt else None

    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()
        
        # here we are trying to 
    inFile = None
    if(opt.src == "stdin"):
            inFile = sys.stdin
            opt.batch_size = 1
    else:
      inFile = open(opt.src)
    
    if opt.version == 1.0:
        translator = onmt.EnsembleTranslator(opt)
    elif opt.version == 2.0:
        translator = onmt.Translator(opt)
        
    for line in addone(inFile):
        if line is not None:
            if opt.input_type == 'word':
                srcTokens = line.split()
                srcBatch += [srcTokens]
                if tgtF:
                    tgtTokens = tgtF.readline().split() if tgtF else None
                    tgtBatch += [tgtTokens]
            elif opt.input_type == 'char':
                srcTokens = list(line.strip())
                srcBatch += [srcTokens]
                if tgtF:
                    #~ tgtTokens = tgtF.readline().split() if tgtF else None
                    tgtTokens = list(tgtF.readline().strip()) if tgtF else None
                    tgtBatch += [tgtTokens]
            else:
                raise NotImplementedError("Input type unknown")

            #if len(srcBatch) < opt.batch_size:
            #    print('srcBatch < opt.batch_size')
            #    continue
        else:
            # at the end of file, check last batch
            if len(srcBatch) == 0:
                break

        predBatch, predScore, predLength, goldScore, numGoldWords  = translator.translate(srcBatch,
                                                                                    tgtBatch)
        if opt.normalize and opt.version == 1.0:
            predBatch_ = []
            predScore_ = []
            for bb, ss, ll in zip(predBatch, predScore, predLength):
                #~ ss_ = [s_/numpy.maximum(1.,len(b_)) for b_,s_,l_ in zip(bb,ss,ll)]
                ss_ = [lenPenalty(s_, l_, opt.alpha) for b_,s_,l_ in zip(bb,ss,ll)]
                ss_origin = [(s_, len(b_)) for b_,s_,l_ in zip(bb,ss,ll)]
                sidx = numpy.argsort(ss_)[::-1]
                #~ print(ss_, sidx, ss_origin)
                predBatch_.append([bb[s] for s in sidx])
                predScore_.append([ss_[s] for s in sidx])
            predBatch = predBatch_
            predScore = predScore_

            if opt.preferLongestOutputs:
                sortedPredictions = []
                for index, prediction in enumerate(predBatch[0]):
                    sortedPredictions.append((index, len(prediction)))
                sortedPredictions.sort(key=lambda x: x[1], reverse=True)

                predBatchCopy = predBatch
                predScoreCopy = predScore

                for index, sortedPrediction in enumerate(sortedPredictions):
                    predBatch[0][index] = predBatchCopy[0][sortedPredictions[index][0]]
                    predScore[0][index] = predScoreCopy[0][sortedPredictions[index][0]]


        predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x[0]) for x in predBatch)
        if tgtF is not None:
            goldScoreTotal += sum(goldScore).item()
            goldWordsTotal += numGoldWords
            
        for b in range(len(predBatch)):
                        
            count += 1
            
            bestHyp =  getSentenceFromTokens(predBatch[b][0], opt.input_type)            
            if not opt.print_nbest:
                #~ print(predBatch[b][0])
                outF.write(bestHyp + '\n')
                outF.flush()

            if opt.verbose:
                srcSent = getSentenceFromTokens(srcBatch[b], opt.input_type)
                if translator.tgt_dict.lower:
                    srcSent = srcSent.lower()
                print('SENT %d: %s' % (count, srcSent))
                
                
                print('PRED %d: %s' % (count, bestHyp))
                print("PRED SCORE: %.4f" %  predScore[b][0])

                if tgtF is not None:
                    #~ if opt.input_type == 'word':
                        #~ tgtSent = ' '.join(tgtBatch[b]) 
                    #~ elif opt.input_type == 'char':
                        #~ tgtSent = ''.join(tgtBatch[b])
                    tgtSent = getSentenceFromTokens(tgtBatch[b], opt.input_type)
                    if translator.tgt_dict.lower:
                        tgtSent = tgtSent.lower()
                    print('GOLD %d: %s ' % (count, tgtSent))
                    print("GOLD SCORE: %.4f" % goldScore[b])

                if opt.print_nbest:
                    print('\nBEST HYP:')
                    for n in range(opt.n_best):
                        idx = n
                        sent = getSentenceFromTokens(predBatch[b][idx], opt.input_type)
                        print("[%.4f] %s" % (predScore[b][idx], sent))
                            

                print('')

        srcBatch, tgtBatch = [], []
        
    if opt.verbose:
        reportScore('PRED', predScoreTotal, predWordsTotal)
        if tgtF: reportScore('GOLD', goldScoreTotal, goldWordsTotal)
                

    if tgtF:
        tgtF.close()

    if opt.dump_beam:
        json.dump(translator.beam_accum, open(opt.dump_beam, 'w'))
Exemplo n.º 19
0
def main():
    opt = parser.parse_args()
    opt.cuda = opt.gpu > -1
    print(opt)
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    translator = onmt.Translator(opt)

    outF = codecs.open(opt.output, 'w', 'utf-8')

    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0

    srcBatch, tgtBatch = [], []

    count = 0

    total_time = 0.0
    nsamples = 0.0

    tgtF = open(opt.tgt) if opt.tgt else None
    for line in addone(codecs.open(opt.src, "r", "utf-8")):

        if line is not None:
            srcTokens = line.split()
            srcBatch += [srcTokens]
            if tgtF:
                tgtTokens = tgtF.readline().split() if tgtF else None
                tgtBatch += [tgtTokens]

            if len(srcBatch) < opt.batch_size:
                continue
        else:
            # at the end of file, check last batch
            if len(srcBatch) == 0:
                break

        start_time = time.time()
        predBatch, predScore, goldScore = translator.translate(
            srcBatch, tgtBatch)
        total_time += (time.time() - start_time)
        nsamples += len(predBatch)

        predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x[0]) for x in predBatch)
        if tgtF is not None:
            goldScoreTotal += sum(goldScore)
            goldWordsTotal += sum(len(x) for x in tgtBatch)

        for b in range(len(predBatch)):
            count += 1
            outF.write(" ".join(predBatch[b][0]) + '\n')
            outF.flush()

            if opt.verbose:
                srcSent = ' '.join(srcBatch[b])
                if translator.tgt_dict.lower:
                    srcSent = srcSent.lower()
                print('SENT %d: %s' % (count, srcSent))
                print('PRED %d: %s' % (count, " ".join(predBatch[b][0])))
                print("PRED SCORE: %.4f" % predScore[b][0])

                if tgtF is not None:
                    tgtSent = ' '.join(tgtBatch[b])
                    if translator.tgt_dict.lower:
                        tgtSent = tgtSent.lower()
                    print('GOLD %d: %s ' % (count, tgtSent))
                    print("GOLD SCORE: %.4f" % goldScore[b])

                if opt.n_best > 1:
                    print('\nBEST HYP:')
                    for n in range(opt.n_best):
                        print("[%.4f] %s" %
                              (predScore[b][n], " ".join(predBatch[b][n])))

                print('')

        srcBatch, tgtBatch = [], []

    reportScore('PRED', predScoreTotal, predWordsTotal)
    if tgtF:
        reportScore('GOLD', goldScoreTotal, goldWordsTotal)

    if tgtF:
        tgtF.close()

    samples_per_sec = nsamples / total_time
    print("Average samples per second: %f, %f, %f" %
          (nsamples, total_time, samples_per_sec))
    print("Time per sample %f" % (total_time / nsamples))
Exemplo n.º 20
0
def main():
    opt = parser.parse_args()
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Always pick n_best
    opt.n_best = opt.beam_size

    translator = onmt.Translator(opt)

    outF = open(opt.output, 'w')

    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0

    srcBatch, tgtBatch = [], []

    count = 0

    tgtF = open(opt.tgt) if opt.tgt else None

    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()

    for line in addone(open(opt.src)):
        if line is not None:
            srcTokens = line.split()
            srcBatch += [srcTokens]
            if tgtF:
                tgtTokens = tgtF.readline().split() if tgtF else None
                tgtBatch += [tgtTokens]

            if len(srcBatch) < opt.batch_size:
                continue
        else:
            # at the end of file, check last batch
            if len(srcBatch) == 0:
                break

        predBatch, predScore, goldScore = translator.translate(
            srcBatch, tgtBatch)

        if opt.normalize:
            predBatch_ = []
            predScore_ = []
            for bb, ss in zip(predBatch, predScore):
                ss_ = [
                    s_ / numpy.maximum(1., len(b_)) for b_, s_ in zip(bb, ss)
                ]
                sidx = numpy.argsort(ss_)[::-1]
                predBatch_.append([bb[s] for s in sidx])
                predScore_.append([ss_[s] for s in sidx])
            predBatch = predBatch_
            predScore = predScore_

        predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x[0]) for x in predBatch)
        if tgtF is not None:
            goldScoreTotal += sum(goldScore)
            goldWordsTotal += sum(len(x) for x in tgtBatch)

        for b in range(len(predBatch)):
            # Pred Batch always have n-best outputs
            #~ scores = torch.Tensor(len(predBatch[b]))
            #~ for n in range(opt.n_best):
            #~ scores[n] = predScore[b][n]
            #~ if opt.normalize:
            #~ scores[n] = scores[n] / ( len(predBatch[b][n]) + 1)
            #~
            #~ sorted_scores, sorted_index = torch.sort(scores, 0, True)
            #~ bestSent = predBatch[b][sorted_index[0]]
            #~ bestIndex = sorted_index[0]
            count += 1
            # Best sentence = having highest log prob

            if not opt.print_nbest:
                outF.write(" ".join(predBatch[b][0]) + '\n')
                outF.flush()
            else:
                for n in range(opt.n_best):
                    idx = n
                    #~ if opt.verbose:
                    print("%d ||| %s ||| %.6f" % (count - 1, " ".join(
                        predBatch[b][idx]), predScore[b][idx]))
                    outF.write("%d ||| %s ||| %.6f\n" % (count - 1, " ".join(
                        predBatch[b][idx]), predScore[b][idx]))
                    outF.flush()

            if opt.verbose:
                srcSent = ' '.join(srcBatch[b])
                if translator.tgt_dict.lower:
                    srcSent = srcSent.lower()
                print('SENT %d: %s' % (count, srcSent))
                print('PRED %d: %s' % (count, " ".join(predBatch[b][0])))
                print("PRED SCORE: %.4f" % predScore[b][0])

                if tgtF is not None:
                    tgtSent = ' '.join(tgtBatch[b])
                    if translator.tgt_dict.lower:
                        tgtSent = tgtSent.lower()
                    print('GOLD %d: %s ' % (count, tgtSent))
                    print("GOLD SCORE: %.4f" % goldScore[b])
                print('')

        srcBatch, tgtBatch = [], []

    reportScore('PRED', predScoreTotal, predWordsTotal)
    if tgtF:
        reportScore('GOLD', goldScoreTotal, goldWordsTotal)

    if tgtF:
        tgtF.close()

    if opt.dump_beam:
        json.dump(translator.beam_accum, open(opt.dump_beam, 'w'))
Exemplo n.º 21
0
    return predictions


@app.route('/translate', methods=['POST'])
def config():
    req = request.get_json()
    res = []
    for s in req:
        res.append(translate(s))
    return jsonify(sum(res, []))


if __name__ == '__main__':
    opt = parser.parse_args()

    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    for model in opt.model:
        print("Loading model... " + model)
        modelopt = copy.copy(opt)
        modelopt.model = model
        hash = hash_byname(model)
        translators[hash] = onmt.Translator(modelopt, dummy_opt.__dict__)

    app.run(debug=False,  host='0.0.0.0', port=8092)
Exemplo n.º 22
0
def main(arg_list=None):
    opt = parse_arg(arg_list)
    if opt.task == "simp":
        opt.src = "../../data_%s/%s/test/test.normal" % (opt.task, opt.data)
        opt.tgt = "../../data_%s/%s/test/test.simple.0" % (opt.task, opt.data)
    elif opt.task == "MT":
        #opt.src = "../../data_%s/%s/test.de-en.de" % (opt.task, opt.data)
        #opt.tgt = "../../data_%s/%s/test.de-en.en" % (opt.task, opt.data)
        opt.src = "../../data_%s/%s/test.en-zh.en" % (opt.task, opt.data)
        opt.tgt = "../../data_%s/%s/test.en-zh.zh" % (opt.task, opt.data)
    elif opt.task == "Multi-MT":
        line = opt.language_pair.split("-")
        S_lang = line[0]
        T_lang = line[1]
        opt.src = "../../data_%s/%s/%s.%s.%s" % (
            opt.task, opt.data, opt.test_set, opt.language_pair, S_lang)
        opt.tgt = "../../data_%s/%s/%s.%s.%s" % (
            opt.task, opt.data, opt.test_set, opt.language_pair, T_lang)
    else:
        assert False
    if opt.output is None:
        opt.output = os.path.dirname(opt.model) + "/" + "test.txt"
    opt.gpu = opt.gpus
    opt.cuda = opt.gpu > -1
    torch.cuda.set_device(opt.gpu)

    translator = onmt.Translator(opt)

    outF = open(opt.output, 'w')

    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0

    srcBatch, tgtBatch = [], []

    count = 0

    tgtF = open(opt.tgt) if opt.tgt else None
    src_contents = open(opt.src).readlines()
    representations = []
    src_rb_list = []
    for line_num, line in enumerate(src_contents):

        srcTokens = line.split()
        srcBatch += [srcTokens]
        if tgtF:
            tgtTokens = tgtF.readline().split() if tgtF else None
            tgtBatch += [tgtTokens]

        if line_num < len(src_contents) - 1 and len(srcBatch) < opt.batch_size:
            continue

        predBatch, predScore, goldScore, rep, src_rb = translator.translate(
            srcBatch, tgtBatch)
        representations.append(rep)
        src_rb_list.append(src_rb)
        '''predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x[0]) for x in predBatch)
        if tgtF is not None:
            goldScoreTotal += sum(goldScore)
            goldWordsTotal += sum(len(x) for x in tgtBatch)'''

        for b in range(len(predBatch)):
            count += 1
            pred_sent = " ".join(predBatch[b][0])
            if opt.bpe:
                pred_sent = pred_sent.replace("@@ ", "")
            outF.write(pred_sent + '\n')

            if opt.verbose:
                print('SENT %d: %s' % (count, " ".join(srcBatch[b])))
                print('PRED %d: %s' % (count, " ".join(predBatch[b][0])))
                if opt.bpe:
                    print('PRED CON %d: %s' % (count, pred_sent))
                print("PRED SCORE: %.4f" % predScore[b][0])

                if tgtF is not None:
                    print('GOLD %d: %s ' % (count, " ".join(tgtBatch[b])))
                    print("GOLD SCORE: %.4f" % goldScore[b])

                if opt.n_best > 1:
                    print('\nBEST HYP:')
                    for n in range(opt.n_best):
                        print("[%.4f] %s" %
                              (predScore[b][n], " ".join(predBatch[b][0])))
                print('')
            predScoreTotal += predScore[b][0]
            predWordsTotal += len(predBatch[b][0])
            if tgtF is not None:
                goldScoreTotal += goldScore[b]
                goldWordsTotal += len(tgtBatch[b])
            '''reportScore('PRED', predScoreTotal, predWordsTotal)
            if tgtF:
                reportScore('GOLD', goldScoreTotal, goldWordsTotal)'''

        srcBatch, tgtBatch = [], []

    reportScore('PRED', predScoreTotal, predWordsTotal)
    if tgtF:
        reportScore('GOLD', goldScoreTotal, goldWordsTotal)

    if tgtF:
        tgtF.close()
    outF.close()
    if opt.output_representation:
        save_data = {
            "representations": torch.cat(representations),
            "src_rb": torch.cat(src_rb_list)
        }
        torch.save(save_data, opt.output_representation)
Exemplo n.º 23
0
def main():
    opt = parser.parse_args()
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    translator = onmt.Translator(opt)

    outF = codecs.open(opt.output, 'w', 'utf-8')

    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0

    srcBatch, tgtBatch = [], []

    count = 0

    tgtF = codecs.open(opt.tgt, 'r', 'utf-8') if opt.tgt else None

    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()

    for line in addone(codecs.open(opt.src, 'r', 'utf-8')):
        if line is not None:
            srcTokens = line.split()
            srcBatch += [srcTokens]
            if tgtF:
                tgtTokens = tgtF.readline().split() if tgtF else None
                tgtBatch += [tgtTokens]

            if len(srcBatch) < opt.batch_size:
                continue
        else:
            # at the end of file, check last batch
            if len(srcBatch) == 0:
                break

        print(srcBatch)
        print(tgtBatch)
        predBatch, predScore, goldScore, encStates = translator.translate(
            srcBatch, tgtBatch)
        print(predBatch[0][0][1:-1])
        # print(encStates[-1][0])
        predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x[0]) for x in predBatch)
        if tgtF is not None:
            goldScoreTotal += sum(goldScore)
            goldWordsTotal += sum(len(x) for x in tgtBatch)

        for b in range(len(predBatch)):
            count += 1
            outF.write(" ".join(predBatch[b][0]) + '\n')
            outF.flush()

            if opt.verbose:
                srcSent = ' '.join(srcBatch[b])
                if translator.tgt_dict.lower:
                    srcSent = srcSent.lower()
                os.write(1, bytes('SENT %d: %s\n' % (count, srcSent), 'UTF-8'))
                os.write(
                    1,
                    bytes('PRED %d: %s\n' % (count, " ".join(predBatch[b][0])),
                          'UTF-8'))
                print("PRED SCORE: %.4f" % predScore[b][0])

                if tgtF is not None:
                    tgtSent = ' '.join(tgtBatch[b])
                    if translator.tgt_dict.lower:
                        tgtSent = tgtSent.lower()
                    os.write(
                        1, bytes('GOLD %d: %s\n' % (count, tgtSent), 'UTF-8'))
                    print("GOLD SCORE: %.4f" % goldScore[b])

                if opt.n_best > 1:
                    print('\nBEST HYP:')
                    for n in range(opt.n_best):
                        os.write(
                            1,
                            bytes(
                                "[%.4f] %s\n" %
                                (predScore[b][n], " ".join(predBatch[b][n])),
                                'UTF-8'))

                print('')

        srcBatch, tgtBatch = [], []

    reportScore('PRED', predScoreTotal, predWordsTotal)
    if tgtF:
        reportScore('GOLD', goldScoreTotal, goldWordsTotal)

    if tgtF:
        tgtF.close()

    if opt.dump_beam:
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Exemplo n.º 24
0
def main():
    opt = parser.parse_args()
    # by me
    # 更新, 用它原来的默认的即可
    # opt.bos_token = onmt.Constants.BERT_CLS_WORD

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    # Always pick n_best
    opt.n_best = opt.beam_size

    if opt.output == "stdout":
        outF = sys.stdout
    else:
        outF = open(opt.output, 'w')

    pred_score_total, pred_words_total, gold_score_total, gold_words_total = 0, 0, 0, 0

    src_batch, tgt_batch = [], []

    count = 0

    tgtF = open(opt.tgt) if opt.tgt else None
    #
    # if opt.dump_beam != "":
    #     import json
    #     translator.initBeamAccum()

    in_file = None

    if opt.src == "stdin":
        in_file = sys.stdin
        opt.batch_size = 1
    elif opt.encoder_type == "audio" and opt.asr_format == "h5":
        in_file = h5.File(opt.src, 'r')
    elif opt.encoder_type == "audio" and opt.asr_format == "scp":
        import kaldiio
        from kaldiio import ReadHelper
        audio_data = iter(ReadHelper('scp:' + opt.src))
    else:
        in_file = open(opt.src)

    if not opt.fast_translate:
        translator = onmt.Translator(opt)
    else:
        from onmt.inference.FastTranslator import FastTranslator
        translator = FastTranslator(opt)

    # Audio processing for the source batch
    if opt.encoder_type == "audio":

        s_prev_context = []
        t_prev_context = []

        i = 0
        while True:
            if opt.asr_format == "h5":
                if i == len(in_file):
                    break
                line = np.array(in_file[str(i)])
                i += 1
            elif opt.asr_format == "scp":
                try:
                    _, line = next(audio_data)
                except StopIteration:
                    break

            if opt.stride != 1:
                line = line[0::opt.stride]
            line = torch.from_numpy(line)
            if opt.concat != 1:
                add = (opt.concat - line.size()[0] % opt.concat) % opt.concat
                z = torch.FloatTensor(add, line.size()[1]).zero_()
                line = torch.cat((line, z), 0)
                line = line.reshape((line.size()[0] // opt.concat,
                                     line.size()[1] * opt.concat))

            if opt.previous_context > 0:
                s_prev_context.append(line)
                for i in range(1, opt.previous_context + 1):
                    if i < len(s_prev_context):
                        line = torch.cat((torch.cat(
                            (s_prev_context[-i - 1],
                             torch.zeros(1,
                                         line.size()[1]))), line))
                if len(s_prev_context) > opt.previous_context:
                    s_prev_context = s_prev_context[-1 * opt.previous_context:]
            src_batch += [line]

            if tgtF:
                # ~ tgt_tokens = tgtF.readline().split() if tgtF else None
                tline = tgtF.readline().strip()
                if opt.previous_context > 0:
                    t_prev_context.append(tline)
                    for i in range(1, opt.previous_context + 1):
                        if i < len(s_prev_context):
                            tline = t_prev_context[-i - 1] + " # " + tline
                    if len(t_prev_context) > opt.previous_context:
                        t_prev_context = t_prev_context[-1 *
                                                        opt.previous_context:]

                if opt.input_type == 'word':
                    tgt_tokens = tline.split() if tgtF else None
                elif opt.input_type == 'char':
                    tgt_tokens = list(tline.strip()) if tgtF else None
                else:
                    raise NotImplementedError("Input type unknown")

                tgt_batch += [tgt_tokens]

            if len(src_batch) < opt.batch_size:
                continue

            print("Batch size:", len(src_batch), len(tgt_batch))
            pred_batch, pred_score, pred_length, gold_score, num_gold_words, all_gold_scores = translator.translate(
                src_batch, tgt_batch, type='asr')

            print("Result:", len(pred_batch))
            count, pred_score, pred_words, gold_score, goldWords = translateBatch(
                opt, tgtF, count, outF, translator, src_batch, tgt_batch,
                pred_batch, pred_score, pred_length, gold_score,
                num_gold_words, all_gold_scores, opt.input_type)
            pred_score_total += pred_score
            pred_words_total += pred_words
            gold_score_total += gold_score
            gold_words_total += goldWords
            src_batch, tgt_batch = [], []

        # catch the last batch
        if len(src_batch) != 0:
            print("Batch size:", len(src_batch), len(tgt_batch))
            pred_batch, pred_score, pred_length, gold_score, num_gold_words, all_gold_scores = translator.translate(
                src_batch, tgt_batch, type='asr')
            print("Result:", len(pred_batch))
            count, pred_score, pred_words, gold_score, goldWords = translateBatch(
                opt, tgtF, count, outF, translator, src_batch, tgt_batch,
                pred_batch, pred_score, pred_length, gold_score,
                num_gold_words, all_gold_scores, opt.input_type)
            pred_score_total += pred_score
            pred_words_total += pred_words
            gold_score_total += gold_score
            gold_words_total += goldWords
            src_batch, tgt_batch = [], []
    # Text processing
    else:
        # addone 这里设置为可迭代, 因为我们设置batch_size 的长度
        for line in addone(in_file):
            if line is not None:
                if opt.input_type == 'word':
                    src_tokens = line.split()
                elif opt.input_type == 'char':
                    src_tokens = list(line.strip())
                else:
                    raise NotImplementedError("Input type unknown")
                src_batch += [src_tokens]
                # tgtF:None
                if tgtF:
                    # ~ tgt_tokens = tgtF.readline().split() if tgtF else None
                    if opt.input_type == 'word':
                        tgt_tokens = tgtF.readline().split() if tgtF else None
                    elif opt.input_type == 'char':
                        tgt_tokens = list(
                            tgtF.readline().strip()) if tgtF else None
                    else:
                        raise NotImplementedError("Input type unknown")
                    tgt_batch += [tgt_tokens]
                if len(src_batch) < opt.batch_size:
                    continue
            else:
                # at the end of file, check last batch
                # 我们的文件 src_batch 是一个list,正常len不为0
                if len(src_batch) == 0:
                    break

            # actually done beam search from the model
            pred_batch, pred_score, pred_length, gold_score, num_gold_words, all_gold_scores = translator.translate(
                src_batch, tgt_batch)

            # convert output tensor to words
            count, pred_score, pred_words, gold_score, goldWords = translateBatch(
                opt, tgtF, count, outF, translator, src_batch, tgt_batch,
                pred_batch, pred_score, pred_length, gold_score,
                num_gold_words, all_gold_scores, opt.input_type)
            pred_score_total += pred_score
            pred_words_total += pred_words
            gold_score_total += gold_score
            gold_words_total += goldWords
            src_batch, tgt_batch = [], []

    if opt.verbose:
        reportScore('PRED', pred_score_total, pred_words_total)
        if tgtF: reportScore('GOLD', gold_score_total, gold_words_total)

    if tgtF:
        tgtF.close()

    if opt.dump_beam:
        json.dump(translator.beam_accum, open(opt.dump_beam, 'w'))
Exemplo n.º 25
0
def main():
    previous_words = None
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]
    print('dummy_opt: ', dummy_opt)

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)
    translator = onmt.Translator(opt, dummy_opt.__dict__)
    out_file = codecs.open(opt.output, 'w', 'utf-8')
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0
    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()
    data = onmt.IO.ONMTDataset(opt.src,
                               opt.tgt,
                               translator.fields,
                               use_filter_pred=False)

    test_data = onmt.IO.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        shuffle=False)

    counter = count(1)
    for batch in test_data:
        pred_batch, gold_batch, pred_scores, gold_scores, attn, src \
            = translator.translate(batch, data)
        pred_score_total += sum(score[0] for score in pred_scores)
        pred_words_total += sum(len(x[0]) for x in pred_batch)
        if opt.tgt:
            gold_score_total += sum(gold_scores)
            gold_words_total += sum(len(x) for x in batch.tgt[1:])

        #davidstap
        #_, src_lengths = batch.src
        #encStates, context = translator.model.encoder(src, src_lengths)

        # z_batch: an iterator over the predictions, their scores,
        # the gold sentence, its score, and the source sentence for each
        # sentence in the batch. It has to be zip_longest instead of
        # plain-old zip because the gold_batch has length 0 if the target
        # is not included.
        z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores,
                              (sent.squeeze(1)
                               for sent in src.split(1, dim=1)))

        for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch:
            # src_sent is torch.LongTensor
            #print('type src_sent:',type(src_sent))
            n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]]
            out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()

            if opt.verbose:
                sent_number = next(counter)
                words = get_src_words(src_sent,
                                      translator.fields["src"].vocab.itos)

                if previous_words is not None:

                    print('BLEU: ', sentence_bleu([words], previous_words))
                    print()
                    print('S1:', words)
                    print('S2:', previous_words)

                #os.write(1, bytes('\nSENT %d: %s\n' %
            #                      (sent_number, words), 'UTF-8'))

                previous_words = words

                best_pred = n_best_preds[0]

                #TODO: calculate BLEU score reference (best_pred) and hypothesis (words)
                #TODO: calculate cosine_similarity (best_pred) and hypothesis (words)
                #bleu_score = sentence_bleu(best_pred, words)
                #print('BLEU: ',bleu_score)

                best_score = pred_score[0]
                #os.write(1, bytes('PRED %d: %s\n' %
                #                      (sent_number, best_pred), 'UTF-8'))
                #print("PRED SCORE: %.4f" % best_score)

                # 'words' = input sentence
                # 'best_pred' = prediction

                # put source sentence in translator.model.encoder to find context
                # maybe change data type src? torchtext datatype?

                #model = NMTModel(encoder, decoder) (see ModelConstructor)
                src_lengths = len(words.split())

                # src(FloatTensor): a sequence of source tensors with
                #         optional feature tensors of size (len x batch).
                # tgt(FloatTensor): a sequence of target tensors with
                #         optional feature tensors of size (len x batch).
                # lengths([int]): an array of the src length.
                # dec_state: A decoder state object

                #hidden, context = translator.model.encoder(src_sent, src_lengths)

                #euc_dist(context_r, context_pred)

                if opt.tgt:
                    tgt_sent = ' '.join(gold_sent)
                    os.write(
                        1,
                        bytes('GOLD %d: %s\n' % (sent_number, tgt_sent),
                              'UTF-8'))
                    print("GOLD SCORE: %.4f" % gold_score)

                if len(n_best_preds) > 1:
                    print('\nBEST HYP:')
                    for score, sent in zip(pred_score, n_best_preds):
                        os.write(1,
                                 bytes("[%.4f] %s\n" % (score, sent), 'UTF-8'))

    report_score('PRED', pred_score_total, pred_words_total)
    if opt.tgt:
        report_score('GOLD', gold_score_total, gold_words_total)

    if opt.dump_beam:
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Exemplo n.º 26
0
def main():
    opt = parser.parse_args()

    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)
    translator = onmt.Translator(opt, dummy_opt.__dict__)
    outF = codecs.open(opt.output, 'w', 'utf-8')
    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0
    srcBatch, tgtBatch = [], []
    count = 0
    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()

    data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, None)

    testData = onmt.IO.OrderedIterator(
        dataset=data, device=opt.gpu,
        batch_size=opt.batch_size, train=False, sort=False,
        shuffle=False)

    index = 0
    for batch in testData:
        predBatch, predScore, goldScore, attn, src \
            = translator.translate(batch, data)
        predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x[0]) for x in predBatch)
        if opt.tgt:
            goldScoreTotal += sum(goldScore)
            goldWordsTotal += sum(len(x) for x in tgtBatch)

        for b in range(len(predBatch)):
            count += 1
            try:
                # python2
                outF.write(" ".join([i.decode('utf-8')
                           for i in predBatch[b][0]]) + '\n')
            except AttributeError:
                # python3: can't do .decode on a str object
                outF.write(" ".join(predBatch[b][0]) + '\n')
            outF.flush()

            if opt.verbose:
                words = []
                for f in src[:, b]:
                    word = translator.fields["src"].vocab.itos[f]
                    if word == onmt.IO.PAD_WORD:
                        break
                    words.append(word)

                os.write(1, bytes('SENT %d: %s\n' %
                                  (count, " ".join(words)), 'UTF-8'))

                index += 1
                print(len(predBatch[b][0]))
                os.write(1, bytes('\n PRED %d: %s\n' %
                                  (count, " ".join(predBatch[b][0])), 'UTF-8'))
                print("PRED SCORE: %.4f" % predScore[b][0])

                if opt.tgt:
                    tgtSent = ' '.join(tgtBatch[b])
                    os.write(1, bytes('GOLD %d: %s\n' %
                             (count, tgtSent), 'UTF-8'))
                    print("GOLD SCORE: %.4f" % goldScore[b])

                if opt.n_best > 1:
                    print('\nBEST HYP:')
                    for n in range(opt.n_best):
                        os.write(1, bytes("[%.4f] %s\n" % (predScore[b][n],
                                 " ".join(predBatch[b][n])),
                            'UTF-8'))

                if opt.attn_debug:
                    print('')
                    for i, w in enumerate(predBatch[b][0]):
                        print(w)
                        _, ids = attn[b][0][i].sort(0, descending=True)
                        for j in ids[:5].tolist():
                            print("\t%s\t%d\t%3f" % (srcBatch[b][j], j,
                                                     attn[b][0][i][j]))

        srcBatch, tgtBatch = [], []

    reportScore('PRED', predScoreTotal, predWordsTotal)
    if opt.tgt:
        reportScore('GOLD', goldScoreTotal, goldWordsTotal)

    if opt.dump_beam:
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Exemplo n.º 27
0
def main():
    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)
    translator = onmt.Translator(opt, dummy_opt.__dict__)
    out_file = codecs.open(opt.output, 'w', 'utf-8')
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0
    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()
    data = onmt.IO.ONMTDataset(opt.src,
                               opt.tgt,
                               translator.fields,
                               use_filter_pred=False)

    test_data = onmt.IO.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        shuffle=False)

    counter = count(1)
    for batch in test_data:
        pred_batch, gold_batch, pred_scores, gold_scores, attn, src \
            = translator.translate(batch, data)
        pred_score_total += sum(score[0] for score in pred_scores)
        pred_words_total += sum(len(x[0]) for x in pred_batch)
        if opt.tgt:
            gold_score_total += sum(gold_scores)
            gold_words_total += sum(len(x) for x in batch.tgt[1:])

        # z_batch: an iterator over the predictions, their scores,
        # the gold sentence, its score, and the source sentence for each
        # sentence in the batch. It has to be zip_longest instead of
        # plain-old zip because the gold_batch has length 0 if the target
        # is not included.
        z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores,
                              (sent.squeeze(1)
                               for sent in src.split(1, dim=1)))

        for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch:
            n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]]
            out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()

            if opt.verbose:
                sent_number = next(counter)
                words = get_src_words(src_sent,
                                      translator.fields["src"].vocab.itos)

                os.write(
                    1, bytes('\nSENT %d: %s\n' % (sent_number, words),
                             'UTF-8'))

                best_pred = n_best_preds[0]
                best_score = pred_score[0]
                os.write(
                    1,
                    bytes('PRED %d: %s\n' % (sent_number, best_pred), 'UTF-8'))
                print("PRED SCORE: %.4f" % best_score)

                if opt.tgt:
                    tgt_sent = ' '.join(gold_sent)
                    os.write(
                        1,
                        bytes('GOLD %d: %s\n' % (sent_number, tgt_sent),
                              'UTF-8'))
                    print("GOLD SCORE: %.4f" % gold_score)

                if len(n_best_preds) > 1:
                    print('\nBEST HYP:')
                    for score, sent in zip(pred_score, n_best_preds):
                        os.write(1,
                                 bytes("[%.4f] %s\n" % (score, sent), 'UTF-8'))

    report_score('PRED', pred_score_total, pred_words_total)
    if opt.tgt:
        report_score('GOLD', gold_score_total, gold_words_total)

    if opt.dump_beam:
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))
Exemplo n.º 28
0
 def __init__(self, model):
     opt = TranslatorParameter(model)
     self.translator = onmt.Translator(opt)
Exemplo n.º 29
0
def translate(src, model, output):
    parser = argparse.ArgumentParser(
        description='translate.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    opts.translate_opts(parser)

    opt = parser.parse_known_args([])[0]
    if opt.batch_size != 1:
        print("WARNING: -batch_size isn't supported currently, "
              "we set it to 1 for now!")
        opt.batch_size = 1

    opt.src = src
    opt.model = model
    opt.output = output

    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    translator = onmt.Translator(opt, dummy_opt.__dict__)
    out_file = codecs.open(opt.output, 'w', 'utf-8')
    gold_out_file = codecs.open("gold_" + opt.output, 'w', 'utf-8')

    #print "TRANSLATOR SOURCE VOCAB"
    #for i in range(len(translator.fields["src"].vocab.itos)):
    #    print i, translator.fields["src"].vocab.itos[i]
    #print

    data = onmt.IO.ONMTDataset(opt.src,
                               opt.tgt,
                               translator.fields,
                               use_filter_pred=False)

    test_data = onmt.IO.OrderedIterator(dataset=data,
                                        device=opt.gpu,
                                        batch_size=opt.batch_size,
                                        train=False,
                                        sort=False,
                                        shuffle=False)

    counter = count(1)
    for batch in test_data:
        pred_batch, gold_batch, pred_scores, gold_scores, attn, src \
            = translator.translate(batch, data)

        # z_batch: an iterator over the predictions, their scores,
        # the gold sentence, its score, and the source sentence for each
        # sentence in the batch. It has to be zip_longest instead of
        # plain-old zip because the gold_batch has length 0 if the target
        # is not included.
        z_batch = zip_longest(pred_batch, gold_batch, pred_scores, gold_scores,
                              (sent.squeeze(1)
                               for sent in src.split(1, dim=1)))

        for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch:
            n_best_preds = [" ".join(pred) for pred in pred_sents[:opt.n_best]]
            out_file.write('\n'.join(n_best_preds))
            out_file.write('\n')
            out_file.flush()

            words = get_src_words(src_sent,
                                  translator.fields["src"].vocab.itos)
            #print words
            gold_out_file.write(words)
            gold_out_file.write('\n')
            gold_out_file.flush()
def main():
    opt = parser.parse_args()

    dummy_parser = argparse.ArgumentParser(description='train.py')
    opts.model_opts(dummy_parser)
    dummy_opt = dummy_parser.parse_known_args([])[0]

    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)
    #Creates the translator!!!
    translator = onmt.Translator(opt, dummy_opt.__dict__)
    outF = codecs.open(opt.output, 'w', 'utf-8')
    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0
    count = 0
    if opt.dump_beam != "":
        import json
        translator.initBeamAccum()

    #Process the data for the test
    data = onmt.IO.ONMTDataset(opt.src, opt.tgt, translator.fields, None, opt.inter)
    testData = onmt.IO.OrderedIterator(
        dataset=data, device=opt.gpu,
        batch_size=opt.batch_size, train=False, sort=False,
        shuffle=False)

    if opt.inter != None:
        inter_act=True
    else:
        inter_act=False

    index = 0
    for batch in testData:
        #I will need to change the translator!
        predBatch, goldBatch, predScore, goldScore, attn, src \
            = translator.translate(batch, data, inter_act)
        #print((attn[0][0]))
        #print (predBatch)
        if opt.save_attention:
            attn_numpy=attn[0][0].numpy()
        #print(attn_numpy.T.shape)
            pickle.dump(attn_numpy.T,open('attention_matrix.pkl','wb'))
        predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x[0]) for x in predBatch)
        if opt.tgt:
            goldScoreTotal += sum(goldScore)
            goldWordsTotal += sum(len(x) for x in batch.tgt[1:])

        for b in range(len(predBatch)):
            count += 1
            try:
                # python2 (should be the same)
                for n in range(opt.n_best):
                    outF.write(" ".join([i
                               for i in predBatch[b][n]]) + '\n')
            except AttributeError:
                # python3: can't do .decode on a str object
                for n in range(opt.n_best):
                    outF.write(" ".join(predBatch[b][n]) + '\n')
            outF.flush()

            if opt.verbose:
                words = []
                for f in src[:, b]:
                    word = translator.fields["src"].vocab.itos[f]
                    if word == onmt.IO.PAD_WORD:
                        break
                    words.append(word)

                os.write(1, bytes('\nSENT %d: %s\n' %
                                  (count, " ".join(words)), 'UTF-8'))

                index += 1
                os.write(1, bytes('PRED %d: %s\n' %
                                  (count, " ".join(predBatch[b][0])), 'UTF-8'))
                print("PRED SCORE: %.4f" % predScore[b][0])

                if opt.tgt:
                    tgtSent = ' '.join(goldBatch[b])
                    os.write(1, bytes('GOLD %d: %s\n' %
                             (count, tgtSent), 'UTF-8'))
                    print("GOLD SCORE: %.4f" % goldScore[b])

                if opt.n_best > 1:
                    print('\nBEST HYP:')
                    for n in range(opt.n_best):
                        os.write(1, bytes("[%.4f] %s\n" % (predScore[b][n],
                                 " ".join(predBatch[b][n])),
                            'UTF-8'))

    reportScore('PRED', predScoreTotal, predWordsTotal)
    if opt.tgt:
        reportScore('GOLD', goldScoreTotal, goldWordsTotal)

    if opt.dump_beam:
        json.dump(translator.beam_accum,
                  codecs.open(opt.dump_beam, 'w', 'utf-8'))