Exemplo n.º 1
0
def get_models():
    args = parse_args()

    state_en2fr = prototype_state()
    if hasattr(args, 'state_en2fr'):
        with open(args.state_en2fr) as src:
            state_en2fr.update(cPickle.load(src))
    state_en2fr.update(eval("dict({})".format(args.changes)))

    state_fr2en = prototype_state()
    if hasattr(args, 'state_fr2en') and args.state_fr2en is not None:
        with open(args.state_fr2en) as src:
            state_fr2en.update(cPickle.load(src))
    state_fr2en.update(eval("dict({})".format(args.changes)))

    rng = numpy.random.RandomState(state_en2fr['seed'])
    enc_dec_en_2_fr = RNNEncoderDecoder(state_en2fr, rng, skip_init=True)
    enc_dec_en_2_fr.build()
    lm_model_en_2_fr = enc_dec_en_2_fr.create_lm_model()
    lm_model_en_2_fr.load(args.model_path_en2fr)
    indx_word_src = cPickle.load(open(state_en2fr['word_indx'], 'rb'))
    indx_word_trgt = cPickle.load(open(state_en2fr['word_indx_trgt'], 'rb'))

    if hasattr(args, 'state_fr2en') and args.state_fr2en is not None:
        rng = numpy.random.RandomState(state_fr2en['seed'])
        enc_dec_fr_2_en = RNNEncoderDecoder(state_fr2en, rng, skip_init=True)
        enc_dec_fr_2_en.build()
        lm_model_fr_2_en = enc_dec_fr_2_en.create_lm_model()
        lm_model_fr_2_en.load(args.model_path_fr2en)

        return [lm_model_en_2_fr, enc_dec_en_2_fr, indx_word_src, indx_word_trgt, state_en2fr, \
                lm_model_fr_2_en, enc_dec_fr_2_en, state_fr2en]
    else:
        return [lm_model_en_2_fr, enc_dec_en_2_fr, indx_word_src, indx_word_trgt, state_en2fr, \
                None, None, None]
Exemplo n.º 2
0
def get_models():
    args = parse_args()

    state_en2fr = prototype_state()
    if hasattr(args, 'state_en2fr'):
        with open(args.state_en2fr) as src:
            state_en2fr.update(cPickle.load(src))
    state_en2fr.update(eval("dict({})".format(args.changes)))

    state_fr2en = prototype_state()
    if hasattr(args, 'state_fr2en') and args.state_fr2en is not None:
        with open(args.state_fr2en) as src:
            state_fr2en.update(cPickle.load(src))
    state_fr2en.update(eval("dict({})".format(args.changes)))

    rng = numpy.random.RandomState(state_en2fr['seed'])
    enc_dec_en_2_fr = RNNEncoderDecoder(state_en2fr, rng, skip_init=True)
    enc_dec_en_2_fr.build()
    lm_model_en_2_fr = enc_dec_en_2_fr.create_lm_model()
    lm_model_en_2_fr.load(args.model_path_en2fr)
    indx_word_src = cPickle.load(open(state_en2fr['word_indx'], 'rb'))
    indx_word_trgt = cPickle.load(open(state_en2fr['word_indx_trgt'], 'rb'))

    if hasattr(args, 'state_fr2en') and args.state_fr2en is not None:
        rng = numpy.random.RandomState(state_fr2en['seed'])
        enc_dec_fr_2_en = RNNEncoderDecoder(state_fr2en, rng, skip_init=True)
        enc_dec_fr_2_en.build()
        lm_model_fr_2_en = enc_dec_fr_2_en.create_lm_model()
        lm_model_fr_2_en.load(args.model_path_fr2en)

        return [lm_model_en_2_fr, enc_dec_en_2_fr, indx_word_src, indx_word_trgt, state_en2fr, \
            lm_model_fr_2_en, enc_dec_fr_2_en, state_fr2en]
    else:
        return [lm_model_en_2_fr, enc_dec_en_2_fr, indx_word_src, indx_word_trgt, state_en2fr,\
                None, None, None]
Exemplo n.º 3
0
    def __init__(self, args):
        self.args = args
        self.state = prototype_state()
        with open(self.args.state) as src:
            self.state.update(cPickle.load(src))
        self.state.update(eval("dict({})".format(self.args.changes)))

        logging.basicConfig(level=getattr(logging, self.state['level']),
                            format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

        rng = numpy.random.RandomState(self.state['seed'])
        enc_dec = RNNEncoderDecoder(self.state, rng, skip_init=True)
        enc_dec.build()
        self.lm_model = enc_dec.create_lm_model()
        self.lm_model.load(self.args.model_path)
        self.indx_word = cPickle.load(open(self.state['word_indx'], 'rb'))

        self.sampler = None
        self.beam_search = None
        if self.args.beam_search:
            self.beam_search = BeamSearch(enc_dec)
            self.beam_search.compile()
        else:
            self.sampler = enc_dec.create_sampler(many_samples=True)

        self.idict_src = cPickle.load(open(self.state['indx_word'], 'r'))
Exemplo n.º 4
0
    def __init__(self, args):
        self.args = args
        self.state = prototype_state()
        with open(self.args.state) as src:
            self.state.update(cPickle.load(src))
        self.state.update(eval("dict({})".format(self.args.changes)))

        logging.basicConfig(level=getattr(logging, self.state['level']),
                            format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

        rng = numpy.random.RandomState(self.state['seed'])
        enc_dec = RNNEncoderDecoder(self.state, rng, skip_init=True)
        enc_dec.build()
        self.lm_model = enc_dec.create_lm_model()
        self.lm_model.load(self.args.model_path)
        self.indx_word = cPickle.load(open(self.state['word_indx'], 'rb'))

        self.sampler = None
        self.beam_search = None
        if self.args.beam_search:
            self.beam_search = BeamSearch(enc_dec)
            self.beam_search.compile()
        else:
            self.sampler = enc_dec.create_sampler(many_samples=True)

        self.idict_src = cPickle.load(open(self.state['indx_word'], 'r'))
Exemplo n.º 5
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])

    enc_dec = RNNEncoderDecoder(state,
                                rng,
                                skip_init=True,
                                compute_alignment=True)

    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)

    alignment_fun = enc_dec.create_probs_computer(return_alignment=True)

    word_indx_src = cPickle.load(open(state['word_indx'], 'rb'))
    word_indx_trg = cPickle.load(open(state['word_indx_trgt'], 'rb'))
    source_file = args.source
    target_file = args.target
    output_file = args.output

    comput_alignment(source_file, target_file, output_file, alignment_fun,
                     word_indx_src, word_indx_trg, state)
Exemplo n.º 6
0
    def __init__(self):
        # para setting
        self.arg_state = 'search_state.pkl'
        self.arg_changes = ""
        self.arg_model_path = 'search_model.npz'
        self.arg_beam_search = True
        self.arg_ignore_unk = False
        self.arg_normalize = False

        self.state = prototype_state()
        with open(self.arg_state) as src:
            self.state.update(cPickle.load(src))
        self.state.update(eval("dict({})".format(self.arg_changes)))

        logging.basicConfig(
            level=getattr(logging, self.state['level']),
            format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

        rng = numpy.random.RandomState(self.state['seed'])
        self.enc_dec = RNNEncoderDecoder(self.state, rng, skip_init=True)
        self.enc_dec.build()
        self.lm_model = self.enc_dec.create_lm_model()
        self.lm_model.load(self.arg_model_path)
        self.indx_word = cPickle.load(open(self.state['word_indx'], 'rb'))

        self.beam_search = None
        self.beam_search = BeamSearch(self.enc_dec)
        self.beam_search.compile()

        self.idict_src = cPickle.load(open(self.state['indx_word'], 'r'))
Exemplo n.º 7
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state["level"]), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s"
    )

    server_address = ("", args.port)
    httpd = BaseHTTPServer.HTTPServer(server_address, MTReqHandler)

    rng = numpy.random.RandomState(state["seed"])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state["word_indx"], "rb"))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state["indx_word"], "r"))

    tokenizer_cmd = [os.getcwd() + "/tokenizer.perl", "-l", "en", "-q", "-"]
    detokenizer_cmd = [os.getcwd() + "/detokenizer.perl", "-l", "fr", "-q", "-"]
    sampler = Sampler(
        state,
        lm_model,
        indx_word,
        idict_src,
        beam_search=beam_search,
        tokenizer_cmd=tokenizer_cmd,
        detokenizer_cmd=detokenizer_cmd,
    )
    httpd.sampler = sampler

    print "Server starting.."
    httpd.serve_forever()

    """
Exemplo n.º 8
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    server_address = ('', args.port)
    httpd = BaseHTTPServer.HTTPServer(server_address, MTReqHandler)

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'], 'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'], 'r'))

    tokenizer_cmd = [os.getcwd() + '/tokenizer.perl', '-l', 'en', '-q', '-']
    detokenizer_cmd = [
        os.getcwd() + '/detokenizer.perl', '-l', 'fr', '-q', '-'
    ]
    sampler = Sampler(state,
                      lm_model,
                      indx_word,
                      idict_src,
                      beam_search=beam_search,
                      tokenizer_cmd=tokenizer_cmd,
                      detokenizer_cmd=detokenizer_cmd)
    httpd.sampler = sampler

    print 'Server starting..'
    httpd.serve_forever()
    '''
Exemplo n.º 9
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
Exemplo n.º 10
0
def main():
    args = parse_args()
    state = prototype_state()

    with open(args.state) as src:
        state.update(cPickle.load(src))

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)

    scoreMaker = ScoreMaker(enc_dec)
    ScoreMaker.compile()

    indx_word_src = cPickle.load(open(state['word_indx'],'rb'))
    indx_word_trg = cPickle.load(open(state['word_indx_trgt'],'rb'))

    idict_src = cPickle.load(open(state['indx_word'],'r'))
    idict_trg = cPickle.load(open(state['indx_word_target'],'r'))

    fsrc = open(args.source, 'r')
    ftrg = open(args.target, 'r')
    for srcline, trgline in zip(fsrc, ftrg):
        src_seqin = srcline.strip()
        trg_seqin = trgline.strip()
        src_seq, src_parsed_in = parse_input(state, 
                                             indx_word_src, 
                                             src_seqin, 
                                             idx2word=idict_src)
        trg_seq, trg_parsed_in = parse_input(state, 
                                             indx_word_trg, 
                                             trg_seqin, 
                                             idx2word=idict_trg)
        print "Parsed Input:", src_parsed_in

        ScoreMaker.score(lm_model, src_seq, trg_seq, idict_src, idict_trg)

    fsrc.close()
    ftrg.close()
Exemplo n.º 11
0
def main():
    args = parse_args()
    state = prototype_state()

    with open(args.state) as src:
        state.update(cPickle.load(src))

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)

    ScoreMaker = ScoreMaker(enc_dec)
    ScoreMaker.compile()

    indx_word_src = cPickle.load(open(state['word_indx'], 'rb'))
    indx_word_trg = cPickle.load(open(state['word_indx_trgt'], 'rb'))

    idict_src = cPickle.load(open(state['indx_word'], 'r'))
    idict_trg = cPickle.load(open(state['indx_word_target'], 'r'))

    fsrc = open(args.source, 'r')
    ftrg = open(args.target, 'r')
    for srcline, trgline in zip(fsrc, ftrg):
        src_seqin = srcline.strip()
        trg_seqin = trgline.strip()
        src_seq, src_parsed_in = parse_input(state,
                                             indx_word_src,
                                             src_seqin,
                                             idx2word=idict_src)
        trg_seq, trg_parsed_in = parse_input(state,
                                             indx_word_trg,
                                             trg_seqin,
                                             idx2word=idict_trg)
        print "Parsed Input:", src_parsed_in

        ScoreMaker.score(lm_model, src_seq, trg_seq, idict_src, idict_trg)

    fsrc.close()
    ftrg.close()
Exemplo n.º 12
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    state['sort_k_batches'] = 1  # which means don't sort
    state['shuffle'] = False
    state['use_infinite_loop'] = False
    state['force_enc_repr_cpu'] = False

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state,
                                rng,
                                skip_init=True,
                                compute_alignment=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)

    indx_word_src = cPickle.load(open(state['word_indx'], 'rb'))
    indx_word_trgt = cPickle.load(open(state['word_indx_trgt'], 'rb'))

    if args.mode == "batch":
        data_given = args.src or args.trg
        txt = data_given and not (args.src.endswith(".h5")
                                  and args.trg.endswith(".h5"))
        if data_given and not txt:
            state['source'] = [args.src]
            state['target'] = [args.trg]
        if not data_given and not txt:
            logger.info("Using the training data")
        if txt:
            data_iter = BatchBiTxtIterator(state,
                                           args.src,
                                           indx_word_src,
                                           args.trg,
                                           indx_word_trgt,
                                           state['bs'],
                                           raise_unk=not args.allow_unk)
            data_iter.start()
        else:
            data_iter = get_batch_iterator(state)
            data_iter.start(0)

        score_file = open(args.scores, "w") if args.scores else sys.stdout

        scorer = enc_dec.create_scorer(batch=True)

        count = 0
        n_samples = 0
        logger.info('Scoring phrases')
        for i, batch in enumerate(data_iter):
            if batch == None:
                continue
            if args.n_batches >= 0 and i == args.n_batches:
                break

            if args.y_noise:
                y = batch['y']
                random_words = numpy.random.randint(0, 100,
                                                    y.shape).astype("int64")
                change_mask = numpy.random.binomial(1, args.y_noise,
                                                    y.shape).astype("int64")
                y = change_mask * random_words + (1 - change_mask) * y
                batch['y'] = y

            st = time.time()
            [scores] = scorer(batch['x'], batch['y'], batch['x_mask'],
                              batch['y_mask'])
            if args.print_probs:
                scores = numpy.exp(scores)
            up_time = time.time() - st
            for s in scores:
                print >> score_file, "{:.5e}".format(float(s))

            n_samples += batch['x'].shape[1]
            count += 1

            if count % 100 == 0:
                score_file.flush()
                logger.debug("Scores flushed")
            logger.debug(
                "{} batches, {} samples, {} per sample; example scores: {}".
                format(count, n_samples, up_time / scores.shape[0],
                       scores[:5]))

        logger.info("Done")
        score_file.flush()
    elif args.mode == "interact":
        scorer = enc_dec.create_scorer()
        while True:
            try:
                compute_probs = enc_dec.create_probs_computer()
                src_line = raw_input('Source sequence: ')
                trgt_line = raw_input('Target sequence: ')
                src_seq = parse_input(state,
                                      indx_word_src,
                                      src_line,
                                      raise_unk=not args.allow_unk,
                                      unk_sym=state['unk_sym_source'],
                                      null_sym=state['null_sym_source'])
                trgt_seq = parse_input(state,
                                       indx_word_trgt,
                                       trgt_line,
                                       raise_unk=not args.allow_unk,
                                       unk_sym=state['unk_sym_target'],
                                       null_sym=state['null_sym_target'])
                print "Binarized source: ", src_seq
                print "Binarized target: ", trgt_seq
                probs = compute_probs(src_seq, trgt_seq)
                print "Probs: {}, cost: {}".format(
                    probs, -numpy.sum(numpy.log(probs)))
            except Exception:
                traceback.print_exc()
    elif args.mode == "txt":
        assert args.src and args.trg
        scorer = enc_dec.create_scorer()
        src_file = open(args.src, "r")
        trg_file = open(args.trg, "r")
        compute_probs = enc_dec.create_probs_computer(return_alignment=True)
        try:
            numpy.set_printoptions(precision=3, linewidth=150, suppress=True)
            i = 0
            while True:
                src_line = next(src_file).strip()
                trgt_line = next(trg_file).strip()
                src_seq, src_words = parse_input(
                    state,
                    indx_word_src,
                    src_line,
                    raise_unk=not args.allow_unk,
                    unk_sym=state['unk_sym_source'],
                    null_sym=state['null_sym_source'])
                trgt_seq, trgt_words = parse_input(
                    state,
                    indx_word_trgt,
                    trgt_line,
                    raise_unk=not args.allow_unk,
                    unk_sym=state['unk_sym_target'],
                    null_sym=state['null_sym_target'])
                probs, alignment = compute_probs(src_seq, trgt_seq)
                if args.verbose:
                    print "Probs: ", probs.flatten()
                    if alignment.ndim == 3:
                        print "Alignment:".ljust(20), src_line, "<eos>"
                        for i, word in enumerate(trgt_words):
                            print "{}{}".format(word.ljust(20), alignment[i, :,
                                                                          0])
                        print "Generated by:"
                        for i, word in enumerate(trgt_words):
                            j = numpy.argmax(alignment[i, :, 0])
                            print "{} <--- {}".format(
                                word, src_words[j]
                                if j < len(src_words) else "<eos>")
                i += 1
                if i % 100 == 0:
                    sys.stdout.flush()
                    logger.debug(i)
                print -numpy.sum(numpy.log(probs))
        except StopIteration:
            pass
    else:
        raise Exception("Unknown mode {}".format(args.mode))
Exemplo n.º 13
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))

    if args.config:
        state.update(eval(open(args.config).read()))

    if args.weights: state['weights'] = args.weights
    if args.lm_file: state['lm_file'] = args.lm_file
    if args.lm_vocab: state['lm_vocab'] = args.lm_vocab
    if args.pt_file: state['phrase_table'] = args.pt_file
    if args.lm_ngram: state['lm_ngram'] = args.lm_ngram

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])

    enc_dec = RNNEncoderDecoder(state,
                                rng,
                                skip_init=True,
                                compute_alignment=True)

    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'], 'rb'))
    idict_src = cPickle.load(open(state['indx_word'], 'r'))
    trg_idx2word = cPickle.load(open(state['indx_word_target'], 'r'))
    trg_word2idx = cPickle.load(open(state['word_indx_trgt'], 'r'))

    #0:UNK_tm_value 1:rnn_weight 2:lm_weight 3:tm_weight 4:word_penalty_weight
    fea_weights = map(float, state['weights'].split(','))
    beam_search = BeamSearch(enc_dec, trg_idx2word, trg_word2idx, indx_word)
    beam_search.compile()
    beam_search.init_features(state, fea_weights)
    #beam_search.init_lm(state['lm_vocab'], state['lm_file'], ngram=int(state['lm_ngram']), weight=fea_weights[2])
    #beam_search.init_tm(state['phrase_table'], weights=fea_weights[3:])

    fsrc = open(args.source, 'r')
    ftrans = open(args.trans, 'w')

    start_time = time.time()

    n_samples = args.beam_size
    total_cost = 0.0
    logging.debug("Beam size: {}".format(n_samples))
    for i, line in enumerate(fsrc):
        seqin = line.strip()
        seq, parsed_in = parse_input(state,
                                     indx_word,
                                     seqin,
                                     idx2word=idict_src)
        if args.verbose:
            print >> sys.stderr, "Parsed Input:", parsed_in
        trans, costs, trans_ids, aligns, lm_costs, tm_costs, unk_nums, rnn_costs = sample(
            lm_model,
            seqin,
            seq,
            n_samples,
            beam_search=beam_search,
            ignore_unk=args.ignore_unk,
            normalize=args.normalize)
        #for (i, t) in enumerate(trans):
        #    costs[i] = costs[i] / len(t)
        best = numpy.argmin(costs)
        align_str = []
        for (idx, _a) in enumerate(aligns[best]):
            align_str.append("[%s]" % ' '.join(map(str, _a)))

        if args.nbest:
            nbest_trans = trans
            nbest_costs = costs
            nbest_lm_costs = lm_costs
            nbest_tm_costs = tm_costs
            nbest_unk_nums = unk_nums
            nbest_rnn_costs = rnn_costs
            nbest_trans = numpy.array(nbest_trans)[numpy.argsort(nbest_costs)]
            nbest_lm_costs = numpy.array(nbest_lm_costs)[numpy.argsort(
                nbest_costs)]
            nbest_tm_costs = numpy.array(nbest_tm_costs)[numpy.argsort(
                nbest_costs)]
            nbest_unk_nums = numpy.array(nbest_unk_nums)[numpy.argsort(
                nbest_costs)]
            nbest_rnn_costs = numpy.array(nbest_rnn_costs)[numpy.argsort(
                nbest_costs)]
            nbest_costs = numpy.array(sorted(nbest_costs))

            for (t, lm, tm, c, u, r) in zip(nbest_trans, nbest_lm_costs,
                                            nbest_tm_costs, nbest_costs,
                                            nbest_unk_nums, nbest_rnn_costs):
                sum_lm = numpy.sum(lm)
                sum_unk = numpy.sum(u)
                sum_tm = numpy.sum(tm)
                rnn_cost = numpy.sum(r)
                sum_wp = len(t.split(' ')) + 1
                #rnn_cost = c - sum_lm * beam_search.weight_lm - sum_tm * beam_search.weight_tm - sum_wp * beam_search.weight_wp
                pure_tm = sum_tm + sum_unk * beam_search.unk_tm_value
                #rnn_cost = sum_rnn / beam_search.weight_rnn
                #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, c, rnn_cost, sum_lm, sum_tm, sum_wp)
                #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.weight_tm, -rnn_cost, -sum_lm, -pure_tm, -sum_wp)
                print >> ftrans, "%s ||| %f %f %f %f ||| 0" % (
                    t, -rnn_cost, -sum_lm, -sum_tm, -sum_wp)
                if args.verbose:
                    print >>sys.stderr, "%s ||| %f %f %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.unk_tm_value * beam_search.weight_tm,\
                                                            -rnn_cost * beam_search.weight_rnn, \
                                                            -sum_lm * beam_search.weight_lm, \
                                                            -pure_tm * beam_search.weight_tm, \
                                                            -sum_tm * beam_search.weight_tm, \
                                                            -sum_wp * beam_search.weight_wp, c)
            print >> ftrans, ''
            #nbest_str = ' ||| '.join("%s | %f" % (t, c) for (t, c) in zip(nbest_trans, nbest_costs))
            #out_str += "\t" + nbest_str
        else:
            out_str = trans[best]
            if args.alignment:
                out_str += "\t" + ' '.join(align_str)
            if args.show_unk:
                best_ids = trans_ids[best]
                unk_ids = []
                for (i, idx) in enumerate(best_ids):
                    if idx == beam_search.unk_id:
                        unk_ids.append(i)
                out_str += "\t" + ' '.join(map(str, unk_ids))

            print >> ftrans, out_str

        if args.verbose:
            print "[Translation]%s\t[Align]%s" % (trans[best],
                                                  ' '.join(align_str))
        total_cost += costs[best]
        if (i + 1) % 100 == 0:
            ftrans.flush()
            logger.debug("Current speed is {} per sentence".format(
                (time.time() - start_time) / (i + 1)))
    print "Total cost of the translations: {}".format(total_cost)
    print "Total used time: {}".format(time.time() - start_time)

    fsrc.close()
    ftrans.close()
Exemplo n.º 14
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])

    ###########################################################
    # by He Wei
    #enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec = RNNEncoderDecoder(state,
                                rng,
                                skip_init=True,
                                compute_alignment=True)
    ###########################################################

    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'], 'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'], 'r'))

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        #assert beam_search
        #assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        #n_samples = args.beam_size
        total_cost = 0.0
        #logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state,
                                         indx_word,
                                         seqin,
                                         idx2word=idict_src)
            if args.verbose:
                print "Parsed Input:", parsed_in

            if args.beam_search:
                trans, costs, _, aligns = sample(lm_model,
                                                 seq,
                                                 args.beam_size,
                                                 sampler=sampler,
                                                 beam_search=beam_search,
                                                 ignore_unk=args.ignore_unk,
                                                 normalize=args.normalize)
            else:
                trans, costs, _, aligns = sample(lm_model,
                                                 seq,
                                                 1,
                                                 sampler=sampler,
                                                 beam_search=beam_search,
                                                 ignore_unk=args.ignore_unk,
                                                 normalize=args.normalize)
            best = numpy.argmin(costs)
            out_str = trans[best]
            align_str = []

            if args.beam_search and args.alignment:
                for (idx, _a) in enumerate(aligns[best]):
                    align_str.append("[%s]" % ' '.join(map(str, _a)))
                    #align_str.append("[%d-%d:%f,%d-%d:%f]" % (idx, _a[0], _a[1], idx, _a[2], _a[3]))
                out_str += "\t" + ' '.join(align_str)

            if args.beam_search and args.nbest:
                nbest_trans = trans
                nbest_costs = costs
                nbest_trans = numpy.array(nbest_trans)[numpy.argsort(
                    nbest_costs)]
                nbest_costs = numpy.array(sorted(nbest_costs))
                nbest_str = ' ||| '.join(
                    "%s | %f" % (t, c)
                    for (t, c) in zip(nbest_trans, nbest_costs))
                out_str += "\t" + nbest_str

            print >> ftrans, out_str

            if args.verbose:
                print "[Translation]%s\t[Align]%s" % (trans[best],
                                                      ' '.join(align_str))
            total_cost += costs[best]
            if (i + 1) % 100 == 0:
                ftrans.flush()
                logger.debug("Current speed is {} per sentence".format(
                    (time.time() - start_time) / (i + 1)))
        print "Total cost of the translations: {}".format(total_cost)
        print "Total used time: {}".format(time.time() - start_time)

        fsrc.close()
        ftrans.close()
    else:
        while True:
            try:
                seqin = raw_input('Input Sequence: ')
                n_samples = int(raw_input('How many samples? '))
                alpha = None
                if not args.beam_search:
                    alpha = float(raw_input('Inverse Temperature? '))
                seq, parsed_in = parse_input(state,
                                             indx_word,
                                             seqin,
                                             idx2word=idict_src)
                print "Parsed Input:", parsed_in
            except Exception:
                print "Exception while parsing your input:"
                traceback.print_exc()
                continue

            sample(lm_model,
                   seq,
                   n_samples,
                   sampler=sampler,
                   beam_search=beam_search,
                   ignore_unk=args.ignore_unk,
                   normalize=args.normalize,
                   alpha=alpha,
                   verbose=True)
Exemplo n.º 15
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))

    if args.config:
        state.update(eval(open(args.config).read()))

    if args.weights: state['weights'] = args.weights
    if args.lm_file: state['lm_file'] = args.lm_file
    if args.lm_vocab: state['lm_vocab'] = args.lm_vocab
    if args.pt_file: state['phrase_table'] = args.pt_file
    if args.lm_ngram: state['lm_ngram'] = args.lm_ngram

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])

    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True)

    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'],'rb'))
    idict_src = cPickle.load(open(state['indx_word'],'r'))
    trg_idx2word = cPickle.load(open(state['indx_word_target'],'r'))
    trg_word2idx = cPickle.load(open(state['word_indx_trgt'],'r'))

    #0:UNK_tm_value 1:rnn_weight 2:lm_weight 3:tm_weight 4:word_penalty_weight
    fea_weights = map(float, state['weights'].split(','))
    beam_search = BeamSearch(enc_dec, trg_idx2word, trg_word2idx, indx_word)
    beam_search.compile()
    beam_search.init_features(state, fea_weights)
    #beam_search.init_lm(state['lm_vocab'], state['lm_file'], ngram=int(state['lm_ngram']), weight=fea_weights[2])
    #beam_search.init_tm(state['phrase_table'], weights=fea_weights[3:])

    fsrc = open(args.source, 'r')
    ftrans = open(args.trans, 'w')

    start_time = time.time()

    n_samples = args.beam_size
    total_cost = 0.0
    logging.debug("Beam size: {}".format(n_samples))
    for i, line in enumerate(fsrc):
        seqin = line.strip()
        seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
        if args.verbose:
            print >> sys.stderr, "Parsed Input:", parsed_in
        trans, costs, trans_ids, aligns, lm_costs, tm_costs, unk_nums, rnn_costs = sample(lm_model, seqin, seq, n_samples,
                beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
        #for (i, t) in enumerate(trans):
        #    costs[i] = costs[i] / len(t)
        best = numpy.argmin(costs)
        align_str = []
        for (idx, _a) in enumerate(aligns[best]):
            align_str.append("[%s]" % ' '.join(map(str, _a)))

        if args.nbest:
            nbest_trans = trans
            nbest_costs = costs
            nbest_lm_costs = lm_costs
            nbest_tm_costs = tm_costs
            nbest_unk_nums = unk_nums
            nbest_rnn_costs = rnn_costs
            nbest_trans = numpy.array(nbest_trans)[numpy.argsort(nbest_costs)]
            nbest_lm_costs = numpy.array(nbest_lm_costs)[numpy.argsort(nbest_costs)]
            nbest_tm_costs = numpy.array(nbest_tm_costs)[numpy.argsort(nbest_costs)]
            nbest_unk_nums = numpy.array(nbest_unk_nums)[numpy.argsort(nbest_costs)]
            nbest_rnn_costs = numpy.array(nbest_rnn_costs)[numpy.argsort(nbest_costs)]
            nbest_costs = numpy.array(sorted(nbest_costs))

            for (t, lm, tm, c, u, r) in zip(nbest_trans, nbest_lm_costs, nbest_tm_costs, nbest_costs, nbest_unk_nums, nbest_rnn_costs):
                sum_lm = numpy.sum(lm)
                sum_unk = numpy.sum(u)
                sum_tm = numpy.sum(tm)
                rnn_cost = numpy.sum(r)
                sum_wp = len(t.split(' ')) + 1
                #rnn_cost = c - sum_lm * beam_search.weight_lm - sum_tm * beam_search.weight_tm - sum_wp * beam_search.weight_wp
                pure_tm = sum_tm + sum_unk * beam_search.unk_tm_value
                #rnn_cost = sum_rnn / beam_search.weight_rnn
                #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, c, rnn_cost, sum_lm, sum_tm, sum_wp)
                #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.weight_tm, -rnn_cost, -sum_lm, -pure_tm, -sum_wp)
                print >> ftrans, "%s ||| %f %f %f %f ||| 0" % (t, -rnn_cost, -sum_lm, -sum_tm, -sum_wp)
                if args.verbose:
                    print >>sys.stderr, "%s ||| %f %f %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.unk_tm_value * beam_search.weight_tm,\
                                                            -rnn_cost * beam_search.weight_rnn, \
                                                            -sum_lm * beam_search.weight_lm, \
                                                            -pure_tm * beam_search.weight_tm, \
                                                            -sum_tm * beam_search.weight_tm, \
                                                            -sum_wp * beam_search.weight_wp, c)
            print >> ftrans, ''
            #nbest_str = ' ||| '.join("%s | %f" % (t, c) for (t, c) in zip(nbest_trans, nbest_costs))
            #out_str += "\t" + nbest_str
        else:
            out_str = trans[best]
            if args.alignment:
                out_str += "\t" + ' '.join(align_str)
            if args.show_unk:
                best_ids = trans_ids[best]
                unk_ids = []
                for (i, idx) in enumerate(best_ids):
                    if idx == beam_search.unk_id:
                        unk_ids.append(i)
                out_str += "\t" + ' '.join(map(str, unk_ids))

            print >>ftrans, out_str

        if args.verbose:
            print "[Translation]%s\t[Align]%s" % (trans[best], ' '.join(align_str))
        total_cost += costs[best]
        if (i + 1)  % 100 == 0:
            ftrans.flush()
            logger.debug("Current speed is {} per sentence".
                    format((time.time() - start_time) / (i + 1)))
    print "Total cost of the translations: {}".format(total_cost)
    print "Total used time: {}".format(time.time() - start_time)

    fsrc.close()
    ftrans.close()
Exemplo n.º 16
0
            sen = indices_to_words(lm_model.word_indxs, trans[i])
            # sentences.append(" ".join(sen))
            sentences.append(sen)
        for i in range(len(costs)):
            if verbose:
                print "{}: {}".format(costs[i], sentences[i])
        return sentences, costs, alignment






model_path = "path/to/search_model.npz"
state_file = "path/to/search_state.pkl"
states = prototype_state()
with open(state_file) as src:
states.update(cPickle.load(src))
states.update(eval("dict({})".format("")))

logging.basicConfig(level=getattr(logging, states['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

rng = numpy.random.RandomState(states['seed'])
enc_dec = RNNEncoderDecoder(states, rng, skip_init=True)
enc_dec.build()

lm_models =  enc_dec.create_lm_model()
lm_models.load(model_path)
indx_word = cPickle.load(open(states['word_indx'],'rb'))
beam_search = None
beam_search = BeamSearch(enc_dec)
Exemplo n.º 17
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    state['sort_k_batches'] = 1
    state['shuffle'] = False
    state['use_infinite_loop'] = False
    state['force_enc_repr_cpu'] = False

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)

    indx_word_src = cPickle.load(open(state['word_indx'],'rb'))
    indx_word_trgt = cPickle.load(open(state['word_indx_trgt'], 'rb'))

    if args.mode == "batch":
        data_given = args.src or args.trg
        txt = data_given and not (args.src.endswith(".h5") and args.trg.endswith(".h5"))
        if data_given and not txt:
            state['source'] = [args.src]
            state['target'] = [args.trg]
        if not data_given and not txt:
            logger.info("Using the training data")
        if txt:
            data_iter = BatchBiTxtIterator(state,
                    args.src, indx_word_src, args.trg, indx_word_trgt,
                    state['bs'], raise_unk=not args.allow_unk)
            data_iter.start()
        else:
            data_iter = get_batch_iterator(state)
            data_iter.start(0)

        score_file = open(args.scores, "w") if args.scores else sys.stdout

        scorer = enc_dec.create_scorer(batch=True)

        count = 0
        n_samples = 0
        logger.info('Scoring phrases')
        for i, batch in enumerate(data_iter):
            if batch == None:
                continue
            if args.n_batches >= 0 and i == args.n_batches:
                break

            if args.y_noise:
                y = batch['y']
                random_words = numpy.random.randint(0, 100, y.shape).astype("int64")
                change_mask = numpy.random.binomial(1, args.y_noise, y.shape).astype("int64")
                y = change_mask * random_words + (1 - change_mask) * y
                batch['y'] = y

            st = time.time()
            [scores] = scorer(batch['x'], batch['y'],
                    batch['x_mask'], batch['y_mask'])
            if args.print_probs:
                scores = numpy.exp(scores)
            up_time = time.time() - st
            for s in scores:
                print >>score_file, "{:.5e}".format(float(s))

            n_samples += batch['x'].shape[1]
            count += 1

            if count % 100 == 0:
                score_file.flush()
                logger.debug("Scores flushed")
            logger.debug("{} batches, {} samples, {} per sample; example scores: {}".format(
                count, n_samples, up_time/scores.shape[0], scores[:5]))

        logger.info("Done")
        score_file.flush()
    elif args.mode == "interact":
        scorer = enc_dec.create_scorer()
        while True:
            try:
                compute_probs = enc_dec.create_probs_computer()
                src_line = raw_input('Source sequence: ')
                trgt_line = raw_input('Target sequence: ')
                src_seq = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk, 
                                      unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source'])
                trgt_seq = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk,
                                       unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target'])
                print "Binarized source: ", src_seq
                print "Binarized target: ", trgt_seq
                probs = compute_probs(src_seq, trgt_seq)
                print "Probs: {}, cost: {}".format(probs, -numpy.sum(numpy.log(probs)))
            except Exception:
                traceback.print_exc()
    elif args.mode == "txt":
        assert args.src and args.trg
        scorer = enc_dec.create_scorer()
        src_file = open(args.src, "r")
        trg_file = open(args.trg, "r")
        compute_probs = enc_dec.create_probs_computer(return_alignment=True)
        try:
            numpy.set_printoptions(precision=3, linewidth=150, suppress=True)
            i = 0
            while True:
                src_line = next(src_file).strip()
                trgt_line = next(trg_file).strip()
                src_seq, src_words = parse_input(state,
                        indx_word_src, src_line, raise_unk=not args.allow_unk,
                        unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source'])
                trgt_seq, trgt_words = parse_input(state,
                        indx_word_trgt, trgt_line, raise_unk=not args.allow_unk,
                        unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target'])
                probs, alignment = compute_probs(src_seq, trgt_seq)
                if args.verbose:
                    print "Probs: ", probs.flatten()
                    if alignment.ndim == 3:
                        print "Alignment:".ljust(20), src_line, "<eos>"
                        for i, word in enumerate(trgt_words):
                            print "{}{}".format(word.ljust(20), alignment[i, :, 0])
                        print "Generated by:"
                        for i, word in enumerate(trgt_words):
                            j = numpy.argmax(alignment[i, :, 0])
                            print "{} <--- {}".format(word,
                                    src_words[j] if j < len(src_words) else "<eos>")
                i += 1
                if i % 100 == 0:
                    sys.stdout.flush()
                    logger.debug(i)
                print -numpy.sum(numpy.log(probs))
        except StopIteration:
            pass
    else:
        raise Exception("Unknown mode {}".format(args.mode))
Exemplo n.º 18
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'],'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'],'r'))

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        assert beam_search
        assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        n_samples = args.beam_size
        total_cost = 0.0
        logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
            if args.verbose:
                print "Parsed Input:", parsed_in
            trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                    beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
            try:
                best = numpy.argmin(costs)
                print >>ftrans, trans[best]
                total_cost += costs[best]
            except:
                print >> ftrans, "FAIL"
            if args.verbose:
                print "Translation:", trans[best]
            if (i + 1)  % 100 == 0:
                ftrans.flush()
                logger.debug("Current speed is {} per sentence".
                        format((time.time() - start_time) / (i + 1)))
        print "Total cost of the translations: {}".format(total_cost)

        fsrc.close()
        ftrans.close()
    else:
        while True:
            try:
                seqin = raw_input('Input Sequence: ')
                n_samples = int(raw_input('How many samples? '))
                alpha = None
                if not args.beam_search:
                    alpha = float(raw_input('Inverse Temperature? '))
                seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
                print "Parsed Input:", parsed_in
            except Exception:
                print "Exception while parsing your input:"
                traceback.print_exc()
                continue

            sample(lm_model, seq, n_samples, sampler=sampler,
                    beam_search=beam_search,
                    ignore_unk=args.ignore_unk, normalize=args.normalize,
                    alpha=alpha, verbose=True)
Exemplo n.º 19
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'],'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'],'r'))

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        assert beam_search
        assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        n_samples = args.beam_size
        total_cost = 0.0
        logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
            if lm_model.maintain_coverage:
                trans, costs, coverages, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                        beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
            else:
                trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                        beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
            
            if args.verbose:
                print "Parsed Input:", parsed_in

            if len(trans) == 0:
                trans = ['Failed']
                costs = [0.0]

            best = numpy.argmin(costs)
            print >>ftrans, trans[best]
            if args.verbose:
                print "Translation:", trans[best]

                if lm_model.maintain_coverage: