Exemplo n.º 1
0
def sample_targets(input_phrase, model, n_samples, reverse_score, normalize):

    [lm_model, enc_dec, indx_word_src, indx_word_trgt, state, \
            lm_model_fr_2_en, enc_dec_fr_2_en, state_fr2en] = model

    beam_search = BeamSearch(enc_dec)
    beam_search.compile()
    sampler = enc_dec.create_sampler(many_samples=True)

    #sample_func can take argument : normalize (bool)
    trans, scores, trans_bin = cached_sample_func(lm_model,
                                                  input_phrase,
                                                  n_samples,
                                                  sampler=sampler,
                                                  beam_search=beam_search)

    #Reordering scores-trans
    #Warning : selection of phrases to rescore is hard-coded
    trans = [tra for (sco, tra) in sorted(zip(scores, trans))][0:10]
    trans_bin = [tra_bin
                 for (sco, tra_bin) in sorted(zip(scores, trans_bin))][0:10]
    scores = sorted(scores)[0:10]

    #Reverse scoring of selected phrases
    if reverse_score:
        reverse_scorer = enc_dec_fr_2_en.create_scorer(batch=True)

        source_phrases_to_reverse_score = []
        target_phrases_to_reverse_score = []
        for tra_bin in trans_bin:
            source_phrases_to_reverse_score.append(input_phrase)
            target_phrases_to_reverse_score.append(tra_bin)

        state_fr2en['seqlen'] = 1000
        x, x_mask, y, y_mask = create_padded_batch(
            state_fr2en, [numpy.asarray(target_phrases_to_reverse_score)],
            [numpy.asarray(source_phrases_to_reverse_score)])

        reverse_scores = -reverse_scorer(
            numpy.atleast_2d(x), numpy.atleast_2d(y), numpy.atleast_2d(x_mask),
            numpy.atleast_2d(y_mask))[0]

        for index in xrange(len(scores)):
            scores[index] = (scores[index] + reverse_scores[index]) / 2.

    else:
        for index in xrange(len(scores)):
            scores[index] = scores[index]

    trans = trans[numpy.argmin(scores)]
    score = numpy.min(scores)

    if normalize == False:
        final_score = score
    else:
        final_score = score / numpy.log(len(input_phrase) + 1)

    return trans, final_score
Exemplo n.º 2
0
def sample_targets(input_phrase, model, n_samples, reverse_score, normalize):

    [lm_model, enc_dec, indx_word_src, indx_word_trgt, state, \
            lm_model_fr_2_en, enc_dec_fr_2_en, state_fr2en] = model

    beam_search = BeamSearch(enc_dec)
    beam_search.compile()
    sampler = enc_dec.create_sampler(many_samples=True)

    #sample_func can take argument : normalize (bool)
    trans, scores, trans_bin = cached_sample_func(lm_model, input_phrase, n_samples,
                                           sampler=sampler, beam_search=beam_search)

    #Reordering scores-trans
    #Warning : selection of phrases to rescore is hard-coded
    trans = [tra for (sco, tra) in sorted(zip(scores, trans))][0:10]
    trans_bin = [tra_bin for (sco, tra_bin) in sorted(zip(scores, trans_bin))][0:10]
    scores = sorted(scores)[0:10]

    #Reverse scoring of selected phrases
    if reverse_score:
        reverse_scorer = enc_dec_fr_2_en.create_scorer(batch=True)

        source_phrases_to_reverse_score = []
        target_phrases_to_reverse_score = []
        for tra_bin in trans_bin:
            source_phrases_to_reverse_score.append(input_phrase)
            target_phrases_to_reverse_score.append(tra_bin)

        state_fr2en['seqlen'] = 1000
        x, x_mask, y, y_mask = create_padded_batch(
                                    state_fr2en,
                                    [numpy.asarray(target_phrases_to_reverse_score)],
                                    [numpy.asarray(source_phrases_to_reverse_score)])

        reverse_scores = - reverse_scorer(numpy.atleast_2d(x), numpy.atleast_2d(y),
                                          numpy.atleast_2d(x_mask),
                                          numpy.atleast_2d(y_mask))[0]

        for index in xrange(len(scores)):
            scores[index] = (scores[index] + reverse_scores[index]) / 2.

    else:
        for index in xrange(len(scores)):
            scores[index] = scores[index]

    trans = trans[numpy.argmin(scores)]
    score = numpy.min(scores)

    if normalize == False:
        final_score = score
    else:
        final_score = score / numpy.log(len(input_phrase) + 1)

    return trans, final_score
Exemplo n.º 3
0
def main():
    args = parse_args()

    # this loads the state specified in the prototype
    state = getattr(experiments.nmt, args.proto)()
    # this is based on the suggestion in the README.md in this foloder
    if args.state:
        if args.state.endswith(".py"):
            state.update(eval(open(args.state).read()))
        else:
            with open(args.state) as src:
                state.update(cPickle.load(src))
    for change in args.changes:
        state.update(eval("dict({})".format(change)))

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")
    logger.debug("State:\n{}".format(pprint.pformat(state)))

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, args.skip_init)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()

    # If we are going to use validation with the bleu script, we
    # will need early stopping
    bleu_validator = None
    if state['bleu_script'] is not None and state['validation_set'] is not None\
        and state['validation_set_grndtruth'] is not None:
        # make beam search
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
        bleu_validator = BleuValidator(state, lm_model, beam_search, verbose=state['output_validation_set'])

    logger.debug("Load data")
    train_data = get_batch_iterator(state)
    logger.debug("Compile trainer")

    algo = eval(state['algo'])(lm_model, state, train_data)
    logger.debug("Run training")

    main = MainLoop(train_data, None, None, lm_model, algo, state, None,
            reset=state['reset'],
            bleu_val_fn = bleu_validator,
            hooks=[RandomSamplePrinter(state, lm_model, train_data)]
                if state['hookFreq'] >= 0 #and state['validation_set'] is not None
                else None)

    if state['reload']:
        main.load()
    if state['loopIters'] > 0:
        main.main()
Exemplo n.º 4
0
def main():
    args = parse_args()

    state = prototype_phrase_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    server_address = ('', args.port)
    httpd = ThreadedHTTPServer(server_address, MTReqHandler)
    #httpd = BaseHTTPServer.HTTPServer(server_address, MTReqHandler)

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'], 'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'], 'r'))

    tokenizer_cmd = [os.getcwd() + '/tokenizer.perl', '-l', 'en', '-q', '-']
    detokenizer_cmd = [
        os.getcwd() + '/detokenizer.perl', '-l', 'fr', '-q', '-'
    ]
    sampler = Sampler(state,
                      lm_model,
                      indx_word,
                      idict_src,
                      beam_search=beam_search,
                      tokenizer_cmd=tokenizer_cmd,
                      detokenizer_cmd=detokenizer_cmd)
    httpd.sampler = sampler

    print 'Server starting..'
    httpd.serve_forever()
    '''
Exemplo n.º 5
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state["level"]), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s"
    )

    server_address = ("", args.port)
    httpd = BaseHTTPServer.HTTPServer(server_address, MTReqHandler)

    rng = numpy.random.RandomState(state["seed"])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state["word_indx"], "rb"))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state["indx_word"], "r"))

    tokenizer_cmd = [os.getcwd() + "/tokenizer.perl", "-l", "en", "-q", "-"]
    detokenizer_cmd = [os.getcwd() + "/detokenizer.perl", "-l", "fr", "-q", "-"]
    sampler = Sampler(
        state,
        lm_model,
        indx_word,
        idict_src,
        beam_search=beam_search,
        tokenizer_cmd=tokenizer_cmd,
        detokenizer_cmd=detokenizer_cmd,
    )
    httpd.sampler = sampler

    print "Server starting.."
    httpd.serve_forever()

    """