Exemplo n.º 1
0
def load_scorer(model, option, alignweights=None):

    # load model parameters and set theano shared variables
    param_list = numpy.load(model).files
    param_list = dict.fromkeys(
        [key for key in param_list if not key.startswith('adam_')], 0)
    params = load_params(model, param_list)
    tparams = init_theano_params(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, option)
    inps = [x, x_mask, y, y_mask]
    use_noise.set_value(0.)

    if alignweights:
        logging.debug("Save weight mode ON, alignment matrix will be saved.")
        outputs = [cost, opt_ret['dec_alphas']]
        f_log_probs = theano.function(inps, outputs)
    else:
        f_log_probs = theano.function(inps, cost)

    return f_log_probs
Exemplo n.º 2
0
def rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):

        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_tparams(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs):
        # sample given an input sequence and obtain scores
        scores = []
        for i, f_log_probs in enumerate(fs_log_probs):
            scores.append(pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize))

        return scores

    lines = source_file.readlines()
    nbest_lines = nbest_file.readlines()

    with tempfile.NamedTemporaryFile(prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(prefix='rescore-tmpout') as tmp_out:
        for line in nbest_lines:
            linesplit = line.split(' ||| ')
            idx = int(linesplit[0])
            tmp_in.write(lines[idx])
            tmp_out.write(linesplit[1] + '\n')
        tmp_in.seek(0)
        tmp_out.seek(0)
        pairs = TextIterator(tmp_in.name, tmp_out.name,
                         options[0]['dictionaries'][0], options[0]['dictionaries'][1],
                         n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'],
                         batch_size=b,
                         maxlen=float('inf'),
                         sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after

        scores = _score(pairs)
        for i, line in enumerate(nbest_lines):
            score_str = ' '.join(map(str,[s[i] for s in scores]))
            saveto.write('{0} {1}\n'.format(line.strip(), score_str))
Exemplo n.º 3
0
def get_error(model, test_src, test_target):
    profile = False

    # reload options
    f = open('%s.pkl' % model, 'rb')
    model_options = pkl.load(f)
    logging.info(model_options)

    logging.info('Building model')
    params = init_params(model_options)

    # reload parameters
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    dict_src = os.path.join(model_options['baseDir'],
                            model_options['dictionaries'][0])
    if len(model_options['dictionaries']) == 1:
        dict_target = None
    else:
        dict_target = os.path.join(model_options['baseDir'],
                                   model_options['dictionaries'][1])

    valid = TextIterator(test_src,
                         test_target,
                         dict_src,
                         dict_target,
                         n_words_source=model_options['n_words_src'],
                         n_words_target=model_options['n_words'],
                         batch_size=model_options['valid_batch_size'],
                         maxlen=model_options['maxlen'])

    logging.info('Building f_log_probs...')
    f_log_probs = theano.function(inps, cost, profile=profile)
    valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid)
    valid_err = valid_errs.mean()
    logging.info('Valid Error:%s' % (str(valid_err)))
Exemplo n.º 4
0
def get_error(model, test_src, test_target):
    profile=False

    # reload options
    f = open('%s.pkl' % model, 'rb')
    model_options = pkl.load(f)
    logging.info(model_options)

    logging.info('Building model')
    params = init_params(model_options)

    # reload parameters
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    dict_src = os.path.join(model_options['baseDir'], model_options['dictionaries'][0])
    if len(model_options['dictionaries']) == 1:
        dict_target = None
    else:
        dict_target = os.path.join(model_options['baseDir'], model_options['dictionaries'][1])

    valid = TextIterator(test_src, test_target,
                         dict_src,
                         dict_target,
                         n_words_source=model_options['n_words_src'],
                         n_words_target=model_options['n_words'],
                         batch_size=model_options['valid_batch_size'],
                         maxlen=model_options['maxlen'])

    logging.info('Building f_log_probs...')
    f_log_probs = theano.function(inps, cost, profile=profile)
    valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
    valid_err = valid_errs.mean()
    logging.info('Valid Error:%s'% (str(valid_err)))
Exemplo n.º 5
0
def build_alignment_cg(model, options):

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # build model
    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, options)
    inps = [x, x_mask, y, y_mask]

    # compile a function and return it
    return theano.function(inps, opt_ret['dec_alphas'])
def rescore_model(source_file, nbest_file, saveto, models, options, b,
                  normalize, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):

        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys(
            [key for key in param_list if not key.startswith('adam_')], 0)
        params = load_params(model, param_list)
        tparams = init_theano_params(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            sys.stderr.write(
                "\t*** Save weight mode ON, alignment matrix will be saved.\n")
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(f_log_probs,
                                          prepare_data,
                                          options[i],
                                          pairs,
                                          normalize=normalize,
                                          alignweights=alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    lines = source_file.readlines()
    nbest_lines = nbest_file.readlines()

    if alignweights:  ### opening the temporary file.
        temp_name = saveto.name + ".json"
        align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name)

    with tempfile.NamedTemporaryFile(
            prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(
                prefix='rescore-tmpout') as tmp_out:
        for line in nbest_lines:
            linesplit = line.split(' ||| ')
            idx = int(
                linesplit[0])  ##index from the source file. Starting from 0.
            tmp_in.write(lines[idx])
            tmp_out.write(linesplit[1] + '\n')

        tmp_in.seek(0)
        tmp_out.seek(0)
        pairs = TextIterator(
            tmp_in.name,
            tmp_out.name,
            options[0]['dictionaries'][:-1],
            options[0]['dictionaries'][1],
            n_words_source=options[0]['n_words_src'],
            n_words_target=options[0]['n_words'],
            batch_size=b,
            maxlen=float('inf'),
            sort_by_length=False
        )  #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after

        scores, alignments = _score(pairs, alignweights)

        for i, line in enumerate(nbest_lines):
            score_str = ' '.join(map(str, [s[i] for s in scores]))
            saveto.write('{0} {1}\n'.format(line.strip(), score_str))

        ### optional save weights mode.
        if alignweights:
            for line in alignments:
                align_OUT.write(line + "\n")
    if alignweights:
        combine_source_target_text(source_file, nbest_file, saveto.name,
                                   align_OUT)
        align_OUT.close()
Exemplo n.º 7
0
def rescore_model(source_file, target_file, saveto, models, options, b,
                  normalization_alpha, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):

        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys(
            [key for key in param_list if not key.startswith('adam_')], 0)
        params = load_params(model, param_list)
        tparams = init_theano_params(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            logging.debug(
                "Save weight mode ON, alignment matrix will be saved.")
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(
                f_log_probs,
                prepare_data,
                options[i],
                pairs,
                normalization_alpha=normalization_alpha,
                alignweights=alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    pairs = TextIterator(
        source_file.name,
        target_file.name,
        options[0]['dictionaries'][:-1],
        options[0]['dictionaries'][-1],
        n_words_source=options[0]['n_words_src'],
        n_words_target=options[0]['n_words'],
        batch_size=b,
        maxlen=float('inf'),
        sort_by_length=False
    )  #TODO: sorting by length could be more efficient, but we'd want to resort after

    scores, alignments = _score(pairs, alignweights)

    source_file.seek(0)
    target_file.seek(0)
    source_lines = source_file.readlines()
    target_lines = target_file.readlines()

    for i, line in enumerate(target_lines):
        score_str = ' '.join(map(str, [s[i] for s in scores]))
        if verbose:
            saveto.write('{0} '.format(line.strip()))
        saveto.write('{0}\n'.format(score_str))

    ### optional save weights mode.
    if alignweights:
        ### writing out the alignments.
        temp_name = saveto.name + ".json"
        with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
            for line in all_alignments:
                align_OUT.write(line + "\n")
            ### combining the actual source and target words.
            combine_source_target_text_1to1(source_file, target_file,
                                            saveto.name, align_OUT)
Exemplo n.º 8
0
def rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):
        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_tparams(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            sys.stderr.write("\t*** Save weight mode ON, alignment matrix will be saved.\n")
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    lines = source_file.readlines()
    nbest_lines = nbest_file.readlines()

    if alignweights: ### opening the temporary file.
        temp_name = saveto.name + ".json"
        align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name)

    with tempfile.NamedTemporaryFile(prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(prefix='rescore-tmpout') as tmp_out:
        for line in nbest_lines:
            linesplit = line.split(' ||| ')
            idx = int(linesplit[0])   ##index from the source file. Starting from 0.
            tmp_in.write(lines[idx])
            tmp_out.write(linesplit[1] + '\n')

        tmp_in.seek(0)
        tmp_out.seek(0)
        pairs = TextIterator(tmp_in.name, tmp_out.name,
                        options[0]['dictionaries'][:-1], options[0]['dictionaries'][1],
                         n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'],
                         batch_size=b,
                         maxlen=float('inf'),
                         sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after


        scores, alignments = _score(pairs, alignweights)

        for i, line in enumerate(nbest_lines):
            score_str = ' '.join(map(str,[s[i] for s in scores]))
            saveto.write('{0} {1}\n'.format(line.strip(), score_str))

        ### optional save weights mode.
        if alignweights:
            for line in alignments:
                align_OUT.write(line + "\n")
    if alignweights:
        combine_source_target_text(source_file, nbest_file, saveto.name, align_OUT)
        align_OUT.close()
Exemplo n.º 9
0
def rescore_model(source_file, target_file, saveto, models, options, b, normalize, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):
        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_theano_params(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            sys.stderr.write("\t*** Save weight mode ON, alignment matrix will be saved.\n")
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    pairs = TextIterator(source_file.name, target_file.name,
                    options[0]['dictionaries'][:-1], options[0]['dictionaries'][1],
                     n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'],
                     batch_size=b,
                     maxlen=float('inf'),
                     sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd want to resort after

    scores, alignments = _score(pairs, alignweights)

    source_file.seek(0)
    target_file.seek(0)
    source_lines = source_file.readlines()
    target_lines = target_file.readlines()

    for i, line in enumerate(target_lines):
        score_str = ' '.join(map(str,[s[i] for s in scores]))
        saveto.write('{0} {1}\n'.format(line.strip(), score_str))

    ### optional save weights mode.
    if alignweights:
        ### writing out the alignments.
        temp_name = saveto.name + ".json"
        with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
            for line in all_alignments:
                align_OUT.write(line + "\n")
            ### combining the actual source and target words.
            combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT)
Exemplo n.º 10
0
def rescore_model(source_file, target_file, saveto, models, options, b,
                  normalize, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):
        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_tparams(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            print "\t*** Save weight mode ON, alignment matrix will be saved."
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            print "\t*** Save weight mode OFF, alignment matrix will not be saved."
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score_this_batch = pred_probs(f_log_probs,
                                          prepare_data,
                                          options[i],
                                          pairs,
                                          normalize=normalize,
                                          alignweights=alignweights)
            scores.append(score_this_batch)

        return scores

    pairs = TextIterator(
        source_file.name,
        target_file.name,
        options[0]['dictionaries'][0],
        options[0]['dictionaries'][1],
        n_words_source=options[0]['n_words_src'],
        n_words_target=options[0]['n_words'],
        batch_size=b,
        maxlen=float('inf'),
        sort_by_length=False
    )  #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after

    scores = _score(pairs, alignweights)

    source_file.seek(0)
    target_file.seek(0)
    source_lines = source_file.readlines()
    target_lines = target_file.readlines()

    for i, line in enumerate(target_lines):
        score_str = ' '.join(map(str, [s[i] for s in scores]))
        saveto.write('{0} {1}\n'.format(line.strip(), score_str))

    ### optional save weights mode.
    if alignweights:
        ### writing out the alignments.
        temp_name = saveto.name + ".json"
        with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
            for line in all_alignments:
                align_OUT.write(line + "\n")
            ### combining the actual source and target words.
            combine_source_target_text_1to1(source_file, target_file,
                                            saveto.name, align_OUT)
Exemplo n.º 11
0
def main(model,
         pklmodel,
         valid_datasets=['../data/dev/newstest2011.en.tok',
                          '../data/dev/newstest2011.fr.tok'],
         dictionaries=[
              '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
              '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'],
         dictionary_chunk='/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
         result_file='./cost.result'):





    # load the dictionaries of both source and target
    # load dictionaries and invert them
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        with open(dd, 'rb') as f:
            worddicts[ii] = pkl.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    # dict for chunk label
    worddict_chunk = [None]
    worddict_r_chunk = [None]
    with open(dictionary_chunk, 'rb') as f:
        worddict_chunk = pkl.load(f)
    worddict_r_chunk = dict()
    for kk, vv in worddict_chunk.iteritems():
        worddict_r_chunk[vv] = kk
    print worddict_chunk

    print 'load model model_options'
    with open('%s' % pklmodel, 'rb') as f:
        options = pkl.load(f)


    # build valid set
    valid = TrainingTextIterator(valid_datasets[0], valid_datasets[1],
                                 dictionaries[0], dictionaries[1], dictionary_chunk,
                                 n_words_source=options['n_words_src'], n_words_target=options['n_words'],
                                 batch_size=options['batch_size'],
                                 max_chunk_len=options['maxlen_chunk'], max_word_len=options['maxlen_chunk_words'])


    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
    x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator, \
    opt_ret, \
    cost, cost_cw= \
        build_model(tparams, options)


    inps = [x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator]



    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=False)
    f_log_probs_cw = theano.function(inps, cost_cw, profile=False)
    print 'Done'

    valid_errs, valid_errs_cw = pred_probs(f_log_probs, f_log_probs_cw, prepare_training_data,
                                            options, valid)

    valid_err = valid_errs.mean()
    valid_err_cw = valid_errs_cw.mean()

    with open(result_file, 'w') as result_file:
        print >> result_file, valid_err, valid_err_cw
Exemplo n.º 12
0
def main(model, dictionary, dictionary_target, source, target, outfile,
         wordbyword):

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)
    """
    # load source dictionary and invert
    with open(dictionary, 'rb') as f:
        word_dict = pkl.load(f)
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # load target dictionary and invert
    with open(dictionary_target, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'
    """
    valid_noshuf = TextIterator(source,
                                target,
                                dictionary,
                                dictionary_target,
                                n_words_source=options['n_words_src'],
                                n_words_target=options['n_words'],
                                batch_size=options['valid_batch_size'],
                                maxlen=2000,
                                shuffle=False)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost, cost_ = \
        build_model(tparams, options)

    inps = [x, x_mask, y, y_mask]

    if wordbyword:
        f_log_probs = theano.function(inps, cost_, profile=profile)
        valid_errs = pred_probs(f_log_probs,
                                prepare_data,
                                options,
                                valid_noshuf,
                                verbose=True,
                                as_list=True)
        with open(outfile, 'wb') as f:
            pkl.dump(valid_errs, f, pkl.HIGHEST_PROTOCOL)
    else:
        f_log_probs = theano.function(inps, cost, profile=profile)
        valid_errs = pred_probs(f_log_probs,
                                prepare_data,
                                options,
                                valid_noshuf,
                                verbose=True)
        numpy.save(outfile, valid_errs)