示例#1
0
文件: score.py 项目: tomekd/nematus
def rescore_model(source_file, target_file, saveto, models, options, b, normalization_alpha, verbose, alignweights):

    trng = RandomStreams(1234)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, model in enumerate(models):
            f_log_probs = load_scorer(model, options[i], alignweights=alignweights)
            score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalization_alpha=normalization_alpha, alignweights = alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    pairs = TextIterator(source_file.name, target_file.name,
                    options[0]['dictionaries'][:-1], options[0]['dictionaries'][-1],
                     n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'],
                     batch_size=b,
                     maxlen=float('inf'),
                     sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd want to resort after

    scores, alignments = _score(pairs, alignweights)

    source_file.seek(0)
    target_file.seek(0)
    source_lines = source_file.readlines()
    target_lines = target_file.readlines()

    for i, line in enumerate(target_lines):
        score_str = ' '.join(map(str,[s[i] for s in scores]))
        if verbose:
            saveto.write('{0} '.format(line.strip()))
        saveto.write('{0}\n'.format(score_str))

    ### optional save weights mode.
    if alignweights:
        ### writing out the alignments.
        temp_name = saveto.name + ".json"
        with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
            for line in all_alignments:
                align_OUT.write(line + "\n")
            ### combining the actual source and target words.
            combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT)
示例#2
0
def multi_rescore_model(source_file,
                        target_file,
                        savetos,
                        models,
                        options,
                        b,
                        normalization_alpha,
                        verbose,
                        alignweights,
                        extra_sources=[],
                        per_word=False):

    trng = RandomStreams(1234)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        #alignments = []
        #aux_alignments = []
        costs_per_word = []
        for i, model in enumerate(models):
            f_log_probs = load_scorer(model,
                                      options[i],
                                      alignweights=alignweights)
            score, all_alignments, cost_per_word = multi_pred_probs(
                f_log_probs,
                prepare_multi_data,
                options[i],
                pairs,
                normalization_alpha=normalization_alpha,
                alignweights=alignweights)
            #print 'alignment lens'
            #print len(all_alignments)
            #print len(all_alignments[0])

            scores.append(score)

            costs_per_word.append(cost_per_word)

        return scores, tuple(all_alignments), costs_per_word

    #print 'extra_sources', extra_sources

    # list of sources + target sentences (target sentences are the final list)
    # TODO: make TextIterator generic
    sents = TextIterator(source_file.name,
                         target_file.name,
                         options[0]['dictionaries'][:-1],
                         options[0]['dictionaries'][-1],
                         n_words_source=options[0]['n_words_src'],
                         n_words_target=options[0]['n_words'],
                         batch_size=b,
                         maxlen=float('inf'),
                         sort_by_length=False,
                         extra_sources=[ss.name for ss in extra_sources])
    # TODO: sorting by length could be more efficient, but we'd want to resort after

    scores, all_alignments, costs_per_word = _score(sents, alignweights)

    source_lines = []
    source_file.seek(0)
    source_lines.append([source_file.readlines()])

    extra_source_lines = []
    for i, ss in enumerate(extra_sources):
        extra_sources[i].seek(0)
        extra_source_lines.append([extra_sources[i].readlines()])

    target_file.seek(0)
    target_lines = target_file.readlines()

    # print out scores for each translation
    for i, line in enumerate(target_lines):
        if per_word:
            score_str = ' '.join(
                map(str, [s for s in costs_per_word[0][i]
                          ][:len(line.split(" ")) + 1]))
        else:
            score_str = ' '.join(map(str, [s[i] for s in scores]))
        if verbose:
            savetos[0].write('{0} '.format(line.strip()))
        savetos[0].write('{0}\n'.format(score_str))

    # optional save weights mode.

    if alignweights:

        #print 'num alignments', len(all_alignments)

        for i, alignments in enumerate(all_alignments):
            # write out the alignments.
            #print len(alignments)
            temp_name = savetos[i].name + str(i) + ".json"
            #print temp_name
            with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
                for line in alignments:
                    #print len(line[0][0])
                    #raw_input()
                    align_OUT.write(line + "\n")
                # combine the actual source and target words.
                #print 'savetos', len(savetos)
                #print 'source files', len(extra_sources)
                if i == 0:
                    tmp_srcfile = source_file
                else:
                    tmp_srcfile = extra_sources[i - 1]
                combine_source_target_text_1to1(tmp_srcfile,
                                                target_file,
                                                savetos[i].name,
                                                align_OUT,
                                                suffix=str(i))
示例#3
0
def rescore_model(source_file, target_file, saveto, models, options, b,
                  normalization_alpha, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):

        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys(
            [key for key in param_list if not key.startswith('adam_')], 0)
        params = load_params(model, param_list)
        tparams = init_theano_params(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            logging.debug(
                "Save weight mode ON, alignment matrix will be saved.")
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(
                f_log_probs,
                prepare_data,
                options[i],
                pairs,
                normalization_alpha=normalization_alpha,
                alignweights=alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    pairs = TextIterator(
        source_file.name,
        target_file.name,
        options[0]['dictionaries'][:-1],
        options[0]['dictionaries'][-1],
        n_words_source=options[0]['n_words_src'],
        n_words_target=options[0]['n_words'],
        batch_size=b,
        maxlen=float('inf'),
        sort_by_length=False
    )  #TODO: sorting by length could be more efficient, but we'd want to resort after

    scores, alignments = _score(pairs, alignweights)

    source_file.seek(0)
    target_file.seek(0)
    source_lines = source_file.readlines()
    target_lines = target_file.readlines()

    for i, line in enumerate(target_lines):
        score_str = ' '.join(map(str, [s[i] for s in scores]))
        if verbose:
            saveto.write('{0} '.format(line.strip()))
        saveto.write('{0}\n'.format(score_str))

    ### optional save weights mode.
    if alignweights:
        ### writing out the alignments.
        temp_name = saveto.name + ".json"
        with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
            for line in all_alignments:
                align_OUT.write(line + "\n")
            ### combining the actual source and target words.
            combine_source_target_text_1to1(source_file, target_file,
                                            saveto.name, align_OUT)