示例#1
0
    def sample(self, sentence, ignore_unk=False):
        if self.tokenizer_cmd:
            tokenizer = Popen(self.tokenizer_cmd,
                              stdin=PIPE,
                              stdout=PIPE,
                              shell=True)
            sentence, _ = tokenizer.communicate(sentence)
        seq, parsed_in = parse_input(self.state,
                                     self.indx_word,
                                     sentence,
                                     idx2word=self.idict_src)
        # Sample a translation and detokenize it
        trans, cost, _ = sample(self.lm_model,
                                seq,
                                10,
                                beam_search=self.beam_search,
                                normalize=True,
                                ignore_unk=ignore_unk)
        if self.detokenizer_cmd:
            detokenizer = Popen(self.detokenizer_cmd,
                                stdin=PIPE,
                                stdout=PIPE,
                                shell=True)
            detokenized_sentence, _ = detokenizer.communicate(trans[0])
        else:
            detokenized_sentence = trans[0]

        unknown_words = [
            word for word, index in zip(sentence.split(), seq) if index == 1
        ]
        return detokenized_sentence, unknown_words
示例#2
0
def main():
    args = parse_args()
    state = prototype_state()

    with open(args.state) as src:
        state.update(cPickle.load(src))

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)

    scoreMaker = ScoreMaker(enc_dec)
    ScoreMaker.compile()

    indx_word_src = cPickle.load(open(state['word_indx'],'rb'))
    indx_word_trg = cPickle.load(open(state['word_indx_trgt'],'rb'))

    idict_src = cPickle.load(open(state['indx_word'],'r'))
    idict_trg = cPickle.load(open(state['indx_word_target'],'r'))

    fsrc = open(args.source, 'r')
    ftrg = open(args.target, 'r')
    for srcline, trgline in zip(fsrc, ftrg):
        src_seqin = srcline.strip()
        trg_seqin = trgline.strip()
        src_seq, src_parsed_in = parse_input(state, 
                                             indx_word_src, 
                                             src_seqin, 
                                             idx2word=idict_src)
        trg_seq, trg_parsed_in = parse_input(state, 
                                             indx_word_trg, 
                                             trg_seqin, 
                                             idx2word=idict_trg)
        print "Parsed Input:", src_parsed_in

        ScoreMaker.score(lm_model, src_seq, trg_seq, idict_src, idict_trg)

    fsrc.close()
    ftrg.close()
示例#3
0
def main():
    args = parse_args()
    state = prototype_state()

    with open(args.state) as src:
        state.update(cPickle.load(src))

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)

    ScoreMaker = ScoreMaker(enc_dec)
    ScoreMaker.compile()

    indx_word_src = cPickle.load(open(state['word_indx'], 'rb'))
    indx_word_trg = cPickle.load(open(state['word_indx_trgt'], 'rb'))

    idict_src = cPickle.load(open(state['indx_word'], 'r'))
    idict_trg = cPickle.load(open(state['indx_word_target'], 'r'))

    fsrc = open(args.source, 'r')
    ftrg = open(args.target, 'r')
    for srcline, trgline in zip(fsrc, ftrg):
        src_seqin = srcline.strip()
        trg_seqin = trgline.strip()
        src_seq, src_parsed_in = parse_input(state,
                                             indx_word_src,
                                             src_seqin,
                                             idx2word=idict_src)
        trg_seq, trg_parsed_in = parse_input(state,
                                             indx_word_trg,
                                             trg_seqin,
                                             idx2word=idict_trg)
        print "Parsed Input:", src_parsed_in

        ScoreMaker.score(lm_model, src_seq, trg_seq, idict_src, idict_trg)

    fsrc.close()
    ftrg.close()
示例#4
0
文件: score.py 项目: chagge/GroundHog
 def next(self):
     seqs = []
     try:
         while len(seqs) < self.batch_size:
             line = next(self.txt_file).strip()
             seq, _ = parse_input(self.state, self.indx, line, raise_unk=self.raise_unk)
             seqs.append(seq)
         return self._pack(seqs)
     except StopIteration:
         if not seqs:
             raise StopIteration()
         return self._pack(seqs)
示例#5
0
 def next(self):
     seqs = []
     try:
         while len(seqs) < self.batch_size:
             line = next(self.txt_file).strip()
             seq, _ = parse_input(self.state, self.indx, line, raise_unk=self.raise_unk)
             seqs.append(seq)
         return self._pack(seqs)
     except StopIteration:
         if not seqs:
             raise StopIteration()
         return self._pack(seqs)
示例#6
0
    def sample(self, seqin, n_samples, alpha=None):
        try:
            seq, parsed_in = parse_input(self.state, self.indx_word, seqin, idx2word=self.idict_src)
            print "Parsed Input:", parsed_in

            (sentences, cost, _) = sample(self.lm_model, seq, n_samples, sampler=self.sampler,
                beam_search=self.beam_search,
                ignore_unk=self.args.ignore_unk, normalize=self.args.normalize,
                alpha=alpha, verbose=True)
            return sentences, cost
        except Exception:
            print "Exception while parsing your input:"
            traceback.print_exc()
            return None, None
示例#7
0
    def sample(self, seqin, n_samples, alpha=None):
        try:
            seq, parsed_in = parse_input(self.state, self.indx_word, seqin, idx2word=self.idict_src)
            print "Parsed Input:", parsed_in

            (sentences, cost, _) = sample(self.lm_model, seq, n_samples, sampler=self.sampler,
                beam_search=self.beam_search,
                ignore_unk=self.args.ignore_unk, normalize=self.args.normalize,
                alpha=alpha, verbose=True)
            return sentences, cost
        except Exception:
            print "Exception while parsing your input:"
            traceback.print_exc()
            return None, None
def translate( source):
    # source = "哈 哈 哈"
    # languages = "zh-en"
    start_time = time.time()

    # n_samples = args.beam_size
    n_samples = 12
    total_cost = 0.0
    logging.debug("Beam size: {}".format(n_samples))
    seqin = source.strip()
    print source
    seq, parsed_in = parse_input(states, indx_word, seqin, idx2word=idict_src)
    trans, costs, alignment = sample(lm_models, seq, n_samples, beam_search=beam_search, ignore_unk=False, normalize=False)
    best = numpy.argmin(costs)
    print type(trans[best])
    return trans[best], alignment
示例#9
0
    def sample(self, sentence, ignore_unk=False, beamwidth=10):
        if self.tokenizer_cmd:
            tokenizer = Popen(self.tokenizer_cmd, stdin=PIPE, stdout=PIPE)
            sentence, _ = tokenizer.communicate(sentence)
        seq, parsed_in = parse_input(self.state, self.indx_word, sentence, idx2word=self.idict_src)
        # Sample a translation and detokenize it
        trans, cost, _ = sample(
            self.lm_model, seq, beamwidth, beam_search=self.beam_search, normalize=True, ignore_unk=ignore_unk
        )
        if self.detokenizer_cmd:
            detokenizer = Popen(self.detokenizer_cmd, stdin=PIPE, stdout=PIPE)
            detokenized_sentence, _ = detokenizer.communicate(trans[0])
        else:
            detokenized_sentence = trans[0]

        unknown_words = [word for word, index in zip(sentence.split(), seq) if index == 1]
        return detokenized_sentence, unknown_words
示例#10
0
    def getSamples(self, seqori, k):
        # split the sentence
        seqin = ""
        for i in range(0, len(seqori), 3):
            w = seqori[i:i + 3]
            seqin = seqin + w + " "

        print "split seq:#%s#" % (seqin)
        #return

        seq, parsed_in = parse_input(self.state,
                                     self.indx_word,
                                     seqin,
                                     idx2word=self.idict_src)

        ans, align, rester, updater = self.sample(seq, k)

        return ans, align, rester, updater
示例#11
0
    def getRep(self, seqori):
        seqin = ""
        for i in range(0, len(seqori), 3):
            w = seqori[i:i + 3]
            seqin = seqin + w + " "

        print "split seq:#%s#" % (seqin)

        seq, parsed_in = parse_input(self.state,
                                     self.indx_word,
                                     seqin,
                                     idx2word=self.idict_src)
        rep = self.beam_search.search(seq,
                                      20,
                                      ignore_unk=self.arg_ignore_unk,
                                      minlen=len(seq) / 2,
                                      getRep=True)

        return rep
示例#12
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))

    if args.config:
        state.update(eval(open(args.config).read()))

    if args.weights: state['weights'] = args.weights
    if args.lm_file: state['lm_file'] = args.lm_file
    if args.lm_vocab: state['lm_vocab'] = args.lm_vocab
    if args.pt_file: state['phrase_table'] = args.pt_file
    if args.lm_ngram: state['lm_ngram'] = args.lm_ngram

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])

    enc_dec = RNNEncoderDecoder(state,
                                rng,
                                skip_init=True,
                                compute_alignment=True)

    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'], 'rb'))
    idict_src = cPickle.load(open(state['indx_word'], 'r'))
    trg_idx2word = cPickle.load(open(state['indx_word_target'], 'r'))
    trg_word2idx = cPickle.load(open(state['word_indx_trgt'], 'r'))

    #0:UNK_tm_value 1:rnn_weight 2:lm_weight 3:tm_weight 4:word_penalty_weight
    fea_weights = map(float, state['weights'].split(','))
    beam_search = BeamSearch(enc_dec, trg_idx2word, trg_word2idx, indx_word)
    beam_search.compile()
    beam_search.init_features(state, fea_weights)
    #beam_search.init_lm(state['lm_vocab'], state['lm_file'], ngram=int(state['lm_ngram']), weight=fea_weights[2])
    #beam_search.init_tm(state['phrase_table'], weights=fea_weights[3:])

    fsrc = open(args.source, 'r')
    ftrans = open(args.trans, 'w')

    start_time = time.time()

    n_samples = args.beam_size
    total_cost = 0.0
    logging.debug("Beam size: {}".format(n_samples))
    for i, line in enumerate(fsrc):
        seqin = line.strip()
        seq, parsed_in = parse_input(state,
                                     indx_word,
                                     seqin,
                                     idx2word=idict_src)
        if args.verbose:
            print >> sys.stderr, "Parsed Input:", parsed_in
        trans, costs, trans_ids, aligns, lm_costs, tm_costs, unk_nums, rnn_costs = sample(
            lm_model,
            seqin,
            seq,
            n_samples,
            beam_search=beam_search,
            ignore_unk=args.ignore_unk,
            normalize=args.normalize)
        #for (i, t) in enumerate(trans):
        #    costs[i] = costs[i] / len(t)
        best = numpy.argmin(costs)
        align_str = []
        for (idx, _a) in enumerate(aligns[best]):
            align_str.append("[%s]" % ' '.join(map(str, _a)))

        if args.nbest:
            nbest_trans = trans
            nbest_costs = costs
            nbest_lm_costs = lm_costs
            nbest_tm_costs = tm_costs
            nbest_unk_nums = unk_nums
            nbest_rnn_costs = rnn_costs
            nbest_trans = numpy.array(nbest_trans)[numpy.argsort(nbest_costs)]
            nbest_lm_costs = numpy.array(nbest_lm_costs)[numpy.argsort(
                nbest_costs)]
            nbest_tm_costs = numpy.array(nbest_tm_costs)[numpy.argsort(
                nbest_costs)]
            nbest_unk_nums = numpy.array(nbest_unk_nums)[numpy.argsort(
                nbest_costs)]
            nbest_rnn_costs = numpy.array(nbest_rnn_costs)[numpy.argsort(
                nbest_costs)]
            nbest_costs = numpy.array(sorted(nbest_costs))

            for (t, lm, tm, c, u, r) in zip(nbest_trans, nbest_lm_costs,
                                            nbest_tm_costs, nbest_costs,
                                            nbest_unk_nums, nbest_rnn_costs):
                sum_lm = numpy.sum(lm)
                sum_unk = numpy.sum(u)
                sum_tm = numpy.sum(tm)
                rnn_cost = numpy.sum(r)
                sum_wp = len(t.split(' ')) + 1
                #rnn_cost = c - sum_lm * beam_search.weight_lm - sum_tm * beam_search.weight_tm - sum_wp * beam_search.weight_wp
                pure_tm = sum_tm + sum_unk * beam_search.unk_tm_value
                #rnn_cost = sum_rnn / beam_search.weight_rnn
                #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, c, rnn_cost, sum_lm, sum_tm, sum_wp)
                #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.weight_tm, -rnn_cost, -sum_lm, -pure_tm, -sum_wp)
                print >> ftrans, "%s ||| %f %f %f %f ||| 0" % (
                    t, -rnn_cost, -sum_lm, -sum_tm, -sum_wp)
                if args.verbose:
                    print >>sys.stderr, "%s ||| %f %f %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.unk_tm_value * beam_search.weight_tm,\
                                                            -rnn_cost * beam_search.weight_rnn, \
                                                            -sum_lm * beam_search.weight_lm, \
                                                            -pure_tm * beam_search.weight_tm, \
                                                            -sum_tm * beam_search.weight_tm, \
                                                            -sum_wp * beam_search.weight_wp, c)
            print >> ftrans, ''
            #nbest_str = ' ||| '.join("%s | %f" % (t, c) for (t, c) in zip(nbest_trans, nbest_costs))
            #out_str += "\t" + nbest_str
        else:
            out_str = trans[best]
            if args.alignment:
                out_str += "\t" + ' '.join(align_str)
            if args.show_unk:
                best_ids = trans_ids[best]
                unk_ids = []
                for (i, idx) in enumerate(best_ids):
                    if idx == beam_search.unk_id:
                        unk_ids.append(i)
                out_str += "\t" + ' '.join(map(str, unk_ids))

            print >> ftrans, out_str

        if args.verbose:
            print "[Translation]%s\t[Align]%s" % (trans[best],
                                                  ' '.join(align_str))
        total_cost += costs[best]
        if (i + 1) % 100 == 0:
            ftrans.flush()
            logger.debug("Current speed is {} per sentence".format(
                (time.time() - start_time) / (i + 1)))
    print "Total cost of the translations: {}".format(total_cost)
    print "Total used time: {}".format(time.time() - start_time)

    fsrc.close()
    ftrans.close()
示例#13
0
文件: train.py 项目: Blues5/GroundHog
    def __call__(self):
        """
        Opens the file for the validation set and creates a subprocess 
        for the multi-bleu script. 

        Returns a boolean indicating whether the current model should
        be saved. 
        """

        print "Started Validation: "
        val_start_time = time.time()
        fsrc = open(self.state['validation_set'], 'r')
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE)
        total_cost = 0.0

        if self.verbose:
            ftrans = open(self.state['validation_set_out'], 'w')

        for i, line in enumerate(fsrc):
            """
            Load the sentence, retrieve the sample, write to file
            """
            if self.state['source_encoding'] == 'utf8':
                seqin = line.strip().decode('utf-8')
            else:
                seqin = line.strip()
            seq, parsed_in = parse_input(self.state,
                                         self.indx_word,
                                         seqin,
                                         idx2word=self.idict_src)

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs, _ = sample(self.lm_model,
                                     seq,
                                     self.n_samples,
                                     beam_search=self.beam_search,
                                     ignore_unk=self.ignore_unk,
                                     normalize=self.normalize)
            try:
                best = numpy.argmin(costs)
                total_cost += costs[best]
                trans_out = trans[best]
            except ValueError:
                print "Could not fine a translation for line: {}".format(i + 1)
                trans_out = u'UNK' if self.state[
                    'target_encoding'] == 'utf8' else 'UNK'

            # Write to subprocess and file if it exists
            if self.state['target_encoding'] == 'utf8':
                print >> mb_subprocess.stdin, trans_out.encode('utf8').replace(
                    " ", "")
                if self.verbose:
                    print >> ftrans, trans_out.encode('utf8').replace(" ", "")
            else:
                print >> mb_subprocess.stdin, trans_out
                if self.verbose:
                    print >> ftrans, trans_out

            if i != 0 and i % 50 == 0:
                print "Translated {} lines of validation set...".format(i)
            mb_subprocess.stdin.flush()

        print "Total cost of the validation: {}".format(total_cost)
        fsrc.close()
        if self.verbose:
            ftrans.close()

        # send end of file, read output.
        mb_subprocess.stdin.close()
        out_parse = re.match(r'BLEU = [-.0-9]+',
                             mb_subprocess.stdout.readline())
        print "Validation Took: {} minutes".format(
            float(time.time() - val_start_time) / 60.)
        assert out_parse is not None

        # extract the score
        bleu_score = float(out_parse.group()[6:])
        self.val_bleu_curve.append(bleu_score)
        print bleu_score
        mb_subprocess.terminate()

        # Determine whether or not we should save
        if self.best_bleu < bleu_score:
            self.best_bleu = bleu_score
            return True
        return False
示例#14
0
def main():
    args = parse_args()

    state = prototype_phrase_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    if 'rolling_vocab' not in state:
        state['rolling_vocab'] = 0
    if 'save_algo' not in state:
        state['save_algo'] = 0
    if 'save_gs' not in state:
        state['save_gs'] = 0
    if 'save_iter' not in state:
        state['save_iter'] = -1
    if 'var_src_len' not in state:
        state['var_src_len'] = False

    with open(args.topn_file, 'rb') as f:
        topn = cPickle.load(f) # Load dictionary (source word index : list of target word indices)
    if args.less_transfer:
        for elt in topn:
            topn[elt] = topn[elt][:args.num_ttables] # Take the first args.num_ttables only
    else:
        for elt in topn:
            topn[elt] = set(topn[elt][:args.num_ttables]) # Take the first args.num_ttables only and convert list to set

    num_models = len(args.models)
    rng = numpy.random.RandomState(state['seed'])
    enc_decs = []
    lm_models = []
    original_W_0_dec_approx_embdr = []
    original_W2_dec_deep_softmax = []
    original_b_dec_deep_softmax = []
    for i in xrange(num_models):
        enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True))
        enc_decs[i].build()
        lm_models.append(enc_decs[i].create_lm_model())
        lm_models[i].load(args.models[i])

        original_W_0_dec_approx_embdr.append(lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value())
        original_W2_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].get_value())
        original_b_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].get_value())

        # On GPU, this will free memory for the next models
        # Additional gains could be made by rolling the source vocab
        lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value(numpy.zeros((1,1), dtype=numpy.float32))
        lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].set_value(numpy.zeros((1,1), dtype=numpy.float32))
        lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].set_value(numpy.zeros((1), dtype=numpy.float32))

    indx_word = cPickle.load(open(state['word_indx'],'rb')) #Source w2i

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_decs)
        beam_search.compile()
    else:
        raise NotImplementedError
        #sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'],'r')) #Source i2w
    
    original_target_i2w = lm_models[0].word_indxs.copy()
    # I don't think that we need target_word2index

    max_words = len(original_b_dec_deep_softmax[0])
    
    if args.less_transfer:
        # Use OrderedDict instead of set for reproducibility
        d = OrderedDict() # Up to now
        D = OrderedDict() # Full
        C = OrderedDict() # Allowed to reject
        prev_line = 0
        logger.info("%d" % prev_line)
        D_dict = OrderedDict()
        output = False

        for i in xrange(args.num_common):
            D[i] = 0
            C[i] = 0
        null_unk_indices = [state['null_sym_target'],state['unk_sym_target']]
        update_dicts(null_unk_indices, d, D, C, args.num_common)
        with open(args.source, 'r') as f:
            for i, line in enumerate(f):
                seqin = line.strip()
                seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) # seq is the ndarray of indices
                indices = []
                for elt in seq[:-1]: # Exclude the EOL token
                    if elt != 1: # Exclude OOV (1 will not be a key of topn)
                        indices.extend(topn[elt]) # Add topn best unigram translations for each source word
                output = update_dicts(indices, d, D, C, args.num_common)
                if (i % args.change_every) == 0 and args.change_every > 0 and i > 0:
                    output = True
                if output:
                    D_dict[prev_line] = D.copy() # Save dictionary for the lines preceding this one
                    prev_line = i
                    logger.info("%d" % i)
                    output = False
                    d = OrderedDict()
                    if args.no_reset:
                        C = D.copy()
                    else:
                        D = OrderedDict() # Full
                        C = OrderedDict() # Allowed to reject
                        for i in xrange(args.num_common):
                            D[i] = 0
                            C[i] = 0
                    null_unk_indices = [state['null_sym_target'], state['unk_sym_target']]
                    update_dicts(null_unk_indices, d, D, C, args.num_common)
                    update_dicts(indices, d, D, C, args.num_common) # Assumes you cannot fill d with only 1 line
            D_dict[prev_line] = D.copy()

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        assert beam_search
        assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        n_samples = args.beam_size
        total_cost = 0.0
        logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) # seq is the ndarray of indices
            # For now, keep all input words in the model.
            # In the future, we may want to filter them to save on memory, but this isn't really much of an issue now
            if args.verbose:
                print "Parsed Input:", parsed_in
            if args.less_transfer:
                if i in D_dict:
                    indices = D_dict[i].keys()
                    eos_id = indices.index(state['null_sym_target']) # Find new eos and unk positions
                    unk_id = indices.index(state['unk_sym_target'])
                    for j in xrange(num_models):
                        lm_models[j].params[lm_models[j].name2pos['W_0_dec_approx_embdr']].set_value(original_W_0_dec_approx_embdr[j][indices])
                        lm_models[j].params[lm_models[j].name2pos['W2_dec_deep_softmax']].set_value(original_W2_dec_deep_softmax[j][:, indices])
                        lm_models[j].params[lm_models[j].name2pos['b_dec_deep_softmax']].set_value(original_b_dec_deep_softmax[j][indices])
                    lm_models[0].word_indxs = dict([(k, original_target_i2w[index]) for k, index in enumerate(indices)]) # target index2word
                trans, costs, _ = sample(lm_models[0], seq, n_samples, sampler=sampler,
                        beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize,
                        normalize_p=args.normalize_p, eos_id=eos_id, unk_id=unk_id, final=True, wp=args.wp)
            else:
                # Extract the indices you need
                indices = set()
                for elt in seq[:-1]: # Exclude the EOL token
                    if elt != 1: # Exclude OOV (1 will not be a key of topn)
                        indices = indices.union(topn[elt]) # Add topn best unigram translations for each source word
                num_common_words = args.num_common
                while True:
                    if num_common_words >= max_words:
                        final = True
                        num_common_words = max_words
                    else:
                        final = False

                    if args.final: # No matter the number of words
                        final = True
                    indices = indices.union(set(xrange(num_common_words))) # Add common words
                    indices = list(indices) # Convert back to list for advanced indexing
                    eos_id = indices.index(state['null_sym_target']) # Find new eos and unk positions
                    unk_id = indices.index(state['unk_sym_target'])
                    # Set the target word matrices and biases
                    for j in xrange(num_models):
                        lm_models[j].params[lm_models[j].name2pos['W_0_dec_approx_embdr']].set_value(original_W_0_dec_approx_embdr[j][indices])
                        lm_models[j].params[lm_models[j].name2pos['W2_dec_deep_softmax']].set_value(original_W2_dec_deep_softmax[j][:, indices])
                        lm_models[j].params[lm_models[j].name2pos['b_dec_deep_softmax']].set_value(original_b_dec_deep_softmax[j][indices])
                    lm_models[0].word_indxs = dict([(k, original_target_i2w[index]) for k, index in enumerate(indices)]) # target index2word

                    try:
                        trans, costs, _ = sample(lm_models[0], seq, n_samples, sampler=sampler,
                                beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize,
                                normalize_p=args.normalize_p, eos_id=eos_id, unk_id=unk_id, final=final)
                        break # Breaks only if it succeeded (If final=True, will always succeed)
                    except RuntimeError:
                        indices = set(indices)
                        num_common_words *= 2
            if not args.n_best:
                best = numpy.argmin(costs)
                print >>ftrans, trans[best]
            else:
                order = numpy.argsort(costs)
                best = order[0]
                for elt in order:
                    print >>ftrans, str(i+args.start) + ' ||| ' + trans[elt] + ' ||| ' + str(costs[elt])
            if args.verbose:
                print "Translation:", trans[best]
            total_cost += costs[best]
            if (i + 1)  % 100 == 0:
                ftrans.flush()
                logger.debug("Current speed is {} per sentence".
                        format((time.time() - start_time) / (i + 1)))
        print "Total cost of the translations: {}".format(total_cost)

        fsrc.close()
        ftrans.close()
    else:
        raise NotImplementedError
示例#15
0
def process_sentence(source_sentence, model, max_phrase_length, n_samples,
                     copy_UNK_words, add_period, normalize, reverse_score):

    #Setting up comp_score function
    logger.debug("setting up comp_score function")
    [lm_model, enc_dec, indx_word_src, indx_word_trgt, state, \
            lm_model_fr_2_en, enc_dec_fr_2_en, state_fr2en] = model

    eol_src = state['null_sym_source']
    src_seq = parse_input(state, indx_word_src, source_sentence)
    if src_seq[-1] == eol_src:
        src_seq = src_seq[:-1]
    n_s = len(src_seq)

    #Create sorted phrase lists
    tiled_source_phrase_list = []
    index_order_list = []
    for i in xrange(n_s):
        for j in numpy.arange(i, min(i + max_phrase_length, n_s)):
            index_order_list.append([i, j])

    logger.debug("sorting list")
    index_order_list.sort(key=lambda (i, j): (j - i))

    logger.debug("creating phrase lists")
    if add_period:
        period_src = indx_word_src['.']
        for i, j in index_order_list:
            tiled_source_phrase_list.append(
                numpy.hstack((src_seq[i:j + 1], period_src, eol_src)))
    else:
        for i, j in index_order_list:
            tiled_source_phrase_list.append(
                numpy.hstack((src_seq[i:j + 1], eol_src)))

    #Compute nested score dictionary
    logger.debug("computing nested score dictionary")
    score_dict = {}
    trans = {}

    for phrase_idx in xrange(0, len(index_order_list)):
        logger.debug("{0} out of {1}".format(phrase_idx,
                                             len(index_order_list)))
        i, j = index_order_list[phrase_idx]
        logger.debug("Translating phrase : {}".format(" ".join(
            source_sentence.strip().split()[i:j + 1])))

        if copy_UNK_words == True:
            phrase_to_translate = tiled_source_phrase_list[phrase_idx]
            n_UNK_words = numpy.sum(
                [word == 1 for word in phrase_to_translate])
            if n_UNK_words >= 1 and n_UNK_words == len(
                    phrase_to_translate) - 1:
                suggested_translation = " ".join(
                    source_sentence.strip().split()[i:j + 1])
                trans[i, j] = suggested_translation
                score = .0001
                score_dict[i, j] = score
            if n_UNK_words >= 1 and n_UNK_words != len(phrase_to_translate) - 1:
                suggested_translation = "WILL NOT BE USED"
                trans[i, j] = suggested_translation
                score = 1e9
                score_dict[i, j] = score
            if n_UNK_words == 0:
                suggested_translation, score = sample_targets(
                                                    input_phrase= \
                                                           tiled_source_phrase_list[phrase_idx],
                                                    model=model,
                                                    n_samples=n_samples,
                                                    reverse_score=reverse_score,
                                                    normalize=normalize
                )
                trans[i, j] = suggested_translation
                score_dict[i, j] = score

        else:
            phrase_to_translate = tiled_source_phrase_list[phrase_idx]
            suggested_translation, score = sample_targets(
                input_phrase=phrase_to_translate,
                model=model,
                n_samples=n_samples,
                reverse_score=reverse_score,
                normalize=normalize)
            trans[i, j] = suggested_translation
            score_dict[i, j] = score

    #Remove the period at the end if not last word
    #Lower case first word if not first word
    if add_period:
        for phrase_idx in xrange(0, len(index_order_list)):
            i, j = index_order_list[phrase_idx]
            if i != 0:
                trans[i, j] = " ".join([trans[i, j][0].lower()] +
                                       [trans[i, j][1:]])
            if j != len(src_seq) - 1:
                last_word = trans[i, j].strip().split()[-1]
                if last_word == '.':
                    trans[i, j] = " ".join(trans[i, j].strip().split()[:-1])

    #Translation of full sentence without segmentation
    logger.debug("Translating full sentence")
    phrase_to_translate = numpy.hstack((src_seq, eol_src))
    full_translation, __ = sample_targets(input_phrase=phrase_to_translate,
                                          model=model,
                                          n_samples=n_samples,
                                          reverse_score=reverse_score,
                                          normalize=normalize)
    logger.debug("Translation output:".format(full_translation))

    return trans, score_dict, full_translation
示例#16
0
文件: sample.py 项目: ucaslyc/NMT-1
def main():
    args = parse_args()

    state = prototype_search_with_coverage_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state,
                                rng,
                                skip_init=True,
                                compute_alignment=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'], 'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'], 'r'))

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        assert beam_search
        assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        n_samples = args.beam_size
        total_cost = 0.0
        logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state,
                                         indx_word,
                                         seqin,
                                         idx2word=idict_src)
            if lm_model.maintain_coverage:
                if lm_model.use_linguistic_coverage and lm_model.use_fertility_model:
                    trans, aligns, costs, coverages, fertility, _ = sample(
                        lm_model,
                        seq,
                        n_samples,
                        sampler=sampler,
                        beam_search=beam_search,
                        ignore_unk=args.ignore_unk,
                        normalize=args.normalize)
                else:
                    trans, aligns, costs, coverages, _ = sample(
                        lm_model,
                        seq,
                        n_samples,
                        sampler=sampler,
                        beam_search=beam_search,
                        ignore_unk=args.ignore_unk,
                        normalize=args.normalize)
            else:
                trans, aligns, costs, _ = sample(lm_model,
                                                 seq,
                                                 n_samples,
                                                 sampler=sampler,
                                                 beam_search=beam_search,
                                                 ignore_unk=args.ignore_unk,
                                                 normalize=args.normalize)

            if args.verbose:
                print "Parsed Input:", parsed_in

            if len(trans) == 0:
                trans = ['Failed']
                costs = [0.0]

            best = numpy.argmin(costs)
            print >> ftrans, trans[best]
            if args.verbose:
                print "Translation:", trans[best]
                print "Aligns:"
                # aligns shape:  (target_len, source_len)
                # we reverse it to the shape (source_len, target_len) to show the matrix
                print numpy.array(aligns[best]).transpose().tolist()

                if lm_model.maintain_coverage:
                    # since we filtered <eos> from trans[best], thus the index adds 1
                    coverage = coverages[best]
                    print "Coverage:",
                    words = parsed_in.split()
                    for k in xrange(len(words)):
                        print '%s/%.2f' % (words[k], coverage[k]),
                    print ''
                    if lm_model.use_linguistic_coverage and lm_model.use_fertility_model:
                        print 'Fertility:  ',
                        for k in xrange(len(words)):
                            print '%s/%.2f' % (words[k], fertility[k]),
                        print ''
                print

            total_cost += costs[best]
            if (i + 1) % 100 == 0:
                ftrans.flush()
                logger.debug("Current speed is {} per sentence".format(
                    (time.time() - start_time) / (i + 1)))
        print "Total cost of the translations: {}".format(total_cost)

        fsrc.close()
        ftrans.close()
    else:
        while True:
            try:
                seqin = raw_input('Input Sequence: ')
                n_samples = int(raw_input('How many samples? '))
                alpha = None
                if not args.beam_search:
                    alpha = float(raw_input('Inverse Temperature? '))
                seq, parsed_in = parse_input(state,
                                             indx_word,
                                             seqin,
                                             idx2word=idict_src)
                print "Parsed Input:", parsed_in
            except Exception:
                print "Exception while parsing your input:"
                traceback.print_exc()
                continue

            sample(lm_model,
                   seq,
                   n_samples,
                   sampler=sampler,
                   beam_search=beam_search,
                   ignore_unk=args.ignore_unk,
                   normalize=args.normalize,
                   alpha=alpha,
                   verbose=True)
示例#17
0
def main():
    args = parse_args()

    state = prototype_search_with_coverage_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'],'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'],'r'))

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        assert beam_search
        assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        n_samples = args.beam_size
        total_cost = 0.0
        logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
            if lm_model.maintain_coverage:
                if lm_model.use_linguistic_coverage and lm_model.use_fertility_model:
                    trans, aligns, costs, coverages, fertility, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                            beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
                else:
                    trans, aligns, costs, coverages, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                            beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
            else:
                trans, aligns, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                        beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
            
            if args.verbose:
                print "Parsed Input:", parsed_in

            if len(trans) == 0:
                trans = ['Failed']
                costs = [0.0]

            best = numpy.argmin(costs)
            print >>ftrans, trans[best]
            if args.verbose:
                print "Translation:", trans[best]
                print "Aligns:"
                # aligns shape:  (target_len, source_len)
                # we reverse it to the shape (source_len, target_len) to show the matrix
                print numpy.array(aligns[best]).transpose().tolist()

                if lm_model.maintain_coverage:
                    # since we filtered <eos> from trans[best], thus the index adds 1
                    coverage = coverages[best]
                    print "Coverage:", 
                    words = parsed_in.split()
                    for k in xrange(len(words)):
                        print '%s/%.2f'%(words[k], coverage[k]),
                    print ''
                    if lm_model.use_linguistic_coverage and lm_model.use_fertility_model:
                        print 'Fertility:  ',
                        for k in xrange(len(words)):
                            print '%s/%.2f'%(words[k], fertility[k]),
                        print ''
                print 

            total_cost += costs[best]
            if (i + 1)  % 100 == 0:
                ftrans.flush()
                logger.debug("Current speed is {} per sentence".
                        format((time.time() - start_time) / (i + 1)))
        print "Total cost of the translations: {}".format(total_cost)

        fsrc.close()
        ftrans.close()
    else:
        while True:
            try:
                seqin = raw_input('Input Sequence: ')
                n_samples = int(raw_input('How many samples? '))
                alpha = None
                if not args.beam_search:
                    alpha = float(raw_input('Inverse Temperature? '))
                seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
                print "Parsed Input:", parsed_in
            except Exception:
                print "Exception while parsing your input:"
                traceback.print_exc()
                continue

            sample(lm_model, seq, n_samples, sampler=sampler,
                    beam_search=beam_search,
                    ignore_unk=args.ignore_unk, normalize=args.normalize,
                    alpha=alpha, verbose=True)
示例#18
0
def main():
    args = parse_args()

    state = prototype_search_with_coverage_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state,
                                rng,
                                skip_init=True,
                                compute_alignment=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'], 'rb'))
    t_indx_word = cPickle.load(open(state['word_indx_trgt'], 'rb'))

    sampler = None
    beam_search = BeamSearch(enc_dec)
    beam_search.compile()

    idict_src = cPickle.load(open(state['indx_word'], 'r'))
    t_idict_src = cPickle.load(open(state['indx_word_target'], 'r'))

    fsrc = open(args.source, 'r')
    ftrg = open(args.target, 'r')

    start_time = time.time()

    total_cost = 0.0
    # for i, line in enumerate(fsrc):
    i = 0
    while 1:
        try:
            seqin = fsrc.next().strip()
            seqout = ftrg.next().strip()
        except StopIteration:
            break

        seq, parsed_in = parse_input(state,
                                     indx_word,
                                     seqin,
                                     idx2word=idict_src)
        out, parsed_out = parse_target(state,
                                       t_indx_word,
                                       seqout,
                                       idx2word=t_idict_src)

        if lm_model.maintain_coverage:
            if lm_model.use_linguistic_coverage and lm_model.use_fertility_model:
                aligns, costs, coverage, fertility = force_decoding(
                    lm_model,
                    seq,
                    out,
                    sampler=sampler,
                    beam_search=beam_search,
                    ignore_unk=args.ignore_unk,
                    normalize=args.normalize)
            else:
                aligns, costs, coverage = force_decoding(
                    lm_model,
                    seq,
                    out,
                    sampler=sampler,
                    beam_search=beam_search,
                    ignore_unk=args.ignore_unk,
                    normalize=args.normalize)
        else:
            aligns, costs = force_decoding(lm_model,
                                           seq,
                                           out,
                                           sampler=sampler,
                                           beam_search=beam_search,
                                           ignore_unk=args.ignore_unk,
                                           normalize=args.normalize)

        print "Parsed Input:", parsed_in
        print "Parsed Target:", parsed_out
        print 'Aligns:'
        print aligns.tolist()

        if lm_model.maintain_coverage:
            # since we filtered <eos> from trans[best], thus the index adds 1
            print "Coverage:",
            words = parsed_in.split()
            for k in xrange(len(words)):
                print '%s/%.2f' % (words[k], coverage[k]),
            print ''

            if lm_model.use_linguistic_coverage and lm_model.use_fertility_model:
                print 'Fertility:  ',
                for k in xrange(len(words)):
                    print '%s/%.2f' % (words[k], fertility[k]),
                print ''
        print

        total_cost += costs[0]
        if (i + 1) % 100 == 0:
            logger.debug("Current speed is {} per sentence".format(
                (time.time() - start_time) / (i + 1)))
    print "Total cost of the translations: {}".format(total_cost)

    fsrc.close()
    ftrg.close()
示例#19
0
def main():
    args = parse_args()

    state = prototype_phrase_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    if 'rolling_vocab' not in state:
        state['rolling_vocab'] = 0
    if 'save_algo' not in state:
        state['save_algo'] = 0
    if 'save_gs' not in state:
        state['save_gs'] = 0
    if 'save_iter' not in state:
        state['save_iter'] = -1
    if 'var_src_len' not in state:
        state['var_src_len'] = False

    if args.num_common and args.num_ttables and args.topn_file:
        with open(args.topn_file, 'rb') as f:
            topn = cPickle.load(f) # Load dictionary (source word index : list of target word indices)
            for elt in topn:
                topn[elt] = topn[elt][:args.num_ttables] # Take the first args.num_ttables only

    num_models = len(args.models)
    rng = numpy.random.RandomState(state['seed'])
    enc_decs = []
    lm_models = []
    alignment_fns = []
    if args.num_common and args.num_ttables and args.topn_file:
        original_W_0_dec_approx_embdr = []
        original_W2_dec_deep_softmax = []
        original_b_dec_deep_softmax = []

    for i in xrange(num_models):
        enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True))
        enc_decs[i].build()
        lm_models.append(enc_decs[i].create_lm_model())
        lm_models[i].load(args.models[i])

        alignment_fns.append(theano.function(inputs=enc_decs[i].inputs, outputs=[enc_decs[i].alignment], name="alignment_fn"))

        if args.num_common and args.num_ttables and args.topn_file:
            original_W_0_dec_approx_embdr.append(lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value())
            original_W2_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].get_value())
            original_b_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].get_value())

            lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value(numpy.zeros((1,1), dtype=numpy.float32))
            lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].set_value(numpy.zeros((1,1), dtype=numpy.float32))
            lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].set_value(numpy.zeros((1), dtype=numpy.float32))

    if args.mapping:
        with open(args.mapping, 'rb') as f:
            mapping = cPickle.load(f)
        heuristic = args.heuristic
    else:
        heuristic = 0
        mapping = None


    word2idx_src = cPickle.load(open(state['word_indx'], 'rb'))
    idict_src = cPickle.load(open(state['indx_word'], 'r'))

    word2idx_trg = cPickle.load(open(state['word_indx_trgt'], 'rb'))
    idict_trg = cPickle.load(open(state['indx_word_target'], 'r'))

    word2idx_trg['<eos>'] = state['null_sym_target']
    word2idx_trg[state['oov']] = state['unk_sym_target'] # 'UNK' may be in the vocabulary. Now points to the right index.
    idict_trg[state['null_sym_target']] = '<eos>'
    idict_trg[state['unk_sym_target']] = state['oov']

    if args.num_common and args.num_ttables and args.topn_file:

        # Use OrderedDict instead of set for reproducibility
        d = OrderedDict() # Up to now
        D = OrderedDict() # Full
        C = OrderedDict() # Allowed to reject
        prev_line = 0
        logger.info("%d" % prev_line)
        D_dict = OrderedDict()
        output = False

        for i in xrange(args.num_common):
            D[i] = 0
            C[i] = 0
        null_unk_indices = [state['null_sym_target'],state['unk_sym_target']]
        update_dicts(null_unk_indices, d, D, C, args.num_common)
        with open(args.source, 'r') as f:
            for i, line in enumerate(f):
                seqin = line.strip()
                seq, _ = parse_input(state, word2idx_src, seqin) # seq is the ndarray of indices
                indices = []
                for elt in seq[:-1]: # Exclude the EOL token
                    if elt != 1 and elt in topn: # Exclude OOV (1 will not be a key of topn)
                        indices.extend(topn[elt]) # Add topn best unigram translations for each source word
                update_dicts(indices, d, D, C, args.num_common)
                if (i % args.change_every) == 0 and args.change_every > 0 and i > 0:
                    D_dict[prev_line] = D.copy() # Save dictionary for the lines preceding this one
                    prev_line = i
                    logger.info("%d" % i)
                    output = False
                    d = OrderedDict()
                    if args.no_reset:
                        C = D.copy()
                    else:
                        D = OrderedDict() # Full
                        C = OrderedDict() # Allowed to reject
                        for i in xrange(args.num_common):
                            D[i] = 0
                            C[i] = 0
                    null_unk_indices = [state['null_sym_target'], state['unk_sym_target']]
                    update_dicts(null_unk_indices, d, D, C, args.num_common)
                    update_dicts(indices, d, D, C, args.num_common) # Assumes you cannot fill d with only 1 line
            D_dict[prev_line] = D.copy()

    start_time = time.time()

    if args.source and args.trans and args.new_trans:
        with open(args.source, 'r') as src_file:
            with open(args.trans, 'r') as trans_file:
                with open(args.new_trans, 'w') as new_trans_file:
                    if not (args.num_common and args.num_ttables and args.topn_file):
                        eos_id = state['null_sym_target']
                        unk_id = state['unk_sym_target']
                        new_word2idx_trg = word2idx_trg

                    prev_i = -1
                    if args.n_best:
                        full_trans_line = trans_file.readline()
                        if full_trans_line == '':
                            raise IOError("File is empty")
                        full_trans_line = full_trans_line.split('|||')
                        n_best_start = int(full_trans_line[0].strip())
                        trans_file.seek(0)
                    while True:
                        if args.n_best:
                            full_trans_line = trans_file.readline()
                            if full_trans_line == '':
                                break
                            full_trans_line = full_trans_line.split('|||')
                            i = int(full_trans_line[0].strip()) - n_best_start
                            trans_line = full_trans_line[1].strip()
                        else:
                            trans_line = trans_file.readline()
                            if trans_line == '':
                                break
                            i = prev_i + 1

                        if i == (prev_i + 1):
                            prev_i = i

                            if (i % args.change_every) == 0 and i > 0:
                                hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize)
                                replace_unknown_words(
                                    src_word_seqs, trg_seqs, trg_word_seqs,
                                    hard_alignments, heuristic, mapping, unk_id,
                                    new_trans_file, args.n_best, full_trans_lines)

                            if (i % 100 == 0) and i > 0:
                                new_trans_file.flush()
                                logger.debug("Current speed is {} per sentence".
                                        format((time.time() - start_time) / i))

                            src_line = src_file.readline()
                            src_seq, src_words = parse_input(state, word2idx_src, src_line.strip())
                            src_words.append('<eos>')

                            if (i % args.change_every) == 0:
                                src_seqs = []
                                src_word_seqs = []
                                trg_seqs = []
                                trg_word_seqs = []
                                full_trans_lines = [] # Only used with n-best lists
                                if args.num_common and args.num_ttables and args.topn_file:
                                    indices = D_dict[i].keys()
                                    eos_id = indices.index(state['null_sym_target']) # Find new eos and unk positions
                                    unk_id = indices.index(state['unk_sym_target'])
                                    for j in xrange(num_models):
                                        lm_models[j].params[lm_models[j].name2pos['W_0_dec_approx_embdr']].set_value(original_W_0_dec_approx_embdr[j][indices])
                                        lm_models[j].params[lm_models[j].name2pos['W2_dec_deep_softmax']].set_value(original_W2_dec_deep_softmax[j][:, indices])
                                        lm_models[j].params[lm_models[j].name2pos['b_dec_deep_softmax']].set_value(original_b_dec_deep_softmax[j][indices])
                                    new_word2idx_trg = dict([(idict_trg[index], k) for k, index in enumerate(indices)])
                        elif i != prev_i:
                            raise ValueError("prev_i: %d, i: %d" % (prev_i, i))

                        trans_seq, trans_words = parse_output(new_word2idx_trg, trans_line.strip(), eos_id=eos_id, unk_id=unk_id)
                        trans_words.append('<eos>')

                        src_seqs.append(src_seq)
                        src_word_seqs.append(src_words)
                        trg_seqs.append(trans_seq)
                        trg_word_seqs.append(trans_words)
                        if args.n_best:
                            full_trans_lines.append(full_trans_line)

                    # Out of loop
                    hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize)
                    replace_unknown_words(src_word_seqs, trg_seqs, trg_word_seqs,
                                          hard_alignments, heuristic, mapping, unk_id,
                                          new_trans_file, args.n_best, full_trans_lines)
    else:
        raise NotImplementedError
示例#20
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'],'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'],'r'))

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        assert beam_search
        assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        n_samples = args.beam_size
        total_cost = 0.0
        logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
            if lm_model.maintain_coverage:
                trans, costs, coverages, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                        beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
            else:
                trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                        beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
            
            if args.verbose:
                print "Parsed Input:", parsed_in

            if len(trans) == 0:
                trans = ['Failed']
                costs = [0.0]

            best = numpy.argmin(costs)
            print >>ftrans, trans[best]
            if args.verbose:
                print "Translation:", trans[best]

                if lm_model.maintain_coverage:
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))

    if args.config:
        state.update(eval(open(args.config).read()))

    if args.weights: state['weights'] = args.weights
    if args.lm_file: state['lm_file'] = args.lm_file
    if args.lm_vocab: state['lm_vocab'] = args.lm_vocab
    if args.pt_file: state['phrase_table'] = args.pt_file
    if args.lm_ngram: state['lm_ngram'] = args.lm_ngram

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])

    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True)

    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'],'rb'))
    idict_src = cPickle.load(open(state['indx_word'],'r'))
    trg_idx2word = cPickle.load(open(state['indx_word_target'],'r'))
    trg_word2idx = cPickle.load(open(state['word_indx_trgt'],'r'))

    #0:UNK_tm_value 1:rnn_weight 2:lm_weight 3:tm_weight 4:word_penalty_weight
    fea_weights = map(float, state['weights'].split(','))
    beam_search = BeamSearch(enc_dec, trg_idx2word, trg_word2idx, indx_word)
    beam_search.compile()
    beam_search.init_features(state, fea_weights)
    #beam_search.init_lm(state['lm_vocab'], state['lm_file'], ngram=int(state['lm_ngram']), weight=fea_weights[2])
    #beam_search.init_tm(state['phrase_table'], weights=fea_weights[3:])

    fsrc = open(args.source, 'r')
    ftrans = open(args.trans, 'w')

    start_time = time.time()

    n_samples = args.beam_size
    total_cost = 0.0
    logging.debug("Beam size: {}".format(n_samples))
    for i, line in enumerate(fsrc):
        seqin = line.strip()
        seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
        if args.verbose:
            print >> sys.stderr, "Parsed Input:", parsed_in
        trans, costs, trans_ids, aligns, lm_costs, tm_costs, unk_nums, rnn_costs = sample(lm_model, seqin, seq, n_samples,
                beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
        #for (i, t) in enumerate(trans):
        #    costs[i] = costs[i] / len(t)
        best = numpy.argmin(costs)
        align_str = []
        for (idx, _a) in enumerate(aligns[best]):
            align_str.append("[%s]" % ' '.join(map(str, _a)))

        if args.nbest:
            nbest_trans = trans
            nbest_costs = costs
            nbest_lm_costs = lm_costs
            nbest_tm_costs = tm_costs
            nbest_unk_nums = unk_nums
            nbest_rnn_costs = rnn_costs
            nbest_trans = numpy.array(nbest_trans)[numpy.argsort(nbest_costs)]
            nbest_lm_costs = numpy.array(nbest_lm_costs)[numpy.argsort(nbest_costs)]
            nbest_tm_costs = numpy.array(nbest_tm_costs)[numpy.argsort(nbest_costs)]
            nbest_unk_nums = numpy.array(nbest_unk_nums)[numpy.argsort(nbest_costs)]
            nbest_rnn_costs = numpy.array(nbest_rnn_costs)[numpy.argsort(nbest_costs)]
            nbest_costs = numpy.array(sorted(nbest_costs))

            for (t, lm, tm, c, u, r) in zip(nbest_trans, nbest_lm_costs, nbest_tm_costs, nbest_costs, nbest_unk_nums, nbest_rnn_costs):
                sum_lm = numpy.sum(lm)
                sum_unk = numpy.sum(u)
                sum_tm = numpy.sum(tm)
                rnn_cost = numpy.sum(r)
                sum_wp = len(t.split(' ')) + 1
                #rnn_cost = c - sum_lm * beam_search.weight_lm - sum_tm * beam_search.weight_tm - sum_wp * beam_search.weight_wp
                pure_tm = sum_tm + sum_unk * beam_search.unk_tm_value
                #rnn_cost = sum_rnn / beam_search.weight_rnn
                #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, c, rnn_cost, sum_lm, sum_tm, sum_wp)
                #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.weight_tm, -rnn_cost, -sum_lm, -pure_tm, -sum_wp)
                print >> ftrans, "%s ||| %f %f %f %f ||| 0" % (t, -rnn_cost, -sum_lm, -sum_tm, -sum_wp)
                if args.verbose:
                    print >>sys.stderr, "%s ||| %f %f %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.unk_tm_value * beam_search.weight_tm,\
                                                            -rnn_cost * beam_search.weight_rnn, \
                                                            -sum_lm * beam_search.weight_lm, \
                                                            -pure_tm * beam_search.weight_tm, \
                                                            -sum_tm * beam_search.weight_tm, \
                                                            -sum_wp * beam_search.weight_wp, c)
            print >> ftrans, ''
            #nbest_str = ' ||| '.join("%s | %f" % (t, c) for (t, c) in zip(nbest_trans, nbest_costs))
            #out_str += "\t" + nbest_str
        else:
            out_str = trans[best]
            if args.alignment:
                out_str += "\t" + ' '.join(align_str)
            if args.show_unk:
                best_ids = trans_ids[best]
                unk_ids = []
                for (i, idx) in enumerate(best_ids):
                    if idx == beam_search.unk_id:
                        unk_ids.append(i)
                out_str += "\t" + ' '.join(map(str, unk_ids))

            print >>ftrans, out_str

        if args.verbose:
            print "[Translation]%s\t[Align]%s" % (trans[best], ' '.join(align_str))
        total_cost += costs[best]
        if (i + 1)  % 100 == 0:
            ftrans.flush()
            logger.debug("Current speed is {} per sentence".
                    format((time.time() - start_time) / (i + 1)))
    print "Total cost of the translations: {}".format(total_cost)
    print "Total used time: {}".format(time.time() - start_time)

    fsrc.close()
    ftrans.close()
示例#22
0
def main():
    args = parse_args()

    state = getattr(experiments.nmt, args.state_fn)()
    if hasattr(args, 'state') and args.state:
        with open(args.state) as src:
            state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    assert state['enc_rec_layer'] == "RecursiveConvolutionalLayer", "Only works with gated recursive convolutional encoder"

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)

    indx_word = cPickle.load(open(state['word_indx'],'rb'))
    idict_src = cPickle.load(open(state['indx_word'],'r'))

    x = TT.lvector()
    h = TT.tensor3()

    proj_x = theano.function([x], enc_dec.encoder.input_embedders[0](
        enc_dec.encoder.approx_embedder(x)).out, name='proj_x')
    new_h, gater = enc_dec.encoder.transitions[0].step_fprop(
        None, h, return_gates = True)
    step_up = theano.function([h], [new_h, gater], name='gater_step')

    while True:
        try:
            seqin = raw_input('Input Sequence: ')
            seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
            print "Parsed Input:", parsed_in
        except Exception:
            print "Exception while parsing your input:"
            traceback.print_exc()
            continue

        # get the initial embedding
        new_h = proj_x(seq)
        new_h = new_h.reshape(new_h.shape[0], 1, new_h.shape[1])

        nodes = numpy.arange(len(seq)).tolist()
        node_idx = len(seq)-1
        rules = []
        nodes_level = copy.deepcopy(nodes)

        G = nx.DiGraph()

        input_nodes = []
        merge_nodes = []
        aggregate_nodes = []

        nidx = 0 
        vpos = 0
        nodes_pos = {}
        nodes_labels = {}
        # input nodes
        for nn in nodes[:-1]:
            nidx += 1
            G.add_node(nn, pos=(nidx, 0), ndcolor="blue", label="%d"%nn)
            nodes_pos[nn] = (nidx, vpos)
            nodes_labels[nn] = idict_src[seq[nidx-1]]
            input_nodes.append(nn)
        node_idx = len(seq) - 1

        vpos += 6
        for dd in xrange(len(seq)-1):
            new_h, gater = step_up(new_h)
            decisions = numpy.argmax(gater, -1)
            new_nodes_level = numpy.zeros(len(seq) - (dd+1))
            hpos = float(len(seq)+1) - 0.5 * (dd+1)
            last_node = True
            for nn in xrange(len(seq)-(dd+1)):
                hpos -= 1
                if not last_node:
                    # merge nodes
                    node_idx += 1
                    G.add_node(node_idx, ndcolor="red", label="m")
                    nodes_labels[node_idx] = ""
                    nodes_pos[node_idx] = (hpos, vpos)
                    G.add_edge(nodes_level[-(nn+1)], node_idx, weight=gater[-(nn+1),0,0])
                    G.add_edge(nodes_level[-(nn+2)], node_idx, weight=gater[-(nn+1),0,0])
                    merge_nodes.append(node_idx)

                    merge_node = node_idx
                    # linear aggregation nodes
                    node_idx += 1
                    G.add_node(node_idx, ndcolor="red", label="")
                    nodes_labels[node_idx] = "$+$"
                    nodes_pos[node_idx] = (hpos, vpos+6)
                    G.add_edge(merge_node, node_idx, weight=gater[-(nn+1),0,0])
                    G.add_edge(nodes_level[-(nn+2)], node_idx, weight=gater[-(nn+1),0,1])
                    G.add_edge(nodes_level[-(nn+1)], node_idx, weight=gater[-(nn+1),0,2])
                    aggregate_nodes.append(node_idx)

                    new_nodes_level[-(nn+1)] = node_idx
                last_node = False
            nodes_level = copy.deepcopy(new_nodes_level)
            vpos += 12

        # TODO: Show only strong edges.
        threshold = float(raw_input('Threshold: '))
        edges = [(u,v,d) for (u,v,d) in G.edges(data=True) if d['weight'] > threshold]
        #edges = G.edges(data=True)

        use_weighting = raw_input('Color according to weight [Y/N]: ')
        if use_weighting == 'Y':
            cm = plt.get_cmap('binary') 
            cNorm  = colors.Normalize(vmin=0., vmax=1.)
            scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm)
            colorList = [scalarMap.to_rgba(d['weight']) for (u,v,d) in edges]
        else:
            colorList = 'k'

        nx.draw_networkx_nodes(G, pos=nodes_pos, nodelist=input_nodes, node_color='white', alpha=1., edge_color='white')
        nx.draw_networkx_nodes(G, pos=nodes_pos, nodelist=merge_nodes, node_color='blue', alpha=0.8, node_size=20)
        nx.draw_networkx_nodes(G, pos=nodes_pos, nodelist=aggregate_nodes, node_color='red', alpha=0.8, node_size=80)
        nx.draw_networkx_edges(G, pos=nodes_pos, edge_color=colorList, edgelist=edges)
        nx.draw_networkx_labels(G,pos=nodes_pos,labels=nodes_labels,font_family='sans-serif')
        plt.axis('off')
        figname = raw_input('Save to: ')
        if figname[-3:] == "pdf":
            plt.savefig(figname, type='pdf')
        else:
            plt.savefig(figname)
        plt.close()
        G.clear()
示例#23
0
def process_sentence(source_sentence, model, max_phrase_length, n_samples,
                     copy_UNK_words, add_period, normalize, reverse_score):
    # Setting up comp_score function
    logger.debug("setting up comp_score function")
    [lm_model, enc_dec, indx_word_src, indx_word_trgt, state, \
     lm_model_fr_2_en, enc_dec_fr_2_en, state_fr2en] = model

    eol_src = state['null_sym_source']
    src_seq = parse_input(state, indx_word_src, source_sentence)
    if src_seq[-1] == eol_src:
        src_seq = src_seq[:-1]
    n_s = len(src_seq)

    # Create sorted phrase lists
    tiled_source_phrase_list = []
    index_order_list = []
    for i in xrange(n_s):
        for j in numpy.arange(i, min(i + max_phrase_length, n_s)):
            index_order_list.append([i, j])

    logger.debug("sorting list")
    index_order_list.sort(key=lambda (i, j): (j - i))

    logger.debug("creating phrase lists")
    if add_period:
        period_src = indx_word_src['.']
        for i, j in index_order_list:
            tiled_source_phrase_list.append(numpy.hstack((src_seq[i:j + 1], period_src, eol_src)))
    else:
        for i, j in index_order_list:
            tiled_source_phrase_list.append(numpy.hstack((src_seq[i:j + 1], eol_src)))

    # Compute nested score dictionary
    logger.debug("computing nested score dictionary")
    score_dict = {}
    trans = {}

    for phrase_idx in xrange(0, len(index_order_list)):
        logger.debug("{0} out of {1}".format(phrase_idx, len(index_order_list)))
        i, j = index_order_list[phrase_idx]
        logger.debug("Translating phrase : {}".format(" ".join(source_sentence.strip().split()[i:j + 1])))

        if copy_UNK_words == True:
            phrase_to_translate = tiled_source_phrase_list[phrase_idx]
            n_UNK_words = numpy.sum([word == 1 for word in phrase_to_translate])
            if n_UNK_words >= 1 and n_UNK_words == len(phrase_to_translate) - 1:
                suggested_translation = " ".join(source_sentence.strip().split()[i:j + 1])
                trans[i, j] = suggested_translation
                score = .0001
                score_dict[i, j] = score
            if n_UNK_words >= 1 and n_UNK_words != len(phrase_to_translate) - 1:
                suggested_translation = "WILL NOT BE USED"
                trans[i, j] = suggested_translation
                score = 1e9
                score_dict[i, j] = score
            if n_UNK_words == 0:
                suggested_translation, score = sample_targets(
                    input_phrase= \
                        tiled_source_phrase_list[phrase_idx],
                    model=model,
                    n_samples=n_samples,
                    reverse_score=reverse_score,
                    normalize=normalize
                )
                trans[i, j] = suggested_translation
                score_dict[i, j] = score

        else:
            phrase_to_translate = tiled_source_phrase_list[phrase_idx]
            suggested_translation, score = sample_targets(
                input_phrase=phrase_to_translate,
                model=model, n_samples=n_samples,
                reverse_score=reverse_score,
                normalize=normalize
            )
            trans[i, j] = suggested_translation
            score_dict[i, j] = score

    # Remove the period at the end if not last word
    # Lower case first word if not first word
    if add_period:
        for phrase_idx in xrange(0, len(index_order_list)):
            i, j = index_order_list[phrase_idx]
            if i != 0:
                trans[i, j] = " ".join([trans[i, j][0].lower()] + [trans[i, j][1:]])
            if j != len(src_seq) - 1:
                last_word = trans[i, j].strip().split()[-1]
                if last_word == '.':
                    trans[i, j] = " ".join(trans[i, j].strip().split()[:-1])

    # Translation of full sentence without segmentation
    logger.debug("Translating full sentence")
    phrase_to_translate = numpy.hstack((src_seq, eol_src))
    full_translation, __ = sample_targets(input_phrase=phrase_to_translate,
                                          model=model,
                                          n_samples=n_samples,
                                          reverse_score=reverse_score,
                                          normalize=normalize
                                          )
    logger.debug("Translation output:".format(full_translation))

    return trans, score_dict, full_translation
示例#24
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])

    ###########################################################
    # by He Wei
    #enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec = RNNEncoderDecoder(state,
                                rng,
                                skip_init=True,
                                compute_alignment=True)
    ###########################################################

    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'], 'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'], 'r'))

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        #assert beam_search
        #assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        #n_samples = args.beam_size
        total_cost = 0.0
        #logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state,
                                         indx_word,
                                         seqin,
                                         idx2word=idict_src)
            if args.verbose:
                print "Parsed Input:", parsed_in

            if args.beam_search:
                trans, costs, _, aligns = sample(lm_model,
                                                 seq,
                                                 args.beam_size,
                                                 sampler=sampler,
                                                 beam_search=beam_search,
                                                 ignore_unk=args.ignore_unk,
                                                 normalize=args.normalize)
            else:
                trans, costs, _, aligns = sample(lm_model,
                                                 seq,
                                                 1,
                                                 sampler=sampler,
                                                 beam_search=beam_search,
                                                 ignore_unk=args.ignore_unk,
                                                 normalize=args.normalize)
            best = numpy.argmin(costs)
            out_str = trans[best]
            align_str = []

            if args.beam_search and args.alignment:
                for (idx, _a) in enumerate(aligns[best]):
                    align_str.append("[%s]" % ' '.join(map(str, _a)))
                    #align_str.append("[%d-%d:%f,%d-%d:%f]" % (idx, _a[0], _a[1], idx, _a[2], _a[3]))
                out_str += "\t" + ' '.join(align_str)

            if args.beam_search and args.nbest:
                nbest_trans = trans
                nbest_costs = costs
                nbest_trans = numpy.array(nbest_trans)[numpy.argsort(
                    nbest_costs)]
                nbest_costs = numpy.array(sorted(nbest_costs))
                nbest_str = ' ||| '.join(
                    "%s | %f" % (t, c)
                    for (t, c) in zip(nbest_trans, nbest_costs))
                out_str += "\t" + nbest_str

            print >> ftrans, out_str

            if args.verbose:
                print "[Translation]%s\t[Align]%s" % (trans[best],
                                                      ' '.join(align_str))
            total_cost += costs[best]
            if (i + 1) % 100 == 0:
                ftrans.flush()
                logger.debug("Current speed is {} per sentence".format(
                    (time.time() - start_time) / (i + 1)))
        print "Total cost of the translations: {}".format(total_cost)
        print "Total used time: {}".format(time.time() - start_time)

        fsrc.close()
        ftrans.close()
    else:
        while True:
            try:
                seqin = raw_input('Input Sequence: ')
                n_samples = int(raw_input('How many samples? '))
                alpha = None
                if not args.beam_search:
                    alpha = float(raw_input('Inverse Temperature? '))
                seq, parsed_in = parse_input(state,
                                             indx_word,
                                             seqin,
                                             idx2word=idict_src)
                print "Parsed Input:", parsed_in
            except Exception:
                print "Exception while parsing your input:"
                traceback.print_exc()
                continue

            sample(lm_model,
                   seq,
                   n_samples,
                   sampler=sampler,
                   beam_search=beam_search,
                   ignore_unk=args.ignore_unk,
                   normalize=args.normalize,
                   alpha=alpha,
                   verbose=True)
示例#25
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    state['sort_k_batches'] = 1  # which means don't sort
    state['shuffle'] = False
    state['use_infinite_loop'] = False
    state['force_enc_repr_cpu'] = False

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state,
                                rng,
                                skip_init=True,
                                compute_alignment=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)

    indx_word_src = cPickle.load(open(state['word_indx'], 'rb'))
    indx_word_trgt = cPickle.load(open(state['word_indx_trgt'], 'rb'))

    if args.mode == "batch":
        data_given = args.src or args.trg
        txt = data_given and not (args.src.endswith(".h5")
                                  and args.trg.endswith(".h5"))
        if data_given and not txt:
            state['source'] = [args.src]
            state['target'] = [args.trg]
        if not data_given and not txt:
            logger.info("Using the training data")
        if txt:
            data_iter = BatchBiTxtIterator(state,
                                           args.src,
                                           indx_word_src,
                                           args.trg,
                                           indx_word_trgt,
                                           state['bs'],
                                           raise_unk=not args.allow_unk)
            data_iter.start()
        else:
            data_iter = get_batch_iterator(state)
            data_iter.start(0)

        score_file = open(args.scores, "w") if args.scores else sys.stdout

        scorer = enc_dec.create_scorer(batch=True)

        count = 0
        n_samples = 0
        logger.info('Scoring phrases')
        for i, batch in enumerate(data_iter):
            if batch == None:
                continue
            if args.n_batches >= 0 and i == args.n_batches:
                break

            if args.y_noise:
                y = batch['y']
                random_words = numpy.random.randint(0, 100,
                                                    y.shape).astype("int64")
                change_mask = numpy.random.binomial(1, args.y_noise,
                                                    y.shape).astype("int64")
                y = change_mask * random_words + (1 - change_mask) * y
                batch['y'] = y

            st = time.time()
            [scores] = scorer(batch['x'], batch['y'], batch['x_mask'],
                              batch['y_mask'])
            if args.print_probs:
                scores = numpy.exp(scores)
            up_time = time.time() - st
            for s in scores:
                print >> score_file, "{:.5e}".format(float(s))

            n_samples += batch['x'].shape[1]
            count += 1

            if count % 100 == 0:
                score_file.flush()
                logger.debug("Scores flushed")
            logger.debug(
                "{} batches, {} samples, {} per sample; example scores: {}".
                format(count, n_samples, up_time / scores.shape[0],
                       scores[:5]))

        logger.info("Done")
        score_file.flush()
    elif args.mode == "interact":
        scorer = enc_dec.create_scorer()
        while True:
            try:
                compute_probs = enc_dec.create_probs_computer()
                src_line = raw_input('Source sequence: ')
                trgt_line = raw_input('Target sequence: ')
                src_seq = parse_input(state,
                                      indx_word_src,
                                      src_line,
                                      raise_unk=not args.allow_unk,
                                      unk_sym=state['unk_sym_source'],
                                      null_sym=state['null_sym_source'])
                trgt_seq = parse_input(state,
                                       indx_word_trgt,
                                       trgt_line,
                                       raise_unk=not args.allow_unk,
                                       unk_sym=state['unk_sym_target'],
                                       null_sym=state['null_sym_target'])
                print "Binarized source: ", src_seq
                print "Binarized target: ", trgt_seq
                probs = compute_probs(src_seq, trgt_seq)
                print "Probs: {}, cost: {}".format(
                    probs, -numpy.sum(numpy.log(probs)))
            except Exception:
                traceback.print_exc()
    elif args.mode == "txt":
        assert args.src and args.trg
        scorer = enc_dec.create_scorer()
        src_file = open(args.src, "r")
        trg_file = open(args.trg, "r")
        compute_probs = enc_dec.create_probs_computer(return_alignment=True)
        try:
            numpy.set_printoptions(precision=3, linewidth=150, suppress=True)
            i = 0
            while True:
                src_line = next(src_file).strip()
                trgt_line = next(trg_file).strip()
                src_seq, src_words = parse_input(
                    state,
                    indx_word_src,
                    src_line,
                    raise_unk=not args.allow_unk,
                    unk_sym=state['unk_sym_source'],
                    null_sym=state['null_sym_source'])
                trgt_seq, trgt_words = parse_input(
                    state,
                    indx_word_trgt,
                    trgt_line,
                    raise_unk=not args.allow_unk,
                    unk_sym=state['unk_sym_target'],
                    null_sym=state['null_sym_target'])
                probs, alignment = compute_probs(src_seq, trgt_seq)
                if args.verbose:
                    print "Probs: ", probs.flatten()
                    if alignment.ndim == 3:
                        print "Alignment:".ljust(20), src_line, "<eos>"
                        for i, word in enumerate(trgt_words):
                            print "{}{}".format(word.ljust(20), alignment[i, :,
                                                                          0])
                        print "Generated by:"
                        for i, word in enumerate(trgt_words):
                            j = numpy.argmax(alignment[i, :, 0])
                            print "{} <--- {}".format(
                                word, src_words[j]
                                if j < len(src_words) else "<eos>")
                i += 1
                if i % 100 == 0:
                    sys.stdout.flush()
                    logger.debug(i)
                print -numpy.sum(numpy.log(probs))
        except StopIteration:
            pass
    else:
        raise Exception("Unknown mode {}".format(args.mode))
示例#26
0
    def __call__(self): 
        """
        Opens the file for the validation set and creates a subprocess 
        for the multi-bleu script. 

        Returns a boolean indicating whether the current model should
        be saved. 
        """ 

        print "Started Validation: "
        val_start_time = time.time()
        fsrc = open(self.state['validation_set'], 'r')
        mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) 
        total_cost = 0.0

        if self.verbose:
            ftrans = open(self.state['validation_set_out'], 'w')
        
        for i, line in enumerate(fsrc):
            """
            Load the sentence, retrieve the sample, write to file
            """
            if self.state['source_encoding'] == 'utf8':
                seqin = line.strip().decode('utf-8')
            else:
                seqin = line.strip()
            seq, parsed_in = parse_input(self.state, self.indx_word, seqin, idx2word=self.idict_src)

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs, _ = sample(self.lm_model, seq, self.n_samples,
                    beam_search=self.beam_search, ignore_unk=self.ignore_unk, normalize=self.normalize)
            try:
                best = numpy.argmin(costs)
                total_cost += costs[best]
                trans_out = trans[best]
            except ValueError:  
                print "Could not fine a translation for line: {}".format(i+1)
                trans_out = u'UNK' if self.state['target_encoding'] == 'utf8' else 'UNK'
            
            # Write to subprocess and file if it exists
            if self.state['target_encoding'] == 'utf8':
                print >> mb_subprocess.stdin, trans_out.encode('utf8').replace(" ","")
                if self.verbose:
                    print  >> ftrans, trans_out.encode('utf8').replace(" ","")
            else:
                print >> mb_subprocess.stdin, trans_out
                if self.verbose:
                    print >> ftrans, trans_out
         
            if i != 0 and i % 50 == 0:
                print "Translated {} lines of validation set...".format(i)
            mb_subprocess.stdin.flush()

        print "Total cost of the validation: {}".format(total_cost)
        fsrc.close()
        if self.verbose:
            ftrans.close()

        # send end of file, read output.        
        mb_subprocess.stdin.close()
        out_parse = re.match(r'BLEU = [-.0-9]+', mb_subprocess.stdout.readline())
        print "Validation Took: {} minutes".format(float(time.time() - val_start_time)/60.) 
        assert out_parse is not None

        # extract the score
        bleu_score = float(out_parse.group()[6:])
        self.val_bleu_curve.append(bleu_score)                
        print bleu_score     
        mb_subprocess.terminate()

        # Determine whether or not we should save
        if self.best_bleu < bleu_score:
            self.best_bleu = bleu_score
            return True
        return False
示例#27
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    state['sort_k_batches'] = 1
    state['shuffle'] = False
    state['use_infinite_loop'] = False
    state['force_enc_repr_cpu'] = False

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)

    indx_word_src = cPickle.load(open(state['word_indx'],'rb'))
    indx_word_trgt = cPickle.load(open(state['word_indx_trgt'], 'rb'))

    if args.mode == "batch":
        data_given = args.src or args.trg
        txt = data_given and not (args.src.endswith(".h5") and args.trg.endswith(".h5"))
        if data_given and not txt:
            state['source'] = [args.src]
            state['target'] = [args.trg]
        if not data_given and not txt:
            logger.info("Using the training data")
        if txt:
            data_iter = BatchBiTxtIterator(state,
                    args.src, indx_word_src, args.trg, indx_word_trgt,
                    state['bs'], raise_unk=not args.allow_unk)
            data_iter.start()
        else:
            data_iter = get_batch_iterator(state)
            data_iter.start(0)

        score_file = open(args.scores, "w") if args.scores else sys.stdout

        scorer = enc_dec.create_scorer(batch=True)

        count = 0
        n_samples = 0
        logger.info('Scoring phrases')
        for i, batch in enumerate(data_iter):
            if batch == None:
                continue
            if args.n_batches >= 0 and i == args.n_batches:
                break

            if args.y_noise:
                y = batch['y']
                random_words = numpy.random.randint(0, 100, y.shape).astype("int64")
                change_mask = numpy.random.binomial(1, args.y_noise, y.shape).astype("int64")
                y = change_mask * random_words + (1 - change_mask) * y
                batch['y'] = y

            st = time.time()
            [scores] = scorer(batch['x'], batch['y'],
                    batch['x_mask'], batch['y_mask'])
            if args.print_probs:
                scores = numpy.exp(scores)
            up_time = time.time() - st
            for s in scores:
                print >>score_file, "{:.5e}".format(float(s))

            n_samples += batch['x'].shape[1]
            count += 1

            if count % 100 == 0:
                score_file.flush()
                logger.debug("Scores flushed")
            logger.debug("{} batches, {} samples, {} per sample; example scores: {}".format(
                count, n_samples, up_time/scores.shape[0], scores[:5]))

        logger.info("Done")
        score_file.flush()
    elif args.mode == "interact":
        scorer = enc_dec.create_scorer()
        while True:
            try:
                compute_probs = enc_dec.create_probs_computer()
                src_line = raw_input('Source sequence: ')
                trgt_line = raw_input('Target sequence: ')
                src_seq = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk, 
                                      unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source'])
                trgt_seq = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk,
                                       unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target'])
                print "Binarized source: ", src_seq
                print "Binarized target: ", trgt_seq
                probs = compute_probs(src_seq, trgt_seq)
                print "Probs: {}, cost: {}".format(probs, -numpy.sum(numpy.log(probs)))
            except Exception:
                traceback.print_exc()
    elif args.mode == "txt":
        assert args.src and args.trg
        scorer = enc_dec.create_scorer()
        src_file = open(args.src, "r")
        trg_file = open(args.trg, "r")
        compute_probs = enc_dec.create_probs_computer(return_alignment=True)
        try:
            numpy.set_printoptions(precision=3, linewidth=150, suppress=True)
            i = 0
            while True:
                src_line = next(src_file).strip()
                trgt_line = next(trg_file).strip()
                src_seq, src_words = parse_input(state,
                        indx_word_src, src_line, raise_unk=not args.allow_unk,
                        unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source'])
                trgt_seq, trgt_words = parse_input(state,
                        indx_word_trgt, trgt_line, raise_unk=not args.allow_unk,
                        unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target'])
                probs, alignment = compute_probs(src_seq, trgt_seq)
                if args.verbose:
                    print "Probs: ", probs.flatten()
                    if alignment.ndim == 3:
                        print "Alignment:".ljust(20), src_line, "<eos>"
                        for i, word in enumerate(trgt_words):
                            print "{}{}".format(word.ljust(20), alignment[i, :, 0])
                        print "Generated by:"
                        for i, word in enumerate(trgt_words):
                            j = numpy.argmax(alignment[i, :, 0])
                            print "{} <--- {}".format(word,
                                    src_words[j] if j < len(src_words) else "<eos>")
                i += 1
                if i % 100 == 0:
                    sys.stdout.flush()
                    logger.debug(i)
                print -numpy.sum(numpy.log(probs))
        except StopIteration:
            pass
    else:
        raise Exception("Unknown mode {}".format(args.mode))
示例#28
0
def main():
    args = parse_args()

    state = prototype_phrase_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    if 'rolling_vocab' not in state:
        state['rolling_vocab'] = 0
    if 'save_algo' not in state:
        state['save_algo'] = 0
    if 'save_gs' not in state:
        state['save_gs'] = 0
    if 'save_iter' not in state:
        state['save_iter'] = -1
    if 'var_src_len' not in state:
        state['var_src_len'] = False

    if args.num_common and args.num_ttables and args.topn_file:
        with open(args.topn_file, 'rb') as f:
            topn = cPickle.load(f) # Load dictionary (source word index : list of target word indices)
            for elt in topn:
                topn[elt] = topn[elt][:args.num_ttables] # Take the first args.num_ttables only

    num_models = len(args.models)
    rng = numpy.random.RandomState(state['seed'])
    enc_decs = []
    lm_models = []
    alignment_fns = []
    if args.num_common and args.num_ttables and args.topn_file:
        original_W_0_dec_approx_embdr = []
        original_W2_dec_deep_softmax = []
        original_b_dec_deep_softmax = []

    for i in xrange(num_models):
        enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True))
        enc_decs[i].build()
        lm_models.append(enc_decs[i].create_lm_model())
        lm_models[i].load(args.models[i])

        alignment_fns.append(theano.function(inputs=enc_decs[i].inputs, outputs=[enc_decs[i].alignment], name="alignment_fn"))

        if args.num_common and args.num_ttables and args.topn_file:
            original_W_0_dec_approx_embdr.append(lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value())
            original_W2_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].get_value())
            original_b_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].get_value())

            lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value(numpy.zeros((1,1), dtype=numpy.float32))
            lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].set_value(numpy.zeros((1,1), dtype=numpy.float32))
            lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].set_value(numpy.zeros((1), dtype=numpy.float32))

    if args.mapping:
        with open(args.mapping, 'rb') as f:
            mapping = cPickle.load(f)
        heuristic = args.heuristic
    else:
        heuristic = 0
        mapping = None


    word2idx_src = cPickle.load(open(state['word_indx'], 'rb'))
    idict_src = cPickle.load(open(state['indx_word'], 'r'))

    word2idx_trg = cPickle.load(open(state['word_indx_trgt'], 'rb'))
    idict_trg = cPickle.load(open(state['indx_word_target'], 'r'))

    word2idx_trg['<eos>'] = state['null_sym_target']
    word2idx_trg[state['oov']] = state['unk_sym_target'] # 'UNK' may be in the vocabulary. Now points to the right index.
    idict_trg[state['null_sym_target']] = '<eos>'
    idict_trg[state['unk_sym_target']] = state['oov']

    if args.num_common and args.num_ttables and args.topn_file:

        # Use OrderedDict instead of set for reproducibility
        d = OrderedDict() # Up to now
        D = OrderedDict() # Full
        C = OrderedDict() # Allowed to reject
        prev_line = 0
        logger.info("%d" % prev_line)
        D_dict = OrderedDict()
        output = False

        for i in xrange(args.num_common):
            D[i] = 0
            C[i] = 0
        null_unk_indices = [state['null_sym_target'],state['unk_sym_target']]
        update_dicts(null_unk_indices, d, D, C, args.num_common)
        with open(args.source, 'r') as f:
            for i, line in enumerate(f):
                seqin = line.strip()
                seq, _ = parse_input(state, word2idx_src, seqin) # seq is the ndarray of indices
                indices = []
                for elt in seq[:-1]: # Exclude the EOL token
                    if elt != 1: # Exclude OOV (1 will not be a key of topn)
                        indices.extend(topn[elt]) # Add topn best unigram translations for each source word
                update_dicts(indices, d, D, C, args.num_common)
                if (i % args.change_every) == 0 and args.change_every > 0 and i > 0:
                    D_dict[prev_line] = D.copy() # Save dictionary for the lines preceding this one
                    prev_line = i
                    logger.info("%d" % i)
                    output = False
                    d = OrderedDict()
                    if args.no_reset:
                        C = D.copy()
                    else:
                        D = OrderedDict() # Full
                        C = OrderedDict() # Allowed to reject
                        for i in xrange(args.num_common):
                            D[i] = 0
                            C[i] = 0
                    null_unk_indices = [state['null_sym_target'], state['unk_sym_target']]
                    update_dicts(null_unk_indices, d, D, C, args.num_common)
                    update_dicts(indices, d, D, C, args.num_common) # Assumes you cannot fill d with only 1 line
            D_dict[prev_line] = D.copy()

    start_time = time.time()

    if args.source and args.trans and args.new_trans:
        with open(args.source, 'r') as src_file:
            with open(args.trans, 'r') as trans_file:
                with open(args.new_trans, 'w') as new_trans_file:
                    if not (args.num_common and args.num_ttables and args.topn_file):
                        eos_id = state['null_sym_target']
                        unk_id = state['unk_sym_target']
                        new_word2idx_trg = word2idx_trg

                    prev_i = -1
                    if args.n_best:
                        full_trans_line = trans_file.readline()
                        if full_trans_line == '':
                            raise IOError("File is empty")
                        full_trans_line = full_trans_line.split('|||')
                        n_best_start = int(full_trans_line[0].strip())
                        trans_file.seek(0)
                    while True:
                        if args.n_best:
                            full_trans_line = trans_file.readline()
                            if full_trans_line == '':
                                break
                            full_trans_line = full_trans_line.split('|||')
                            i = int(full_trans_line[0].strip()) - n_best_start
                            trans_line = full_trans_line[1].strip()
                        else:
                            trans_line = trans_file.readline()
                            if trans_line == '':
                                break
                            i = prev_i + 1

                        if i == (prev_i + 1):
                            prev_i = i

                            if (i % args.change_every) == 0 and i > 0:
                                hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize)
                                replace_unknown_words(
                                    src_word_seqs, trg_seqs, trg_word_seqs,
                                    hard_alignments, heuristic, mapping, unk_id,
                                    new_trans_file, args.n_best, full_trans_lines)

                            if (i % 100 == 0) and i > 0:
                                new_trans_file.flush()
                                logger.debug("Current speed is {} per sentence".
                                        format((time.time() - start_time) / i))

                            src_line = src_file.readline()
                            src_seq, src_words = parse_input(state, word2idx_src, src_line.strip())
                            src_words.append('<eos>')

                            if (i % args.change_every) == 0:
                                src_seqs = []
                                src_word_seqs = []
                                trg_seqs = []
                                trg_word_seqs = []
                                full_trans_lines = [] # Only used with n-best lists
                                if args.num_common and args.num_ttables and args.topn_file:
                                    indices = D_dict[i].keys()
                                    eos_id = indices.index(state['null_sym_target']) # Find new eos and unk positions
                                    unk_id = indices.index(state['unk_sym_target'])
                                    for j in xrange(num_models):
                                        lm_models[j].params[lm_models[j].name2pos['W_0_dec_approx_embdr']].set_value(original_W_0_dec_approx_embdr[j][indices])
                                        lm_models[j].params[lm_models[j].name2pos['W2_dec_deep_softmax']].set_value(original_W2_dec_deep_softmax[j][:, indices])
                                        lm_models[j].params[lm_models[j].name2pos['b_dec_deep_softmax']].set_value(original_b_dec_deep_softmax[j][indices])
                                    new_word2idx_trg = dict([(idict_trg[index], k) for k, index in enumerate(indices)])
                        elif i != prev_i:
                            raise ValueError("prev_i: %d, i: %d" % (prev_i, i))

                        trans_seq, trans_words = parse_output(new_word2idx_trg, trans_line.strip(), eos_id=eos_id, unk_id=unk_id)
                        trans_words.append('<eos>')

                        src_seqs.append(src_seq)
                        src_word_seqs.append(src_words)
                        trg_seqs.append(trans_seq)
                        trg_word_seqs.append(trans_words)
                        if args.n_best:
                            full_trans_lines.append(full_trans_line)

                    # Out of loop
                    hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize)
                    replace_unknown_words(src_word_seqs, trg_seqs, trg_word_seqs,
                                          hard_alignments, heuristic, mapping, unk_id,
                                          new_trans_file, args.n_best, full_trans_lines)
    else:
        raise NotImplementedError
示例#29
0
def main():
    args = parse_args()

    state = prototype_phrase_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    if 'rolling_vocab' not in state:
        state['rolling_vocab'] = 0
    if 'save_algo' not in state:
        state['save_algo'] = 0
    if 'save_gs' not in state:
        state['save_gs'] = 0
    if 'save_iter' not in state:
        state['save_iter'] = -1
    if 'var_src_len' not in state:
        state['var_src_len'] = False

    with open(args.topn_file, 'rb') as f:
        topn = cPickle.load(
            f
        )  # Load dictionary (source word index : list of target word indices)
    if args.less_transfer:
        for elt in topn:
            topn[elt] = topn[
                elt][:args.num_ttables]  # Take the first args.num_ttables only
    else:
        for elt in topn:
            topn[elt] = set(
                topn[elt][:args.num_ttables]
            )  # Take the first args.num_ttables only and convert list to set

    num_models = len(args.models)
    rng = numpy.random.RandomState(state['seed'])
    enc_decs = []
    lm_models = []
    original_W_0_dec_approx_embdr = []
    original_W2_dec_deep_softmax = []
    original_b_dec_deep_softmax = []
    for i in xrange(num_models):
        enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True))
        enc_decs[i].build()
        lm_models.append(enc_decs[i].create_lm_model())
        lm_models[i].load(args.models[i])

        original_W_0_dec_approx_embdr.append(lm_models[i].params[
            lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value())
        original_W2_dec_deep_softmax.append(lm_models[i].params[
            lm_models[i].name2pos['W2_dec_deep_softmax']].get_value())
        original_b_dec_deep_softmax.append(lm_models[i].params[
            lm_models[i].name2pos['b_dec_deep_softmax']].get_value())

        # On GPU, this will free memory for the next models
        # Additional gains could be made by rolling the source vocab
        lm_models[i].params[
            lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value(
                numpy.zeros((1, 1), dtype=numpy.float32))
        lm_models[i].params[
            lm_models[i].name2pos['W2_dec_deep_softmax']].set_value(
                numpy.zeros((1, 1), dtype=numpy.float32))
        lm_models[i].params[
            lm_models[i].name2pos['b_dec_deep_softmax']].set_value(
                numpy.zeros((1), dtype=numpy.float32))

    indx_word = cPickle.load(open(state['word_indx'], 'rb'))  #Source w2i

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_decs)
        beam_search.compile()
    else:
        raise NotImplementedError
        #sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'], 'r'))  #Source i2w

    original_target_i2w = lm_models[0].word_indxs.copy()
    # I don't think that we need target_word2index

    max_words = len(original_b_dec_deep_softmax[0])

    if args.less_transfer:
        # Use OrderedDict instead of set for reproducibility
        d = OrderedDict()  # Up to now
        D = OrderedDict()  # Full
        C = OrderedDict()  # Allowed to reject
        prev_line = 0
        logger.info("%d" % prev_line)
        D_dict = OrderedDict()
        output = False

        for i in xrange(args.num_common):
            D[i] = 0
            C[i] = 0
        null_unk_indices = [state['null_sym_target'], state['unk_sym_target']]
        update_dicts(null_unk_indices, d, D, C, args.num_common)
        with open(args.source, 'r') as f:
            for i, line in enumerate(f):
                seqin = line.strip()
                seq, parsed_in = parse_input(
                    state, indx_word, seqin,
                    idx2word=idict_src)  # seq is the ndarray of indices
                indices = []
                for elt in seq[:-1]:  # Exclude the EOL token
                    if elt != 1:  # Exclude OOV (1 will not be a key of topn)
                        indices.extend(
                            topn[elt]
                        )  # Add topn best unigram translations for each source word
                output = update_dicts(indices, d, D, C, args.num_common)
                if (i % args.change_every
                    ) == 0 and args.change_every > 0 and i > 0:
                    output = True
                if output:
                    D_dict[prev_line] = D.copy(
                    )  # Save dictionary for the lines preceding this one
                    prev_line = i
                    logger.info("%d" % i)
                    output = False
                    d = OrderedDict()
                    if args.no_reset:
                        C = D.copy()
                    else:
                        D = OrderedDict()  # Full
                        C = OrderedDict()  # Allowed to reject
                        for i in xrange(args.num_common):
                            D[i] = 0
                            C[i] = 0
                    null_unk_indices = [
                        state['null_sym_target'], state['unk_sym_target']
                    ]
                    update_dicts(null_unk_indices, d, D, C, args.num_common)
                    update_dicts(
                        indices, d, D, C, args.num_common
                    )  # Assumes you cannot fill d with only 1 line
            D_dict[prev_line] = D.copy()

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        assert beam_search
        assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        n_samples = args.beam_size
        total_cost = 0.0
        logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(
                state, indx_word, seqin,
                idx2word=idict_src)  # seq is the ndarray of indices
            # For now, keep all input words in the model.
            # In the future, we may want to filter them to save on memory, but this isn't really much of an issue now
            if args.verbose:
                print "Parsed Input:", parsed_in
            if args.less_transfer:
                if i in D_dict:
                    indices = D_dict[i].keys()
                    eos_id = indices.index(state['null_sym_target']
                                           )  # Find new eos and unk positions
                    unk_id = indices.index(state['unk_sym_target'])
                    for j in xrange(num_models):
                        lm_models[j].params[lm_models[j].name2pos[
                            'W_0_dec_approx_embdr']].set_value(
                                original_W_0_dec_approx_embdr[j][indices])
                        lm_models[j].params[lm_models[j].name2pos[
                            'W2_dec_deep_softmax']].set_value(
                                original_W2_dec_deep_softmax[j][:, indices])
                        lm_models[j].params[lm_models[j].name2pos[
                            'b_dec_deep_softmax']].set_value(
                                original_b_dec_deep_softmax[j][indices])
                    lm_models[0].word_indxs = dict([
                        (k, original_target_i2w[index])
                        for k, index in enumerate(indices)
                    ])  # target index2word
                trans, costs, _ = sample(lm_models[0],
                                         seq,
                                         n_samples,
                                         sampler=sampler,
                                         beam_search=beam_search,
                                         ignore_unk=args.ignore_unk,
                                         normalize=args.normalize,
                                         normalize_p=args.normalize_p,
                                         eos_id=eos_id,
                                         unk_id=unk_id,
                                         final=True,
                                         wp=args.wp)
            else:
                # Extract the indices you need
                indices = set()
                for elt in seq[:-1]:  # Exclude the EOL token
                    if elt != 1:  # Exclude OOV (1 will not be a key of topn)
                        indices = indices.union(
                            topn[elt]
                        )  # Add topn best unigram translations for each source word
                num_common_words = args.num_common
                while True:
                    if num_common_words >= max_words:
                        final = True
                        num_common_words = max_words
                    else:
                        final = False

                    if args.final:  # No matter the number of words
                        final = True
                    indices = indices.union(set(
                        xrange(num_common_words)))  # Add common words
                    indices = list(
                        indices)  # Convert back to list for advanced indexing
                    eos_id = indices.index(state['null_sym_target']
                                           )  # Find new eos and unk positions
                    unk_id = indices.index(state['unk_sym_target'])
                    # Set the target word matrices and biases
                    for j in xrange(num_models):
                        lm_models[j].params[lm_models[j].name2pos[
                            'W_0_dec_approx_embdr']].set_value(
                                original_W_0_dec_approx_embdr[j][indices])
                        lm_models[j].params[lm_models[j].name2pos[
                            'W2_dec_deep_softmax']].set_value(
                                original_W2_dec_deep_softmax[j][:, indices])
                        lm_models[j].params[lm_models[j].name2pos[
                            'b_dec_deep_softmax']].set_value(
                                original_b_dec_deep_softmax[j][indices])
                    lm_models[0].word_indxs = dict([
                        (k, original_target_i2w[index])
                        for k, index in enumerate(indices)
                    ])  # target index2word

                    try:
                        trans, costs, _ = sample(lm_models[0],
                                                 seq,
                                                 n_samples,
                                                 sampler=sampler,
                                                 beam_search=beam_search,
                                                 ignore_unk=args.ignore_unk,
                                                 normalize=args.normalize,
                                                 normalize_p=args.normalize_p,
                                                 eos_id=eos_id,
                                                 unk_id=unk_id,
                                                 final=final)
                        break  # Breaks only if it succeeded (If final=True, will always succeed)
                    except RuntimeError:
                        indices = set(indices)
                        num_common_words *= 2
            if not args.n_best:
                best = numpy.argmin(costs)
                print >> ftrans, trans[best]
            else:
                order = numpy.argsort(costs)
                best = order[0]
                for elt in order:
                    print >> ftrans, str(
                        i + args.start) + ' ||| ' + trans[elt] + ' ||| ' + str(
                            costs[elt])
            if args.verbose:
                print "Translation:", trans[best]
            total_cost += costs[best]
            if (i + 1) % 100 == 0:
                ftrans.flush()
                logger.debug("Current speed is {} per sentence".format(
                    (time.time() - start_time) / (i + 1)))
        print "Total cost of the translations: {}".format(total_cost)

        fsrc.close()
        ftrans.close()
    else:
        raise NotImplementedError
示例#30
0
    # assert beam_search
    # assert args.beam_size

    fsrc = open(args.source, 'r')
    ftrg = open(args.target, 'r')

    start_time = time.time()

    n_samples = args.beam_size
    total_cost = 0.0
    logging.debug("Beam size: {}".format(n_samples))
    for srcline, trgline in zip(fsrc, ftrg):
        src_seqin = srcline.strip()
        trg_seqin = trgline.strip()
        src_seq, src_parsed_in = parse_input(state,
                                             indx_word_src,
                                             src_seqin,
                                             idx2word=idict_src)
        trg_seq, trg_parsed_in = parse_input(state,
                                             indx_word_trg,
                                             trg_seqin,
                                             idx2word=idict_trg)
        if args.verbose:
            print "Parsed Input:", parsed_in
        trans, costs, _ = sample(lm_model,
                                 src_seq,
                                 trg_seq,
                                 n_samples,
                                 sampler=sampler,
                                 beam_search=beam_search,
                                 ignore_unk=args.ignore_unk,
                                 normalize=args.normalize,
示例#31
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'],'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'],'r'))

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        assert beam_search
        assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        n_samples = args.beam_size
        total_cost = 0.0
        logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
            if args.verbose:
                print "Parsed Input:", parsed_in
            trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                    beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
            try:
                best = numpy.argmin(costs)
                print >>ftrans, trans[best]
                total_cost += costs[best]
            except:
                print >> ftrans, "FAIL"
            if args.verbose:
                print "Translation:", trans[best]
            if (i + 1)  % 100 == 0:
                ftrans.flush()
                logger.debug("Current speed is {} per sentence".
                        format((time.time() - start_time) / (i + 1)))
        print "Total cost of the translations: {}".format(total_cost)

        fsrc.close()
        ftrans.close()
    else:
        while True:
            try:
                seqin = raw_input('Input Sequence: ')
                n_samples = int(raw_input('How many samples? '))
                alpha = None
                if not args.beam_search:
                    alpha = float(raw_input('Inverse Temperature? '))
                seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
                print "Parsed Input:", parsed_in
            except Exception:
                print "Exception while parsing your input:"
                traceback.print_exc()
                continue

            sample(lm_model, seq, n_samples, sampler=sampler,
                    beam_search=beam_search,
                    ignore_unk=args.ignore_unk, normalize=args.normalize,
                    alpha=alpha, verbose=True)
示例#32
0
            if (i + 1)  % 100 == 0:
                ftrans.flush()
                logger.debug("Current speed is {} per sentence".
                        format((time.time() - start_time) / (i + 1)))
        print "Total cost of the translations: {}".format(total_cost)

        fsrc.close()
        ftrans.close()
    else:
        while True:
            try:
                seqin = raw_input('Input Sequence: ')
                n_samples = int(raw_input('How many samples? '))
                alpha = None
                if not args.beam_search:
                    alpha = float(raw_input('Inverse Temperature? '))
                seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
                print "Parsed Input:", parsed_in
            except Exception:
                print "Exception while parsing your input:"
                traceback.print_exc()
                continue

            sample(lm_model, seq, n_samples, sampler=sampler,
                    beam_search=beam_search,
                    ignore_unk=args.ignore_unk, normalize=args.normalize,
                    alpha=alpha, verbose=True)

if __name__ == "__main__":
    main()