def get_models(): args = parse_args() state_en2fr = prototype_state() if hasattr(args, 'state_en2fr'): with open(args.state_en2fr) as src: state_en2fr.update(cPickle.load(src)) state_en2fr.update(eval("dict({})".format(args.changes))) state_fr2en = prototype_state() if hasattr(args, 'state_fr2en') and args.state_fr2en is not None: with open(args.state_fr2en) as src: state_fr2en.update(cPickle.load(src)) state_fr2en.update(eval("dict({})".format(args.changes))) rng = numpy.random.RandomState(state_en2fr['seed']) enc_dec_en_2_fr = RNNEncoderDecoder(state_en2fr, rng, skip_init=True) enc_dec_en_2_fr.build() lm_model_en_2_fr = enc_dec_en_2_fr.create_lm_model() lm_model_en_2_fr.load(args.model_path_en2fr) indx_word_src = cPickle.load(open(state_en2fr['word_indx'], 'rb')) indx_word_trgt = cPickle.load(open(state_en2fr['word_indx_trgt'], 'rb')) if hasattr(args, 'state_fr2en') and args.state_fr2en is not None: rng = numpy.random.RandomState(state_fr2en['seed']) enc_dec_fr_2_en = RNNEncoderDecoder(state_fr2en, rng, skip_init=True) enc_dec_fr_2_en.build() lm_model_fr_2_en = enc_dec_fr_2_en.create_lm_model() lm_model_fr_2_en.load(args.model_path_fr2en) return [lm_model_en_2_fr, enc_dec_en_2_fr, indx_word_src, indx_word_trgt, state_en2fr, \ lm_model_fr_2_en, enc_dec_fr_2_en, state_fr2en] else: return [lm_model_en_2_fr, enc_dec_en_2_fr, indx_word_src, indx_word_trgt, state_en2fr, \ None, None, None]
def get_models(): args = parse_args() state_en2fr = prototype_state() if hasattr(args, 'state_en2fr'): with open(args.state_en2fr) as src: state_en2fr.update(cPickle.load(src)) state_en2fr.update(eval("dict({})".format(args.changes))) state_fr2en = prototype_state() if hasattr(args, 'state_fr2en') and args.state_fr2en is not None: with open(args.state_fr2en) as src: state_fr2en.update(cPickle.load(src)) state_fr2en.update(eval("dict({})".format(args.changes))) rng = numpy.random.RandomState(state_en2fr['seed']) enc_dec_en_2_fr = RNNEncoderDecoder(state_en2fr, rng, skip_init=True) enc_dec_en_2_fr.build() lm_model_en_2_fr = enc_dec_en_2_fr.create_lm_model() lm_model_en_2_fr.load(args.model_path_en2fr) indx_word_src = cPickle.load(open(state_en2fr['word_indx'], 'rb')) indx_word_trgt = cPickle.load(open(state_en2fr['word_indx_trgt'], 'rb')) if hasattr(args, 'state_fr2en') and args.state_fr2en is not None: rng = numpy.random.RandomState(state_fr2en['seed']) enc_dec_fr_2_en = RNNEncoderDecoder(state_fr2en, rng, skip_init=True) enc_dec_fr_2_en.build() lm_model_fr_2_en = enc_dec_fr_2_en.create_lm_model() lm_model_fr_2_en.load(args.model_path_fr2en) return [lm_model_en_2_fr, enc_dec_en_2_fr, indx_word_src, indx_word_trgt, state_en2fr, \ lm_model_fr_2_en, enc_dec_fr_2_en, state_fr2en] else: return [lm_model_en_2_fr, enc_dec_en_2_fr, indx_word_src, indx_word_trgt, state_en2fr,\ None, None, None]
def __init__(self, args): self.args = args self.state = prototype_state() with open(self.args.state) as src: self.state.update(cPickle.load(src)) self.state.update(eval("dict({})".format(self.args.changes))) logging.basicConfig(level=getattr(logging, self.state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(self.state['seed']) enc_dec = RNNEncoderDecoder(self.state, rng, skip_init=True) enc_dec.build() self.lm_model = enc_dec.create_lm_model() self.lm_model.load(self.args.model_path) self.indx_word = cPickle.load(open(self.state['word_indx'], 'rb')) self.sampler = None self.beam_search = None if self.args.beam_search: self.beam_search = BeamSearch(enc_dec) self.beam_search.compile() else: self.sampler = enc_dec.create_sampler(many_samples=True) self.idict_src = cPickle.load(open(self.state['indx_word'], 'r'))
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) alignment_fun = enc_dec.create_probs_computer(return_alignment=True) word_indx_src = cPickle.load(open(state['word_indx'], 'rb')) word_indx_trg = cPickle.load(open(state['word_indx_trgt'], 'rb')) source_file = args.source target_file = args.target output_file = args.output comput_alignment(source_file, target_file, output_file, alignment_fun, word_indx_src, word_indx_trg, state)
def __init__(self): # para setting self.arg_state = 'search_state.pkl' self.arg_changes = "" self.arg_model_path = 'search_model.npz' self.arg_beam_search = True self.arg_ignore_unk = False self.arg_normalize = False self.state = prototype_state() with open(self.arg_state) as src: self.state.update(cPickle.load(src)) self.state.update(eval("dict({})".format(self.arg_changes))) logging.basicConfig( level=getattr(logging, self.state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(self.state['seed']) self.enc_dec = RNNEncoderDecoder(self.state, rng, skip_init=True) self.enc_dec.build() self.lm_model = self.enc_dec.create_lm_model() self.lm_model.load(self.arg_model_path) self.indx_word = cPickle.load(open(self.state['word_indx'], 'rb')) self.beam_search = None self.beam_search = BeamSearch(self.enc_dec) self.beam_search.compile() self.idict_src = cPickle.load(open(self.state['indx_word'], 'r'))
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state["level"]), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s" ) server_address = ("", args.port) httpd = BaseHTTPServer.HTTPServer(server_address, MTReqHandler) rng = numpy.random.RandomState(state["seed"]) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state["word_indx"], "rb")) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state["indx_word"], "r")) tokenizer_cmd = [os.getcwd() + "/tokenizer.perl", "-l", "en", "-q", "-"] detokenizer_cmd = [os.getcwd() + "/detokenizer.perl", "-l", "fr", "-q", "-"] sampler = Sampler( state, lm_model, indx_word, idict_src, beam_search=beam_search, tokenizer_cmd=tokenizer_cmd, detokenizer_cmd=detokenizer_cmd, ) httpd.sampler = sampler print "Server starting.." httpd.serve_forever() """
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") server_address = ('', args.port) httpd = BaseHTTPServer.HTTPServer(server_address, MTReqHandler) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'], 'r')) tokenizer_cmd = [os.getcwd() + '/tokenizer.perl', '-l', 'en', '-q', '-'] detokenizer_cmd = [ os.getcwd() + '/detokenizer.perl', '-l', 'fr', '-q', '-' ] sampler = Sampler(state, lm_model, indx_word, idict_src, beam_search=beam_search, tokenizer_cmd=tokenizer_cmd, detokenizer_cmd=detokenizer_cmd) httpd.sampler = sampler print 'Server starting..' httpd.serve_forever() '''
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path)
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) scoreMaker = ScoreMaker(enc_dec) ScoreMaker.compile() indx_word_src = cPickle.load(open(state['word_indx'],'rb')) indx_word_trg = cPickle.load(open(state['word_indx_trgt'],'rb')) idict_src = cPickle.load(open(state['indx_word'],'r')) idict_trg = cPickle.load(open(state['indx_word_target'],'r')) fsrc = open(args.source, 'r') ftrg = open(args.target, 'r') for srcline, trgline in zip(fsrc, ftrg): src_seqin = srcline.strip() trg_seqin = trgline.strip() src_seq, src_parsed_in = parse_input(state, indx_word_src, src_seqin, idx2word=idict_src) trg_seq, trg_parsed_in = parse_input(state, indx_word_trg, trg_seqin, idx2word=idict_trg) print "Parsed Input:", src_parsed_in ScoreMaker.score(lm_model, src_seq, trg_seq, idict_src, idict_trg) fsrc.close() ftrg.close()
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) ScoreMaker = ScoreMaker(enc_dec) ScoreMaker.compile() indx_word_src = cPickle.load(open(state['word_indx'], 'rb')) indx_word_trg = cPickle.load(open(state['word_indx_trgt'], 'rb')) idict_src = cPickle.load(open(state['indx_word'], 'r')) idict_trg = cPickle.load(open(state['indx_word_target'], 'r')) fsrc = open(args.source, 'r') ftrg = open(args.target, 'r') for srcline, trgline in zip(fsrc, ftrg): src_seqin = srcline.strip() trg_seqin = trgline.strip() src_seq, src_parsed_in = parse_input(state, indx_word_src, src_seqin, idx2word=idict_src) trg_seq, trg_parsed_in = parse_input(state, indx_word_trg, trg_seqin, idx2word=idict_trg) print "Parsed Input:", src_parsed_in ScoreMaker.score(lm_model, src_seq, trg_seq, idict_src, idict_trg) fsrc.close() ftrg.close()
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) state['sort_k_batches'] = 1 # which means don't sort state['shuffle'] = False state['use_infinite_loop'] = False state['force_enc_repr_cpu'] = False logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word_src = cPickle.load(open(state['word_indx'], 'rb')) indx_word_trgt = cPickle.load(open(state['word_indx_trgt'], 'rb')) if args.mode == "batch": data_given = args.src or args.trg txt = data_given and not (args.src.endswith(".h5") and args.trg.endswith(".h5")) if data_given and not txt: state['source'] = [args.src] state['target'] = [args.trg] if not data_given and not txt: logger.info("Using the training data") if txt: data_iter = BatchBiTxtIterator(state, args.src, indx_word_src, args.trg, indx_word_trgt, state['bs'], raise_unk=not args.allow_unk) data_iter.start() else: data_iter = get_batch_iterator(state) data_iter.start(0) score_file = open(args.scores, "w") if args.scores else sys.stdout scorer = enc_dec.create_scorer(batch=True) count = 0 n_samples = 0 logger.info('Scoring phrases') for i, batch in enumerate(data_iter): if batch == None: continue if args.n_batches >= 0 and i == args.n_batches: break if args.y_noise: y = batch['y'] random_words = numpy.random.randint(0, 100, y.shape).astype("int64") change_mask = numpy.random.binomial(1, args.y_noise, y.shape).astype("int64") y = change_mask * random_words + (1 - change_mask) * y batch['y'] = y st = time.time() [scores] = scorer(batch['x'], batch['y'], batch['x_mask'], batch['y_mask']) if args.print_probs: scores = numpy.exp(scores) up_time = time.time() - st for s in scores: print >> score_file, "{:.5e}".format(float(s)) n_samples += batch['x'].shape[1] count += 1 if count % 100 == 0: score_file.flush() logger.debug("Scores flushed") logger.debug( "{} batches, {} samples, {} per sample; example scores: {}". format(count, n_samples, up_time / scores.shape[0], scores[:5])) logger.info("Done") score_file.flush() elif args.mode == "interact": scorer = enc_dec.create_scorer() while True: try: compute_probs = enc_dec.create_probs_computer() src_line = raw_input('Source sequence: ') trgt_line = raw_input('Target sequence: ') src_seq = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source']) trgt_seq = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target']) print "Binarized source: ", src_seq print "Binarized target: ", trgt_seq probs = compute_probs(src_seq, trgt_seq) print "Probs: {}, cost: {}".format( probs, -numpy.sum(numpy.log(probs))) except Exception: traceback.print_exc() elif args.mode == "txt": assert args.src and args.trg scorer = enc_dec.create_scorer() src_file = open(args.src, "r") trg_file = open(args.trg, "r") compute_probs = enc_dec.create_probs_computer(return_alignment=True) try: numpy.set_printoptions(precision=3, linewidth=150, suppress=True) i = 0 while True: src_line = next(src_file).strip() trgt_line = next(trg_file).strip() src_seq, src_words = parse_input( state, indx_word_src, src_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source']) trgt_seq, trgt_words = parse_input( state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target']) probs, alignment = compute_probs(src_seq, trgt_seq) if args.verbose: print "Probs: ", probs.flatten() if alignment.ndim == 3: print "Alignment:".ljust(20), src_line, "<eos>" for i, word in enumerate(trgt_words): print "{}{}".format(word.ljust(20), alignment[i, :, 0]) print "Generated by:" for i, word in enumerate(trgt_words): j = numpy.argmax(alignment[i, :, 0]) print "{} <--- {}".format( word, src_words[j] if j < len(src_words) else "<eos>") i += 1 if i % 100 == 0: sys.stdout.flush() logger.debug(i) print -numpy.sum(numpy.log(probs)) except StopIteration: pass else: raise Exception("Unknown mode {}".format(args.mode))
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) if args.config: state.update(eval(open(args.config).read())) if args.weights: state['weights'] = args.weights if args.lm_file: state['lm_file'] = args.lm_file if args.lm_vocab: state['lm_vocab'] = args.lm_vocab if args.pt_file: state['phrase_table'] = args.pt_file if args.lm_ngram: state['lm_ngram'] = args.lm_ngram logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) idict_src = cPickle.load(open(state['indx_word'], 'r')) trg_idx2word = cPickle.load(open(state['indx_word_target'], 'r')) trg_word2idx = cPickle.load(open(state['word_indx_trgt'], 'r')) #0:UNK_tm_value 1:rnn_weight 2:lm_weight 3:tm_weight 4:word_penalty_weight fea_weights = map(float, state['weights'].split(',')) beam_search = BeamSearch(enc_dec, trg_idx2word, trg_word2idx, indx_word) beam_search.compile() beam_search.init_features(state, fea_weights) #beam_search.init_lm(state['lm_vocab'], state['lm_file'], ngram=int(state['lm_ngram']), weight=fea_weights[2]) #beam_search.init_tm(state['phrase_table'], weights=fea_weights[3:]) fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print >> sys.stderr, "Parsed Input:", parsed_in trans, costs, trans_ids, aligns, lm_costs, tm_costs, unk_nums, rnn_costs = sample( lm_model, seqin, seq, n_samples, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) #for (i, t) in enumerate(trans): # costs[i] = costs[i] / len(t) best = numpy.argmin(costs) align_str = [] for (idx, _a) in enumerate(aligns[best]): align_str.append("[%s]" % ' '.join(map(str, _a))) if args.nbest: nbest_trans = trans nbest_costs = costs nbest_lm_costs = lm_costs nbest_tm_costs = tm_costs nbest_unk_nums = unk_nums nbest_rnn_costs = rnn_costs nbest_trans = numpy.array(nbest_trans)[numpy.argsort(nbest_costs)] nbest_lm_costs = numpy.array(nbest_lm_costs)[numpy.argsort( nbest_costs)] nbest_tm_costs = numpy.array(nbest_tm_costs)[numpy.argsort( nbest_costs)] nbest_unk_nums = numpy.array(nbest_unk_nums)[numpy.argsort( nbest_costs)] nbest_rnn_costs = numpy.array(nbest_rnn_costs)[numpy.argsort( nbest_costs)] nbest_costs = numpy.array(sorted(nbest_costs)) for (t, lm, tm, c, u, r) in zip(nbest_trans, nbest_lm_costs, nbest_tm_costs, nbest_costs, nbest_unk_nums, nbest_rnn_costs): sum_lm = numpy.sum(lm) sum_unk = numpy.sum(u) sum_tm = numpy.sum(tm) rnn_cost = numpy.sum(r) sum_wp = len(t.split(' ')) + 1 #rnn_cost = c - sum_lm * beam_search.weight_lm - sum_tm * beam_search.weight_tm - sum_wp * beam_search.weight_wp pure_tm = sum_tm + sum_unk * beam_search.unk_tm_value #rnn_cost = sum_rnn / beam_search.weight_rnn #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, c, rnn_cost, sum_lm, sum_tm, sum_wp) #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.weight_tm, -rnn_cost, -sum_lm, -pure_tm, -sum_wp) print >> ftrans, "%s ||| %f %f %f %f ||| 0" % ( t, -rnn_cost, -sum_lm, -sum_tm, -sum_wp) if args.verbose: print >>sys.stderr, "%s ||| %f %f %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.unk_tm_value * beam_search.weight_tm,\ -rnn_cost * beam_search.weight_rnn, \ -sum_lm * beam_search.weight_lm, \ -pure_tm * beam_search.weight_tm, \ -sum_tm * beam_search.weight_tm, \ -sum_wp * beam_search.weight_wp, c) print >> ftrans, '' #nbest_str = ' ||| '.join("%s | %f" % (t, c) for (t, c) in zip(nbest_trans, nbest_costs)) #out_str += "\t" + nbest_str else: out_str = trans[best] if args.alignment: out_str += "\t" + ' '.join(align_str) if args.show_unk: best_ids = trans_ids[best] unk_ids = [] for (i, idx) in enumerate(best_ids): if idx == beam_search.unk_id: unk_ids.append(i) out_str += "\t" + ' '.join(map(str, unk_ids)) print >> ftrans, out_str if args.verbose: print "[Translation]%s\t[Align]%s" % (trans[best], ' '.join(align_str)) total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) print "Total used time: {}".format(time.time() - start_time) fsrc.close() ftrans.close()
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) ########################################################### # by He Wei #enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) ########################################################### enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'], 'r')) if args.source and args.trans: # Actually only beam search is currently supported here #assert beam_search #assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() #n_samples = args.beam_size total_cost = 0.0 #logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print "Parsed Input:", parsed_in if args.beam_search: trans, costs, _, aligns = sample(lm_model, seq, args.beam_size, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, costs, _, aligns = sample(lm_model, seq, 1, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) best = numpy.argmin(costs) out_str = trans[best] align_str = [] if args.beam_search and args.alignment: for (idx, _a) in enumerate(aligns[best]): align_str.append("[%s]" % ' '.join(map(str, _a))) #align_str.append("[%d-%d:%f,%d-%d:%f]" % (idx, _a[0], _a[1], idx, _a[2], _a[3])) out_str += "\t" + ' '.join(align_str) if args.beam_search and args.nbest: nbest_trans = trans nbest_costs = costs nbest_trans = numpy.array(nbest_trans)[numpy.argsort( nbest_costs)] nbest_costs = numpy.array(sorted(nbest_costs)) nbest_str = ' ||| '.join( "%s | %f" % (t, c) for (t, c) in zip(nbest_trans, nbest_costs)) out_str += "\t" + nbest_str print >> ftrans, out_str if args.verbose: print "[Translation]%s\t[Align]%s" % (trans[best], ' '.join(align_str)) total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) print "Total used time: {}".format(time.time() - start_time) fsrc.close() ftrans.close() else: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = None if not args.beam_search: alpha = float(raw_input('Inverse Temperature? ')) seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, alpha=alpha, verbose=True)
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) if args.config: state.update(eval(open(args.config).read())) if args.weights: state['weights'] = args.weights if args.lm_file: state['lm_file'] = args.lm_file if args.lm_vocab: state['lm_vocab'] = args.lm_vocab if args.pt_file: state['phrase_table'] = args.pt_file if args.lm_ngram: state['lm_ngram'] = args.lm_ngram logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) idict_src = cPickle.load(open(state['indx_word'],'r')) trg_idx2word = cPickle.load(open(state['indx_word_target'],'r')) trg_word2idx = cPickle.load(open(state['word_indx_trgt'],'r')) #0:UNK_tm_value 1:rnn_weight 2:lm_weight 3:tm_weight 4:word_penalty_weight fea_weights = map(float, state['weights'].split(',')) beam_search = BeamSearch(enc_dec, trg_idx2word, trg_word2idx, indx_word) beam_search.compile() beam_search.init_features(state, fea_weights) #beam_search.init_lm(state['lm_vocab'], state['lm_file'], ngram=int(state['lm_ngram']), weight=fea_weights[2]) #beam_search.init_tm(state['phrase_table'], weights=fea_weights[3:]) fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print >> sys.stderr, "Parsed Input:", parsed_in trans, costs, trans_ids, aligns, lm_costs, tm_costs, unk_nums, rnn_costs = sample(lm_model, seqin, seq, n_samples, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) #for (i, t) in enumerate(trans): # costs[i] = costs[i] / len(t) best = numpy.argmin(costs) align_str = [] for (idx, _a) in enumerate(aligns[best]): align_str.append("[%s]" % ' '.join(map(str, _a))) if args.nbest: nbest_trans = trans nbest_costs = costs nbest_lm_costs = lm_costs nbest_tm_costs = tm_costs nbest_unk_nums = unk_nums nbest_rnn_costs = rnn_costs nbest_trans = numpy.array(nbest_trans)[numpy.argsort(nbest_costs)] nbest_lm_costs = numpy.array(nbest_lm_costs)[numpy.argsort(nbest_costs)] nbest_tm_costs = numpy.array(nbest_tm_costs)[numpy.argsort(nbest_costs)] nbest_unk_nums = numpy.array(nbest_unk_nums)[numpy.argsort(nbest_costs)] nbest_rnn_costs = numpy.array(nbest_rnn_costs)[numpy.argsort(nbest_costs)] nbest_costs = numpy.array(sorted(nbest_costs)) for (t, lm, tm, c, u, r) in zip(nbest_trans, nbest_lm_costs, nbest_tm_costs, nbest_costs, nbest_unk_nums, nbest_rnn_costs): sum_lm = numpy.sum(lm) sum_unk = numpy.sum(u) sum_tm = numpy.sum(tm) rnn_cost = numpy.sum(r) sum_wp = len(t.split(' ')) + 1 #rnn_cost = c - sum_lm * beam_search.weight_lm - sum_tm * beam_search.weight_tm - sum_wp * beam_search.weight_wp pure_tm = sum_tm + sum_unk * beam_search.unk_tm_value #rnn_cost = sum_rnn / beam_search.weight_rnn #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, c, rnn_cost, sum_lm, sum_tm, sum_wp) #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.weight_tm, -rnn_cost, -sum_lm, -pure_tm, -sum_wp) print >> ftrans, "%s ||| %f %f %f %f ||| 0" % (t, -rnn_cost, -sum_lm, -sum_tm, -sum_wp) if args.verbose: print >>sys.stderr, "%s ||| %f %f %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.unk_tm_value * beam_search.weight_tm,\ -rnn_cost * beam_search.weight_rnn, \ -sum_lm * beam_search.weight_lm, \ -pure_tm * beam_search.weight_tm, \ -sum_tm * beam_search.weight_tm, \ -sum_wp * beam_search.weight_wp, c) print >> ftrans, '' #nbest_str = ' ||| '.join("%s | %f" % (t, c) for (t, c) in zip(nbest_trans, nbest_costs)) #out_str += "\t" + nbest_str else: out_str = trans[best] if args.alignment: out_str += "\t" + ' '.join(align_str) if args.show_unk: best_ids = trans_ids[best] unk_ids = [] for (i, idx) in enumerate(best_ids): if idx == beam_search.unk_id: unk_ids.append(i) out_str += "\t" + ' '.join(map(str, unk_ids)) print >>ftrans, out_str if args.verbose: print "[Translation]%s\t[Align]%s" % (trans[best], ' '.join(align_str)) total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) print "Total used time: {}".format(time.time() - start_time) fsrc.close() ftrans.close()
sen = indices_to_words(lm_model.word_indxs, trans[i]) # sentences.append(" ".join(sen)) sentences.append(sen) for i in range(len(costs)): if verbose: print "{}: {}".format(costs[i], sentences[i]) return sentences, costs, alignment model_path = "path/to/search_model.npz" state_file = "path/to/search_state.pkl" states = prototype_state() with open(state_file) as src: states.update(cPickle.load(src)) states.update(eval("dict({})".format(""))) logging.basicConfig(level=getattr(logging, states['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(states['seed']) enc_dec = RNNEncoderDecoder(states, rng, skip_init=True) enc_dec.build() lm_models = enc_dec.create_lm_model() lm_models.load(model_path) indx_word = cPickle.load(open(states['word_indx'],'rb')) beam_search = None beam_search = BeamSearch(enc_dec)
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) state['sort_k_batches'] = 1 state['shuffle'] = False state['use_infinite_loop'] = False state['force_enc_repr_cpu'] = False logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word_src = cPickle.load(open(state['word_indx'],'rb')) indx_word_trgt = cPickle.load(open(state['word_indx_trgt'], 'rb')) if args.mode == "batch": data_given = args.src or args.trg txt = data_given and not (args.src.endswith(".h5") and args.trg.endswith(".h5")) if data_given and not txt: state['source'] = [args.src] state['target'] = [args.trg] if not data_given and not txt: logger.info("Using the training data") if txt: data_iter = BatchBiTxtIterator(state, args.src, indx_word_src, args.trg, indx_word_trgt, state['bs'], raise_unk=not args.allow_unk) data_iter.start() else: data_iter = get_batch_iterator(state) data_iter.start(0) score_file = open(args.scores, "w") if args.scores else sys.stdout scorer = enc_dec.create_scorer(batch=True) count = 0 n_samples = 0 logger.info('Scoring phrases') for i, batch in enumerate(data_iter): if batch == None: continue if args.n_batches >= 0 and i == args.n_batches: break if args.y_noise: y = batch['y'] random_words = numpy.random.randint(0, 100, y.shape).astype("int64") change_mask = numpy.random.binomial(1, args.y_noise, y.shape).astype("int64") y = change_mask * random_words + (1 - change_mask) * y batch['y'] = y st = time.time() [scores] = scorer(batch['x'], batch['y'], batch['x_mask'], batch['y_mask']) if args.print_probs: scores = numpy.exp(scores) up_time = time.time() - st for s in scores: print >>score_file, "{:.5e}".format(float(s)) n_samples += batch['x'].shape[1] count += 1 if count % 100 == 0: score_file.flush() logger.debug("Scores flushed") logger.debug("{} batches, {} samples, {} per sample; example scores: {}".format( count, n_samples, up_time/scores.shape[0], scores[:5])) logger.info("Done") score_file.flush() elif args.mode == "interact": scorer = enc_dec.create_scorer() while True: try: compute_probs = enc_dec.create_probs_computer() src_line = raw_input('Source sequence: ') trgt_line = raw_input('Target sequence: ') src_seq = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source']) trgt_seq = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target']) print "Binarized source: ", src_seq print "Binarized target: ", trgt_seq probs = compute_probs(src_seq, trgt_seq) print "Probs: {}, cost: {}".format(probs, -numpy.sum(numpy.log(probs))) except Exception: traceback.print_exc() elif args.mode == "txt": assert args.src and args.trg scorer = enc_dec.create_scorer() src_file = open(args.src, "r") trg_file = open(args.trg, "r") compute_probs = enc_dec.create_probs_computer(return_alignment=True) try: numpy.set_printoptions(precision=3, linewidth=150, suppress=True) i = 0 while True: src_line = next(src_file).strip() trgt_line = next(trg_file).strip() src_seq, src_words = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source']) trgt_seq, trgt_words = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target']) probs, alignment = compute_probs(src_seq, trgt_seq) if args.verbose: print "Probs: ", probs.flatten() if alignment.ndim == 3: print "Alignment:".ljust(20), src_line, "<eos>" for i, word in enumerate(trgt_words): print "{}{}".format(word.ljust(20), alignment[i, :, 0]) print "Generated by:" for i, word in enumerate(trgt_words): j = numpy.argmax(alignment[i, :, 0]) print "{} <--- {}".format(word, src_words[j] if j < len(src_words) else "<eos>") i += 1 if i % 100 == 0: sys.stdout.flush() logger.debug(i) print -numpy.sum(numpy.log(probs)) except StopIteration: pass else: raise Exception("Unknown mode {}".format(args.mode))
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'],'r')) if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print "Parsed Input:", parsed_in trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) try: best = numpy.argmin(costs) print >>ftrans, trans[best] total_cost += costs[best] except: print >> ftrans, "FAIL" if args.verbose: print "Translation:", trans[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = None if not args.beam_search: alpha = float(raw_input('Inverse Temperature? ')) seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, alpha=alpha, verbose=True)
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'],'r')) if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if lm_model.maintain_coverage: trans, costs, coverages, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) if args.verbose: print "Parsed Input:", parsed_in if len(trans) == 0: trans = ['Failed'] costs = [0.0] best = numpy.argmin(costs) print >>ftrans, trans[best] if args.verbose: print "Translation:", trans[best] if lm_model.maintain_coverage: