def sample(self, sentence, ignore_unk=False): if self.tokenizer_cmd: tokenizer = Popen(self.tokenizer_cmd, stdin=PIPE, stdout=PIPE, shell=True) sentence, _ = tokenizer.communicate(sentence) seq, parsed_in = parse_input(self.state, self.indx_word, sentence, idx2word=self.idict_src) # Sample a translation and detokenize it trans, cost, _ = sample(self.lm_model, seq, 10, beam_search=self.beam_search, normalize=True, ignore_unk=ignore_unk) if self.detokenizer_cmd: detokenizer = Popen(self.detokenizer_cmd, stdin=PIPE, stdout=PIPE, shell=True) detokenized_sentence, _ = detokenizer.communicate(trans[0]) else: detokenized_sentence = trans[0] unknown_words = [ word for word, index in zip(sentence.split(), seq) if index == 1 ] return detokenized_sentence, unknown_words
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) scoreMaker = ScoreMaker(enc_dec) ScoreMaker.compile() indx_word_src = cPickle.load(open(state['word_indx'],'rb')) indx_word_trg = cPickle.load(open(state['word_indx_trgt'],'rb')) idict_src = cPickle.load(open(state['indx_word'],'r')) idict_trg = cPickle.load(open(state['indx_word_target'],'r')) fsrc = open(args.source, 'r') ftrg = open(args.target, 'r') for srcline, trgline in zip(fsrc, ftrg): src_seqin = srcline.strip() trg_seqin = trgline.strip() src_seq, src_parsed_in = parse_input(state, indx_word_src, src_seqin, idx2word=idict_src) trg_seq, trg_parsed_in = parse_input(state, indx_word_trg, trg_seqin, idx2word=idict_trg) print "Parsed Input:", src_parsed_in ScoreMaker.score(lm_model, src_seq, trg_seq, idict_src, idict_trg) fsrc.close() ftrg.close()
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) ScoreMaker = ScoreMaker(enc_dec) ScoreMaker.compile() indx_word_src = cPickle.load(open(state['word_indx'], 'rb')) indx_word_trg = cPickle.load(open(state['word_indx_trgt'], 'rb')) idict_src = cPickle.load(open(state['indx_word'], 'r')) idict_trg = cPickle.load(open(state['indx_word_target'], 'r')) fsrc = open(args.source, 'r') ftrg = open(args.target, 'r') for srcline, trgline in zip(fsrc, ftrg): src_seqin = srcline.strip() trg_seqin = trgline.strip() src_seq, src_parsed_in = parse_input(state, indx_word_src, src_seqin, idx2word=idict_src) trg_seq, trg_parsed_in = parse_input(state, indx_word_trg, trg_seqin, idx2word=idict_trg) print "Parsed Input:", src_parsed_in ScoreMaker.score(lm_model, src_seq, trg_seq, idict_src, idict_trg) fsrc.close() ftrg.close()
def next(self): seqs = [] try: while len(seqs) < self.batch_size: line = next(self.txt_file).strip() seq, _ = parse_input(self.state, self.indx, line, raise_unk=self.raise_unk) seqs.append(seq) return self._pack(seqs) except StopIteration: if not seqs: raise StopIteration() return self._pack(seqs)
def sample(self, seqin, n_samples, alpha=None): try: seq, parsed_in = parse_input(self.state, self.indx_word, seqin, idx2word=self.idict_src) print "Parsed Input:", parsed_in (sentences, cost, _) = sample(self.lm_model, seq, n_samples, sampler=self.sampler, beam_search=self.beam_search, ignore_unk=self.args.ignore_unk, normalize=self.args.normalize, alpha=alpha, verbose=True) return sentences, cost except Exception: print "Exception while parsing your input:" traceback.print_exc() return None, None
def translate( source): # source = "哈 哈 哈" # languages = "zh-en" start_time = time.time() # n_samples = args.beam_size n_samples = 12 total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) seqin = source.strip() print source seq, parsed_in = parse_input(states, indx_word, seqin, idx2word=idict_src) trans, costs, alignment = sample(lm_models, seq, n_samples, beam_search=beam_search, ignore_unk=False, normalize=False) best = numpy.argmin(costs) print type(trans[best]) return trans[best], alignment
def sample(self, sentence, ignore_unk=False, beamwidth=10): if self.tokenizer_cmd: tokenizer = Popen(self.tokenizer_cmd, stdin=PIPE, stdout=PIPE) sentence, _ = tokenizer.communicate(sentence) seq, parsed_in = parse_input(self.state, self.indx_word, sentence, idx2word=self.idict_src) # Sample a translation and detokenize it trans, cost, _ = sample( self.lm_model, seq, beamwidth, beam_search=self.beam_search, normalize=True, ignore_unk=ignore_unk ) if self.detokenizer_cmd: detokenizer = Popen(self.detokenizer_cmd, stdin=PIPE, stdout=PIPE) detokenized_sentence, _ = detokenizer.communicate(trans[0]) else: detokenized_sentence = trans[0] unknown_words = [word for word, index in zip(sentence.split(), seq) if index == 1] return detokenized_sentence, unknown_words
def getSamples(self, seqori, k): # split the sentence seqin = "" for i in range(0, len(seqori), 3): w = seqori[i:i + 3] seqin = seqin + w + " " print "split seq:#%s#" % (seqin) #return seq, parsed_in = parse_input(self.state, self.indx_word, seqin, idx2word=self.idict_src) ans, align, rester, updater = self.sample(seq, k) return ans, align, rester, updater
def getRep(self, seqori): seqin = "" for i in range(0, len(seqori), 3): w = seqori[i:i + 3] seqin = seqin + w + " " print "split seq:#%s#" % (seqin) seq, parsed_in = parse_input(self.state, self.indx_word, seqin, idx2word=self.idict_src) rep = self.beam_search.search(seq, 20, ignore_unk=self.arg_ignore_unk, minlen=len(seq) / 2, getRep=True) return rep
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) if args.config: state.update(eval(open(args.config).read())) if args.weights: state['weights'] = args.weights if args.lm_file: state['lm_file'] = args.lm_file if args.lm_vocab: state['lm_vocab'] = args.lm_vocab if args.pt_file: state['phrase_table'] = args.pt_file if args.lm_ngram: state['lm_ngram'] = args.lm_ngram logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) idict_src = cPickle.load(open(state['indx_word'], 'r')) trg_idx2word = cPickle.load(open(state['indx_word_target'], 'r')) trg_word2idx = cPickle.load(open(state['word_indx_trgt'], 'r')) #0:UNK_tm_value 1:rnn_weight 2:lm_weight 3:tm_weight 4:word_penalty_weight fea_weights = map(float, state['weights'].split(',')) beam_search = BeamSearch(enc_dec, trg_idx2word, trg_word2idx, indx_word) beam_search.compile() beam_search.init_features(state, fea_weights) #beam_search.init_lm(state['lm_vocab'], state['lm_file'], ngram=int(state['lm_ngram']), weight=fea_weights[2]) #beam_search.init_tm(state['phrase_table'], weights=fea_weights[3:]) fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print >> sys.stderr, "Parsed Input:", parsed_in trans, costs, trans_ids, aligns, lm_costs, tm_costs, unk_nums, rnn_costs = sample( lm_model, seqin, seq, n_samples, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) #for (i, t) in enumerate(trans): # costs[i] = costs[i] / len(t) best = numpy.argmin(costs) align_str = [] for (idx, _a) in enumerate(aligns[best]): align_str.append("[%s]" % ' '.join(map(str, _a))) if args.nbest: nbest_trans = trans nbest_costs = costs nbest_lm_costs = lm_costs nbest_tm_costs = tm_costs nbest_unk_nums = unk_nums nbest_rnn_costs = rnn_costs nbest_trans = numpy.array(nbest_trans)[numpy.argsort(nbest_costs)] nbest_lm_costs = numpy.array(nbest_lm_costs)[numpy.argsort( nbest_costs)] nbest_tm_costs = numpy.array(nbest_tm_costs)[numpy.argsort( nbest_costs)] nbest_unk_nums = numpy.array(nbest_unk_nums)[numpy.argsort( nbest_costs)] nbest_rnn_costs = numpy.array(nbest_rnn_costs)[numpy.argsort( nbest_costs)] nbest_costs = numpy.array(sorted(nbest_costs)) for (t, lm, tm, c, u, r) in zip(nbest_trans, nbest_lm_costs, nbest_tm_costs, nbest_costs, nbest_unk_nums, nbest_rnn_costs): sum_lm = numpy.sum(lm) sum_unk = numpy.sum(u) sum_tm = numpy.sum(tm) rnn_cost = numpy.sum(r) sum_wp = len(t.split(' ')) + 1 #rnn_cost = c - sum_lm * beam_search.weight_lm - sum_tm * beam_search.weight_tm - sum_wp * beam_search.weight_wp pure_tm = sum_tm + sum_unk * beam_search.unk_tm_value #rnn_cost = sum_rnn / beam_search.weight_rnn #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, c, rnn_cost, sum_lm, sum_tm, sum_wp) #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.weight_tm, -rnn_cost, -sum_lm, -pure_tm, -sum_wp) print >> ftrans, "%s ||| %f %f %f %f ||| 0" % ( t, -rnn_cost, -sum_lm, -sum_tm, -sum_wp) if args.verbose: print >>sys.stderr, "%s ||| %f %f %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.unk_tm_value * beam_search.weight_tm,\ -rnn_cost * beam_search.weight_rnn, \ -sum_lm * beam_search.weight_lm, \ -pure_tm * beam_search.weight_tm, \ -sum_tm * beam_search.weight_tm, \ -sum_wp * beam_search.weight_wp, c) print >> ftrans, '' #nbest_str = ' ||| '.join("%s | %f" % (t, c) for (t, c) in zip(nbest_trans, nbest_costs)) #out_str += "\t" + nbest_str else: out_str = trans[best] if args.alignment: out_str += "\t" + ' '.join(align_str) if args.show_unk: best_ids = trans_ids[best] unk_ids = [] for (i, idx) in enumerate(best_ids): if idx == beam_search.unk_id: unk_ids.append(i) out_str += "\t" + ' '.join(map(str, unk_ids)) print >> ftrans, out_str if args.verbose: print "[Translation]%s\t[Align]%s" % (trans[best], ' '.join(align_str)) total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) print "Total used time: {}".format(time.time() - start_time) fsrc.close() ftrans.close()
def __call__(self): """ Opens the file for the validation set and creates a subprocess for the multi-bleu script. Returns a boolean indicating whether the current model should be saved. """ print "Started Validation: " val_start_time = time.time() fsrc = open(self.state['validation_set'], 'r') mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 if self.verbose: ftrans = open(self.state['validation_set_out'], 'w') for i, line in enumerate(fsrc): """ Load the sentence, retrieve the sample, write to file """ if self.state['source_encoding'] == 'utf8': seqin = line.strip().decode('utf-8') else: seqin = line.strip() seq, parsed_in = parse_input(self.state, self.indx_word, seqin, idx2word=self.idict_src) # draw sample, checking to ensure we don't get an empty string back trans, costs, _ = sample(self.lm_model, seq, self.n_samples, beam_search=self.beam_search, ignore_unk=self.ignore_unk, normalize=self.normalize) try: best = numpy.argmin(costs) total_cost += costs[best] trans_out = trans[best] except ValueError: print "Could not fine a translation for line: {}".format(i + 1) trans_out = u'UNK' if self.state[ 'target_encoding'] == 'utf8' else 'UNK' # Write to subprocess and file if it exists if self.state['target_encoding'] == 'utf8': print >> mb_subprocess.stdin, trans_out.encode('utf8').replace( " ", "") if self.verbose: print >> ftrans, trans_out.encode('utf8').replace(" ", "") else: print >> mb_subprocess.stdin, trans_out if self.verbose: print >> ftrans, trans_out if i != 0 and i % 50 == 0: print "Translated {} lines of validation set...".format(i) mb_subprocess.stdin.flush() print "Total cost of the validation: {}".format(total_cost) fsrc.close() if self.verbose: ftrans.close() # send end of file, read output. mb_subprocess.stdin.close() out_parse = re.match(r'BLEU = [-.0-9]+', mb_subprocess.stdout.readline()) print "Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) self.val_bleu_curve.append(bleu_score) print bleu_score mb_subprocess.terminate() # Determine whether or not we should save if self.best_bleu < bleu_score: self.best_bleu = bleu_score return True return False
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False with open(args.topn_file, 'rb') as f: topn = cPickle.load(f) # Load dictionary (source word index : list of target word indices) if args.less_transfer: for elt in topn: topn[elt] = topn[elt][:args.num_ttables] # Take the first args.num_ttables only else: for elt in topn: topn[elt] = set(topn[elt][:args.num_ttables]) # Take the first args.num_ttables only and convert list to set num_models = len(args.models) rng = numpy.random.RandomState(state['seed']) enc_decs = [] lm_models = [] original_W_0_dec_approx_embdr = [] original_W2_dec_deep_softmax = [] original_b_dec_deep_softmax = [] for i in xrange(num_models): enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True)) enc_decs[i].build() lm_models.append(enc_decs[i].create_lm_model()) lm_models[i].load(args.models[i]) original_W_0_dec_approx_embdr.append(lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value()) original_W2_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].get_value()) original_b_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].get_value()) # On GPU, this will free memory for the next models # Additional gains could be made by rolling the source vocab lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].set_value(numpy.zeros((1), dtype=numpy.float32)) indx_word = cPickle.load(open(state['word_indx'],'rb')) #Source w2i sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_decs) beam_search.compile() else: raise NotImplementedError #sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'],'r')) #Source i2w original_target_i2w = lm_models[0].word_indxs.copy() # I don't think that we need target_word2index max_words = len(original_b_dec_deep_softmax[0]) if args.less_transfer: # Use OrderedDict instead of set for reproducibility d = OrderedDict() # Up to now D = OrderedDict() # Full C = OrderedDict() # Allowed to reject prev_line = 0 logger.info("%d" % prev_line) D_dict = OrderedDict() output = False for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'],state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) with open(args.source, 'r') as f: for i, line in enumerate(f): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) # seq is the ndarray of indices indices = [] for elt in seq[:-1]: # Exclude the EOL token if elt != 1: # Exclude OOV (1 will not be a key of topn) indices.extend(topn[elt]) # Add topn best unigram translations for each source word output = update_dicts(indices, d, D, C, args.num_common) if (i % args.change_every) == 0 and args.change_every > 0 and i > 0: output = True if output: D_dict[prev_line] = D.copy() # Save dictionary for the lines preceding this one prev_line = i logger.info("%d" % i) output = False d = OrderedDict() if args.no_reset: C = D.copy() else: D = OrderedDict() # Full C = OrderedDict() # Allowed to reject for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'], state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) update_dicts(indices, d, D, C, args.num_common) # Assumes you cannot fill d with only 1 line D_dict[prev_line] = D.copy() if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) # seq is the ndarray of indices # For now, keep all input words in the model. # In the future, we may want to filter them to save on memory, but this isn't really much of an issue now if args.verbose: print "Parsed Input:", parsed_in if args.less_transfer: if i in D_dict: indices = D_dict[i].keys() eos_id = indices.index(state['null_sym_target']) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos['W_0_dec_approx_embdr']].set_value(original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos['W2_dec_deep_softmax']].set_value(original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos['b_dec_deep_softmax']].set_value(original_b_dec_deep_softmax[j][indices]) lm_models[0].word_indxs = dict([(k, original_target_i2w[index]) for k, index in enumerate(indices)]) # target index2word trans, costs, _ = sample(lm_models[0], seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, normalize_p=args.normalize_p, eos_id=eos_id, unk_id=unk_id, final=True, wp=args.wp) else: # Extract the indices you need indices = set() for elt in seq[:-1]: # Exclude the EOL token if elt != 1: # Exclude OOV (1 will not be a key of topn) indices = indices.union(topn[elt]) # Add topn best unigram translations for each source word num_common_words = args.num_common while True: if num_common_words >= max_words: final = True num_common_words = max_words else: final = False if args.final: # No matter the number of words final = True indices = indices.union(set(xrange(num_common_words))) # Add common words indices = list(indices) # Convert back to list for advanced indexing eos_id = indices.index(state['null_sym_target']) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) # Set the target word matrices and biases for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos['W_0_dec_approx_embdr']].set_value(original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos['W2_dec_deep_softmax']].set_value(original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos['b_dec_deep_softmax']].set_value(original_b_dec_deep_softmax[j][indices]) lm_models[0].word_indxs = dict([(k, original_target_i2w[index]) for k, index in enumerate(indices)]) # target index2word try: trans, costs, _ = sample(lm_models[0], seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, normalize_p=args.normalize_p, eos_id=eos_id, unk_id=unk_id, final=final) break # Breaks only if it succeeded (If final=True, will always succeed) except RuntimeError: indices = set(indices) num_common_words *= 2 if not args.n_best: best = numpy.argmin(costs) print >>ftrans, trans[best] else: order = numpy.argsort(costs) best = order[0] for elt in order: print >>ftrans, str(i+args.start) + ' ||| ' + trans[elt] + ' ||| ' + str(costs[elt]) if args.verbose: print "Translation:", trans[best] total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: raise NotImplementedError
def process_sentence(source_sentence, model, max_phrase_length, n_samples, copy_UNK_words, add_period, normalize, reverse_score): #Setting up comp_score function logger.debug("setting up comp_score function") [lm_model, enc_dec, indx_word_src, indx_word_trgt, state, \ lm_model_fr_2_en, enc_dec_fr_2_en, state_fr2en] = model eol_src = state['null_sym_source'] src_seq = parse_input(state, indx_word_src, source_sentence) if src_seq[-1] == eol_src: src_seq = src_seq[:-1] n_s = len(src_seq) #Create sorted phrase lists tiled_source_phrase_list = [] index_order_list = [] for i in xrange(n_s): for j in numpy.arange(i, min(i + max_phrase_length, n_s)): index_order_list.append([i, j]) logger.debug("sorting list") index_order_list.sort(key=lambda (i, j): (j - i)) logger.debug("creating phrase lists") if add_period: period_src = indx_word_src['.'] for i, j in index_order_list: tiled_source_phrase_list.append( numpy.hstack((src_seq[i:j + 1], period_src, eol_src))) else: for i, j in index_order_list: tiled_source_phrase_list.append( numpy.hstack((src_seq[i:j + 1], eol_src))) #Compute nested score dictionary logger.debug("computing nested score dictionary") score_dict = {} trans = {} for phrase_idx in xrange(0, len(index_order_list)): logger.debug("{0} out of {1}".format(phrase_idx, len(index_order_list))) i, j = index_order_list[phrase_idx] logger.debug("Translating phrase : {}".format(" ".join( source_sentence.strip().split()[i:j + 1]))) if copy_UNK_words == True: phrase_to_translate = tiled_source_phrase_list[phrase_idx] n_UNK_words = numpy.sum( [word == 1 for word in phrase_to_translate]) if n_UNK_words >= 1 and n_UNK_words == len( phrase_to_translate) - 1: suggested_translation = " ".join( source_sentence.strip().split()[i:j + 1]) trans[i, j] = suggested_translation score = .0001 score_dict[i, j] = score if n_UNK_words >= 1 and n_UNK_words != len(phrase_to_translate) - 1: suggested_translation = "WILL NOT BE USED" trans[i, j] = suggested_translation score = 1e9 score_dict[i, j] = score if n_UNK_words == 0: suggested_translation, score = sample_targets( input_phrase= \ tiled_source_phrase_list[phrase_idx], model=model, n_samples=n_samples, reverse_score=reverse_score, normalize=normalize ) trans[i, j] = suggested_translation score_dict[i, j] = score else: phrase_to_translate = tiled_source_phrase_list[phrase_idx] suggested_translation, score = sample_targets( input_phrase=phrase_to_translate, model=model, n_samples=n_samples, reverse_score=reverse_score, normalize=normalize) trans[i, j] = suggested_translation score_dict[i, j] = score #Remove the period at the end if not last word #Lower case first word if not first word if add_period: for phrase_idx in xrange(0, len(index_order_list)): i, j = index_order_list[phrase_idx] if i != 0: trans[i, j] = " ".join([trans[i, j][0].lower()] + [trans[i, j][1:]]) if j != len(src_seq) - 1: last_word = trans[i, j].strip().split()[-1] if last_word == '.': trans[i, j] = " ".join(trans[i, j].strip().split()[:-1]) #Translation of full sentence without segmentation logger.debug("Translating full sentence") phrase_to_translate = numpy.hstack((src_seq, eol_src)) full_translation, __ = sample_targets(input_phrase=phrase_to_translate, model=model, n_samples=n_samples, reverse_score=reverse_score, normalize=normalize) logger.debug("Translation output:".format(full_translation)) return trans, score_dict, full_translation
def main(): args = parse_args() state = prototype_search_with_coverage_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'], 'r')) if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if lm_model.maintain_coverage: if lm_model.use_linguistic_coverage and lm_model.use_fertility_model: trans, aligns, costs, coverages, fertility, _ = sample( lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, aligns, costs, coverages, _ = sample( lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, aligns, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) if args.verbose: print "Parsed Input:", parsed_in if len(trans) == 0: trans = ['Failed'] costs = [0.0] best = numpy.argmin(costs) print >> ftrans, trans[best] if args.verbose: print "Translation:", trans[best] print "Aligns:" # aligns shape: (target_len, source_len) # we reverse it to the shape (source_len, target_len) to show the matrix print numpy.array(aligns[best]).transpose().tolist() if lm_model.maintain_coverage: # since we filtered <eos> from trans[best], thus the index adds 1 coverage = coverages[best] print "Coverage:", words = parsed_in.split() for k in xrange(len(words)): print '%s/%.2f' % (words[k], coverage[k]), print '' if lm_model.use_linguistic_coverage and lm_model.use_fertility_model: print 'Fertility: ', for k in xrange(len(words)): print '%s/%.2f' % (words[k], fertility[k]), print '' print total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = None if not args.beam_search: alpha = float(raw_input('Inverse Temperature? ')) seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, alpha=alpha, verbose=True)
def main(): args = parse_args() state = prototype_search_with_coverage_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'],'r')) if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if lm_model.maintain_coverage: if lm_model.use_linguistic_coverage and lm_model.use_fertility_model: trans, aligns, costs, coverages, fertility, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, aligns, costs, coverages, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, aligns, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) if args.verbose: print "Parsed Input:", parsed_in if len(trans) == 0: trans = ['Failed'] costs = [0.0] best = numpy.argmin(costs) print >>ftrans, trans[best] if args.verbose: print "Translation:", trans[best] print "Aligns:" # aligns shape: (target_len, source_len) # we reverse it to the shape (source_len, target_len) to show the matrix print numpy.array(aligns[best]).transpose().tolist() if lm_model.maintain_coverage: # since we filtered <eos> from trans[best], thus the index adds 1 coverage = coverages[best] print "Coverage:", words = parsed_in.split() for k in xrange(len(words)): print '%s/%.2f'%(words[k], coverage[k]), print '' if lm_model.use_linguistic_coverage and lm_model.use_fertility_model: print 'Fertility: ', for k in xrange(len(words)): print '%s/%.2f'%(words[k], fertility[k]), print '' print total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = None if not args.beam_search: alpha = float(raw_input('Inverse Temperature? ')) seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, alpha=alpha, verbose=True)
def main(): args = parse_args() state = prototype_search_with_coverage_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) t_indx_word = cPickle.load(open(state['word_indx_trgt'], 'rb')) sampler = None beam_search = BeamSearch(enc_dec) beam_search.compile() idict_src = cPickle.load(open(state['indx_word'], 'r')) t_idict_src = cPickle.load(open(state['indx_word_target'], 'r')) fsrc = open(args.source, 'r') ftrg = open(args.target, 'r') start_time = time.time() total_cost = 0.0 # for i, line in enumerate(fsrc): i = 0 while 1: try: seqin = fsrc.next().strip() seqout = ftrg.next().strip() except StopIteration: break seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) out, parsed_out = parse_target(state, t_indx_word, seqout, idx2word=t_idict_src) if lm_model.maintain_coverage: if lm_model.use_linguistic_coverage and lm_model.use_fertility_model: aligns, costs, coverage, fertility = force_decoding( lm_model, seq, out, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: aligns, costs, coverage = force_decoding( lm_model, seq, out, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: aligns, costs = force_decoding(lm_model, seq, out, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) print "Parsed Input:", parsed_in print "Parsed Target:", parsed_out print 'Aligns:' print aligns.tolist() if lm_model.maintain_coverage: # since we filtered <eos> from trans[best], thus the index adds 1 print "Coverage:", words = parsed_in.split() for k in xrange(len(words)): print '%s/%.2f' % (words[k], coverage[k]), print '' if lm_model.use_linguistic_coverage and lm_model.use_fertility_model: print 'Fertility: ', for k in xrange(len(words)): print '%s/%.2f' % (words[k], fertility[k]), print '' print total_cost += costs[0] if (i + 1) % 100 == 0: logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrg.close()
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False if args.num_common and args.num_ttables and args.topn_file: with open(args.topn_file, 'rb') as f: topn = cPickle.load(f) # Load dictionary (source word index : list of target word indices) for elt in topn: topn[elt] = topn[elt][:args.num_ttables] # Take the first args.num_ttables only num_models = len(args.models) rng = numpy.random.RandomState(state['seed']) enc_decs = [] lm_models = [] alignment_fns = [] if args.num_common and args.num_ttables and args.topn_file: original_W_0_dec_approx_embdr = [] original_W2_dec_deep_softmax = [] original_b_dec_deep_softmax = [] for i in xrange(num_models): enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True)) enc_decs[i].build() lm_models.append(enc_decs[i].create_lm_model()) lm_models[i].load(args.models[i]) alignment_fns.append(theano.function(inputs=enc_decs[i].inputs, outputs=[enc_decs[i].alignment], name="alignment_fn")) if args.num_common and args.num_ttables and args.topn_file: original_W_0_dec_approx_embdr.append(lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value()) original_W2_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].get_value()) original_b_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].get_value()) lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].set_value(numpy.zeros((1), dtype=numpy.float32)) if args.mapping: with open(args.mapping, 'rb') as f: mapping = cPickle.load(f) heuristic = args.heuristic else: heuristic = 0 mapping = None word2idx_src = cPickle.load(open(state['word_indx'], 'rb')) idict_src = cPickle.load(open(state['indx_word'], 'r')) word2idx_trg = cPickle.load(open(state['word_indx_trgt'], 'rb')) idict_trg = cPickle.load(open(state['indx_word_target'], 'r')) word2idx_trg['<eos>'] = state['null_sym_target'] word2idx_trg[state['oov']] = state['unk_sym_target'] # 'UNK' may be in the vocabulary. Now points to the right index. idict_trg[state['null_sym_target']] = '<eos>' idict_trg[state['unk_sym_target']] = state['oov'] if args.num_common and args.num_ttables and args.topn_file: # Use OrderedDict instead of set for reproducibility d = OrderedDict() # Up to now D = OrderedDict() # Full C = OrderedDict() # Allowed to reject prev_line = 0 logger.info("%d" % prev_line) D_dict = OrderedDict() output = False for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'],state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) with open(args.source, 'r') as f: for i, line in enumerate(f): seqin = line.strip() seq, _ = parse_input(state, word2idx_src, seqin) # seq is the ndarray of indices indices = [] for elt in seq[:-1]: # Exclude the EOL token if elt != 1 and elt in topn: # Exclude OOV (1 will not be a key of topn) indices.extend(topn[elt]) # Add topn best unigram translations for each source word update_dicts(indices, d, D, C, args.num_common) if (i % args.change_every) == 0 and args.change_every > 0 and i > 0: D_dict[prev_line] = D.copy() # Save dictionary for the lines preceding this one prev_line = i logger.info("%d" % i) output = False d = OrderedDict() if args.no_reset: C = D.copy() else: D = OrderedDict() # Full C = OrderedDict() # Allowed to reject for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'], state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) update_dicts(indices, d, D, C, args.num_common) # Assumes you cannot fill d with only 1 line D_dict[prev_line] = D.copy() start_time = time.time() if args.source and args.trans and args.new_trans: with open(args.source, 'r') as src_file: with open(args.trans, 'r') as trans_file: with open(args.new_trans, 'w') as new_trans_file: if not (args.num_common and args.num_ttables and args.topn_file): eos_id = state['null_sym_target'] unk_id = state['unk_sym_target'] new_word2idx_trg = word2idx_trg prev_i = -1 if args.n_best: full_trans_line = trans_file.readline() if full_trans_line == '': raise IOError("File is empty") full_trans_line = full_trans_line.split('|||') n_best_start = int(full_trans_line[0].strip()) trans_file.seek(0) while True: if args.n_best: full_trans_line = trans_file.readline() if full_trans_line == '': break full_trans_line = full_trans_line.split('|||') i = int(full_trans_line[0].strip()) - n_best_start trans_line = full_trans_line[1].strip() else: trans_line = trans_file.readline() if trans_line == '': break i = prev_i + 1 if i == (prev_i + 1): prev_i = i if (i % args.change_every) == 0 and i > 0: hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize) replace_unknown_words( src_word_seqs, trg_seqs, trg_word_seqs, hard_alignments, heuristic, mapping, unk_id, new_trans_file, args.n_best, full_trans_lines) if (i % 100 == 0) and i > 0: new_trans_file.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / i)) src_line = src_file.readline() src_seq, src_words = parse_input(state, word2idx_src, src_line.strip()) src_words.append('<eos>') if (i % args.change_every) == 0: src_seqs = [] src_word_seqs = [] trg_seqs = [] trg_word_seqs = [] full_trans_lines = [] # Only used with n-best lists if args.num_common and args.num_ttables and args.topn_file: indices = D_dict[i].keys() eos_id = indices.index(state['null_sym_target']) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos['W_0_dec_approx_embdr']].set_value(original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos['W2_dec_deep_softmax']].set_value(original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos['b_dec_deep_softmax']].set_value(original_b_dec_deep_softmax[j][indices]) new_word2idx_trg = dict([(idict_trg[index], k) for k, index in enumerate(indices)]) elif i != prev_i: raise ValueError("prev_i: %d, i: %d" % (prev_i, i)) trans_seq, trans_words = parse_output(new_word2idx_trg, trans_line.strip(), eos_id=eos_id, unk_id=unk_id) trans_words.append('<eos>') src_seqs.append(src_seq) src_word_seqs.append(src_words) trg_seqs.append(trans_seq) trg_word_seqs.append(trans_words) if args.n_best: full_trans_lines.append(full_trans_line) # Out of loop hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize) replace_unknown_words(src_word_seqs, trg_seqs, trg_word_seqs, hard_alignments, heuristic, mapping, unk_id, new_trans_file, args.n_best, full_trans_lines) else: raise NotImplementedError
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'],'r')) if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if lm_model.maintain_coverage: trans, costs, coverages, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) if args.verbose: print "Parsed Input:", parsed_in if len(trans) == 0: trans = ['Failed'] costs = [0.0] best = numpy.argmin(costs) print >>ftrans, trans[best] if args.verbose: print "Translation:", trans[best] if lm_model.maintain_coverage:
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) if args.config: state.update(eval(open(args.config).read())) if args.weights: state['weights'] = args.weights if args.lm_file: state['lm_file'] = args.lm_file if args.lm_vocab: state['lm_vocab'] = args.lm_vocab if args.pt_file: state['phrase_table'] = args.pt_file if args.lm_ngram: state['lm_ngram'] = args.lm_ngram logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) idict_src = cPickle.load(open(state['indx_word'],'r')) trg_idx2word = cPickle.load(open(state['indx_word_target'],'r')) trg_word2idx = cPickle.load(open(state['word_indx_trgt'],'r')) #0:UNK_tm_value 1:rnn_weight 2:lm_weight 3:tm_weight 4:word_penalty_weight fea_weights = map(float, state['weights'].split(',')) beam_search = BeamSearch(enc_dec, trg_idx2word, trg_word2idx, indx_word) beam_search.compile() beam_search.init_features(state, fea_weights) #beam_search.init_lm(state['lm_vocab'], state['lm_file'], ngram=int(state['lm_ngram']), weight=fea_weights[2]) #beam_search.init_tm(state['phrase_table'], weights=fea_weights[3:]) fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print >> sys.stderr, "Parsed Input:", parsed_in trans, costs, trans_ids, aligns, lm_costs, tm_costs, unk_nums, rnn_costs = sample(lm_model, seqin, seq, n_samples, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) #for (i, t) in enumerate(trans): # costs[i] = costs[i] / len(t) best = numpy.argmin(costs) align_str = [] for (idx, _a) in enumerate(aligns[best]): align_str.append("[%s]" % ' '.join(map(str, _a))) if args.nbest: nbest_trans = trans nbest_costs = costs nbest_lm_costs = lm_costs nbest_tm_costs = tm_costs nbest_unk_nums = unk_nums nbest_rnn_costs = rnn_costs nbest_trans = numpy.array(nbest_trans)[numpy.argsort(nbest_costs)] nbest_lm_costs = numpy.array(nbest_lm_costs)[numpy.argsort(nbest_costs)] nbest_tm_costs = numpy.array(nbest_tm_costs)[numpy.argsort(nbest_costs)] nbest_unk_nums = numpy.array(nbest_unk_nums)[numpy.argsort(nbest_costs)] nbest_rnn_costs = numpy.array(nbest_rnn_costs)[numpy.argsort(nbest_costs)] nbest_costs = numpy.array(sorted(nbest_costs)) for (t, lm, tm, c, u, r) in zip(nbest_trans, nbest_lm_costs, nbest_tm_costs, nbest_costs, nbest_unk_nums, nbest_rnn_costs): sum_lm = numpy.sum(lm) sum_unk = numpy.sum(u) sum_tm = numpy.sum(tm) rnn_cost = numpy.sum(r) sum_wp = len(t.split(' ')) + 1 #rnn_cost = c - sum_lm * beam_search.weight_lm - sum_tm * beam_search.weight_tm - sum_wp * beam_search.weight_wp pure_tm = sum_tm + sum_unk * beam_search.unk_tm_value #rnn_cost = sum_rnn / beam_search.weight_rnn #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, c, rnn_cost, sum_lm, sum_tm, sum_wp) #print >> ftrans, "%s ||| %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.weight_tm, -rnn_cost, -sum_lm, -pure_tm, -sum_wp) print >> ftrans, "%s ||| %f %f %f %f ||| 0" % (t, -rnn_cost, -sum_lm, -sum_tm, -sum_wp) if args.verbose: print >>sys.stderr, "%s ||| %f %f %f %f %f %f %f ||| 0" % (t, sum_unk * beam_search.unk_tm_value * beam_search.weight_tm,\ -rnn_cost * beam_search.weight_rnn, \ -sum_lm * beam_search.weight_lm, \ -pure_tm * beam_search.weight_tm, \ -sum_tm * beam_search.weight_tm, \ -sum_wp * beam_search.weight_wp, c) print >> ftrans, '' #nbest_str = ' ||| '.join("%s | %f" % (t, c) for (t, c) in zip(nbest_trans, nbest_costs)) #out_str += "\t" + nbest_str else: out_str = trans[best] if args.alignment: out_str += "\t" + ' '.join(align_str) if args.show_unk: best_ids = trans_ids[best] unk_ids = [] for (i, idx) in enumerate(best_ids): if idx == beam_search.unk_id: unk_ids.append(i) out_str += "\t" + ' '.join(map(str, unk_ids)) print >>ftrans, out_str if args.verbose: print "[Translation]%s\t[Align]%s" % (trans[best], ' '.join(align_str)) total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) print "Total used time: {}".format(time.time() - start_time) fsrc.close() ftrans.close()
def main(): args = parse_args() state = getattr(experiments.nmt, args.state_fn)() if hasattr(args, 'state') and args.state: with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) assert state['enc_rec_layer'] == "RecursiveConvolutionalLayer", "Only works with gated recursive convolutional encoder" logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) idict_src = cPickle.load(open(state['indx_word'],'r')) x = TT.lvector() h = TT.tensor3() proj_x = theano.function([x], enc_dec.encoder.input_embedders[0]( enc_dec.encoder.approx_embedder(x)).out, name='proj_x') new_h, gater = enc_dec.encoder.transitions[0].step_fprop( None, h, return_gates = True) step_up = theano.function([h], [new_h, gater], name='gater_step') while True: try: seqin = raw_input('Input Sequence: ') seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue # get the initial embedding new_h = proj_x(seq) new_h = new_h.reshape(new_h.shape[0], 1, new_h.shape[1]) nodes = numpy.arange(len(seq)).tolist() node_idx = len(seq)-1 rules = [] nodes_level = copy.deepcopy(nodes) G = nx.DiGraph() input_nodes = [] merge_nodes = [] aggregate_nodes = [] nidx = 0 vpos = 0 nodes_pos = {} nodes_labels = {} # input nodes for nn in nodes[:-1]: nidx += 1 G.add_node(nn, pos=(nidx, 0), ndcolor="blue", label="%d"%nn) nodes_pos[nn] = (nidx, vpos) nodes_labels[nn] = idict_src[seq[nidx-1]] input_nodes.append(nn) node_idx = len(seq) - 1 vpos += 6 for dd in xrange(len(seq)-1): new_h, gater = step_up(new_h) decisions = numpy.argmax(gater, -1) new_nodes_level = numpy.zeros(len(seq) - (dd+1)) hpos = float(len(seq)+1) - 0.5 * (dd+1) last_node = True for nn in xrange(len(seq)-(dd+1)): hpos -= 1 if not last_node: # merge nodes node_idx += 1 G.add_node(node_idx, ndcolor="red", label="m") nodes_labels[node_idx] = "" nodes_pos[node_idx] = (hpos, vpos) G.add_edge(nodes_level[-(nn+1)], node_idx, weight=gater[-(nn+1),0,0]) G.add_edge(nodes_level[-(nn+2)], node_idx, weight=gater[-(nn+1),0,0]) merge_nodes.append(node_idx) merge_node = node_idx # linear aggregation nodes node_idx += 1 G.add_node(node_idx, ndcolor="red", label="") nodes_labels[node_idx] = "$+$" nodes_pos[node_idx] = (hpos, vpos+6) G.add_edge(merge_node, node_idx, weight=gater[-(nn+1),0,0]) G.add_edge(nodes_level[-(nn+2)], node_idx, weight=gater[-(nn+1),0,1]) G.add_edge(nodes_level[-(nn+1)], node_idx, weight=gater[-(nn+1),0,2]) aggregate_nodes.append(node_idx) new_nodes_level[-(nn+1)] = node_idx last_node = False nodes_level = copy.deepcopy(new_nodes_level) vpos += 12 # TODO: Show only strong edges. threshold = float(raw_input('Threshold: ')) edges = [(u,v,d) for (u,v,d) in G.edges(data=True) if d['weight'] > threshold] #edges = G.edges(data=True) use_weighting = raw_input('Color according to weight [Y/N]: ') if use_weighting == 'Y': cm = plt.get_cmap('binary') cNorm = colors.Normalize(vmin=0., vmax=1.) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm) colorList = [scalarMap.to_rgba(d['weight']) for (u,v,d) in edges] else: colorList = 'k' nx.draw_networkx_nodes(G, pos=nodes_pos, nodelist=input_nodes, node_color='white', alpha=1., edge_color='white') nx.draw_networkx_nodes(G, pos=nodes_pos, nodelist=merge_nodes, node_color='blue', alpha=0.8, node_size=20) nx.draw_networkx_nodes(G, pos=nodes_pos, nodelist=aggregate_nodes, node_color='red', alpha=0.8, node_size=80) nx.draw_networkx_edges(G, pos=nodes_pos, edge_color=colorList, edgelist=edges) nx.draw_networkx_labels(G,pos=nodes_pos,labels=nodes_labels,font_family='sans-serif') plt.axis('off') figname = raw_input('Save to: ') if figname[-3:] == "pdf": plt.savefig(figname, type='pdf') else: plt.savefig(figname) plt.close() G.clear()
def process_sentence(source_sentence, model, max_phrase_length, n_samples, copy_UNK_words, add_period, normalize, reverse_score): # Setting up comp_score function logger.debug("setting up comp_score function") [lm_model, enc_dec, indx_word_src, indx_word_trgt, state, \ lm_model_fr_2_en, enc_dec_fr_2_en, state_fr2en] = model eol_src = state['null_sym_source'] src_seq = parse_input(state, indx_word_src, source_sentence) if src_seq[-1] == eol_src: src_seq = src_seq[:-1] n_s = len(src_seq) # Create sorted phrase lists tiled_source_phrase_list = [] index_order_list = [] for i in xrange(n_s): for j in numpy.arange(i, min(i + max_phrase_length, n_s)): index_order_list.append([i, j]) logger.debug("sorting list") index_order_list.sort(key=lambda (i, j): (j - i)) logger.debug("creating phrase lists") if add_period: period_src = indx_word_src['.'] for i, j in index_order_list: tiled_source_phrase_list.append(numpy.hstack((src_seq[i:j + 1], period_src, eol_src))) else: for i, j in index_order_list: tiled_source_phrase_list.append(numpy.hstack((src_seq[i:j + 1], eol_src))) # Compute nested score dictionary logger.debug("computing nested score dictionary") score_dict = {} trans = {} for phrase_idx in xrange(0, len(index_order_list)): logger.debug("{0} out of {1}".format(phrase_idx, len(index_order_list))) i, j = index_order_list[phrase_idx] logger.debug("Translating phrase : {}".format(" ".join(source_sentence.strip().split()[i:j + 1]))) if copy_UNK_words == True: phrase_to_translate = tiled_source_phrase_list[phrase_idx] n_UNK_words = numpy.sum([word == 1 for word in phrase_to_translate]) if n_UNK_words >= 1 and n_UNK_words == len(phrase_to_translate) - 1: suggested_translation = " ".join(source_sentence.strip().split()[i:j + 1]) trans[i, j] = suggested_translation score = .0001 score_dict[i, j] = score if n_UNK_words >= 1 and n_UNK_words != len(phrase_to_translate) - 1: suggested_translation = "WILL NOT BE USED" trans[i, j] = suggested_translation score = 1e9 score_dict[i, j] = score if n_UNK_words == 0: suggested_translation, score = sample_targets( input_phrase= \ tiled_source_phrase_list[phrase_idx], model=model, n_samples=n_samples, reverse_score=reverse_score, normalize=normalize ) trans[i, j] = suggested_translation score_dict[i, j] = score else: phrase_to_translate = tiled_source_phrase_list[phrase_idx] suggested_translation, score = sample_targets( input_phrase=phrase_to_translate, model=model, n_samples=n_samples, reverse_score=reverse_score, normalize=normalize ) trans[i, j] = suggested_translation score_dict[i, j] = score # Remove the period at the end if not last word # Lower case first word if not first word if add_period: for phrase_idx in xrange(0, len(index_order_list)): i, j = index_order_list[phrase_idx] if i != 0: trans[i, j] = " ".join([trans[i, j][0].lower()] + [trans[i, j][1:]]) if j != len(src_seq) - 1: last_word = trans[i, j].strip().split()[-1] if last_word == '.': trans[i, j] = " ".join(trans[i, j].strip().split()[:-1]) # Translation of full sentence without segmentation logger.debug("Translating full sentence") phrase_to_translate = numpy.hstack((src_seq, eol_src)) full_translation, __ = sample_targets(input_phrase=phrase_to_translate, model=model, n_samples=n_samples, reverse_score=reverse_score, normalize=normalize ) logger.debug("Translation output:".format(full_translation)) return trans, score_dict, full_translation
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) ########################################################### # by He Wei #enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) ########################################################### enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'], 'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'], 'r')) if args.source and args.trans: # Actually only beam search is currently supported here #assert beam_search #assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() #n_samples = args.beam_size total_cost = 0.0 #logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print "Parsed Input:", parsed_in if args.beam_search: trans, costs, _, aligns = sample(lm_model, seq, args.beam_size, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) else: trans, costs, _, aligns = sample(lm_model, seq, 1, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) best = numpy.argmin(costs) out_str = trans[best] align_str = [] if args.beam_search and args.alignment: for (idx, _a) in enumerate(aligns[best]): align_str.append("[%s]" % ' '.join(map(str, _a))) #align_str.append("[%d-%d:%f,%d-%d:%f]" % (idx, _a[0], _a[1], idx, _a[2], _a[3])) out_str += "\t" + ' '.join(align_str) if args.beam_search and args.nbest: nbest_trans = trans nbest_costs = costs nbest_trans = numpy.array(nbest_trans)[numpy.argsort( nbest_costs)] nbest_costs = numpy.array(sorted(nbest_costs)) nbest_str = ' ||| '.join( "%s | %f" % (t, c) for (t, c) in zip(nbest_trans, nbest_costs)) out_str += "\t" + nbest_str print >> ftrans, out_str if args.verbose: print "[Translation]%s\t[Align]%s" % (trans[best], ' '.join(align_str)) total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) print "Total used time: {}".format(time.time() - start_time) fsrc.close() ftrans.close() else: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = None if not args.beam_search: alpha = float(raw_input('Inverse Temperature? ')) seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, alpha=alpha, verbose=True)
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) state['sort_k_batches'] = 1 # which means don't sort state['shuffle'] = False state['use_infinite_loop'] = False state['force_enc_repr_cpu'] = False logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word_src = cPickle.load(open(state['word_indx'], 'rb')) indx_word_trgt = cPickle.load(open(state['word_indx_trgt'], 'rb')) if args.mode == "batch": data_given = args.src or args.trg txt = data_given and not (args.src.endswith(".h5") and args.trg.endswith(".h5")) if data_given and not txt: state['source'] = [args.src] state['target'] = [args.trg] if not data_given and not txt: logger.info("Using the training data") if txt: data_iter = BatchBiTxtIterator(state, args.src, indx_word_src, args.trg, indx_word_trgt, state['bs'], raise_unk=not args.allow_unk) data_iter.start() else: data_iter = get_batch_iterator(state) data_iter.start(0) score_file = open(args.scores, "w") if args.scores else sys.stdout scorer = enc_dec.create_scorer(batch=True) count = 0 n_samples = 0 logger.info('Scoring phrases') for i, batch in enumerate(data_iter): if batch == None: continue if args.n_batches >= 0 and i == args.n_batches: break if args.y_noise: y = batch['y'] random_words = numpy.random.randint(0, 100, y.shape).astype("int64") change_mask = numpy.random.binomial(1, args.y_noise, y.shape).astype("int64") y = change_mask * random_words + (1 - change_mask) * y batch['y'] = y st = time.time() [scores] = scorer(batch['x'], batch['y'], batch['x_mask'], batch['y_mask']) if args.print_probs: scores = numpy.exp(scores) up_time = time.time() - st for s in scores: print >> score_file, "{:.5e}".format(float(s)) n_samples += batch['x'].shape[1] count += 1 if count % 100 == 0: score_file.flush() logger.debug("Scores flushed") logger.debug( "{} batches, {} samples, {} per sample; example scores: {}". format(count, n_samples, up_time / scores.shape[0], scores[:5])) logger.info("Done") score_file.flush() elif args.mode == "interact": scorer = enc_dec.create_scorer() while True: try: compute_probs = enc_dec.create_probs_computer() src_line = raw_input('Source sequence: ') trgt_line = raw_input('Target sequence: ') src_seq = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source']) trgt_seq = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target']) print "Binarized source: ", src_seq print "Binarized target: ", trgt_seq probs = compute_probs(src_seq, trgt_seq) print "Probs: {}, cost: {}".format( probs, -numpy.sum(numpy.log(probs))) except Exception: traceback.print_exc() elif args.mode == "txt": assert args.src and args.trg scorer = enc_dec.create_scorer() src_file = open(args.src, "r") trg_file = open(args.trg, "r") compute_probs = enc_dec.create_probs_computer(return_alignment=True) try: numpy.set_printoptions(precision=3, linewidth=150, suppress=True) i = 0 while True: src_line = next(src_file).strip() trgt_line = next(trg_file).strip() src_seq, src_words = parse_input( state, indx_word_src, src_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source']) trgt_seq, trgt_words = parse_input( state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target']) probs, alignment = compute_probs(src_seq, trgt_seq) if args.verbose: print "Probs: ", probs.flatten() if alignment.ndim == 3: print "Alignment:".ljust(20), src_line, "<eos>" for i, word in enumerate(trgt_words): print "{}{}".format(word.ljust(20), alignment[i, :, 0]) print "Generated by:" for i, word in enumerate(trgt_words): j = numpy.argmax(alignment[i, :, 0]) print "{} <--- {}".format( word, src_words[j] if j < len(src_words) else "<eos>") i += 1 if i % 100 == 0: sys.stdout.flush() logger.debug(i) print -numpy.sum(numpy.log(probs)) except StopIteration: pass else: raise Exception("Unknown mode {}".format(args.mode))
def __call__(self): """ Opens the file for the validation set and creates a subprocess for the multi-bleu script. Returns a boolean indicating whether the current model should be saved. """ print "Started Validation: " val_start_time = time.time() fsrc = open(self.state['validation_set'], 'r') mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 if self.verbose: ftrans = open(self.state['validation_set_out'], 'w') for i, line in enumerate(fsrc): """ Load the sentence, retrieve the sample, write to file """ if self.state['source_encoding'] == 'utf8': seqin = line.strip().decode('utf-8') else: seqin = line.strip() seq, parsed_in = parse_input(self.state, self.indx_word, seqin, idx2word=self.idict_src) # draw sample, checking to ensure we don't get an empty string back trans, costs, _ = sample(self.lm_model, seq, self.n_samples, beam_search=self.beam_search, ignore_unk=self.ignore_unk, normalize=self.normalize) try: best = numpy.argmin(costs) total_cost += costs[best] trans_out = trans[best] except ValueError: print "Could not fine a translation for line: {}".format(i+1) trans_out = u'UNK' if self.state['target_encoding'] == 'utf8' else 'UNK' # Write to subprocess and file if it exists if self.state['target_encoding'] == 'utf8': print >> mb_subprocess.stdin, trans_out.encode('utf8').replace(" ","") if self.verbose: print >> ftrans, trans_out.encode('utf8').replace(" ","") else: print >> mb_subprocess.stdin, trans_out if self.verbose: print >> ftrans, trans_out if i != 0 and i % 50 == 0: print "Translated {} lines of validation set...".format(i) mb_subprocess.stdin.flush() print "Total cost of the validation: {}".format(total_cost) fsrc.close() if self.verbose: ftrans.close() # send end of file, read output. mb_subprocess.stdin.close() out_parse = re.match(r'BLEU = [-.0-9]+', mb_subprocess.stdout.readline()) print "Validation Took: {} minutes".format(float(time.time() - val_start_time)/60.) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) self.val_bleu_curve.append(bleu_score) print bleu_score mb_subprocess.terminate() # Determine whether or not we should save if self.best_bleu < bleu_score: self.best_bleu = bleu_score return True return False
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) state['sort_k_batches'] = 1 state['shuffle'] = False state['use_infinite_loop'] = False state['force_enc_repr_cpu'] = False logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word_src = cPickle.load(open(state['word_indx'],'rb')) indx_word_trgt = cPickle.load(open(state['word_indx_trgt'], 'rb')) if args.mode == "batch": data_given = args.src or args.trg txt = data_given and not (args.src.endswith(".h5") and args.trg.endswith(".h5")) if data_given and not txt: state['source'] = [args.src] state['target'] = [args.trg] if not data_given and not txt: logger.info("Using the training data") if txt: data_iter = BatchBiTxtIterator(state, args.src, indx_word_src, args.trg, indx_word_trgt, state['bs'], raise_unk=not args.allow_unk) data_iter.start() else: data_iter = get_batch_iterator(state) data_iter.start(0) score_file = open(args.scores, "w") if args.scores else sys.stdout scorer = enc_dec.create_scorer(batch=True) count = 0 n_samples = 0 logger.info('Scoring phrases') for i, batch in enumerate(data_iter): if batch == None: continue if args.n_batches >= 0 and i == args.n_batches: break if args.y_noise: y = batch['y'] random_words = numpy.random.randint(0, 100, y.shape).astype("int64") change_mask = numpy.random.binomial(1, args.y_noise, y.shape).astype("int64") y = change_mask * random_words + (1 - change_mask) * y batch['y'] = y st = time.time() [scores] = scorer(batch['x'], batch['y'], batch['x_mask'], batch['y_mask']) if args.print_probs: scores = numpy.exp(scores) up_time = time.time() - st for s in scores: print >>score_file, "{:.5e}".format(float(s)) n_samples += batch['x'].shape[1] count += 1 if count % 100 == 0: score_file.flush() logger.debug("Scores flushed") logger.debug("{} batches, {} samples, {} per sample; example scores: {}".format( count, n_samples, up_time/scores.shape[0], scores[:5])) logger.info("Done") score_file.flush() elif args.mode == "interact": scorer = enc_dec.create_scorer() while True: try: compute_probs = enc_dec.create_probs_computer() src_line = raw_input('Source sequence: ') trgt_line = raw_input('Target sequence: ') src_seq = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source']) trgt_seq = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target']) print "Binarized source: ", src_seq print "Binarized target: ", trgt_seq probs = compute_probs(src_seq, trgt_seq) print "Probs: {}, cost: {}".format(probs, -numpy.sum(numpy.log(probs))) except Exception: traceback.print_exc() elif args.mode == "txt": assert args.src and args.trg scorer = enc_dec.create_scorer() src_file = open(args.src, "r") trg_file = open(args.trg, "r") compute_probs = enc_dec.create_probs_computer(return_alignment=True) try: numpy.set_printoptions(precision=3, linewidth=150, suppress=True) i = 0 while True: src_line = next(src_file).strip() trgt_line = next(trg_file).strip() src_seq, src_words = parse_input(state, indx_word_src, src_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_source'], null_sym=state['null_sym_source']) trgt_seq, trgt_words = parse_input(state, indx_word_trgt, trgt_line, raise_unk=not args.allow_unk, unk_sym=state['unk_sym_target'], null_sym=state['null_sym_target']) probs, alignment = compute_probs(src_seq, trgt_seq) if args.verbose: print "Probs: ", probs.flatten() if alignment.ndim == 3: print "Alignment:".ljust(20), src_line, "<eos>" for i, word in enumerate(trgt_words): print "{}{}".format(word.ljust(20), alignment[i, :, 0]) print "Generated by:" for i, word in enumerate(trgt_words): j = numpy.argmax(alignment[i, :, 0]) print "{} <--- {}".format(word, src_words[j] if j < len(src_words) else "<eos>") i += 1 if i % 100 == 0: sys.stdout.flush() logger.debug(i) print -numpy.sum(numpy.log(probs)) except StopIteration: pass else: raise Exception("Unknown mode {}".format(args.mode))
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False if args.num_common and args.num_ttables and args.topn_file: with open(args.topn_file, 'rb') as f: topn = cPickle.load(f) # Load dictionary (source word index : list of target word indices) for elt in topn: topn[elt] = topn[elt][:args.num_ttables] # Take the first args.num_ttables only num_models = len(args.models) rng = numpy.random.RandomState(state['seed']) enc_decs = [] lm_models = [] alignment_fns = [] if args.num_common and args.num_ttables and args.topn_file: original_W_0_dec_approx_embdr = [] original_W2_dec_deep_softmax = [] original_b_dec_deep_softmax = [] for i in xrange(num_models): enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True)) enc_decs[i].build() lm_models.append(enc_decs[i].create_lm_model()) lm_models[i].load(args.models[i]) alignment_fns.append(theano.function(inputs=enc_decs[i].inputs, outputs=[enc_decs[i].alignment], name="alignment_fn")) if args.num_common and args.num_ttables and args.topn_file: original_W_0_dec_approx_embdr.append(lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value()) original_W2_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].get_value()) original_b_dec_deep_softmax.append(lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].get_value()) lm_models[i].params[lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['W2_dec_deep_softmax']].set_value(numpy.zeros((1,1), dtype=numpy.float32)) lm_models[i].params[lm_models[i].name2pos['b_dec_deep_softmax']].set_value(numpy.zeros((1), dtype=numpy.float32)) if args.mapping: with open(args.mapping, 'rb') as f: mapping = cPickle.load(f) heuristic = args.heuristic else: heuristic = 0 mapping = None word2idx_src = cPickle.load(open(state['word_indx'], 'rb')) idict_src = cPickle.load(open(state['indx_word'], 'r')) word2idx_trg = cPickle.load(open(state['word_indx_trgt'], 'rb')) idict_trg = cPickle.load(open(state['indx_word_target'], 'r')) word2idx_trg['<eos>'] = state['null_sym_target'] word2idx_trg[state['oov']] = state['unk_sym_target'] # 'UNK' may be in the vocabulary. Now points to the right index. idict_trg[state['null_sym_target']] = '<eos>' idict_trg[state['unk_sym_target']] = state['oov'] if args.num_common and args.num_ttables and args.topn_file: # Use OrderedDict instead of set for reproducibility d = OrderedDict() # Up to now D = OrderedDict() # Full C = OrderedDict() # Allowed to reject prev_line = 0 logger.info("%d" % prev_line) D_dict = OrderedDict() output = False for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'],state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) with open(args.source, 'r') as f: for i, line in enumerate(f): seqin = line.strip() seq, _ = parse_input(state, word2idx_src, seqin) # seq is the ndarray of indices indices = [] for elt in seq[:-1]: # Exclude the EOL token if elt != 1: # Exclude OOV (1 will not be a key of topn) indices.extend(topn[elt]) # Add topn best unigram translations for each source word update_dicts(indices, d, D, C, args.num_common) if (i % args.change_every) == 0 and args.change_every > 0 and i > 0: D_dict[prev_line] = D.copy() # Save dictionary for the lines preceding this one prev_line = i logger.info("%d" % i) output = False d = OrderedDict() if args.no_reset: C = D.copy() else: D = OrderedDict() # Full C = OrderedDict() # Allowed to reject for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'], state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) update_dicts(indices, d, D, C, args.num_common) # Assumes you cannot fill d with only 1 line D_dict[prev_line] = D.copy() start_time = time.time() if args.source and args.trans and args.new_trans: with open(args.source, 'r') as src_file: with open(args.trans, 'r') as trans_file: with open(args.new_trans, 'w') as new_trans_file: if not (args.num_common and args.num_ttables and args.topn_file): eos_id = state['null_sym_target'] unk_id = state['unk_sym_target'] new_word2idx_trg = word2idx_trg prev_i = -1 if args.n_best: full_trans_line = trans_file.readline() if full_trans_line == '': raise IOError("File is empty") full_trans_line = full_trans_line.split('|||') n_best_start = int(full_trans_line[0].strip()) trans_file.seek(0) while True: if args.n_best: full_trans_line = trans_file.readline() if full_trans_line == '': break full_trans_line = full_trans_line.split('|||') i = int(full_trans_line[0].strip()) - n_best_start trans_line = full_trans_line[1].strip() else: trans_line = trans_file.readline() if trans_line == '': break i = prev_i + 1 if i == (prev_i + 1): prev_i = i if (i % args.change_every) == 0 and i > 0: hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize) replace_unknown_words( src_word_seqs, trg_seqs, trg_word_seqs, hard_alignments, heuristic, mapping, unk_id, new_trans_file, args.n_best, full_trans_lines) if (i % 100 == 0) and i > 0: new_trans_file.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / i)) src_line = src_file.readline() src_seq, src_words = parse_input(state, word2idx_src, src_line.strip()) src_words.append('<eos>') if (i % args.change_every) == 0: src_seqs = [] src_word_seqs = [] trg_seqs = [] trg_word_seqs = [] full_trans_lines = [] # Only used with n-best lists if args.num_common and args.num_ttables and args.topn_file: indices = D_dict[i].keys() eos_id = indices.index(state['null_sym_target']) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos['W_0_dec_approx_embdr']].set_value(original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos['W2_dec_deep_softmax']].set_value(original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos['b_dec_deep_softmax']].set_value(original_b_dec_deep_softmax[j][indices]) new_word2idx_trg = dict([(idict_trg[index], k) for k, index in enumerate(indices)]) elif i != prev_i: raise ValueError("prev_i: %d, i: %d" % (prev_i, i)) trans_seq, trans_words = parse_output(new_word2idx_trg, trans_line.strip(), eos_id=eos_id, unk_id=unk_id) trans_words.append('<eos>') src_seqs.append(src_seq) src_word_seqs.append(src_words) trg_seqs.append(trans_seq) trg_word_seqs.append(trans_words) if args.n_best: full_trans_lines.append(full_trans_line) # Out of loop hard_alignments = compute_alignment(src_seqs, trg_seqs, alignment_fns, args.batchsize) replace_unknown_words(src_word_seqs, trg_seqs, trg_word_seqs, hard_alignments, heuristic, mapping, unk_id, new_trans_file, args.n_best, full_trans_lines) else: raise NotImplementedError
def main(): args = parse_args() state = prototype_phrase_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False with open(args.topn_file, 'rb') as f: topn = cPickle.load( f ) # Load dictionary (source word index : list of target word indices) if args.less_transfer: for elt in topn: topn[elt] = topn[ elt][:args.num_ttables] # Take the first args.num_ttables only else: for elt in topn: topn[elt] = set( topn[elt][:args.num_ttables] ) # Take the first args.num_ttables only and convert list to set num_models = len(args.models) rng = numpy.random.RandomState(state['seed']) enc_decs = [] lm_models = [] original_W_0_dec_approx_embdr = [] original_W2_dec_deep_softmax = [] original_b_dec_deep_softmax = [] for i in xrange(num_models): enc_decs.append(RNNEncoderDecoder(state, rng, skip_init=True)) enc_decs[i].build() lm_models.append(enc_decs[i].create_lm_model()) lm_models[i].load(args.models[i]) original_W_0_dec_approx_embdr.append(lm_models[i].params[ lm_models[i].name2pos['W_0_dec_approx_embdr']].get_value()) original_W2_dec_deep_softmax.append(lm_models[i].params[ lm_models[i].name2pos['W2_dec_deep_softmax']].get_value()) original_b_dec_deep_softmax.append(lm_models[i].params[ lm_models[i].name2pos['b_dec_deep_softmax']].get_value()) # On GPU, this will free memory for the next models # Additional gains could be made by rolling the source vocab lm_models[i].params[ lm_models[i].name2pos['W_0_dec_approx_embdr']].set_value( numpy.zeros((1, 1), dtype=numpy.float32)) lm_models[i].params[ lm_models[i].name2pos['W2_dec_deep_softmax']].set_value( numpy.zeros((1, 1), dtype=numpy.float32)) lm_models[i].params[ lm_models[i].name2pos['b_dec_deep_softmax']].set_value( numpy.zeros((1), dtype=numpy.float32)) indx_word = cPickle.load(open(state['word_indx'], 'rb')) #Source w2i sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_decs) beam_search.compile() else: raise NotImplementedError #sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'], 'r')) #Source i2w original_target_i2w = lm_models[0].word_indxs.copy() # I don't think that we need target_word2index max_words = len(original_b_dec_deep_softmax[0]) if args.less_transfer: # Use OrderedDict instead of set for reproducibility d = OrderedDict() # Up to now D = OrderedDict() # Full C = OrderedDict() # Allowed to reject prev_line = 0 logger.info("%d" % prev_line) D_dict = OrderedDict() output = False for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [state['null_sym_target'], state['unk_sym_target']] update_dicts(null_unk_indices, d, D, C, args.num_common) with open(args.source, 'r') as f: for i, line in enumerate(f): seqin = line.strip() seq, parsed_in = parse_input( state, indx_word, seqin, idx2word=idict_src) # seq is the ndarray of indices indices = [] for elt in seq[:-1]: # Exclude the EOL token if elt != 1: # Exclude OOV (1 will not be a key of topn) indices.extend( topn[elt] ) # Add topn best unigram translations for each source word output = update_dicts(indices, d, D, C, args.num_common) if (i % args.change_every ) == 0 and args.change_every > 0 and i > 0: output = True if output: D_dict[prev_line] = D.copy( ) # Save dictionary for the lines preceding this one prev_line = i logger.info("%d" % i) output = False d = OrderedDict() if args.no_reset: C = D.copy() else: D = OrderedDict() # Full C = OrderedDict() # Allowed to reject for i in xrange(args.num_common): D[i] = 0 C[i] = 0 null_unk_indices = [ state['null_sym_target'], state['unk_sym_target'] ] update_dicts(null_unk_indices, d, D, C, args.num_common) update_dicts( indices, d, D, C, args.num_common ) # Assumes you cannot fill d with only 1 line D_dict[prev_line] = D.copy() if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input( state, indx_word, seqin, idx2word=idict_src) # seq is the ndarray of indices # For now, keep all input words in the model. # In the future, we may want to filter them to save on memory, but this isn't really much of an issue now if args.verbose: print "Parsed Input:", parsed_in if args.less_transfer: if i in D_dict: indices = D_dict[i].keys() eos_id = indices.index(state['null_sym_target'] ) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos[ 'W_0_dec_approx_embdr']].set_value( original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos[ 'W2_dec_deep_softmax']].set_value( original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos[ 'b_dec_deep_softmax']].set_value( original_b_dec_deep_softmax[j][indices]) lm_models[0].word_indxs = dict([ (k, original_target_i2w[index]) for k, index in enumerate(indices) ]) # target index2word trans, costs, _ = sample(lm_models[0], seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, normalize_p=args.normalize_p, eos_id=eos_id, unk_id=unk_id, final=True, wp=args.wp) else: # Extract the indices you need indices = set() for elt in seq[:-1]: # Exclude the EOL token if elt != 1: # Exclude OOV (1 will not be a key of topn) indices = indices.union( topn[elt] ) # Add topn best unigram translations for each source word num_common_words = args.num_common while True: if num_common_words >= max_words: final = True num_common_words = max_words else: final = False if args.final: # No matter the number of words final = True indices = indices.union(set( xrange(num_common_words))) # Add common words indices = list( indices) # Convert back to list for advanced indexing eos_id = indices.index(state['null_sym_target'] ) # Find new eos and unk positions unk_id = indices.index(state['unk_sym_target']) # Set the target word matrices and biases for j in xrange(num_models): lm_models[j].params[lm_models[j].name2pos[ 'W_0_dec_approx_embdr']].set_value( original_W_0_dec_approx_embdr[j][indices]) lm_models[j].params[lm_models[j].name2pos[ 'W2_dec_deep_softmax']].set_value( original_W2_dec_deep_softmax[j][:, indices]) lm_models[j].params[lm_models[j].name2pos[ 'b_dec_deep_softmax']].set_value( original_b_dec_deep_softmax[j][indices]) lm_models[0].word_indxs = dict([ (k, original_target_i2w[index]) for k, index in enumerate(indices) ]) # target index2word try: trans, costs, _ = sample(lm_models[0], seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, normalize_p=args.normalize_p, eos_id=eos_id, unk_id=unk_id, final=final) break # Breaks only if it succeeded (If final=True, will always succeed) except RuntimeError: indices = set(indices) num_common_words *= 2 if not args.n_best: best = numpy.argmin(costs) print >> ftrans, trans[best] else: order = numpy.argsort(costs) best = order[0] for elt in order: print >> ftrans, str( i + args.start) + ' ||| ' + trans[elt] + ' ||| ' + str( costs[elt]) if args.verbose: print "Translation:", trans[best] total_cost += costs[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence".format( (time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: raise NotImplementedError
# assert beam_search # assert args.beam_size fsrc = open(args.source, 'r') ftrg = open(args.target, 'r') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for srcline, trgline in zip(fsrc, ftrg): src_seqin = srcline.strip() trg_seqin = trgline.strip() src_seq, src_parsed_in = parse_input(state, indx_word_src, src_seqin, idx2word=idict_src) trg_seq, trg_parsed_in = parse_input(state, indx_word_trg, trg_seqin, idx2word=idict_trg) if args.verbose: print "Parsed Input:", parsed_in trans, costs, _ = sample(lm_model, src_seq, trg_seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize,
def main(): args = parse_args() state = prototype_state() with open(args.state) as src: state.update(cPickle.load(src)) state.update(eval("dict({})".format(args.changes))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=True) enc_dec.build() lm_model = enc_dec.create_lm_model() lm_model.load(args.model_path) indx_word = cPickle.load(open(state['word_indx'],'rb')) sampler = None beam_search = None if args.beam_search: beam_search = BeamSearch(enc_dec) beam_search.compile() else: sampler = enc_dec.create_sampler(many_samples=True) idict_src = cPickle.load(open(state['indx_word'],'r')) if args.source and args.trans: # Actually only beam search is currently supported here assert beam_search assert args.beam_size fsrc = open(args.source, 'r') ftrans = open(args.trans, 'w') start_time = time.time() n_samples = args.beam_size total_cost = 0.0 logging.debug("Beam size: {}".format(n_samples)) for i, line in enumerate(fsrc): seqin = line.strip() seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) if args.verbose: print "Parsed Input:", parsed_in trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize) try: best = numpy.argmin(costs) print >>ftrans, trans[best] total_cost += costs[best] except: print >> ftrans, "FAIL" if args.verbose: print "Translation:", trans[best] if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = None if not args.beam_search: alpha = float(raw_input('Inverse Temperature? ')) seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, alpha=alpha, verbose=True)
if (i + 1) % 100 == 0: ftrans.flush() logger.debug("Current speed is {} per sentence". format((time.time() - start_time) / (i + 1))) print "Total cost of the translations: {}".format(total_cost) fsrc.close() ftrans.close() else: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = None if not args.beam_search: alpha = float(raw_input('Inverse Temperature? ')) seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src) print "Parsed Input:", parsed_in except Exception: print "Exception while parsing your input:" traceback.print_exc() continue sample(lm_model, seq, n_samples, sampler=sampler, beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize, alpha=alpha, verbose=True) if __name__ == "__main__": main()