def main(source_file, nbest_file, output_file, rescorer_settings): # load model model_options options = [] for model in rescorer_settings.models: options.append(load_config(model)) fill_options(options[-1]) rescore_model(source_file, nbest_file, output_file, rescorer_settings, options)
def main(source_file, nbest_file, output_file, rescorer_settings): # load model model_options options = [] for model in rescorer_settings.models: options.append(load_config(model)) fill_options(options[-1]) options[-1]['reload'] = model options[-1] = argparse.Namespace(**options[-1]) rescore(source_file, nbest_file, output_file, rescorer_settings, options)
def main(models, source_file, nbest_file, saveto, b=80, normalize=False, verbose=False, alignweights=False): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights)
def _load_model_options(self): """ Loads config options for each model. """ options = [] for model in self._models: options.append(load_config(model)) # backward compatibility fill_options(options[-1]) self._options = options
def main(source_file, target_file, output_file, scorer_settings): # load model model_options options = [] for model in scorer_settings.models: options.append(load_config(model)) fill_options(options[-1]) options[-1]['reload'] = model options[-1] = argparse.Namespace(**options[-1]) scores = score_model(source_file, target_file, scorer_settings, options) write_scores(source_file, target_file, scores, output_file, scorer_settings)
def _load_model_options(self): """ Loads config options for each model. """ options = [] for model in self._models: options.append(load_config(model)) # backward compatibility fill_options(options[-1]) # dummy features for single source using multi-source code dummy_options(options[-1]) self._options = options
def _load_model_options(self): """ Loads config options for each model. """ self._options = [] for model in self._models: config = load_config(model) # backward compatibility fill_options(config) config['reload'] = model self._options.append(argparse.Namespace(**config)) _, _, _, self._num_to_target = load_dictionaries(self._options[0])
def _load_model_options(self): """ Loads config options for each model. """ options = [] for model in self._models: m = load_config(model) if not 'concatenate_lm_decoder' in m: m['concatenate_lm_decoder'] = False options.append(m) # backward compatibility fill_options(options[-1]) self._options = options
def main(settings): """ Translates a source language file (or STDIN) into a target language file (or STDOUT). """ # Start logging. level = logging.DEBUG if settings.verbose else logging.INFO logging.basicConfig(level=level, format='%(levelname)s: %(message)s') # Create the TensorFlow session. if settings.cpu: logging.info("using cpu now...") os.environ["CUDA_VISIBLE_DEVICES"] = "" tf_config = tf.ConfigProto(device_count={'GPU': 0}) else: os.environ["CUDA_VISIBLE_DEVICES"] = "2" tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True session = tf.Session(config=tf_config) # Load config file for each model. configs = [] for model in settings.models: config = util.load_config(model) compat.fill_options(config) config['reload'] = model configs.append(argparse.Namespace(**config)) # Create the model graphs and restore their variables. logging.debug("Loading models") models = [] for i, config in enumerate(configs): with tf.variable_scope("model%d" % i) as scope: model = rnn_model.RNNModel(config) saver = model_loader.init_or_restore_variables( config, session, ensemble_scope=scope) models.append(model) logging.debug("Models load done.") # Translate the source file. inference.translate_file(input_file=settings.input, output_file=settings.output, session=session, models=models, configs=configs, beam_size=settings.beam_size, nbest=settings.n_best, minibatch_size=settings.minibatch_size, maxibatch_size=settings.maxibatch_size, normalization_alpha=settings.normalization_alpha)
def _load_model_options(self, options_load): """ Loads config options for each model. """ if options_load == None: options = [] for model in self._models: options.append(load_config(model)) # backward compatibility fill_options(options[-1]) else: options = [load_config(options_load)] fill_options(options[-1]) self._options = options
def main(models, source_file, target_file, saveto, b=80, normalization_alpha=0.0, verbose=False, alignweights=False, extra_sources=[], per_word=False): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) # multi-source or single source functions if len(extra_sources) == 0: savetos = [saveto] + [file(saveto.name, 'w') for _ in extra_sources] multi_rescore_model(source_file, target_file, savetos, models, options, b, normalization_alpha, verbose, alignweights, per_word=per_word) else: savetos = [saveto] + [file(saveto.name, 'w') for _ in extra_sources] #source_files = source_files + extra_sources multi_rescore_model(source_file, target_file, savetos, models, options, b, normalization_alpha, verbose, alignweights, per_word=per_word, extra_sources=extra_sources)
def main(models, saveto, bpe_file, save_alignment=None, k=5, normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes # CAN I MAKE IT INTO SERVER ###### The following functions should be already a part of serverisation # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(f, processes, queue): source_sentences = [] for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [] for w in words: w = [ word_dicts[i][f] if f in word_dicts[i] else 1 for (i, f) in enumerate(w.split('|')) ] if len(w) != options[0]['factors']: sys.stderr.write( 'Error: expected {0} factors, but input word has {1}\n' .format(options[0]['factors'], len(w))) for midx in xrange(n_process): processes[midx].terminate() sys.exit(1) x.append(w) x += [[0] * options[0]['factors']] queue.put((idx, x)) source_sentences.append(words) return idx + 1, source_sentences def _finish_processes(queue): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples, processes, queue, rqueue): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = None while resp is None: try: resp = rqueue.get(True, 5) # if queue is empty after 5s, check if processes are still alive except Empty: for midx in xrange(n_process): if not processes[midx].is_alive(): # kill all other processes and raise exception if one dies queue.cancel_join_thread() rqueue.cancel_join_thread() for idx in xrange(n_process): processes[idx].terminate() sys.stderr.write( "Error: translate worker process {0} crashed with exitcode {1}" .format(processes[midx].pid, processes[midx].exitcode)) sys.exit(1) trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx + 1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 def _parallelized_main(fs_init, fs_next, c, bpe, tokenizer, detokenizer): source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) #print(source_file_t[i]) while source_file_t[0] != "EOT": for i in range(len(source_file_t)): # print source_file_t[i].decode('utf-8') #pipe = subprocess.Popen("echo " + source_file_t[i] + "| perl truecase.perl --model en-truecase.mdl", shell=True) #pipe = subprocess.Popen(["echo", '"' + source_file_t[i] + '"', "|", "perl", "truecase.perl", "--model", # "en-truecase.mdl"], stdout=subprocess.PIPE) #result = pipe.stdout.read() #print pipe.communicate() #print pipe #print pipe.stdout #print pipe.stdout.read() #print pipe. #print "Here" #print result #source_file_t[i] = subprocess.check_output() source_file_t[i] = bpe.segment( tokenizer.tokenize(source_file_t[i], return_str=True)).strip() #print "Passed" print source_file_t detokenized = '' queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph, fs_init, fs_next)) processes[midx].start() n_samples, source_sentences = _send_jobs(source_file_t, processes, queue) _finish_processes(queue) #### The model loading takes place in the head of for loop, prolly in _retrieve_jobs for i, trans in enumerate( _retrieve_jobs(n_samples, processes, queue, rqueue)): print "NEXT SENTENCE:" if nbest: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) order = numpy.argsort(scores) for j in order: if print_word_probabilities: probs = " ||| " + " ".join( "{0}".format(prob) for prob in word_probs[j]) else: probs = "" saveto.write('{0} ||| {1} ||| {2}{3}\n'.format( i, _seqs2words(samples[j]), scores[j], probs)) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos # translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json( alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i + j, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n' .format(i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) ## TODO: Handle the output here #print((_seqs2words(samples) + "\n").encode('utf-8')) #text.append(_seqs2words(samples) + "\n") x = _seqs2words(samples) #print x[0].upper() + x[1:] detokenized += detokenizer.detokenize( (x.decode('utf-8') + " ").split(), return_str=True) detokenized = detokenized[0].upper() + detokenized[1:] #print "ref this" #print detokenized #detokenized[0] = detokenized[0].upper() #c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) ## TODO: End of output handling if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'. format(i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(trans[0]))) print_matrix(alignment, save_alignment) c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) c.close() sys.stderr.write('Done\n') def _listen(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe): while True: try: # Establish connection with client. try: print 'Got connection from', addr print "Receiving..." fname = c.recv(4096) except socket.error: c.close() print "connection closed" break print fname c.send("okay") #if fname == 'exit': # print "Terminating connection with client." # c.close() # break #else: #t = threading.Thread(target=_parallelized_main, args=(fname, fs_init, fs_next, c)) try: t = threading.Thread(target=_parallelized_main, args=(fs_init, fs_next, c, bpe, tokenizer, detokenizer)) t.start() t.join() except socket.error: c.close() break except KeyboardInterrupt as e: LOG.debug('Crtrl+C issued ...') LOG.info('Terminating server ...') try: c.shutdown(socket.SHUT_RDWR) c.close() except: pass break s = socket.socket() # Create a socket object host = socket.gethostname() # Get local machine name port = 12345 # Reserve a port for your service. s.bind((host, port)) # Bind to the port # Now wait for client connection. # Beginning model loading from theano_util import (load_params, init_theano_params) from nmt import (build_sampler) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=save_alignment is not None) fs_init.append(f_init) fs_next.append(f_next) # end of model loading tokenizer = moses.MosesTokenizer() detokenizer = moses.MosesDetokenizer() # start listening to connections once models are loaded args.codes = codecs.open(bpe_file[0], encoding='utf-8') bpe = BPE(args.codes, '@@') while True: try: s.listen(5) print("Waiting for connections and stuff...") c, addr = s.accept() t = threading.Thread(target=_listen, args=(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe)) t.start() except KeyboardInterrupt: break s.close()
def main(models, source_file, saveto, save_alignment=None, k=5, normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process(target=translate_model, args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk)) processes[midx].start() # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(f): source_sentences = [] for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [] for w in words: w = [ word_dicts[i][f] if f in word_dicts[i] else 1 for (i, f) in enumerate(w.split('|')) ] if len(w) != options[0]['factors']: sys.stderr.write( 'Error: expected {0} factors, but input word has {1}\n' .format(options[0]['factors'], len(w))) for midx in xrange(n_process): processes[midx].terminate() sys.exit(1) x.append(w) x += [[0] * options[0]['factors']] queue.put((idx, x)) source_sentences.append(words) return idx + 1, source_sentences def _finish_processes(): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = rqueue.get() trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx + 1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 sys.stderr.write('Translating {0} ...\n'.format(source_file.name)) n_samples, source_sentences = _send_jobs(source_file) _finish_processes() for i, trans in enumerate(_retrieve_jobs(n_samples)): if nbest: samples, scores, word_probs, alignment = trans order = numpy.argsort(scores) for j in order: if print_word_probabilities: probs = " ||| " + " ".join("{0}".format(prob) for prob in word_probs[j]) else: probs = "" saveto.write('{0} ||| {1} ||| {2}{3}\n'.format( i, _seqs2words(samples[j]), scores[j], probs)) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json(alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i + j, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment = trans saveto.write(_seqs2words(samples) + "\n") if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(trans[1], source_sentences[i], _seqs2words(trans[0]).split(), i, i, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(trans[0]))) print_matrix(trans[3], save_alignment) sys.stderr.write('Done\n')
def main(models, source_file, saveto, save_alignment=None, k=5, normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, print_word_probabilities=False, return_hyp_graph=False): options = [] for model in models: # actually, there is only one model options.append(load_config(model)) fill_options(options[-1]) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] # 0 - n-1 are source dictionaries dictionary_target = dictionaries[-1] # load source dictionaries and invert word_dicts = [] # list of word-id mapping word_idicts = [] # list of id-word mapping for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for kk, vv in word_dict.items(): if vv >= options[0]['n_words_src']: del word_dict[kk] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionaries and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for process; note that Queue is used to communicateion between Processes queue = Queue() rqueue = Queue() processes = [None] * n_process for pidx in xrange(n_process): processes[pidx] = Process(target=translate_model, args=(queue, rqueue, pidx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph)) processes[pidx].start() # put data into queue def _send_jobs(f): source_sentences = [] for idx, line in enumerate(f): if chr_level: #into single characters words = list(line.decode('utf-8').strip()) else: # into words (separated by spaces) words = line.strip().split() x = [] for w in words: word = w w = [ word_dicts[i][f] if f in word_dicts[i] else 1 for (i, f) in enumerate(w.split('|')) ] x.append(w) x += [[0] * options[0]['factors']] # end with "EOS" queue.put((idx, x)) source_sentences.append(words) return idx + 1, source_sentences def _finish_processes(): for midx in xrange(n_process): queue.put(None) # this inner function is used to get translation results def _retrieve_jobs(n_samples): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = None while resp is None: try: resp = rqueue.get(True, 5) except Empty: for midx in xrange(n_process): if not processes[midx].is_alive(): # kill all other processes and raise exception if one dies queue.cancel_join_thread() rqueue.cancel_join_thread() for idx in xrange(n_process): processes[idx].terminate() sys.exit(1) trans[resp[0]] = resp[1] while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 sys.stderr.write('Translating...{0}\n'.format(source_file.name)) n_samples, source_sentences = _send_jobs(source_file) _finish_processes() for i, trans in enumerate(_retrieve_jobs(n_samples)): print trans
def theano_to_tensorflow_config(model_path): config = util.load_config(model_path) compat.fill_options(config) config['reload'] = None config['prior_model'] = None return argparse.Namespace(**config)
def main(models, source_file, saveto, save_alignment=None, k=5, normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph)) processes[midx].start() # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(f): source_sentences = [] for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [] for w in words: w = [word_dicts[i][f] if f in word_dicts[i] else 1 for (i,f) in enumerate(w.split('|'))] if len(w) != options[0]['factors']: sys.stderr.write('Error: expected {0} factors, but input word has {1}\n'.format(options[0]['factors'], len(w))) for midx in xrange(n_process): processes[midx].terminate() sys.exit(1) x.append(w) x += [[0]*options[0]['factors']] queue.put((idx, x)) source_sentences.append(words) return idx+1, source_sentences def _finish_processes(): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = rqueue.get() trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx+1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 sys.stderr.write('Translating {0} ...\n'.format(source_file.name)) n_samples, source_sentences = _send_jobs(source_file) _finish_processes() for i, trans in enumerate(_retrieve_jobs(n_samples)): if nbest: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) order = numpy.argsort(scores) for j in order: if print_word_probabilities: probs = " ||| " + " ".join("{0}".format(prob) for prob in word_probs[j]) else: probs = "" saveto.write('{0} ||| {1} ||| {2}{3}\n'.format(i, _seqs2words(samples[j]), scores[j], probs)) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json(alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i+j,save_alignment) else: save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) saveto.write(_seqs2words(samples) + "\n") if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i,save_alignment) else: save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(trans[0]))) print_matrix(alignment, save_alignment) sys.stderr.write('Done\n')
def main(models, source_file, saveto, save_alignment=None, k=5, normalization_alpha=0.0, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False, device_list=[]): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' print 'input dict - 100 most common' for i in xrange(100): print i, " ", word_idict[i] print 'output dict - 100 most common' for i in xrange(100): print i, " ", word_idict_trg[i] # create input and output queues for processes queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): deviceid = '' if device_list is not None and len(device_list) != 0: deviceid = device_list[midx % len(device_list)].strip() processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalization_alpha, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph, deviceid)) processes[midx].start() # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(f): source_sentences = [] for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [] for w in words: w = [word_dicts[i][f] if f in word_dicts[i] else 1 for (i,f) in enumerate(w.split('|'))] if len(w) != options[0]['factors']: sys.stderr.write('Error: expected {0} factors, but input word has {1}\n'.format(options[0]['factors'], len(w))) for midx in xrange(n_process): processes[midx].terminate() sys.exit(1) x.append(w) x += [[0]*options[0]['factors']] queue.put((idx, x)) source_sentences.append(words) return idx+1, source_sentences def _finish_processes(): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = None while resp is None: try: resp = rqueue.get(True, 5) # if queue is empty after 5s, check if processes are still alive except Empty: for midx in xrange(n_process): if not processes[midx].is_alive() and processes[midx].exitcode != 0: # kill all other processes and raise exception if one dies queue.cancel_join_thread() rqueue.cancel_join_thread() for idx in xrange(n_process): processes[idx].terminate() sys.stderr.write("Error: translate worker process {0} crashed with exitcode {1}".format(processes[midx].pid, processes[midx].exitcode)) sys.exit(1) trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx+1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 sys.stderr.write('Translating {0} ...\n'.format(source_file.name)) n_samples, source_sentences = _send_jobs(source_file) _finish_processes() for i, trans in enumerate(_retrieve_jobs(n_samples)): if nbest: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) order = numpy.argsort(scores) for j in order: if print_word_probabilities: probs = " ||| " + " ".join("{0}".format(prob) for prob in word_probs[j]) else: probs = "" saveto.write('{0} ||| {1} ||| {2}{3}\n'.format(i, _seqs2words(samples[j]), scores[j], probs)) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json(alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i+j,save_alignment) else: save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) saveto.write(_seqs2words(samples) + "\n") if i%1==0: print 'input:' print ' '.join(source_sentences[i]) print 'output:' print _seqs2words(samples) + "\n" if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i,save_alignment) else: save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(trans[0]))) print_matrix(alignment, save_alignment) sys.stderr.write('Done\n')