def main(truecase, sock): s = socket.socket() # Create a socket object host = socket.gethostname() # Get local machine name port = sock # Reserve a port for your service. s.bind((host, port)) # Bind to the port # Now wait for client connection. # Initialise truecaser with codecs.open(truecase, 'r', encoding='utf-8') as f: tc_init = f.read().split('\n') truecaser = defaultdict(str) for line in tc_init: truecaser[line.split(' ')[0].lower()] = line.split(' ')[0] # Initialise nltk.moses tokenizer and detokenizer tokenizer = moses.MosesTokenizer() detokenizer = moses.MosesDetokenizer() # Start listening for connections while True: try: s.listen(5) print("Waiting for connections and stuff...") c, addr = s.accept() t = threading.Thread(target=listen, args=(c, addr, tokenizer, detokenizer, truecaser)) t.start() except KeyboardInterrupt: break s.close()
def main(truecase, sock, fasttext, bpe): s = socket.socket() # Create a socket object host = socket.gethostname() # Get local machine name port = sock # Reserve a port for your service. s.bind(('', port)) # Bind to the port # Now wait for client connection. with codecs.open(truecase, 'r', encoding='utf-8') as f: tc_init = f.read().split('\n') truecaser = defaultdict(str) for line in tc_init: truecaser[line.split(' ')[0].lower()] = line.split(' ')[0] ft_mdl = fastText.load_model(fasttext) tokenizer = moses.MosesTokenizer() detokenizer = moses.MosesDetokenizer() while True: try: s.listen(5) LOG.info("Waiting for connections and stuff...") c, addr = s.accept() t = threading.Thread(target=listen, args=(c, addr, tokenizer, detokenizer, truecaser, ft_mdl, bpe)) t.start() except KeyboardInterrupt: break s.close()
def __init__(self, args, word2i, i2word, glove_embeddings, chars, use_cuda): super().__init__() self.tags = ('SOP', 'EOP', 'CAP') self.detokenizer = moses.MosesDetokenizer() assert type(word2i) == dict and type(i2word) == dict and len(word2i) == len(i2word), \ 'Malformed word lookup tables.' if use_cuda: self.float_tensor = torch.cuda.FloatTensor self.long_tensor = torch.cuda.LongTensor else: self.float_tensor = torch.FloatTensor self.long_tensor = torch.LongTensor self.word2i = word2i self.i2word = i2word self.chars = chars self.word_hidden_size = args.word_hidden_size self.char_hidden_size = args.char_hidden_size self.word_num_layers = args.word_num_layers self.char_num_layers = args.char_num_layers self.word_vocab_size = len(word2i) self.char_vocab_size = len(chars) + len(self.tags) if glove_embeddings is None: self.word_encoder = nn.Embedding(self.word_vocab_size, self.word_hidden_size) else: glove_embeddings = glove_embeddings.type(self.float_tensor) self.word_encoder = lambda x_word: glove_embeddings[x_word, :] self.word_lstm = nn.LSTM(self.word_hidden_size, self.word_hidden_size, self.word_num_layers, dropout=args.dropout) self.word_decoder = nn.Linear(self.word_hidden_size, self.word_vocab_size) self.char_encoder = nn.Embedding(self.char_vocab_size, self.char_hidden_size) self.char_lstm = nn.LSTM(self.char_hidden_size, self.char_hidden_size, self.char_num_layers, dropout=args.dropout) # self.char_decoder = nn.Linear(self.char_hidden_size, self.char_vocab_size) self.char_to_embedding = nn.Parameter( torch.randn(self.word_hidden_size, self.char_hidden_size)).type(self.float_tensor) self.x_word_to_g_weight = nn.Parameter( torch.randn(self.word_hidden_size)).type(self.float_tensor) self.x_word_to_g_bias = nn.Parameter(torch.randn(1)).type( self.float_tensor)
def main(models, saveto, bpe_file, save_alignment=None, k=5, normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes # CAN I MAKE IT INTO SERVER ###### The following functions should be already a part of serverisation # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(f, processes, queue): source_sentences = [] for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [] for w in words: w = [ word_dicts[i][f] if f in word_dicts[i] else 1 for (i, f) in enumerate(w.split('|')) ] if len(w) != options[0]['factors']: sys.stderr.write( 'Error: expected {0} factors, but input word has {1}\n' .format(options[0]['factors'], len(w))) for midx in xrange(n_process): processes[midx].terminate() sys.exit(1) x.append(w) x += [[0] * options[0]['factors']] queue.put((idx, x)) source_sentences.append(words) return idx + 1, source_sentences def _finish_processes(queue): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples, processes, queue, rqueue): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = None while resp is None: try: resp = rqueue.get(True, 5) # if queue is empty after 5s, check if processes are still alive except Empty: for midx in xrange(n_process): if not processes[midx].is_alive(): # kill all other processes and raise exception if one dies queue.cancel_join_thread() rqueue.cancel_join_thread() for idx in xrange(n_process): processes[idx].terminate() sys.stderr.write( "Error: translate worker process {0} crashed with exitcode {1}" .format(processes[midx].pid, processes[midx].exitcode)) sys.exit(1) trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx + 1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 def _parallelized_main(fs_init, fs_next, c, bpe, tokenizer, detokenizer): source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) #print(source_file_t[i]) while source_file_t[0] != "EOT": for i in range(len(source_file_t)): # print source_file_t[i].decode('utf-8') #pipe = subprocess.Popen("echo " + source_file_t[i] + "| perl truecase.perl --model en-truecase.mdl", shell=True) #pipe = subprocess.Popen(["echo", '"' + source_file_t[i] + '"', "|", "perl", "truecase.perl", "--model", # "en-truecase.mdl"], stdout=subprocess.PIPE) #result = pipe.stdout.read() #print pipe.communicate() #print pipe #print pipe.stdout #print pipe.stdout.read() #print pipe. #print "Here" #print result #source_file_t[i] = subprocess.check_output() source_file_t[i] = bpe.segment( tokenizer.tokenize(source_file_t[i], return_str=True)).strip() #print "Passed" print source_file_t detokenized = '' queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph, fs_init, fs_next)) processes[midx].start() n_samples, source_sentences = _send_jobs(source_file_t, processes, queue) _finish_processes(queue) #### The model loading takes place in the head of for loop, prolly in _retrieve_jobs for i, trans in enumerate( _retrieve_jobs(n_samples, processes, queue, rqueue)): print "NEXT SENTENCE:" if nbest: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) order = numpy.argsort(scores) for j in order: if print_word_probabilities: probs = " ||| " + " ".join( "{0}".format(prob) for prob in word_probs[j]) else: probs = "" saveto.write('{0} ||| {1} ||| {2}{3}\n'.format( i, _seqs2words(samples[j]), scores[j], probs)) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos # translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json( alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i + j, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n' .format(i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) ## TODO: Handle the output here #print((_seqs2words(samples) + "\n").encode('utf-8')) #text.append(_seqs2words(samples) + "\n") x = _seqs2words(samples) #print x[0].upper() + x[1:] detokenized += detokenizer.detokenize( (x.decode('utf-8') + " ").split(), return_str=True) detokenized = detokenized[0].upper() + detokenized[1:] #print "ref this" #print detokenized #detokenized[0] = detokenized[0].upper() #c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) ## TODO: End of output handling if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'. format(i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(trans[0]))) print_matrix(alignment, save_alignment) c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) c.close() sys.stderr.write('Done\n') def _listen(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe): while True: try: # Establish connection with client. try: print 'Got connection from', addr print "Receiving..." fname = c.recv(4096) except socket.error: c.close() print "connection closed" break print fname c.send("okay") #if fname == 'exit': # print "Terminating connection with client." # c.close() # break #else: #t = threading.Thread(target=_parallelized_main, args=(fname, fs_init, fs_next, c)) try: t = threading.Thread(target=_parallelized_main, args=(fs_init, fs_next, c, bpe, tokenizer, detokenizer)) t.start() t.join() except socket.error: c.close() break except KeyboardInterrupt as e: LOG.debug('Crtrl+C issued ...') LOG.info('Terminating server ...') try: c.shutdown(socket.SHUT_RDWR) c.close() except: pass break s = socket.socket() # Create a socket object host = socket.gethostname() # Get local machine name port = 12345 # Reserve a port for your service. s.bind((host, port)) # Bind to the port # Now wait for client connection. # Beginning model loading from theano_util import (load_params, init_theano_params) from nmt import (build_sampler) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=save_alignment is not None) fs_init.append(f_init) fs_next.append(f_next) # end of model loading tokenizer = moses.MosesTokenizer() detokenizer = moses.MosesDetokenizer() # start listening to connections once models are loaded args.codes = codecs.open(bpe_file[0], encoding='utf-8') bpe = BPE(args.codes, '@@') while True: try: s.listen(5) print("Waiting for connections and stuff...") c, addr = s.accept() t = threading.Thread(target=_listen, args=(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe)) t.start() except KeyboardInterrupt: break s.close()
from collections import Counter from nltk.tokenize import moses import argparse import json import os import sys detokenizer = moses.MosesDetokenizer( ) # must match what's used in preprocessing.py def postprocess(infilename, outfilename, replacements_filename, replacements_map_filename): """De-anonymize and detokenize results (reverses what was done by preprocessing.py) :infilename: Model predictions (which are tokenized and anonymized) :outfilename: Location where detokenized, de-anonymized final text will be written :replacements_filename: *-anon.txt file created by preprocessing.py in which each line is a json-serialized dict mapping anonymization placeholders to the predicate values they replaced in the corresponding input line. :replacements_map_filename: file with a mapping from DMRS predicate values of named nodes (e.g., named0, card0) to the surface form they should be replaced with """ # Load mapping from predicate values to surface form most often seen in training data rmap = json.load(open(replacements_map_filename)) # Generate list of replacements replacements = [] with open(replacements_filename) as infile: for line in infile: replacement_dicts = json.loads(line.strip())
def main(document): tokens = get_tokens() output = moses.MosesDetokenizer().detokenize(tokens, return_str=True) with codecs.open('output.txt', 'w', encoding='utf-8') as fout: fout.write(output) return document