class MosesTranslator(Translator): """Handles the 'translate' task for MTMonkeyWorkers using Moses XML-RPC servers and built-in segmentation, tokenization, and detokenization. """ def __init__(self, translate_port, recase_port, source_lang, target_lang, threads): """Initialize a MosesTranslator object according to the given configuration settings. @param translate_port: the port at which the Moses translator operates @param recase_port: the port at which the recaser operates @param source_lang: source language (ISO-639-1 ID) @param target_lang: target language (ISO-639-1 ID) """ # precompile XML-RPC Moses server addresses self.translate_proxy_addr = "http://localhost:" + translate_port + "/RPC2" self.recase_proxy_addr = None if recase_port is not None: self.recase_proxy_addr = "http://localhost:" + recase_port + "/RPC2" # initialize text processing tools (can be shared among threads) self.splitter = SentenceSplitter({'language': source_lang}) self.tokenizer = Tokenizer({'lowercase': True, 'moses_escape': True}) self.detokenizer = Detokenizer({'moses_deescape': True, 'capitalize_sents': True, 'language': target_lang}) self.threads = threads def process_task(self, task): """Process translation task. Splits request into sentences, then translates and recases each sentence.""" # check parameters # be lenient and allow anything that can map to a boolean for alignmentInfo and detokenize doalign = _convert_boolean(task.get('alignmentInfo', ''), False) dodetok = _convert_boolean(task.get('detokenize', ''), True) dotok = _convert_boolean(task.get('tokenize', ''), True) dosegment = _convert_boolean(task.get('segment', ''), True) nbestsize = min(task.get('nBestSize', 1), 10) # run the translation src_lines = self.splitter.split_sentences(task['text']) if dosegment else [ task['text'] ] ret_src_tok = doalign or len(src_lines) > 1 def _translator(line): return self._translate(line, doalign, dodetok, nbestsize, ret_src_tok, dotok, dosegment) translated = parallel_map(_translator, src_lines) return { 'translationId': uuid.uuid4().hex, 'translation': translated } def _translate(self, src, doalign, dodetok, nbestsize, ret_src_tok, dotok, dosegment): """Translate and recase one sentence. Optionally, word alignment between source and target is included on the output. @param src: source text (one sentence). @param dodetok: detokenize output? @param nbestsize: size of n-best lists on the output @param ret_src_tok: return tokenized source sentences? """ # create server proxies (needed for each thread) translate_proxy = xmlrpclib.ServerProxy(self.translate_proxy_addr) recase_proxy = None if self.recase_proxy_addr is not None: # recasing only if there is a recaser set up recase_proxy = xmlrpclib.ServerProxy(self.recase_proxy_addr) # tokenize src_tokenized = self.tokenizer.tokenize(src) if dotok else src # translate translation = translate_proxy.translate({ "text": src_tokenized, "align": doalign, "nbest": nbestsize, "nbest-distinct": True, }) # provide n-best lists rank = 0 hypos = [] for hypo in translation['nbest']: # recase (if there is a recaser set up) if recase_proxy is not None: recased = recase_proxy.translate({"text": hypo['hyp']})['text'].strip() else: recased = hypo['hyp'] # construct the output parsed_hypo = { 'text': recased, 'score': hypo['totalScore'], 'rank': rank, } if dodetok: # detokenize if needed parsed_hypo['text'] = self.detokenizer.detokenize(recased) if doalign: # provide alignment information if needed parsed_hypo['tokenized'] = recased parsed_hypo['alignment-raw'] = _add_tgt_end(hypo['align'], recased) rank += 1 hypos.append(parsed_hypo) result = { 'src': src, 'translated': hypos, } if ret_src_tok: result['src-tokenized'] = src_tokenized return result
def __init__(self, translate_port, recase_port, source_lang, target_lang, threads): """Initialize a MosesTranslator object according to the given configuration settings. @param translate_port: the port at which the Moses translator operates @param recase_port: the port at which the recaser operates @param source_lang: source language (ISO-639-1 ID) @param target_lang: target language (ISO-639-1 ID) """ # precompile XML-RPC Moses server addresses self.translate_proxy_addr = "http://localhost:" + translate_port + "/RPC2" self.recase_proxy_addr = None if recase_port is not None: self.recase_proxy_addr = "http://localhost:" + recase_port + "/RPC2" # initialize text processing tools (can be shared among threads) self.splitter = SentenceSplitter({'language': source_lang}) self.tokenizer = Tokenizer({'lowercase': True, 'moses_escape': True}) self.detokenizer = Detokenizer({ 'moses_deescape': True, 'capitalize_sents': True, 'language': target_lang }) self.threads = threads
def __init__(self, translate_port, source_lang, target_lang): """Initialize a Translator object according to the given configuration settings.""" # precompile XML-RPC Moses server addresses self.translate_proxy_addr = "http://localhost:" + translate_port + "/RPC2?src="+source_lang+";tgt="+target_lang # initialize text processing tools (can be shared among threads) self.splitter = SentenceSplitter({'language': source_lang}) self.tokenizer = Tokenizer({'lowercase': True, 'moses_escape': True}) self.detokenizer = Detokenizer({'moses_deescape': True, 'capitalize_sents': True, 'language': target_lang})
def __init__(self, translate_port, recase_port, source_lang, target_lang, threads): """Initialize a MosesTranslator object according to the given configuration settings. @param translate_port: the port at which the Moses translator operates @param recase_port: the port at which the recaser operates @param source_lang: source language (ISO-639-1 ID) @param target_lang: target language (ISO-639-1 ID) """ # precompile XML-RPC Moses server addresses self.translate_proxy_addr = "http://localhost:" + translate_port + "/RPC2" self.recase_proxy_addr = None if recase_port is not None: self.recase_proxy_addr = "http://localhost:" + recase_port + "/RPC2" # initialize text processing tools (can be shared among threads) self.splitter = SentenceSplitter({'language': source_lang}) self.tokenizer = Tokenizer({'lowercase': True, 'moses_escape': True}) self.detokenizer = Detokenizer({'moses_deescape': True, 'capitalize_sents': True, 'language': target_lang}) self.threads = threads
class Translator: """Handles the 'translate' task for KhresmoiWorker""" def __init__(self, translate_port, source_lang, target_lang): """Initialize a Translator object according to the given configuration settings.""" # precompile XML-RPC Moses server addresses self.translate_proxy_addr = "http://*****:*****@@') for x in range(0,len(src_translate)): src_translate[x] = bpe.segment(src_translate[x]).strip() print src_translate[:3] #print src_translate translated = [self._translate(line, doalign, dodetok, nbestsize) for line in src_translate] return _backward_transform({ 'translationId': uuid.uuid4().hex, 'sentences': translated }, doalign, dodetok) def _translate(self, src, doalign, dodetok, nbestsize): """Translate and recase one sentence. Optionally, word alignment between source and target is included in output.""" # create server proxies (needed for each thread) translate_proxy = xmlrpclib.ServerProxy(self.translate_proxy_addr) # recase_proxy = xmlrpclib.ServerProxy(self.recase_proxy_addr) # translate translation = translate_proxy.translate({ "text": src, "align": doalign, "nbest": nbestsize, "nbest-distinct": True, }) # provide n-best lists rank = 0 hypos = [] for hypo in translation['nbest']: recased = hypo['hyp'] parsed_hypo = { 'text': recased, 'score': hypo['totalScore'], 'rank': rank, } if dodetok: parsed_hypo['text'] = self.detokenizer.detokenize(recased) if doalign: parsed_hypo['tokenized'] = recased parsed_hypo['alignment-raw'] = _add_tgt_end(hypo['align'], recased) rank += 1 hypos.append(parsed_hypo) result = { 'src': src, 'translated': hypos, } if dodetok: result['src-tokenized'] = src return result
class MosesTranslator(Translator): """Handles the 'translate' task for MTMonkeyWorkers using Moses XML-RPC servers and built-in segmentation, tokenization, and detokenization. """ def __init__(self, translate_port, recase_port, source_lang, target_lang, threads): """Initialize a MosesTranslator object according to the given configuration settings. @param translate_port: the port at which the Moses translator operates @param recase_port: the port at which the recaser operates @param source_lang: source language (ISO-639-1 ID) @param target_lang: target language (ISO-639-1 ID) """ # precompile XML-RPC Moses server addresses self.translate_proxy_addr = "http://localhost:" + translate_port + "/RPC2" self.recase_proxy_addr = None if recase_port is not None: self.recase_proxy_addr = "http://localhost:" + recase_port + "/RPC2" # initialize text processing tools (can be shared among threads) self.splitter = SentenceSplitter({'language': source_lang}) self.tokenizer = Tokenizer({'lowercase': True, 'moses_escape': True}) self.detokenizer = Detokenizer({ 'moses_deescape': True, 'capitalize_sents': True, 'language': target_lang }) self.threads = threads def process_task(self, task): """Process translation task. Splits request into sentences, then translates and recases each sentence.""" # check parameters # be lenient and allow anything that can map to a boolean for alignmentInfo and detokenize doalign = _convert_boolean(task.get('alignmentInfo', ''), False) dodetok = _convert_boolean(task.get('detokenize', ''), True) dotok = _convert_boolean(task.get('tokenize', ''), True) dosegment = _convert_boolean(task.get('segment', ''), True) nbestsize = min(task.get('nBestSize', 1), 10) # run the translation src_lines = self.splitter.split_sentences( task['text']) if dosegment else [task['text']] ret_src_tok = doalign or len(src_lines) > 1 def _translator(line): return self._translate(line, doalign, dodetok, nbestsize, ret_src_tok, dotok, dosegment) translated = parallel_map(_translator, src_lines) return {'translationId': uuid.uuid4().hex, 'translation': translated} def _translate(self, src, doalign, dodetok, nbestsize, ret_src_tok, dotok, dosegment): """Translate and recase one sentence. Optionally, word alignment between source and target is included on the output. @param src: source text (one sentence). @param dodetok: detokenize output? @param nbestsize: size of n-best lists on the output @param ret_src_tok: return tokenized source sentences? """ # create server proxies (needed for each thread) translate_proxy = xmlrpclib.ServerProxy(self.translate_proxy_addr) recase_proxy = None if self.recase_proxy_addr is not None: # recasing only if there is a recaser set up recase_proxy = xmlrpclib.ServerProxy(self.recase_proxy_addr) # tokenize src_tokenized = self.tokenizer.tokenize(src) if dotok else src # translate translation = translate_proxy.translate({ "text": src_tokenized, "align": str(doalign), "nbest": nbestsize, "nbest-distinct": str(True), "no-ReportSegmentation": str(True), }) # provide n-best lists rank = 0 hypos = [] for hypo in translation['nbest']: # recase (if there is a recaser set up) if recase_proxy is not None: recased = recase_proxy.translate({"text": hypo['hyp'] })['text'].strip() else: recased = hypo['hyp'] # construct the output parsed_hypo = { 'text': recased, 'score': hypo['totalScore'], 'rank': rank, } if dodetok: # detokenize if needed parsed_hypo['text'] = self.detokenizer.detokenize(recased) if doalign: # provide alignment information if needed parsed_hypo['tokenized'] = recased parsed_hypo['alignment-raw'] = _add_tgt_end( hypo['align'], recased) rank += 1 hypos.append(parsed_hypo) result = { 'src': src, 'translated': hypos, } if ret_src_tok: result['src-tokenized'] = src_tokenized return result
def __init__(self, translate_port, recase_port): self.translate_proxy = xmlrpclib.ServerProxy("http://localhost:" + translate_port + "/RPC2") self.recase_proxy = xmlrpclib.ServerProxy("http://localhost:" + recase_port + "/RPC2") self.tokenizer = Tokenizer({'lowercase': True, 'moses_escape': True}) self.detokenizer = Detokenizer() self.splitter = SentenceSplitter()
class Translator: """Handles the 'translate' task for KhresmoiWorker""" def __init__(self, translate_port, recase_port): self.translate_proxy = xmlrpclib.ServerProxy("http://localhost:" + translate_port + "/RPC2") self.recase_proxy = xmlrpclib.ServerProxy("http://localhost:" + recase_port + "/RPC2") self.tokenizer = Tokenizer({'lowercase': True, 'moses_escape': True}) self.detokenizer = Detokenizer() self.splitter = SentenceSplitter() def process_task(self, task): """Process translation task. Splits request into sentences, then translates and recases each sentence.""" doalign = task.get('alignmentInfo', '').lower() in ['true', 't', 'yes', 'y', '1'] dodetok = not task.get('detokenize', '').lower() in ['false', 'f', 'no', 'n', '0'] src_lines = self.splitter.split_sentences(task['text']) translated = [self._translate(line, doalign, dodetok) for line in src_lines] return { 'translation': [ { "translationId": uuid.uuid4().hex, "translated": translated } ] } def _translate(self, src, doalign, dodetok): """Translate and recase one sentence. Optionally, word alignment between source and target is included in output.""" # tokenize src_tokenized = self.tokenizer.tokenize(src) # translate translation = self.translate_proxy.translate({ "text": src_tokenized, "align": doalign }) # recase tgt_tokenized = self.recase_proxy.translate({ "text": translation['text'] })['text'].strip() # detokenize if dodetok: tgt = self.detokenizer.detokenize(tgt_tokenized) result = { 'text': tgt, 'score': 100, # TODO actual score 'rank': 0 # TODO } # optionally add word-alignment information if doalign: result.update({ 'src-tokenized': src_tokenized, 'tgt-tokenized': tgt_tokenized, 'alignment-raw': _add_tgt_end(translation['align'], tgt_tokenized) }) return result