def batch_translator(input_dict,max_batch_size,batch_type): ''' Given an input in the form {'model_id':int,'src_list':list} returns a dictionary of time taken in various steps during the translation of sentences in the list ''' model_id = input_dict['id'] src_list = input_dict['src_list'] num_sentence = len(src_list) input_subwords_list = [None] * num_sentence output_subwords_list = [None] * num_sentence tagged_src_list = [None] * num_sentence tagged_tgt_list = [None] * num_sentence tgt_list = [None] * num_sentence score_list = [None] * num_sentence out = {} date_original_array = [None] * num_sentence url_original_array = [None] * num_sentence num_array_array = [None] * num_sentence num_map_array = [None] * num_sentence prefix_array = [None] * num_sentence time_model_loading, time_preprocessing, time_tokenizing, time_encoding, \ time_translating, time_decoding, time_detokenizing, time_postprocessing = [0] * 8 start_loading = time.time() sp_encoder, translator, sp_decoder = get_models(model_id) time_model_loading = time.time() - start_loading input_sentence_array_prepd = [None] * num_sentence special_case_sentence_indices = [] start_preprocessing = time.time() try: for i,sent in enumerate(src_list): input_sentence = sent.strip() if special_case_handler.special_case_fits(input_sentence): special_case_sentence_indices.append(i) log_info("sentence fits in special case, capturing index to process at last",MODULE_CONTEXT) else: prefix_array[i], input_sentence = special_case_handler.prefix_handler(input_sentence) input_sentence,date_original_array[i],url_original_array[i],num_array_array[i],num_map_array[i] = \ tagger_util.tag_number_date_url(input_sentence) tagged_src_list[i] = (prefix_array[i] + " " + input_sentence).lstrip() input_sentence_array_prepd[i] = input_sentence time_preprocessing = time.time() - start_preprocessing log_info("translating using NMT-model:{}".format(model_id),MODULE_CONTEXT) if model_id == 56: "09/12/19-Exp-5.6:" input_sentence_array_prepd = [sentence.title() if sentence.isupper() else sentence for sentence in input_sentence_array_prepd] start_tokenizing = time.time() input_sentence_array_prepd = [sentence_processor.moses_tokenizer(sentence) for sentence in input_sentence_array_prepd] time_tokenizing = time.time() - start_tokenizing translation_array, input_subwords_list, output_subwords_list, score_list, time_encoding, time_translating, time_decoding = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,max_batch_size,batch_type,input_subwords_list,output_subwords_list,score_list) start_detokenizing = time.time() translation_array = [sentence_processor.indic_detokenizer(translation) for translation in translation_array] time_detokenizing = time.time() - start_detokenizing else: log_info("Unsupported model id: {} for given input".format(model_id),MODULE_CONTEXT) raise Exception("Unsupported Model ID - id: {} for given input".format(model_id)) start_postprocessing = time.time() for i in range(num_sentence): if i in special_case_sentence_indices: log_info("sentence fits in special case, returning output accordingly and not from model",MODULE_CONTEXT) tgt_list[i] = special_case_handler.handle_special_cases(src_list[i].strip(),model_id) score_list[i] = 1 input_subwords_list[i],output_subwords_list[i],tagged_tgt_list[i],tagged_src_list[i] = \ "","",tgt_list[i],src_list[i].strip() else: translation_array[i] = (prefix_array[i] +" "+translation_array[i]).lstrip() translation_array[i] = translation_array[i].replace("▁"," ") translation_array[i] = misc.regex_pass(translation_array[i],[patterns['p8'],patterns['p9'],patterns['p4'],patterns['p5'], patterns['p6'],patterns['p7']]) tagged_tgt_list[i] = translation_array[i] translation_array[i] = tagger_util.replace_tags_with_original(translation_array[i],\ date_original_array[i],url_original_array[i],num_array_array[i],num_map_array[i]) translation_array[i] = oc.cleaner(tagged_src_list[i],translation_array[i],model_id) tgt_list[i] = translation_array[i] log_info("translate_function-experiment-{} output: {}".format(model_id,translation_array[i]),MODULE_CONTEXT) time_postprocessing = time.time() - start_postprocessing out = {"time_model_loading": time_model_loading,\ "time_preprocessing": time_preprocessing,\ "time_tokenizing": time_tokenizing,\ "time_encoding": time_encoding,\ "time_translating": time_translating,\ "time_decoding": time_decoding,\ "time_detokenizing": time_detokenizing,\ "time_postprocessing": time_postprocessing} except ServerModelError as e: log_exception("ServerModelError error in TRANSLATE_UTIL-translate_func: {} and {}".format(e,sys.exc_info()[0]),MODULE_CONTEXT,e) raise e except Exception as e: log_exception("Exception caught in NMTTranslateService:batch_translator:%s and %s"% (e,sys.exc_info()[0]),MODULE_CONTEXT,e) raise e return out
def interactive_translation(inputs): out = {} i_src, tgt = list(), list() tagged_tgt = list() tagged_src = list() sentence_id = list() tp_tokenizer = None try: for i in inputs: sentence_id.append(i.get("s_id") or "NA") if any(v not in i for v in ['src', 'id']): log_info("either id or src missing in some input", MODULE_CONTEXT) out = CustomResponse(Status.ID_OR_SRC_MISSING.value, inputs) return out log_info("input sentence:{}".format(i['src']), MODULE_CONTEXT) i_src.append(i['src']) i['src'] = i['src'].strip() i['src_lang'], i['tgt_lang'] = misc.get_src_tgt_langauge( i['id']) i['src'] = misc.convert_digits_preprocess( i['src_lang'], i['src']) if special_case_handler.special_case_fits(i['src']): log_info( "sentence fits in special case, returning accordingly and not going to model", MODULE_CONTEXT) translation = special_case_handler.handle_special_cases( i['src'], i['id']) translation = [translation] tag_tgt, tag_src = translation, i['src'] else: log_info( "Performing interactive translation on:{}".format( i['id']), MODULE_CONTEXT) i['src'], date_original, url_original, num_array, num_map = tagger_util.tag_number_date_url( i['src']) tag_src = i['src'] if i['id'] == 56: "english-hindi" if i['src'].isupper(): log_info( "src all Upper case hence Tital casing it", MODULE_CONTEXT) i['src'] = i['src'].title() tp_tokenizer = sentence_processor.indic_tokenizer i['src'] = sentence_processor.moses_tokenizer(i['src']) translation = encode_itranslate_decode( i, num_map, tp_tokenizer) translation = [ sentence_processor.indic_detokenizer(i) for i in translation ] elif i['id'] == 7: "english-tamil" translation = encode_itranslate_decode( i, num_map, tp_tokenizer) elif i['id'] == 10: "english-gujarati" translation = encode_itranslate_decode( i, num_map, tp_tokenizer) elif i['id'] == 15: "english-kannada" translation = encode_itranslate_decode( i, num_map, tp_tokenizer) elif i['id'] == 16: "english-telugu" translation = encode_itranslate_decode( i, num_map, tp_tokenizer) elif i['id'] == 17: "english-malayalam" translation = encode_itranslate_decode( i, num_map, tp_tokenizer) elif i['id'] == 18: "english-punjabi" translation = encode_itranslate_decode( i, num_map, tp_tokenizer) elif i['id'] == 42: "english-marathi" translation = encode_itranslate_decode( i, num_map, tp_tokenizer) elif i['id'] == 50: "telugu-english" tp_tokenizer = sentence_processor.moses_tokenizer i['src'] = sentence_processor.indic_tokenizer(i['src']) translation = encode_itranslate_decode( i, num_map, tp_tokenizer) translation = [ sentence_processor.moses_detokenizer(i) for i in translation ] elif i['id'] == 6: "hindi-english" tp_tokenizer = sentence_processor.moses_tokenizer i['src'] = sentence_processor.indic_tokenizer(i['src']) translation = encode_itranslate_decode( i, num_map, tp_tokenizer) translation = [ sentence_processor.moses_detokenizer(i) for i in translation ] elif i['id'] == 62: "marathi-english" tp_tokenizer = sentence_processor.moses_tokenizer i['src'] = sentence_processor.indic_tokenizer(i['src']) translation = encode_itranslate_decode( i, num_map, tp_tokenizer) translation = [ sentence_processor.moses_detokenizer(i) for i in translation ] elif i['id'] == 8: "tamil-english" tp_tokenizer = sentence_processor.moses_tokenizer i['src'] = sentence_processor.indic_tokenizer(i['src']) translation = encode_itranslate_decode( i, num_map, tp_tokenizer) translation = [ sentence_processor.moses_detokenizer(i) for i in translation ] elif i['id'] == 55: "punjabi-english" tp_tokenizer = sentence_processor.moses_tokenizer i['src'] = sentence_processor.indic_tokenizer(i['src']) translation = encode_itranslate_decode( i, num_map, tp_tokenizer) translation = [ sentence_processor.moses_detokenizer(i) for i in translation ] elif i['id'] == 48: "kannada-english" tp_tokenizer = sentence_processor.moses_tokenizer i['src'] = sentence_processor.indic_tokenizer(i['src']) translation = encode_itranslate_decode( i, num_map, tp_tokenizer) translation = [ sentence_processor.moses_detokenizer(i) for i in translation ] elif i['id'] == 60: "malayalam-english" tp_tokenizer = sentence_processor.moses_tokenizer i['src'] = sentence_processor.indic_tokenizer(i['src']) translation = encode_itranslate_decode( i, num_map, tp_tokenizer) translation = [ sentence_processor.moses_detokenizer(i) for i in translation ] elif i['id'] == 52: "gujarati-english" tp_tokenizer = sentence_processor.moses_tokenizer i['src'] = sentence_processor.indic_tokenizer(i['src']) translation = encode_itranslate_decode( i, num_map, tp_tokenizer) translation = [ sentence_processor.moses_detokenizer(i) for i in translation ] elif i['id'] == 65: "english-bengali 4th" tp_tokenizer = sentence_processor.indic_tokenizer i['src'] = sentence_processor.moses_tokenizer(i['src']) translation = encode_itranslate_decode( i, num_map, tp_tokenizer) translation = [ sentence_processor.indic_detokenizer(i) for i in translation ] elif i['id'] == 66: "bengali-english 3rd" tp_tokenizer = sentence_processor.moses_tokenizer i['src'] = sentence_processor.indic_tokenizer(i['src']) translation = encode_itranslate_decode( i, num_map, tp_tokenizer) translation = [ sentence_processor.moses_detokenizer(i) for i in translation ] elif i['id'] == 67: "ta-en 3rd" tp_tokenizer = sentence_processor.moses_tokenizer i['src'] = sentence_processor.indic_tokenizer(i['src']) translation = encode_itranslate_decode_v2( i, num_map, tp_tokenizer) translation = [ sentence_processor.moses_detokenizer(i) for i in translation ] elif i['id'] == 68: "en-ta 5th" tp_tokenizer = sentence_processor.indic_tokenizer i['src'] = sentence_processor.moses_tokenizer(i['src']) translation = encode_itranslate_decode_v2( i, num_map, tp_tokenizer) translation = [ sentence_processor.indic_detokenizer(i) for i in translation ] elif i['id'] == 69: "hi-en 3rd" tp_tokenizer = sentence_processor.moses_tokenizer i['src'] = sentence_processor.indic_tokenizer(i['src']) translation = encode_itranslate_decode_v2( i, num_map, tp_tokenizer) translation = [ sentence_processor.moses_detokenizer(i) for i in translation ] elif i['id'] == 70: "en-hi 15th" tp_tokenizer = sentence_processor.indic_tokenizer i['src'] = sentence_processor.moses_tokenizer(i['src']) translation = encode_itranslate_decode_v2( i, num_map, tp_tokenizer) translation = [ sentence_processor.indic_detokenizer(i) for i in translation ] elif i['id'] == 71: "te-en 2nd" tp_tokenizer = sentence_processor.moses_tokenizer i['src'] = sentence_processor.indic_tokenizer(i['src']) translation = encode_itranslate_decode_v2( i, num_map, tp_tokenizer) translation = [ sentence_processor.moses_detokenizer(i) for i in translation ] elif i['id'] == 72: "en-te 3rd" tp_tokenizer = sentence_processor.indic_tokenizer i['src'] = sentence_processor.moses_tokenizer(i['src']) translation = encode_itranslate_decode_v2( i, num_map, tp_tokenizer) translation = [ sentence_processor.indic_detokenizer(i) for i in translation ] elif i['id'] == 73: "ml-en 2nd" tp_tokenizer = sentence_processor.moses_tokenizer i['src'] = sentence_processor.indic_tokenizer(i['src']) translation = encode_itranslate_decode_v2( i, num_map, tp_tokenizer) translation = [ sentence_processor.moses_detokenizer(i) for i in translation ] elif i['id'] == 74: "en-ml 3rd" tp_tokenizer = sentence_processor.indic_tokenizer i['src'] = sentence_processor.moses_tokenizer(i['src']) translation = encode_itranslate_decode_v2( i, num_map, tp_tokenizer) translation = [ sentence_processor.indic_detokenizer(i) for i in translation ] else: log_info( "unsupported model id: {} for given input".format( i['id']), MODULE_CONTEXT) raise Exception( "Unsupported Model ID - id: {} for given input". format(i['id'])) translation = [i.replace("▁", " ") for i in translation] translation = [ misc.regex_pass(i, [ patterns['p8'], patterns['p9'], patterns['p4'], patterns['p5'], patterns['p6'], patterns['p7'] ]) for i in translation ] tag_tgt = translation translation = [ tagger_util.replace_tags_with_original( i, date_original, url_original, num_array, num_map) for i in translation ] translation = [ misc.convert_digits_postprocess(i['tgt_lang'], item) for item in translation ] log_info( "interactive translation-experiment-{} output: {}".format( i['id'], translation), MODULE_CONTEXT) tgt.append(translation) tagged_tgt.append(tag_tgt) tagged_src.append(tag_src) out['response_body'] = [{ "tgt": tgt[i], "tagged_tgt": tagged_tgt[i], "tagged_src": tagged_src[i], "s_id": sentence_id[i], "src": i_src[i] } for i in range(len(tgt))] out = CustomResponse(Status.SUCCESS.value, out['response_body']) except Exception as e: status = Status.SYSTEM_ERR.value status['why'] = str(e) log_exception( "Unexpected error:%s and %s" % (e, sys.exc_info()[0]), MODULE_CONTEXT, e) out = CustomResponse(status, inputs) return out
def translate_func(inputs): inputs = inputs out = {} pred_score = list() sentence_id, node_id = list(), list() input_subwords, output_subwords = list(), list() i_src, tgt = list(), list() tagged_tgt, tagged_src = list(), list() s_id, n_id = [0000], [0000] i_s0_src, i_s0_tgt, i_save = list(), list(), list() i_tmx_phrases = list() try: for i in inputs: s0_src, s0_tgt, save = "NA", "NA", False if all(v in i for v in ['s_id', 'n_id']): s_id = [i['s_id']] n_id = [i['n_id']] if any(v not in i for v in ['src', 'id']): log_info("either id or src missing in some input", MODULE_CONTEXT) out = CustomResponse(Status.ID_OR_SRC_MISSING.value, inputs) return out if any(v in i for v in ['s0_src', 's0_tgt', 'save']): s0_src, s0_tgt, save = handle_custome_input( i, s0_src, s0_tgt, save) i_s0_src.append(s0_src), i_s0_tgt.append( s0_tgt), i_save.append(save) log_info("input sentences:{}".format(i['src']), MODULE_CONTEXT) i_src.append(i['src']) i['src'] = i['src'].strip() src_language, tgt_language = misc.get_src_tgt_langauge(i['id']) if src_language == 'English' and i['src'].isupper(): i['src'] = i['src'].title() i['src'] = misc.convert_digits_preprocess( src_language, i['src']) if special_case_handler.special_case_fits(i['src']): log_info( "sentence fits in special case, returning accordingly and not going to model", MODULE_CONTEXT) translation = special_case_handler.handle_special_cases( i['src'], i['id']) scores = [1] input_sw, output_sw, tag_tgt, tag_src = "", "", translation, i[ 'src'] else: log_info("translating using NMT-model:{}".format(i['id']), MODULE_CONTEXT) prefix, i['src'] = special_case_handler.prefix_handler( i['src']) i['src'], date_original, url_original, num_array, num_map = tagger_util.tag_number_date_url( i['src']) tag_src = (prefix + " " + i['src']).lstrip() i['src'], is_missing_stop_punc = special_case_handler.handle_a_sentence_wo_stop( src_language, i['src']) if i['id'] == 6: "hi-en_exp-2 05-05-20" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 7: "english-tamil" translation, scores, input_sw, output_sw = encode_translate_decode( i) elif i['id'] == 10: "english-gujrati" translation, scores, input_sw, output_sw = encode_translate_decode( i) elif i['id'] == 15: "english-kannada" translation, scores, input_sw, output_sw = encode_translate_decode( i) elif i['id'] == 16: "english-telgu" translation, scores, input_sw, output_sw = encode_translate_decode( i) elif i['id'] == 17: "english-malayalam" translation, scores, input_sw, output_sw = encode_translate_decode( i) elif i['id'] == 18: "english-punjabi" translation, scores, input_sw, output_sw = encode_translate_decode( i) elif i['id'] == 42: "english-marathi exp-2" translation, scores, input_sw, output_sw = encode_translate_decode( i) elif i['id'] == 56: "09/12/19-Exp-5.6:" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 8: "ta-en 1st" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 44: "eng-mr-3rd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 47: "en-kn 2nd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 48: "kn-en 1st" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 49: "en-tel 2nd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 50: "tel-en 1st" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 51: "en-guj 2nd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 52: "guj-en 1st" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 53: "en-punjabi 2nd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 55: "punjabi-en 1st" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 57: "en-bengali 3rd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 58: "bengali-en 2nd" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 59: "en-malay 2nd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 60: "malay-en 1st" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 62: "mr-to-en 2nd" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 65: "en-bengali 4th" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 66: "bengali-en 3rd" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 67: "ta-en 3rd" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode_v2( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 68: "en-ta 5th" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode_v2( i) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 69: "hi-en 3rd" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode_v2( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 70: "en-hi 15th" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode_v2( i) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 71: "te-en 2nd" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode_v2( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 72: "en-te 3rd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode_v2( i) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 73: "ml-en 2nd" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode_v2( i) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 74: "en-ml 3rd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode_v2( i) translation = sentence_processor.indic_detokenizer( translation) else: log_info( "Unsupported model id: {} for given input".format( i['id']), MODULE_CONTEXT) raise Exception( "Unsupported Model ID - id: {} for given input". format(i['id'])) translation = oc.postprocess_a_sentence_wo_stop( tgt_language, translation, is_missing_stop_punc) translation = (prefix + " " + translation).lstrip() translation = translation.replace("▁", " ") translation = misc.regex_pass(translation, [ patterns['p8'], patterns['p9'], patterns['p4'], patterns['p5'], patterns['p6'], patterns['p7'] ]) tag_tgt = translation translation = tagger_util.replace_tags_with_original( translation, date_original, url_original, num_array, num_map) translation = oc.cleaner(tag_src, translation, i['id']) translation = misc.convert_digits_postprocess( tgt_language, translation) log_info( "translate_function-experiment-{} output: {}".format( i['id'], translation), MODULE_CONTEXT) tgt.append(translation) pred_score.append(scores) sentence_id.append(s_id[0]), node_id.append(n_id[0]) input_subwords.append(input_sw), output_subwords.append( output_sw) tagged_tgt.append(tag_tgt), tagged_src.append(tag_src) i_tmx_phrases.append(i.get("tmx_phrases", [])) out['response_body'] = [{ "tgt": tgt[i], "pred_score": pred_score[i], "s_id": sentence_id[i], "input_subwords": input_subwords[i], "output_subwords": output_subwords[i], "n_id": node_id[i], "src": i_src[i], "tagged_tgt": tagged_tgt[i], "tagged_src": tagged_src[i], "save": i_save[i], "s0_src": i_s0_src[i], "s0_tgt": i_s0_tgt[i], "tmx_phrases": i_tmx_phrases[i] } for i in range(len(tgt))] out = CustomResponse(Status.SUCCESS.value, out['response_body']) except ServerModelError as e: status = Status.SEVER_MODEL_ERR.value status['why'] = str(e) log_exception( "ServerModelError error in TRANSLATE_UTIL-translate_func: {} and {}" .format(e, sys.exc_info()[0]), MODULE_CONTEXT, e) out = CustomResponse(status, inputs) except Exception as e: status = Status.SYSTEM_ERR.value status['why'] = str(e) log_exception( "Unexpected error:%s and %s" % (e, sys.exc_info()[0]), MODULE_CONTEXT, e) out = CustomResponse(status, inputs) return out
def batch_translator(input_dict): model_id = input_dict['id'] src_list = input_dict['src_list'] num_sentence = len(src_list) input_subwords_list = [None] * num_sentence output_subwords_list = [None] * num_sentence tagged_src_list = [None] * num_sentence tagged_tgt_list = [None] * num_sentence tgt_list = [None] * num_sentence score_list = [None] * num_sentence out = {} date_original_array = [None] * num_sentence url_original_array = [None] * num_sentence num_array_array = [None] * num_sentence num_map_array = [None] * num_sentence prefix_array = [None] * num_sentence sp_encoder, translator, sp_decoder = get_models(model_id) input_sentence_array_prepd = [None] * num_sentence special_case_sentence_indices = [] src_language, tgt_language = misc.get_src_tgt_langauge(model_id) try: for i, sent in enumerate(src_list): input_sentence = sent.strip() input_sentence = misc.convert_digits_preprocess( src_language, input_sentence) if special_case_handler.special_case_fits(input_sentence): special_case_sentence_indices.append(i) log_info( "sentence fits in special case, capturing index to process at last", MODULE_CONTEXT) else: prefix_array[ i], input_sentence = special_case_handler.prefix_handler( input_sentence) input_sentence,date_original_array[i],url_original_array[i],num_array_array[i],num_map_array[i] = \ tagger_util.tag_number_date_url(input_sentence) tagged_src_list[i] = (prefix_array[i] + " " + input_sentence).lstrip() input_sentence_array_prepd[i] = input_sentence input_sentence_array_prepd, sent_indices_wo_stop = \ special_case_handler.handle_sentences_wo_stop(src_language,input_sentence_array_prepd) log_info("translating using NMT-model:{}".format(model_id), MODULE_CONTEXT) if model_id == 5: "hi-en exp-1" input_sentence_array_prepd = [ sentence_processor.indic_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.moses_detokenizer(translation) for translation in translation_array ] elif model_id == 6: "hi-en_exp-2 05-05-20" input_sentence_array_prepd = [ sentence_processor.indic_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.moses_detokenizer(translation) for translation in translation_array ] elif model_id == 7: "english-tamil" translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) elif model_id == 10: "english-gujrati" translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) # translation = translation.replace("ન્યાય માટે Accessક્સેસને","ન્યાયની પહોંચને") elif model_id == 11: "english-bengali" translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) elif model_id == 15: "english-kannada" translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) # translation = translation.replace("uc","") elif model_id == 16: "english-telgu" translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) elif model_id == 17: "english-malayalam" translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) elif model_id == 18: "english-punjabi" translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) elif model_id == 32: "29/10/2019 Exp-12: old_data_original+lc_cleaned+ ik names translated from google(100k)+shabdkosh(appended 29k new),BPE-24K,50knmt,shuff,pretok" input_sentence_array_prepd = [ sentence_processor.moses_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.indic_detokenizer(translation) for translation in translation_array ] elif model_id == 42: "english-marathi exp-2" translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) elif model_id == 56: "09/12/19-Exp-5.6:" input_sentence_array_prepd = [ sentence.title() if sentence.isupper() else sentence for sentence in input_sentence_array_prepd ] input_sentence_array_prepd = [ sentence_processor.moses_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.indic_detokenizer(translation) for translation in translation_array ] elif model_id == 8: "ta-en 1st" input_sentence_array_prepd = [ sentence_processor.indic_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.moses_detokenizer(translation) for translation in translation_array ] elif model_id == 44: "eng-mr-3rd" input_sentence_array_prepd = [ sentence_processor.moses_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.indic_detokenizer(translation) for translation in translation_array ] elif model_id == 45: "en-ta 4th" input_sentence_array_prepd = [ sentence_processor.moses_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.indic_detokenizer(translation) for translation in translation_array ] elif model_id == 47: "en-kn 2nd" input_sentence_array_prepd = [ sentence_processor.moses_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.indic_detokenizer(translation) for translation in translation_array ] elif model_id == 48: "kn-en 1st" input_sentence_array_prepd = [ sentence_processor.indic_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.moses_detokenizer(translation) for translation in translation_array ] elif model_id == 49: "en-tel 2nd" input_sentence_array_prepd = [ sentence_processor.moses_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.indic_detokenizer(translation) for translation in translation_array ] elif model_id == 50: "tel-en 1st" input_sentence_array_prepd = [ sentence_processor.indic_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.moses_detokenizer(translation) for translation in translation_array ] elif model_id == 51: "en-guj 2nd" input_sentence_array_prepd = [ sentence_processor.moses_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.indic_detokenizer(translation) for translation in translation_array ] elif model_id == 52: "guj-en 1st" input_sentence_array_prepd = [ sentence_processor.indic_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.moses_detokenizer(translation) for translation in translation_array ] elif model_id == 53: "en-punjabi 2nd" input_sentence_array_prepd = [ sentence_processor.moses_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.indic_detokenizer(translation) for translation in translation_array ] elif model_id == 55: "punjabi-en 1st" input_sentence_array_prepd = [ sentence_processor.indic_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.moses_detokenizer(translation) for translation in translation_array ] elif model_id == 57: "en-bengali 3rd" input_sentence_array_prepd = [ sentence_processor.moses_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.indic_detokenizer(translation) for translation in translation_array ] elif model_id == 58: "bengali-en 2nd" input_sentence_array_prepd = [ sentence_processor.indic_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.moses_detokenizer(translation) for translation in translation_array ] elif model_id == 59: "en-malay 2nd" input_sentence_array_prepd = [ sentence_processor.moses_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.indic_detokenizer(translation) for translation in translation_array ] elif model_id == 60: "malay-en 1st" input_sentence_array_prepd = [ sentence_processor.indic_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.moses_detokenizer(translation) for translation in translation_array ] elif model_id == 61: "ta-to-en 3rd" input_sentence_array_prepd = [ sentence_processor.indic_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.moses_detokenizer(translation) for translation in translation_array ] elif model_id == 62: "mr-to-en 2nd" input_sentence_array_prepd = [ sentence_processor.indic_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.moses_detokenizer(translation) for translation in translation_array ] elif model_id == 65: "en-bengali 4th" input_sentence_array_prepd = [ sentence_processor.moses_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.indic_detokenizer(translation) for translation in translation_array ] elif model_id == 66: "bengali-en 3rd" input_sentence_array_prepd = [ sentence_processor.indic_tokenizer(sentence) for sentence in input_sentence_array_prepd ] translation_array, input_subwords_list, output_subwords_list, score_list = \ encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,input_subwords_list,output_subwords_list,score_list) translation_array = [ sentence_processor.moses_detokenizer(translation) for translation in translation_array ] else: log_info( "Unsupported model id: {} for given input".format( model_id), MODULE_CONTEXT) raise Exception( "Unsupported Model ID - id: {} for given input".format( model_id)) translation_array = oc.postprocess_sentences_wo_stop( tgt_language, translation_array, sent_indices_wo_stop) for i in range(num_sentence): if i in special_case_sentence_indices: log_info( "sentence fits in special case, returning output accordingly and not from model", MODULE_CONTEXT) tgt_list[i] = special_case_handler.handle_special_cases( src_list[i].strip(), model_id) score_list[i] = 1 input_subwords_list[i],output_subwords_list[i],tagged_tgt_list[i],tagged_src_list[i] = \ "","",tgt_list[i],src_list[i].strip() else: translation_array[i] = (prefix_array[i] + " " + translation_array[i]).lstrip() translation_array[i] = translation_array[i].replace( "▁", " ") translation_array[i] = misc.regex_pass( translation_array[i], [ patterns['p8'], patterns['p9'], patterns['p4'], patterns['p5'], patterns['p6'], patterns['p7'] ]) tagged_tgt_list[i] = translation_array[i] translation_array[i] = tagger_util.replace_tags_with_original(translation_array[i],\ date_original_array[i],url_original_array[i],num_array_array[i],num_map_array[i]) translation_array[i] = oc.cleaner(tagged_src_list[i], translation_array[i], model_id) tgt_list[i] = translation_array[i] log_info( "translate_function-experiment-{} output: {}".format( model_id, translation_array[i]), MODULE_CONTEXT) tgt_list[i] = misc.convert_digits_postprocess( tgt_language, tgt_list[i]) if (not tgt_list[i]) or (tgt_list[i].isspace()): tgt_list[i] = src_list[i] out = { "tagged_src_list": tagged_src_list, "tagged_tgt_list": tagged_tgt_list, "tgt_list": tgt_list } except ServerModelError as e: log_exception( "ServerModelError error in TRANSLATE_UTIL-translate_func: {} and {}" .format(e, sys.exc_info()[0]), MODULE_CONTEXT, e) raise e except Exception as e: log_exception( "Exception caught in NMTTranslateService:batch_translator:%s and %s" % (e, sys.exc_info()[0]), MODULE_CONTEXT, e) raise e return out