def encode_translate_decode(i, translation_server, sp_encoder, sp_decoder): try: logger.info("Inside encode_translate_decode function") i['src'] = str(sp.encode_line(sp_encoder, i['src'])) logger.info("SP encoded sent: %s" % i['src']) input_sw = i['src'] translation, scores, n_best, times = translation_server.run([i]) logger.info("output from model: %s" % translation[0]) output_sw = translation[0] translation = sp.decode_line(sp_decoder, translation[0]) logger.info("SP decoded sent: %s" % translation) return translation, scores, input_sw, output_sw except ServerModelError as e: logger.error( "ServerModelError error in encode_translate_decode: {} and {}". format(e, sys.exc_info()[0])) raise except Exception as e: logger.error( "Unexpexcted error in encode_translate_decode: {} and {}".format( e, sys.exc_info()[0])) raise
def purnaviram_applier(src, tgt): ''' For english to hindi translation ''' try: if tgt is None or len(tgt.split()) == 0: return tgt if len(src.split()) < 5: return tgt if src.endswith('.') and tgt.endswith('ред'): return tgt elif src.endswith('.') and tgt[-1] != 'ред': if tgt.endswith('.'): logger.info("Replacing '.' with purnaviram") tgt = tgt[:-1] + str("ред") else: logger.info("Adding the missing purnaviram") tgt = tgt + str("ред") return tgt else: return tgt except Exception as e: logger.error( "Error in purnaviram applier, returning original tgt: {}".format( e)) return tgt
def update_num_arr(num_array,zero_prefix_num,i_zero,num_array_orignal): ''' This is function is meant to handle zero prefix numbers like 09 or 000 which are converted to 9 or 0 during processing, We want them in original form i.e 09 zero_prefix_num: this is the num that has to be transformed back with zero prefix(from 9 to 09, or, 0 to 000 originally) i_zero: indices of numbers with zero prefix in num_array_orignal ind: indices of zero prefix numbers in num_array descending Note: this function needs some fixing ''' try: num_array_o = None num_array_o = num_array[:] ind = list() zero_prefix_num = np.unique(np.array(zero_prefix_num)) for i in zero_prefix_num: for j,m in enumerate(num_array): if m == i: ind.append(j) for k,l in enumerate(ind): num_array[l] = num_array_orignal[i_zero[k]] return num_array except Exception as e: logger.error("Error in handle_date_url:update_num_arr,returning incoming num_array:{}".format(e)) return num_array_o
def regex_pass(text,regex_list): try: regex_list = regex_list for pattern in regex_list: text = re.sub(pattern['regex'],pattern['replacement'],text) return text except Exception as e: logger.error("Error in regex_pass: handle_date_url function:{}".format(e)) return text
def get_producer(): try: producer = KafkaProducer( bootstrap_servers=list(str(bootstrap_server).split(",")), value_serializer=lambda x: json.dumps(x).encode('utf-8')) logger.info('get_producer : producer returned successfully') return producer except Exception as e: logger.error( 'get_producer : ERROR OCCURRED while creating producer, ERROR = ' + str(e)) return None
def replace_tags_with_original_1(text,date_original,url_original,num_array): try: resultant_str = list() if len(text) == 0: return "" for word in text.split(): if word[:-1] == 'DdAaTtEe' and len(date_original) > 0: word = date_original[int(word[-1])] elif word[:-1] == 'UuRrLl' and len(url_original)> 0 : word = url_original[int(word[-1])] resultant_str.append(word) s = [str(i) for i in resultant_str] res = str(" ".join(s)) logger.info("response after url and date replacemnt:{}".format(res)) array = re.findall(r'NnUuMm..|NnUuMm.', res) logger.info("NnUuMm array after translation:{}".format(array)) for j in array: try: if j[-2:] in hindi_numbers: end_hin_number = j[-2:] index = hindi_numbers.index(end_hin_number) res = res.replace(j,str(num_array[index]),1) elif j[:-1]== "NnUuMm": end_hin_number = j[-1] index = hindi_numbers.index(end_hin_number) res = res.replace(j,str(num_array[index]),1) else: end_hin_number = j[-2] j = j[:-1] index = hindi_numbers.index(end_hin_number) res = res.replace(j,str(num_array[index]),1) except Exception as e: logger.info("inside str.replace error,but handling it:{}".format(e)) res = res.replace(j,"",1) logger.info("response after tags replacement:{}".format(res)) return res except Exception as e: logger.error("Error in parent except block of replace_tags_with_original_1 function, returning tagged output:{}".format(e)) return text
def decode_line(load_model,line): # makes segmenter instance and loads the model file (m.model) try: sp = spm.SentencePieceProcessor() sp.load(load_model) if not line.startswith("["): line = "["+line if not line.endswith("]"): line = line+"]" line = line[0]+line[1:-1].replace('[',"")+line[-1] line = line[0]+line[1:-1].replace(']',"")+line[-1] logger.info("decoding using sp model {}".format(load_model)) if "<unk>" in line: line = line.replace("<unk>","") return sp.DecodePieces(eval(line)) except Exception as e: logger.error("something went wrong! {}".format(e)) logger.error("Unexpected error: %s"% sys.exc_info()[0]) return ""
def maybe_load_vocab(corpus_type, counters, opt): src_vocab = None tgt_vocab = None existing_fields = None if corpus_type == config.train: if opt.src_vocab != "": try: logger.info("Using existing vocabulary...") existing_fields = torch.load(opt.src_vocab) except torch.serialization.pickle.UnpicklingError: logger.info("Building vocab from text file...") # src_vocab, src_vocab_size = _load_vocab( # opt.src_vocab, "src", counters, # opt.src_words_min_frequency) if opt.tgt_vocab != "": logger.error("opt.tgt_vocab 不为空") # tgt_vocab, tgt_vocab_size = _load_vocab( # opt.tgt_vocab, "tgt", counters, # opt.tgt_words_min_frequency) return src_vocab, tgt_vocab, existing_fields
def fullstop_applier(src, tgt): ''' For non-hindi translation pair ''' try: if len(src.split()) < 5: return tgt if src.endswith('.') and tgt.endswith('.'): return tgt elif src.endswith('.') and tgt[-1] != '.': logger.info("Adding the missing fullstop") tgt = tgt + str(".") return tgt else: return tgt except Exception as e: logger.error( "Error in fullstop_applier, returning original tgt: {}".format(e)) return tgt
def prefix_handler(text): ''' Currently this function is only handling different numeric prefixes in the first token of an input eg. 1., 12.1, (1.),(12.1),1,(12) etc. ''' try: prefix = "" tokens = text.split() token_p = tokens[0] regex_list = [patterns['p10'], patterns['p11']] matches = [ re.match(pattern['regex'], token_p) for pattern in regex_list ] if not all(v is None for v in matches): prefix = token_p text = str(" ".join(tokens[1:])) logger.info("Returning from prefix_handler") return prefix, text except Exception as e: logger.error( "Error in prefix handler, returning original text,error:{}".format( e)) return "", text
def get_consumer(topics): try: # consumer = KafkaConsumer( # topic, # bootstrap_servers=[bootstrap_server], # auto_offset_reset='earliest', # enable_auto_commit=True, # group_id=group_id, # value_deserializer=lambda x: json.loads(x.decode('utf-8'))) consumer = KafkaConsumer( bootstrap_servers=list(str(bootstrap_server).split(",")), value_deserializer=lambda x: json.loads(x.decode('utf-8'))) consumer.subscribe(topics) logger.info( 'get_consumer : consumer returned for topics:{}'.format(topics)) return consumer except Exception as e: logger.error( 'ERROR OCCURRED for getting consumer with topics:{}'.format( topics)) logger.error('get_consumer : ERROR = ' + str(e)) return None
def from_en(inputs, translation_server): inputs = inputs out = {} tgt = list() pred_score = list() sentence_id = list() node_id = list() input_subwords = list() output_subwords = list() s_id = [0000] n_id = [0000] try: for i in inputs: if all(v in i for v in ['s_id', 'n_id']): s_id = [i['s_id']] n_id = [i['n_id']] if any(v not in i for v in ['src', 'id']): out['status'] = statusCode["ID_OR_SRC_MISSING"] logger.info("either id or src missing in some input") return out logger.info("input sentences:{}".format(i['src'])) i['src'] = i['src'].strip() if ancillary_functions.special_case_fits(i['src']): logger.info( "sentence fits in special case, returning accordingly and not going to model" ) translation = ancillary_functions.handle_special_cases( i['src'], i['id']) scores = [1] input_sw, output_sw = "", "" else: logger.info("translating using NMT-model:{}".format(i['id'])) logger.info("translating this sentences:{}".format(i['src'])) # prefix,suffix, i['src'] = ancillary_functions.separate_alphanumeric_and_symbol(i['src']) if i['id'] == 1: i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_hindi["ENG_220519"], sp_model.english_hindi["HIN_220519"]) translation = sentence_processor.indic_detokenizer( translation) logger.info( "final output from model-1: {}".format(translation)) else: logger.info( "unsupported model id: {} for given english translation" .format(i['id'])) logger.error( "unsupported model id: {} for given english translation" .format(i['id'])) translation, input_sw, output_sw, scores = "", "", "", [0] # translation = (prefix+" "+translation+" "+suffix).strip() translation = ancillary_functions.replace_hindi_numbers( translation) tgt.append(translation) pred_score.append(scores[0]) sentence_id.append(s_id[0]) node_id.append(n_id[0]) input_subwords.append(input_sw) output_subwords.append(output_sw) out['status'] = statusCode["SUCCESS"] out['response_body'] = [{ "tgt": tgt[i], "s_id": sentence_id[i], "input_subwords": input_subwords[i], "output_subwords": output_subwords[i], "n_id": node_id[i], "pred_score": pred_score[i] } for i in range(len(tgt))] except ServerModelError as e: out['status'] = statusCode["SEVER_MODEL_ERR"] out['status']['why'] = str(e) logger.error( "ServerModelError error in TRANSLATE_UTIL-FROM_ENGLISH: {} and {}". format(e, sys.exc_info()[0])) except Exception as e: out['status'] = statusCode["SYSTEM_ERR"] logger.error( "Unexpected error in translate_util from_eng function: %s and %s" % (e, sys.exc_info()[0])) return out
def translate_func(inputs, translation_server): inputs = inputs out = {} pred_score = list() sentence_id, node_id = list(), list() input_subwords, output_subwords = list(), list() i_src, tgt = list(), list() tagged_tgt, tagged_src = list(), list() s_id, n_id = [0000], [0000] i_s0_src, i_s0_tgt, i_save = list(), list(), list() try: for i in inputs: s0_src, s0_tgt, save = "NA", "NA", False logger.info( log_with_request_info(i.get("s_id"), LOG_TAGS["input"], i)) if all(v in i for v in ['s_id', 'n_id']): s_id = [i['s_id']] n_id = [i['n_id']] if any(v not in i for v in ['src', 'id']): out['status'] = statusCode["ID_OR_SRC_MISSING"] out['response_body'] = [] logger.info("either id or src missing in some input") return (out) if any(v in i for v in ['s0_src', 's0_tgt', 'save']): s0_src, s0_tgt, save = handle_custome_input( i, s0_src, s0_tgt, save) i_s0_src.append(s0_src), i_s0_tgt.append(s0_tgt), i_save.append( save) logger.info("input sentences:{}".format(i['src'])) i_src.append(i['src']) i['src'] = i['src'].strip() if ancillary_functions.special_case_fits(i['src']): logger.info( "sentence fits in special case, returning accordingly and not going to model" ) translation = ancillary_functions.handle_special_cases( i['src'], i['id']) scores = [1] input_sw, output_sw, tag_tgt, tag_src = "", "", translation, i[ 'src'] else: logger.info("translating using NMT-model:{}".format(i['id'])) # prefix,suffix, i['src'] = ancillary_functions.separate_alphanumeric_and_symbol(i['src']) prefix, i['src'] = ancillary_functions.prefix_handler(i['src']) i['src'], date_original, url_original, num_array, num_map = date_url_util.tag_number_date_url_1( i['src']) tag_src = (prefix + " " + i['src']).lstrip() if i['id'] == 5: "hi-en exp-1" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.hindi_english["HIN_EXP_1_291019"], sp_model.hindi_english["ENG_EXP_1_291019"]) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 6: "hi-en_exp-2 05-05-20" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.hindi_english["HIN_EXP_2_050520"], sp_model.hindi_english["ENG_EXP_2_050520"]) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 7: "english-tamil" translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_tamil["ENG_230919"], sp_model.english_tamil["TAM_230919"]) elif i['id'] == 10: "english-gujrati" translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_gujarati["ENG_100919"], sp_model.english_gujarati["GUJ_100919"]) translation = translation.replace( "ન્યાય માટે Accessક્સેસને", "ન્યાયની પહોંચને") elif i['id'] == 11: "english-bengali" translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_bengali["ENG_120919"], sp_model.english_bengali["BENG_120919"]) elif i['id'] == 12: "english-marathi" translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_marathi["ENG_140919"], sp_model.english_marathi["MARATHI_140919"]) elif i['id'] == 15: "english-kannada" translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_kannada["ENG_200919"], sp_model.english_kannada["KANNADA_200919"]) translation = translation.replace("uc", "") elif i['id'] == 16: "english-telgu" translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_telugu["ENG_200919"], sp_model.english_telugu["TELGU_200919"]) elif i['id'] == 17: "english-malayalam" translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_malayalam["ENG_200919"], sp_model.english_malayalam["MALAYALAM_200919"]) elif i['id'] == 18: "english-punjabi" translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_punjabi["ENG_200919"], sp_model.english_punjabi["PUNJABI_200919"]) elif i['id'] == 21: "exp-1 BPE model with varying vocab size 15k for both hindi and english +tokenization" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_hindi["ENG_EXP_1"], sp_model.english_hindi["HIN_EXP_1"]) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 30: "25/10/2019 experiment 10, Old data + dictionary,BPE-24k, nolowercasing,pretok,shuffling,50k nmt" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_hindi["ENG_EXP_10"], sp_model.english_hindi["HIN_EXP_10"]) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 32: "29/10/2019 Exp-12: old_data_original+lc_cleaned+ ik names translated from google(100k)+shabdkosh(appended 29k new),BPE-24K,50knmt,shuff,pretok" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_hindi["ENG_EXP_12"], sp_model.english_hindi["HIN_EXP_12"]) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 54: "29-30/10/19Exp-5.4: -data same as 5.1 exp...old data+ india kanoon 830k(including 1.5 lakhs names n no learned counsel)+72192k shabkosh, BPE 24k, nolowercasing,pretok,shuffling" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_hindi["ENG_EXP_5.4"], sp_model.english_hindi["HIN_EXP_5.4"]) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 42: "english-marathi exp-2" translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_marathi["ENG_071119"], sp_model.english_marathi["MARATHI_071119"]) elif i['id'] == 56: "09/12/19-Exp-5.6:" if i['src'].isupper(): i['src'] = i['src'].title() i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_hindi["ENG_EXP_5.6"], sp_model.english_hindi["HIN_EXP_5.6"]) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 8: "ta-en 1st" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_tamil["TAM_090120"], sp_model.english_tamil["ENG_090120"]) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 43: "mr-en 1st" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_marathi["MARATHI_270120"], sp_model.english_marathi["ENG_270120"]) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 44: "eng-mr-3rd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_marathi["ENG_060220"], sp_model.english_marathi["MARATHI_060220"]) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 45: "en-ta 4th" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_tamil["ENG_080220"], sp_model.english_tamil["TAM_080220"]) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 46: "ta-en 2nd" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_tamil["TAM_100220"], sp_model.english_tamil["ENG_100220"]) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 47: "en-kn 2nd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_kannada["ENG_100220"], sp_model.english_kannada["KANNADA_100220"]) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 48: "kn-en 1st" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_kannada["KANNADA_100220"], sp_model.english_kannada["ENG_100220"]) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 49: "en-tel 2nd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_telugu["ENG_120220"], sp_model.english_telugu["TELUGU_120220"]) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 50: "tel-en 1st" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_telugu["TELUGU_120220"], sp_model.english_telugu["ENG_120220"]) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 51: "en-guj 2nd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_gujarati["ENG_140220"], sp_model.english_gujarati["GUJ_140220"]) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 52: "guj-en 1st" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_gujarati["GUJ_140220"], sp_model.english_gujarati["ENG_140220"]) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 53: "en-punjabi 2nd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_punjabi["ENG_160220"], sp_model.english_punjabi["PUNJABI_160220"]) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 55: "punjabi-en 1st" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_punjabi["PUNJABI_160220"], sp_model.english_punjabi["ENG_160220"]) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] in [57, 65]: "en-bengali 4th" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_bengali["ENG_EN_to_BN_4"], sp_model.english_bengali["BENG_EN_to_BN_4"]) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] in [58, 66]: "bengali-en 3rd" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_bengali["BENG_BN_to_EN_3"], sp_model.english_bengali["ENG_BN_to_EN_3"]) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 59: "en-malay 2nd" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_malayalam["ENG_210220"], sp_model.english_malayalam["MALAYALAM_210220"]) translation = sentence_processor.indic_detokenizer( translation) elif i['id'] == 60: "malay-en 1st" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_malayalam["MALAYALAM_210220"], sp_model.english_malayalam["ENG_210220"]) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 61: "ta-to-en 3rd" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_tamil["TAM_280220"], sp_model.english_tamil["ENG_280220"]) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 62: "mr-to-en 2nd" i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_marathi["MARATHI_280220"], sp_model.english_marathi["ENG_280220"]) translation = sentence_processor.moses_detokenizer( translation) elif i['id'] == 63: "en-hi exp-13 09-03-20" i['src'] = sentence_processor.moses_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_hindi["ENG_EXP_13"], sp_model.english_hindi["HIN_EXP_13"]) translation = sentence_processor.indic_detokenizer( translation) else: logger.info( "Unsupported model id: {} for given input".format( i['id'])) raise Exception( "Unsupported Model ID - id: {} for given input".format( i['id'])) # translation = (prefix+" "+translation+" "+suffix).strip() translation = (prefix + " " + translation).lstrip() translation = translation.replace("▁", " ") translation = date_url_util.regex_pass(translation, [ patterns['p8'], patterns['p9'], patterns['p4'], patterns['p5'], patterns['p6'], patterns['p7'] ]) tag_tgt = translation translation = date_url_util.replace_tags_with_original_1( translation, date_original, url_original, num_array) translation = oc.cleaner(tag_src, translation, i['id']) logger.info("trans_function-experiment-{} output: {}".format( i['id'], translation)) logger.info( log_with_request_info(i.get("s_id"), LOG_TAGS["output"], translation)) tgt.append(translation) pred_score.append(scores[0]) sentence_id.append(s_id[0]), node_id.append(n_id[0]) input_subwords.append(input_sw), output_subwords.append(output_sw) tagged_tgt.append(tag_tgt), tagged_src.append(tag_src) out['status'] = statusCode["SUCCESS"] out['response_body'] = [{ "tgt": tgt[i], "pred_score": pred_score[i], "s_id": sentence_id[i], "input_subwords": input_subwords[i], "output_subwords": output_subwords[i], "n_id": node_id[i], "src": i_src[i], "tagged_tgt": tagged_tgt[i], "tagged_src": tagged_src[i], "save": i_save[i], "s0_src": i_s0_src[i], "s0_tgt": i_s0_tgt[i] } for i in range(len(tgt))] except ServerModelError as e: out['status'] = statusCode["SEVER_MODEL_ERR"] out['status']['why'] = str(e) out['response_body'] = [] logger.error( "ServerModelError error in TRANSLATE_UTIL-translate_func: {} and {}" .format(e, sys.exc_info()[0])) except Exception as e: out['status'] = statusCode["SYSTEM_ERR"] out['status']['why'] = str(e) out['response_body'] = [] logger.error("Unexpected error:%s and %s" % (e, sys.exc_info()[0])) return (out)
def from_hindi(inputs, translation_server): inputs = inputs out = {} tgt = list() pred_score = list() sentence_id = list() node_id = list() input_subwords = list() output_subwords = list() s_id = [0000] n_id = [0000] try: for i in inputs: if all(v in i for v in ['s_id', 'n_id']): s_id = [i['s_id']] n_id = [i['n_id']] if any(v not in i for v in ['src', 'id']): out['status'] = statusCode["ID_OR_SRC_MISSING"] logger.info("either id or src missing in some input") return (out) logger.info("input sentences:{}".format(i['src'])) i['src'] = i['src'].strip() if i['id'] == 3: logger.info("translating using the first model") translation, scores, n_best, times = translation_server.run( [i]) translation = translation[0] input_sw, output_sw = "", "" else: if i['id'] == 2: i['src'] = sentence_processor.indic_tokenizer(i['src']) translation, scores, input_sw, output_sw = encode_translate_decode( i, translation_server, sp_model.english_hindi["HIN_220519"], sp_model.english_hindi["ENG_220519"]) translation = sentence_processor.moses_detokenizer( translation) translation = sentence_processor.detruecaser(translation) else: logger.info( "unsupported model id: {} for given hindi input for translation" .format(i['id'])) translation = "" input_sw, output_sw = "", "" scores = [0] tgt.append(translation) pred_score.append(scores[0]) sentence_id.append(s_id[0]) node_id.append(n_id[0]) input_subwords.append(input_sw) output_subwords.append(output_sw) out['status'] = statusCode["SUCCESS"] out['response_body'] = [{ "tgt": tgt[i], "pred_score": pred_score[i], "s_id": sentence_id[i], "input_subwords": input_subwords[i], "output_subwords": output_subwords[i], "n_id": node_id[i] } for i in range(len(tgt))] except ServerModelError as e: out['status'] = statusCode["SEVER_MODEL_ERR"] out['status']['why'] = str(e) logger.error( "ServerModelError error in TRANSLATE_UTIL-FROM_HINDI: {} and {}". format(e, sys.exc_info()[0])) except Exception as e: out['status'] = statusCode["SYSTEM_ERR"] out['status']['why'] = str(e) logger.error("Unexpected error:%s and %s" % (e, sys.exc_info()[0])) return (out)
def doc_translator(translation_server, c_topic): logger.info('Kafka utils: document_translator') iq = 0 out = {} msg_count = 0 msg_sent = 0 c = get_consumer(c_topic) p = get_producer() try: for msg in c: producer_topic = [ topic["producer"] for topic in kafka_topic if topic["consumer"] == msg.topic ][0] logger.info("Producer for current consumer:{} is-{}".format( msg.topic, producer_topic)) msg_count += 1 logger.info( "*******************msg receive count*********:{}".format( msg_count)) iq = iq + 1 inputs = (msg.value) if inputs is not None and all( v in inputs for v in ['url_end_point', 'message']) and len( inputs) is not 0: record_id = inputs.get("record_id") logger.info( log_with_record_id(record_id, LOG_TAGS["input"], inputs)) if inputs['url_end_point'] == 'translation_en': logger.info("Running kafka on {}".format( inputs['url_end_point'])) logger.info("Running kafka-translation on {}".format( inputs['message'])) out = translate_util.from_en(inputs['message'], translation_server) elif inputs['url_end_point'] == 'translation_hi': logger.info("Running kafka on {}".format( inputs['url_end_point'])) logger.info("Running kafka-translation on {}".format( inputs['message'])) out = translate_util.from_hindi(inputs['message'], translation_server) logger.info( "final output kafka-translation_hi:{}".format(out)) elif inputs['url_end_point'] == "translate-anuvaad": logger.info("Running kafka on {}".format( inputs['url_end_point'])) logger.info("Running kafka-translation on {}".format( inputs['message'])) out = translate_util.translate_func( inputs['message'], translation_server) logger.info( "final output kafka-translate-anuvaad:{}".format(out)) logger.info( log_with_record_id(record_id, LOG_TAGS["output"], out)) else: logger.info("Incorrect url_end_point for KAFKA") out['status'] = statusCode["KAFKA_INVALID_REQUEST"] out['response_body'] = [] if record_id: out['record_id'] = record_id else: out = {} logger.info( "Null input request or key parameter missing in KAFKA request: document_translator" ) p.send(producer_topic, value={'out': out}) p.flush() msg_sent += 1 logger.info("*******************msg sent count*********:{}".format( msg_sent)) except ValueError: '''includes simplejson.decoder.JSONDecodeError ''' logger.error("Decoding JSON has failed in document_translator: %s" % sys.exc_info()[0]) doc_translator(translation_server, c_topic) except Exception as e: logger.error("Unexpected error in kafak doc_translator: %s" % sys.exc_info()[0]) logger.error("error in doc_translator: {}".format(e)) doc_translator(translation_server, c_topic)
def tag_number_date_url_1(text): try: if len(text) == 0: return "","","","","" resultant_str = list() count_date = 0 date_original = list() count_url = 0 url_original = list() count_number = 0 num_map = list() num_array = re.findall(patterns['p3']['regex'],text) num_array_orignal = num_array i_zero = get_indices_of_num_with_zero_prefix(num_array) num_array = list(map(int, num_array)) zero_prefix_num = [num_array[i] for i in i_zero] num_array.sort(reverse = True) # num_array = update_num_arr(num_array,zero_prefix_num,i_zero,num_array_orignal) for j in num_array: text = text.replace(str(j),'NnUuMm'+str(hindi_numbers[count_number]),1) num_map.append({"no.":j,"tag":'NnUuMm'+str(hindi_numbers[count_number])}) count_number +=1 if count_number >30: print("count exceeding 30") count_number = 30 logger.info("number-tag mappings-{}".format(num_map)) logger.info("Number tagging done") for word in text.split(): # if len(word)>4 and len(word)<12 and token_is_date(word): try: ext = [".",",","?","!"] if word.isalpha()== False and word[:-1].isalpha() == False and len(word)>4 and common_utils.token_is_date(word): if word.endswith(tuple(ext)): end_token = word[-1] word = word[:-1] if len(word)<7 and int(word): word = word+end_token else: date_original.append(word) word = 'DdAaTtEe'+str(count_date)+end_token count_date +=1 else: date_original.append(word) word = 'DdAaTtEe'+str(count_date) count_date +=1 elif common_utils.token_is_url(word): url_original.append(word) word = 'UuRrLl'+str(count_url) count_url +=1 except Exception as e: print(e) logger.error("In handle_date_url:tag_num function:{}".format(e)) word = word resultant_str.append(word) s = [str(i) for i in resultant_str] res = str(" ".join(s)) logger.info("tagged response:{} and date:{} and url:{}".format(res,date_original,url_original)) return res,date_original,url_original,num_array,num_map except Exception as e: logger.error("In handle_date_url:tag_num function parent except block:{}".format(e)) return text,[],[],(num_array or [])