예제 #1
0
def encode_translate_decode(i, translation_server, sp_encoder, sp_decoder):
    try:
        logger.info("Inside encode_translate_decode function")
        i['src'] = str(sp.encode_line(sp_encoder, i['src']))
        logger.info("SP encoded sent: %s" % i['src'])
        input_sw = i['src']
        translation, scores, n_best, times = translation_server.run([i])
        logger.info("output from model: %s" % translation[0])
        output_sw = translation[0]
        translation = sp.decode_line(sp_decoder, translation[0])
        logger.info("SP decoded sent: %s" % translation)
        return translation, scores, input_sw, output_sw
    except ServerModelError as e:
        logger.error(
            "ServerModelError error in encode_translate_decode: {} and {}".
            format(e,
                   sys.exc_info()[0]))
        raise

    except Exception as e:
        logger.error(
            "Unexpexcted error in encode_translate_decode: {} and {}".format(
                e,
                sys.exc_info()[0]))
        raise
예제 #2
0
def purnaviram_applier(src, tgt):
    '''
    For english to hindi translation
    '''
    try:
        if tgt is None or len(tgt.split()) == 0:
            return tgt
        if len(src.split()) < 5:
            return tgt
        if src.endswith('.') and tgt.endswith('ред'):
            return tgt
        elif src.endswith('.') and tgt[-1] != 'ред':
            if tgt.endswith('.'):
                logger.info("Replacing '.' with purnaviram")
                tgt = tgt[:-1] + str("ред")
            else:
                logger.info("Adding the missing purnaviram")
                tgt = tgt + str("ред")
            return tgt
        else:
            return tgt

    except Exception as e:
        logger.error(
            "Error in purnaviram applier, returning original tgt: {}".format(
                e))
        return tgt
예제 #3
0
def update_num_arr(num_array,zero_prefix_num,i_zero,num_array_orignal):
  '''
  This is function is meant to handle zero prefix numbers like 09 or 000 which are converted to 9 or 0 during processing, We want them in original form i.e 09
  zero_prefix_num: this is the num that has to be transformed back with zero prefix(from 9 to 09, or, 0 to 000 originally)
  i_zero: indices of numbers with zero prefix in num_array_orignal
  ind: indices of zero prefix numbers in num_array descending

  Note: this function needs some fixing
  '''
  try:
    num_array_o = None
    num_array_o = num_array[:]
        
    ind = list()
    zero_prefix_num = np.unique(np.array(zero_prefix_num))
    for i in zero_prefix_num:
      for j,m in enumerate(num_array):
        if m == i:
          ind.append(j)
    for k,l in enumerate(ind):
      num_array[l] = num_array_orignal[i_zero[k]]
    return num_array
  except Exception as e:
    logger.error("Error in handle_date_url:update_num_arr,returning incoming num_array:{}".format(e))
    return num_array_o
예제 #4
0
def regex_pass(text,regex_list):
  try:
    regex_list = regex_list
    for pattern in regex_list:
      text = re.sub(pattern['regex'],pattern['replacement'],text)

    return text
    
  except Exception as e:
    logger.error("Error in regex_pass: handle_date_url function:{}".format(e))
    return text
예제 #5
0
def get_producer():
    try:
        producer = KafkaProducer(
            bootstrap_servers=list(str(bootstrap_server).split(",")),
            value_serializer=lambda x: json.dumps(x).encode('utf-8'))
        logger.info('get_producer : producer returned successfully')
        return producer
    except Exception as e:
        logger.error(
            'get_producer : ERROR OCCURRED while creating producer, ERROR =  '
            + str(e))
        return None
예제 #6
0
def replace_tags_with_original_1(text,date_original,url_original,num_array):
  try:
    resultant_str = list()
      
    if len(text) == 0:
      return ""
    for word in text.split():
      if word[:-1] == 'DdAaTtEe' and len(date_original) > 0:
        word = date_original[int(word[-1])]
      elif word[:-1] == 'UuRrLl' and len(url_original)> 0 :
        word = url_original[int(word[-1])]          

      resultant_str.append(word)
      s = [str(i) for i in resultant_str] 
      res = str(" ".join(s))

    logger.info("response after url and date replacemnt:{}".format(res))
    array = re.findall(r'NnUuMm..|NnUuMm.', res)   
    logger.info("NnUuMm array after translation:{}".format(array))
    for j in array:
      try:
        if j[-2:] in hindi_numbers:
          end_hin_number = j[-2:]
          index = hindi_numbers.index(end_hin_number)
          res = res.replace(j,str(num_array[index]),1)
        elif j[:-1]== "NnUuMm":
          end_hin_number = j[-1]
          index = hindi_numbers.index(end_hin_number)
          res = res.replace(j,str(num_array[index]),1)
        else:
          end_hin_number = j[-2]
          j = j[:-1]
          index = hindi_numbers.index(end_hin_number)     
          res = res.replace(j,str(num_array[index]),1)
      
      except Exception as e:
        logger.info("inside str.replace error,but handling it:{}".format(e))
        res = res.replace(j,"",1)

    logger.info("response after tags replacement:{}".format(res))
    return res    
  except Exception as e:
    logger.error("Error in parent except block of replace_tags_with_original_1 function, returning tagged output:{}".format(e))
    return text
예제 #7
0
def decode_line(load_model,line):
    # makes segmenter instance and loads the model file (m.model)
    try:
        sp = spm.SentencePieceProcessor()
        sp.load(load_model)
        if not line.startswith("["):
            line = "["+line
        if not line.endswith("]"):
            line = line+"]"     
        line = line[0]+line[1:-1].replace('[',"")+line[-1] 
        line = line[0]+line[1:-1].replace(']',"")+line[-1]  
        logger.info("decoding using sp model {}".format(load_model))
        if "<unk>" in line:
            line = line.replace("<unk>","")
        return sp.DecodePieces(eval(line))
    except Exception as e:
        logger.error("something went wrong! {}".format(e))
        logger.error("Unexpected error: %s"% sys.exc_info()[0])
        return ""
예제 #8
0
def maybe_load_vocab(corpus_type, counters, opt):
    src_vocab = None
    tgt_vocab = None
    existing_fields = None
    if corpus_type == config.train:
        if opt.src_vocab != "":
            try:
                logger.info("Using existing vocabulary...")
                existing_fields = torch.load(opt.src_vocab)
            except torch.serialization.pickle.UnpicklingError:
                logger.info("Building vocab from text file...")
                # src_vocab, src_vocab_size = _load_vocab(
                #     opt.src_vocab, "src", counters,
                #     opt.src_words_min_frequency)
        if opt.tgt_vocab != "":
            logger.error("opt.tgt_vocab 不为空")
            # tgt_vocab, tgt_vocab_size = _load_vocab(
            #     opt.tgt_vocab, "tgt", counters,
            #     opt.tgt_words_min_frequency)
    return src_vocab, tgt_vocab, existing_fields
예제 #9
0
def fullstop_applier(src, tgt):
    '''
    For non-hindi translation pair
    '''
    try:
        if len(src.split()) < 5:
            return tgt
        if src.endswith('.') and tgt.endswith('.'):
            return tgt
        elif src.endswith('.') and tgt[-1] != '.':
            logger.info("Adding the missing fullstop")
            tgt = tgt + str(".")
            return tgt
        else:
            return tgt

    except Exception as e:
        logger.error(
            "Error in fullstop_applier, returning original tgt: {}".format(e))
        return tgt
def prefix_handler(text):
    '''
    Currently this function is only handling different numeric prefixes in the first token of an input eg. 1., 12.1, (1.),(12.1),1,(12) etc.
    '''
    try:
        prefix = ""
        tokens = text.split()
        token_p = tokens[0]
        regex_list = [patterns['p10'], patterns['p11']]
        matches = [
            re.match(pattern['regex'], token_p) for pattern in regex_list
        ]
        if not all(v is None for v in matches):
            prefix = token_p
            text = str(" ".join(tokens[1:]))
        logger.info("Returning from prefix_handler")
        return prefix, text
    except Exception as e:
        logger.error(
            "Error in prefix handler, returning original text,error:{}".format(
                e))
        return "", text
예제 #11
0
def get_consumer(topics):
    try:
        # consumer = KafkaConsumer(
        #            topic,
        #            bootstrap_servers=[bootstrap_server],
        #            auto_offset_reset='earliest',
        #            enable_auto_commit=True,
        #            group_id=group_id,
        #            value_deserializer=lambda x: json.loads(x.decode('utf-8')))
        consumer = KafkaConsumer(
            bootstrap_servers=list(str(bootstrap_server).split(",")),
            value_deserializer=lambda x: json.loads(x.decode('utf-8')))

        consumer.subscribe(topics)
        logger.info(
            'get_consumer : consumer returned for topics:{}'.format(topics))
        return consumer
    except Exception as e:
        logger.error(
            'ERROR OCCURRED for getting consumer with topics:{}'.format(
                topics))
        logger.error('get_consumer : ERROR = ' + str(e))
        return None
예제 #12
0
def from_en(inputs, translation_server):
    inputs = inputs
    out = {}
    tgt = list()
    pred_score = list()
    sentence_id = list()
    node_id = list()
    input_subwords = list()
    output_subwords = list()
    s_id = [0000]
    n_id = [0000]
    try:
        for i in inputs:
            if all(v in i for v in ['s_id', 'n_id']):
                s_id = [i['s_id']]
                n_id = [i['n_id']]

            if any(v not in i for v in ['src', 'id']):
                out['status'] = statusCode["ID_OR_SRC_MISSING"]
                logger.info("either id or src missing in some input")
                return out

            logger.info("input sentences:{}".format(i['src']))
            i['src'] = i['src'].strip()
            if ancillary_functions.special_case_fits(i['src']):
                logger.info(
                    "sentence fits in special case, returning accordingly and not going to model"
                )
                translation = ancillary_functions.handle_special_cases(
                    i['src'], i['id'])
                scores = [1]
                input_sw, output_sw = "", ""
            else:
                logger.info("translating using NMT-model:{}".format(i['id']))
                logger.info("translating this sentences:{}".format(i['src']))
                # prefix,suffix, i['src'] = ancillary_functions.separate_alphanumeric_and_symbol(i['src'])
                if i['id'] == 1:
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_hindi["ENG_220519"],
                        sp_model.english_hindi["HIN_220519"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                    logger.info(
                        "final output from model-1: {}".format(translation))
                else:
                    logger.info(
                        "unsupported model id: {} for given english translation"
                        .format(i['id']))
                    logger.error(
                        "unsupported model id: {} for given english translation"
                        .format(i['id']))
                    translation, input_sw, output_sw, scores = "", "", "", [0]

                # translation = (prefix+" "+translation+" "+suffix).strip()
            translation = ancillary_functions.replace_hindi_numbers(
                translation)
            tgt.append(translation)
            pred_score.append(scores[0])
            sentence_id.append(s_id[0])
            node_id.append(n_id[0])
            input_subwords.append(input_sw)
            output_subwords.append(output_sw)

        out['status'] = statusCode["SUCCESS"]
        out['response_body'] = [{
            "tgt": tgt[i],
            "s_id": sentence_id[i],
            "input_subwords": input_subwords[i],
            "output_subwords": output_subwords[i],
            "n_id": node_id[i],
            "pred_score": pred_score[i]
        } for i in range(len(tgt))]
    except ServerModelError as e:
        out['status'] = statusCode["SEVER_MODEL_ERR"]
        out['status']['why'] = str(e)
        logger.error(
            "ServerModelError error in TRANSLATE_UTIL-FROM_ENGLISH: {} and {}".
            format(e,
                   sys.exc_info()[0]))
    except Exception as e:
        out['status'] = statusCode["SYSTEM_ERR"]
        logger.error(
            "Unexpected error in translate_util from_eng function: %s and %s" %
            (e, sys.exc_info()[0]))

    return out
예제 #13
0
def translate_func(inputs, translation_server):

    inputs = inputs
    out = {}
    pred_score = list()
    sentence_id, node_id = list(), list()
    input_subwords, output_subwords = list(), list()
    i_src, tgt = list(), list()
    tagged_tgt, tagged_src = list(), list()
    s_id, n_id = [0000], [0000]
    i_s0_src, i_s0_tgt, i_save = list(), list(), list()

    try:
        for i in inputs:
            s0_src, s0_tgt, save = "NA", "NA", False
            logger.info(
                log_with_request_info(i.get("s_id"), LOG_TAGS["input"], i))
            if all(v in i for v in ['s_id', 'n_id']):
                s_id = [i['s_id']]
                n_id = [i['n_id']]

            if any(v not in i for v in ['src', 'id']):
                out['status'] = statusCode["ID_OR_SRC_MISSING"]
                out['response_body'] = []
                logger.info("either id or src missing in some input")
                return (out)

            if any(v in i for v in ['s0_src', 's0_tgt', 'save']):
                s0_src, s0_tgt, save = handle_custome_input(
                    i, s0_src, s0_tgt, save)

            i_s0_src.append(s0_src), i_s0_tgt.append(s0_tgt), i_save.append(
                save)

            logger.info("input sentences:{}".format(i['src']))
            i_src.append(i['src'])
            i['src'] = i['src'].strip()
            if ancillary_functions.special_case_fits(i['src']):
                logger.info(
                    "sentence fits in special case, returning accordingly and not going to model"
                )
                translation = ancillary_functions.handle_special_cases(
                    i['src'], i['id'])
                scores = [1]
                input_sw, output_sw, tag_tgt, tag_src = "", "", translation, i[
                    'src']

            else:
                logger.info("translating using NMT-model:{}".format(i['id']))
                # prefix,suffix, i['src'] = ancillary_functions.separate_alphanumeric_and_symbol(i['src'])
                prefix, i['src'] = ancillary_functions.prefix_handler(i['src'])
                i['src'], date_original, url_original, num_array, num_map = date_url_util.tag_number_date_url_1(
                    i['src'])
                tag_src = (prefix + " " + i['src']).lstrip()
                if i['id'] == 5:
                    "hi-en exp-1"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.hindi_english["HIN_EXP_1_291019"],
                        sp_model.hindi_english["ENG_EXP_1_291019"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)
                elif i['id'] == 6:
                    "hi-en_exp-2 05-05-20"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.hindi_english["HIN_EXP_2_050520"],
                        sp_model.hindi_english["ENG_EXP_2_050520"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)

                elif i['id'] == 7:
                    "english-tamil"
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_tamil["ENG_230919"],
                        sp_model.english_tamil["TAM_230919"])
                elif i['id'] == 10:
                    "english-gujrati"
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_gujarati["ENG_100919"],
                        sp_model.english_gujarati["GUJ_100919"])
                    translation = translation.replace(
                        "ન્યાય માટે Accessક્સેસને", "ન્યાયની પહોંચને")
                elif i['id'] == 11:
                    "english-bengali"
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_bengali["ENG_120919"],
                        sp_model.english_bengali["BENG_120919"])
                elif i['id'] == 12:
                    "english-marathi"
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_marathi["ENG_140919"],
                        sp_model.english_marathi["MARATHI_140919"])

                elif i['id'] == 15:
                    "english-kannada"
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_kannada["ENG_200919"],
                        sp_model.english_kannada["KANNADA_200919"])
                    translation = translation.replace("uc", "")
                elif i['id'] == 16:
                    "english-telgu"
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_telugu["ENG_200919"],
                        sp_model.english_telugu["TELGU_200919"])
                elif i['id'] == 17:
                    "english-malayalam"
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_malayalam["ENG_200919"],
                        sp_model.english_malayalam["MALAYALAM_200919"])
                elif i['id'] == 18:
                    "english-punjabi"
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_punjabi["ENG_200919"],
                        sp_model.english_punjabi["PUNJABI_200919"])
                elif i['id'] == 21:
                    "exp-1 BPE model with varying vocab size 15k for both hindi and english +tokenization"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_hindi["ENG_EXP_1"],
                        sp_model.english_hindi["HIN_EXP_1"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                elif i['id'] == 30:
                    "25/10/2019 experiment 10, Old data + dictionary,BPE-24k, nolowercasing,pretok,shuffling,50k nmt"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_hindi["ENG_EXP_10"],
                        sp_model.english_hindi["HIN_EXP_10"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                elif i['id'] == 32:
                    "29/10/2019 Exp-12: old_data_original+lc_cleaned+ ik names translated from google(100k)+shabdkosh(appended 29k new),BPE-24K,50knmt,shuff,pretok"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_hindi["ENG_EXP_12"],
                        sp_model.english_hindi["HIN_EXP_12"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                elif i['id'] == 54:
                    "29-30/10/19Exp-5.4: -data same as 5.1 exp...old data+ india kanoon 830k(including 1.5 lakhs names n no learned counsel)+72192k shabkosh, BPE 24k, nolowercasing,pretok,shuffling"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_hindi["ENG_EXP_5.4"],
                        sp_model.english_hindi["HIN_EXP_5.4"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                elif i['id'] == 42:
                    "english-marathi exp-2"
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_marathi["ENG_071119"],
                        sp_model.english_marathi["MARATHI_071119"])
                elif i['id'] == 56:
                    "09/12/19-Exp-5.6:"
                    if i['src'].isupper():
                        i['src'] = i['src'].title()
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_hindi["ENG_EXP_5.6"],
                        sp_model.english_hindi["HIN_EXP_5.6"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                elif i['id'] == 8:
                    "ta-en 1st"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_tamil["TAM_090120"],
                        sp_model.english_tamil["ENG_090120"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)
                elif i['id'] == 43:
                    "mr-en 1st"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_marathi["MARATHI_270120"],
                        sp_model.english_marathi["ENG_270120"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)
                elif i['id'] == 44:
                    "eng-mr-3rd"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_marathi["ENG_060220"],
                        sp_model.english_marathi["MARATHI_060220"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                elif i['id'] == 45:
                    "en-ta 4th"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_tamil["ENG_080220"],
                        sp_model.english_tamil["TAM_080220"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                elif i['id'] == 46:
                    "ta-en 2nd"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_tamil["TAM_100220"],
                        sp_model.english_tamil["ENG_100220"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)
                elif i['id'] == 47:
                    "en-kn 2nd"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_kannada["ENG_100220"],
                        sp_model.english_kannada["KANNADA_100220"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                elif i['id'] == 48:
                    "kn-en 1st"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_kannada["KANNADA_100220"],
                        sp_model.english_kannada["ENG_100220"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)
                elif i['id'] == 49:
                    "en-tel 2nd"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_telugu["ENG_120220"],
                        sp_model.english_telugu["TELUGU_120220"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                elif i['id'] == 50:
                    "tel-en 1st"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_telugu["TELUGU_120220"],
                        sp_model.english_telugu["ENG_120220"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)
                elif i['id'] == 51:
                    "en-guj 2nd"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_gujarati["ENG_140220"],
                        sp_model.english_gujarati["GUJ_140220"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                elif i['id'] == 52:
                    "guj-en 1st"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_gujarati["GUJ_140220"],
                        sp_model.english_gujarati["ENG_140220"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)
                elif i['id'] == 53:
                    "en-punjabi 2nd"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_punjabi["ENG_160220"],
                        sp_model.english_punjabi["PUNJABI_160220"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                elif i['id'] == 55:
                    "punjabi-en 1st"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_punjabi["PUNJABI_160220"],
                        sp_model.english_punjabi["ENG_160220"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)
                elif i['id'] in [57, 65]:
                    "en-bengali 4th"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_bengali["ENG_EN_to_BN_4"],
                        sp_model.english_bengali["BENG_EN_to_BN_4"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                elif i['id'] in [58, 66]:
                    "bengali-en 3rd"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_bengali["BENG_BN_to_EN_3"],
                        sp_model.english_bengali["ENG_BN_to_EN_3"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)
                elif i['id'] == 59:
                    "en-malay 2nd"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_malayalam["ENG_210220"],
                        sp_model.english_malayalam["MALAYALAM_210220"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                elif i['id'] == 60:
                    "malay-en 1st"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_malayalam["MALAYALAM_210220"],
                        sp_model.english_malayalam["ENG_210220"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)
                elif i['id'] == 61:
                    "ta-to-en 3rd"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_tamil["TAM_280220"],
                        sp_model.english_tamil["ENG_280220"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)
                elif i['id'] == 62:
                    "mr-to-en 2nd"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_marathi["MARATHI_280220"],
                        sp_model.english_marathi["ENG_280220"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)
                elif i['id'] == 63:
                    "en-hi exp-13 09-03-20"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_hindi["ENG_EXP_13"],
                        sp_model.english_hindi["HIN_EXP_13"])
                    translation = sentence_processor.indic_detokenizer(
                        translation)
                else:
                    logger.info(
                        "Unsupported model id: {} for given input".format(
                            i['id']))
                    raise Exception(
                        "Unsupported Model ID - id: {} for given input".format(
                            i['id']))

                # translation = (prefix+" "+translation+" "+suffix).strip()
                translation = (prefix + " " + translation).lstrip()
                translation = translation.replace("▁", " ")
                translation = date_url_util.regex_pass(translation, [
                    patterns['p8'], patterns['p9'], patterns['p4'],
                    patterns['p5'], patterns['p6'], patterns['p7']
                ])
                tag_tgt = translation
                translation = date_url_util.replace_tags_with_original_1(
                    translation, date_original, url_original, num_array)
                translation = oc.cleaner(tag_src, translation, i['id'])
            logger.info("trans_function-experiment-{} output: {}".format(
                i['id'], translation))
            logger.info(
                log_with_request_info(i.get("s_id"), LOG_TAGS["output"],
                                      translation))
            tgt.append(translation)
            pred_score.append(scores[0])
            sentence_id.append(s_id[0]), node_id.append(n_id[0])
            input_subwords.append(input_sw), output_subwords.append(output_sw)
            tagged_tgt.append(tag_tgt), tagged_src.append(tag_src)

        out['status'] = statusCode["SUCCESS"]
        out['response_body'] = [{
            "tgt": tgt[i],
            "pred_score": pred_score[i],
            "s_id": sentence_id[i],
            "input_subwords": input_subwords[i],
            "output_subwords": output_subwords[i],
            "n_id": node_id[i],
            "src": i_src[i],
            "tagged_tgt": tagged_tgt[i],
            "tagged_src": tagged_src[i],
            "save": i_save[i],
            "s0_src": i_s0_src[i],
            "s0_tgt": i_s0_tgt[i]
        } for i in range(len(tgt))]
    except ServerModelError as e:
        out['status'] = statusCode["SEVER_MODEL_ERR"]
        out['status']['why'] = str(e)
        out['response_body'] = []
        logger.error(
            "ServerModelError error in TRANSLATE_UTIL-translate_func: {} and {}"
            .format(e,
                    sys.exc_info()[0]))
    except Exception as e:
        out['status'] = statusCode["SYSTEM_ERR"]
        out['status']['why'] = str(e)
        out['response_body'] = []
        logger.error("Unexpected error:%s and %s" % (e, sys.exc_info()[0]))

    return (out)
예제 #14
0
def from_hindi(inputs, translation_server):
    inputs = inputs
    out = {}
    tgt = list()
    pred_score = list()
    sentence_id = list()
    node_id = list()
    input_subwords = list()
    output_subwords = list()
    s_id = [0000]
    n_id = [0000]

    try:
        for i in inputs:
            if all(v in i for v in ['s_id', 'n_id']):
                s_id = [i['s_id']]
                n_id = [i['n_id']]

            if any(v not in i for v in ['src', 'id']):
                out['status'] = statusCode["ID_OR_SRC_MISSING"]
                logger.info("either id or src missing in some input")
                return (out)

            logger.info("input sentences:{}".format(i['src']))
            i['src'] = i['src'].strip()
            if i['id'] == 3:
                logger.info("translating using the first model")
                translation, scores, n_best, times = translation_server.run(
                    [i])
                translation = translation[0]
                input_sw, output_sw = "", ""

            else:
                if i['id'] == 2:
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
                    translation, scores, input_sw, output_sw = encode_translate_decode(
                        i, translation_server,
                        sp_model.english_hindi["HIN_220519"],
                        sp_model.english_hindi["ENG_220519"])
                    translation = sentence_processor.moses_detokenizer(
                        translation)
                    translation = sentence_processor.detruecaser(translation)

                else:
                    logger.info(
                        "unsupported model id: {} for given hindi input for translation"
                        .format(i['id']))
                    translation = ""
                    input_sw, output_sw = "", ""
                    scores = [0]

            tgt.append(translation)
            pred_score.append(scores[0])
            sentence_id.append(s_id[0])
            node_id.append(n_id[0])
            input_subwords.append(input_sw)
            output_subwords.append(output_sw)

        out['status'] = statusCode["SUCCESS"]
        out['response_body'] = [{
            "tgt": tgt[i],
            "pred_score": pred_score[i],
            "s_id": sentence_id[i],
            "input_subwords": input_subwords[i],
            "output_subwords": output_subwords[i],
            "n_id": node_id[i]
        } for i in range(len(tgt))]
    except ServerModelError as e:
        out['status'] = statusCode["SEVER_MODEL_ERR"]
        out['status']['why'] = str(e)
        logger.error(
            "ServerModelError error in TRANSLATE_UTIL-FROM_HINDI: {} and {}".
            format(e,
                   sys.exc_info()[0]))
    except Exception as e:
        out['status'] = statusCode["SYSTEM_ERR"]
        out['status']['why'] = str(e)
        logger.error("Unexpected error:%s and %s" % (e, sys.exc_info()[0]))

    return (out)
예제 #15
0
def doc_translator(translation_server, c_topic):
    logger.info('Kafka utils: document_translator')
    iq = 0
    out = {}
    msg_count = 0
    msg_sent = 0
    c = get_consumer(c_topic)
    p = get_producer()
    try:
        for msg in c:
            producer_topic = [
                topic["producer"] for topic in kafka_topic
                if topic["consumer"] == msg.topic
            ][0]
            logger.info("Producer for current consumer:{} is-{}".format(
                msg.topic, producer_topic))
            msg_count += 1
            logger.info(
                "*******************msg receive count*********:{}".format(
                    msg_count))
            iq = iq + 1
            inputs = (msg.value)

            if inputs is not None and all(
                    v in inputs for v in ['url_end_point', 'message']) and len(
                        inputs) is not 0:
                record_id = inputs.get("record_id")
                logger.info(
                    log_with_record_id(record_id, LOG_TAGS["input"], inputs))
                if inputs['url_end_point'] == 'translation_en':
                    logger.info("Running kafka on  {}".format(
                        inputs['url_end_point']))
                    logger.info("Running kafka-translation on  {}".format(
                        inputs['message']))
                    out = translate_util.from_en(inputs['message'],
                                                 translation_server)
                elif inputs['url_end_point'] == 'translation_hi':
                    logger.info("Running kafka on  {}".format(
                        inputs['url_end_point']))
                    logger.info("Running kafka-translation on  {}".format(
                        inputs['message']))
                    out = translate_util.from_hindi(inputs['message'],
                                                    translation_server)
                    logger.info(
                        "final output kafka-translation_hi:{}".format(out))
                elif inputs['url_end_point'] == "translate-anuvaad":
                    logger.info("Running kafka on  {}".format(
                        inputs['url_end_point']))
                    logger.info("Running kafka-translation on  {}".format(
                        inputs['message']))
                    out = translate_util.translate_func(
                        inputs['message'], translation_server)
                    logger.info(
                        "final output kafka-translate-anuvaad:{}".format(out))
                    logger.info(
                        log_with_record_id(record_id, LOG_TAGS["output"], out))
                else:
                    logger.info("Incorrect url_end_point for KAFKA")
                    out['status'] = statusCode["KAFKA_INVALID_REQUEST"]
                    out['response_body'] = []

                if record_id: out['record_id'] = record_id

            else:
                out = {}
                logger.info(
                    "Null input request or key parameter missing in KAFKA request: document_translator"
                )

            p.send(producer_topic, value={'out': out})
            p.flush()
            msg_sent += 1
            logger.info("*******************msg sent count*********:{}".format(
                msg_sent))
    except ValueError:
        '''includes simplejson.decoder.JSONDecodeError '''
        logger.error("Decoding JSON has failed in document_translator: %s" %
                     sys.exc_info()[0])
        doc_translator(translation_server, c_topic)
    except Exception as e:
        logger.error("Unexpected error in kafak doc_translator: %s" %
                     sys.exc_info()[0])
        logger.error("error in doc_translator: {}".format(e))
        doc_translator(translation_server, c_topic)
예제 #16
0
def tag_number_date_url_1(text):
  try: 
    if len(text) == 0:
      return "","","","",""
    
    resultant_str = list()
    count_date = 0
    date_original = list()
    count_url = 0
    url_original = list()
    count_number = 0
    num_map = list()
    
    num_array = re.findall(patterns['p3']['regex'],text)
    num_array_orignal = num_array
    i_zero = get_indices_of_num_with_zero_prefix(num_array)
    num_array = list(map(int, num_array))
    zero_prefix_num = [num_array[i] for i in i_zero] 
    num_array.sort(reverse = True)
    # num_array = update_num_arr(num_array,zero_prefix_num,i_zero,num_array_orignal)
 
    for j in num_array:
      text = text.replace(str(j),'NnUuMm'+str(hindi_numbers[count_number]),1)
      num_map.append({"no.":j,"tag":'NnUuMm'+str(hindi_numbers[count_number])})
      count_number +=1
      if count_number >30:
        print("count exceeding 30")
        count_number = 30

    logger.info("number-tag mappings-{}".format(num_map))
    logger.info("Number tagging done")
    for word in text.split():
        # if len(word)>4 and len(word)<12 and token_is_date(word):
        try:
          ext = [".",",","?","!"]
          if word.isalpha()== False and word[:-1].isalpha() == False and len(word)>4 and common_utils.token_is_date(word):
            if word.endswith(tuple(ext)):
              end_token = word[-1]
              word = word[:-1]
              if len(word)<7 and int(word):
                word = word+end_token
              else:
                date_original.append(word)
                word = 'DdAaTtEe'+str(count_date)+end_token
                count_date +=1
            else:
              date_original.append(word)  
              word = 'DdAaTtEe'+str(count_date)
              count_date +=1
          elif common_utils.token_is_url(word):
            url_original.append(word)
            word = 'UuRrLl'+str(count_url)
            count_url +=1
        except Exception as e:
          print(e)
          logger.error("In handle_date_url:tag_num function:{}".format(e))
          word = word
        

        resultant_str.append(word)   
        s = [str(i) for i in resultant_str] 
        res = str(" ".join(s))   
    logger.info("tagged response:{} and date:{} and url:{}".format(res,date_original,url_original)) 
    return res,date_original,url_original,num_array,num_map 
  except Exception as e:
    logger.error("In handle_date_url:tag_num function parent except block:{}".format(e))
    return text,[],[],(num_array or [])