def handle_single_token(token):
    try:
        if isfloat(token):
            return (token)
        elif util.token_is_date(token):
            print("returning date")
            return token
        elif len(token) > 1 and token_is_alphanumeric_char(token):
            if len(token) == 3 and (token[0].isalnum()
                                    == False) and (token[1].isalnum() == True):
                return token
            print("token is alphanumericchar: ", token)
            prefix, suffix, translation_text = separate_alphanumeric_and_symbol(
                token)
            # translation_text = transliterate_text(translation_text)
            # return prefix+translation_text+suffix
            return ""
        elif len(token) == 1:
            print("handling single token and returning character as it is")
            return (token)
        else:
            logger.info("returning null to allow token to go to model")
            return ""
    except:
        logger.info("returning null to allow token to go to model")
        return ""
def special_case_fits(text):
    if len(text) == 0:
        return True
    elif util.token_is_date(text):
        return True
    elif len(text.split()) == 1 and util.token_is_url(text):
        "this will handle single URL and return the same i.e single token-url"
        return True
    elif len(text.split()) == 1 and len(handle_single_token(text)) > 0:
        return True
def handle_special_cases(text, model_id):
    try:
        if len(text) == 0:
            logger.info("Null src for this request")
            return ""
        elif util.token_is_date(text):
            hindi_months = [
                'जनवरी', 'फ़रवरी', 'मार्च', 'अप्रैल', 'मई', 'जून', 'जुलाई',
                'अगस्त', 'सितंबर', 'अक्टूबर', 'नवंबर', 'दिसंबर'
            ]
            tamil_months = [
                'ஜனவரி', 'பிப்ரவரி', 'மார்ச்', 'ஏப்ரல்', 'மே', 'ஜூன்', 'ஜூலை',
                'ஆகஸ்ட்', 'செப்டம்பர்', 'அக்டோபர்', 'நவம்பர்', 'டிசம்பர்'
            ]
            eng_months = [
                'january', 'february', 'march', 'april', 'may', 'june', 'july',
                'august', 'september', 'october', 'november', 'december'
            ]
            if model_id in [1, 13]:
                "english to hindi"
                for i in eng_months:
                    text = text.casefold().replace(
                        i.casefold(), hindi_months[eng_months.index(i)])
            elif model_id == 7:
                "english to tamil"
                for i in eng_months:
                    text = text.casefold().replace(
                        i.casefold(), tamil_months[eng_months.index(i)])

            logger.info(
                'handling dates before model in long alpha-numeric format')
            return text
        elif len(text.split()) == 1 and util.token_is_url(text):
            logger.info(
                'handling single token-url before model and returning as it is'
            )
            return text
        elif len(text.split()) == 1 and len(handle_single_token(text)) > 0:
            return handle_single_token(text)
    except Exception as e:
        logger.info("error when handling special cases :{}".format(e))
        return text
Exemplo n.º 4
0
def tag_number_date_url_1(text):
  try: 
    if len(text) == 0:
      return "","","","",""
    
    resultant_str = list()
    count_date = 0
    date_original = list()
    count_url = 0
    url_original = list()
    count_number = 0
    num_map = list()
    
    num_array = re.findall(patterns['p3']['regex'],text)
    num_array_orignal = num_array
    i_zero = get_indices_of_num_with_zero_prefix(num_array)
    num_array = list(map(int, num_array))
    zero_prefix_num = [num_array[i] for i in i_zero] 
    num_array.sort(reverse = True)
    # num_array = update_num_arr(num_array,zero_prefix_num,i_zero,num_array_orignal)
 
    for j in num_array:
      text = text.replace(str(j),'NnUuMm'+str(hindi_numbers[count_number]),1)
      num_map.append({"no.":j,"tag":'NnUuMm'+str(hindi_numbers[count_number])})
      count_number +=1
      if count_number >30:
        print("count exceeding 30")
        count_number = 30

    logger.info("number-tag mappings-{}".format(num_map))
    logger.info("Number tagging done")
    for word in text.split():
        # if len(word)>4 and len(word)<12 and token_is_date(word):
        try:
          ext = [".",",","?","!"]
          if word.isalpha()== False and word[:-1].isalpha() == False and len(word)>4 and common_utils.token_is_date(word):
            if word.endswith(tuple(ext)):
              end_token = word[-1]
              word = word[:-1]
              if len(word)<7 and int(word):
                word = word+end_token
              else:
                date_original.append(word)
                word = 'DdAaTtEe'+str(count_date)+end_token
                count_date +=1
            else:
              date_original.append(word)  
              word = 'DdAaTtEe'+str(count_date)
              count_date +=1
          elif common_utils.token_is_url(word):
            url_original.append(word)
            word = 'UuRrLl'+str(count_url)
            count_url +=1
        except Exception as e:
          print(e)
          logger.error("In handle_date_url:tag_num function:{}".format(e))
          word = word
        

        resultant_str.append(word)   
        s = [str(i) for i in resultant_str] 
        res = str(" ".join(s))   
    logger.info("tagged response:{} and date:{} and url:{}".format(res,date_original,url_original)) 
    return res,date_original,url_original,num_array,num_map 
  except Exception as e:
    logger.error("In handle_date_url:tag_num function parent except block:{}".format(e))
    return text,[],[],(num_array or []) 
Exemplo n.º 5
0
def tag_number_date_url(text):
  try: 
    resultant_str = list()
    count_date = 0
    date_original = list()
    count_url = 0
    url_original = list()
    for word in text.split():
        print("word",word)
        # if len(word)>4 and len(word)<12 and token_is_date(word):
        ext = [".",",","?","!"]
        if word.isalpha()== False and word[:-1].isalpha() == False and len(word)>4 and common_utils.token_is_date(word):
            if word.endswith(tuple(ext)):
              end_token = word[-1]
              word = word[:-1]
              if len(word)<7 and int(word):
                word = word+end_token
                print("kkkk")
              else:
                date_original.append(word)
                word = 'DdAaTtEe'+str(count_date)+end_token
                count_date +=1
                print("jjj")
            else:
              date_original.append(word)  
              word = 'DdAaTtEe'+str(count_date)
              count_date +=1
              print("ggg")
        elif common_utils.token_is_url(word):
            url_original.append(word)
            word = 'UuRrLl'+str(count_url)
            count_url +=1
            print("kkk")

        resultant_str.append(word)   
        s = [str(i) for i in resultant_str] 
        res = str(" ".join(s))  
    print("res",res,date_original,url_original)    

    return res,date_original,url_original 
  except Exception as e:
    print(e)