def normalize(word): ''' (String) -> String Takes a noisy word as input and returns normalized word. INPUT: nah, yeah OUTPUT: no, yes ''' logger.info("ENTERING SPELL NOISY NORMALIZER MODULE") try: normalized_word = findWordByAbbreviation(word) return_word = "" if normalized_word != '': logger.debug("'%s' --> '%s' ", word, normalized_word) return_word = normalized_word logger.info("SPELL NORMALIZATION DONE\n") return return_word except: logger.error(sys.exc_info()[1]) logger.info("SPELL NORMALIZATION DONE\n") return word
def correctAbbreviation(query): ''' (String) -> String Takes a query with abbreviations and resolves them into their counterpart. INPUT: I want to travel from hyd to blr on 2 oct. OUTPUT: I want to travel from Hyderabad to Bangalore on 2 October. ''' abbreviated_query = "" logger.info("ENTERING ABBREVIATION CORRECTION MODULE") try: logger.debug("Query = " + query) tokens = query.split() for token in tokens: expanded_word = expandWord(token) abbreviated_query += expanded_word + " " abbreviated_query = abbreviated_query.strip() logger.info("ABBREVIATION CORRECTION DONE\n") return abbreviated_query except: logger.error(sys.exc_info()[1]) logger.info("ABBREVIATION CORRECTION DONE\n") return query
def normalize(query): ''' (String) -> String Takes a noisy query as input and returns normalized query. INPUT: I wanna go to hyderabad 2morrow OUTPUT: I want to go to hyderabad tomorrow ''' normalized_query = "" logger.info("ENTERING NOISY NORMALIZER MODULE") try: logger.debug("Query = " + query) tokens = query.split() for token in tokens: normalized_word = get_normalized_word(token) normalized_query += normalized_word + " " normalized_query = normalized_query.strip() logger.info("NORMALIZATION DONE\n") return normalized_query except: logger.error(sys.exc_info()[1]) logger.info("NORMALIZATION DONE\n") return query
def split_dot(word): try: # going.I -> going . I p1 = re.compile("([A-Za-z]+)(\.)([A-Za-z]+)") # 50Kg. -> 50 Kg # 4:00am. -> 4 am p2 = re.compile("([0-9\:]+)([A-Za-z]+)(\.)") # Rs.50 -> Rs 50 p3 = re.compile("([A-Za-z]+)(\.)([0-9]+)") # Rs50 -> Rs 50 p4 = re.compile("([A-Za-z]+)([0-9]+)") # 50Kg -> 50 Kg # 4:00am -> 4 am p5 = re.compile("([0-9\:]+)([A-Za-z]+)") # END. -> END . p6 = re.compile("([A-Za-z]+)(\.)") if p1.match(word): # print "Match 1" return re.sub(p1, r"\1 \2 \3", word) elif p2.match(word): # print "Match 2" return re.sub(p2, r"\1 \2", word) elif p3.match(word): # print "Match 3" return re.sub(p3, r"\1 \3", word) elif p4.match(word): # print "Match 4" return re.sub(p4, r"\1 \2", word) elif p5.match(word): # print "Match 5" new_word = re.sub(p5, r"\1 \2", word) tokens = new_word.split(" ") if tokens[1] in date_extensions: return tokens[0] + tokens[1] return new_word elif p6.match(word): # print "Match 6" return re.sub(p6, r"\1 \2", word) return word except: logger.error(sys.exc_info()[1]) return word
def completeDate(query): ''' (String) -> String Takes a query with incomplete dates (missing month), and fills it using Server Time. Also, it replaces 'th' to make the term number for gap filling module. INPUT: I will be travelling on 25th. OUTPUT: I will be travelling on 25 October. ''' logger.info("ENTERING GAP FILLER MODULE") try: logger.debug("Query = " + query) gap_filled_query = "" tokens = query.split() new_tokens = [] skip_flag = 0 skip_tokens = [] for index in range(0, len(tokens)): token = tokens[index] date = isDateToken(token) if date: new_tokens.append(str(date)) if not isMonthSpecified(index, tokens): month = getComingMonth(date) new_tokens.append(month) # else: # skip_tokens, month = isRelativeMonthSpecified(index,tokens) # if skip_tokens: # new_tokens.append(month) # skip_flag = 1 else: # if skip_flag and index not in skip_tokens: # skip_flag = 0 # skip_tokens = [] # if not skip_flag: new_tokens.append(token) gap_filled_query = " ".join(new_tokens) logger.info("GAP FILLING DONE\n") return gap_filled_query except: logger.error(sys.exc_info()[1]) logger.info("GAP FILLING DONE\n") return query
def splitNumberString(query): ''' (String) -> String Takes a query where multiple word are clubbed together in single token and separates such tokens to multiple words. INPUT: My budget is Rs.50 and extra luggage 10-15kgs. OUTPUT: My budget is Rs 50 and extra luggage 10 - 15 kgs. Cases to handle: Rs.50 -> Rs 50 Rs50 -> Rs 50 10-15kgs -> 10 - 15 Kgs 10Kgs-15kgs -> 10 Kgs - 15 Kgs 10Kg. -> 10 Kg 10.1 -> 10.1 10-12-2015 -> 10-12-2015 10.12.2015 -> 10.12.2015 END. -> END . one-way -> one way 1-tier -> 1 tier 4:00am -> 4:00 am going.I -> going . I // Handle ticket/pnr no. and don't split them Rules (in order): 1. Split '-' ---> 10-15 -> 10 - 15, if tier, way -> remove '-', handle date case 2. Case '.', (i) two numbers: do nothing, (ii) two words: split, (iii) one word-one num: split and remove '.' 3. Split NUM and String. If last char == '.', if word in dict -> remove '.', else full stop. If split == 'nd' (for date), delete token ''' splitted_query = "" logger.info("ENTERING SPLITTER MODULE") try: logger.debug("Query = " + query) tokens = query.split() for token in tokens: splitted_word = split_word(token) splitted_query += splitted_word + " " splitted_query = splitted_query.strip() logger.info("SPLITTING DONE\n") return splitted_query except: logger.error(sys.exc_info()[1]) logger.info("SPLITTING DONE\n") return query
def getComingMonth(given_date): try: today = date.today() today_date = today.day month = today.month if given_date <= today_date: month += 1 month %= 12 monthstr = datetime.date(1900, month, 1).strftime('%B') return monthstr except: logger.error(sys.exc_info()[1]) return ''
def split_word(word): try: new_word = "" splitted_hypen_terms = split_hypen(word) # print "Hypen = ", # print splitted_hypen_terms for term in splitted_hypen_terms: splitted_dot_terms = split_dot(term) new_word += splitted_dot_terms + " " new_word = new_word.strip() if new_word != word: logger.debug("%s --> %s", word, new_word) return new_word except: logger.error(sys.exc_info()[1]) return word
def isMonthSpecified(index, tokens): try: start_index = index - 2 if start_index < 0: start_index = 0 end_index = index + 3 if end_index > len(tokens): end_index = len(tokens) for i in range(start_index, end_index): token = tokens[i] if token in month_list: return True return False except: logger.error(sys.exc_info()[1]) return False
def expandWord(word): ''' (String) -> String Takes a abbreviated word as input and returns the expanded word. INPUT: hyd, kg, rs OUTPUT: hyderabad, kilogram, rupees ''' try: expanded_word = findWordByAbbreviation(word) if expanded_word != '': logger.debug("'%s' --> '%s' ", word, expanded_word) return expanded_word return word except: logger.error(sys.exc_info()[1]) return word
def get_normalized_word(word): ''' (String) -> String Takes a noisy word as input and returns normalized word. INPUT: b4, 2morrow, uttar, going OUTPUT: before, tomorrow, uttar, '' ''' try: normalized_word = findWordByAbbreviation(word) if normalized_word != '': logger.debug("'%s' --> '%s' ", word, normalized_word) return normalized_word return word except: logger.error(sys.exc_info()[1]) return word
def split_hypen(word): try: words = word.split('-') # Check if word is not '-' if len(words) == 2 and words[0] and words[1]: # If hypen between two numbers, keep it w11 = words[0][len(words[0]) - 1] w12 = words[0][0] w2 = words[1][0] firstTen = range(0, 10) if (isNumber(w11) or isNumber(w12)) and isNumber(w2): new_words = [words[0], "-", words[1]] return new_words return words return [word] except: logger.error(sys.exc_info()[1]) return word
def unigramSpellCheck(query, PWLdict): ''' (String) -> String Takes a noisy query with ungrammatical/Out of Vocab words as input and returns the spell corrected query. INPUT: I want to buk a flight from hydrabad to banglore. OUTPUT: I want to book a flight from Hyderabad to Bangalore. ''' logger.info("ENTERING SPELL CHECKER MODULE") try: logger.debug("Query = " + query) word_list = nltk.word_tokenize(query) pos_list = nltk.pos_tag(word_list) replacerDict = SpellingReplacer() # print replacerDict.check("mumbai") replacerPWL = SpellingReplacer(PWLdict) # print replacerPWL.check('mumbai') checked_list = [] for item in pos_list: word = item[0] pos = item[1] truncate_word = re.sub(r'(.)\1+', r'\1', word) normalized_word = normalize(truncate_word) # If word is a special char, don't spell check it if re.match("([^\w@#])", word): checked_list.append(word) elif normalized_word: checked_list.append(normalized_word) elif replacerPWL.check(truncate_word): correctedWord = truncate_word.title() checked_list.append(correctedWord) elif not replacerDict.check(word): correctedWord = "" dist = 100 # Do not replace words from PWL if len(word) <= 3 if len(truncate_word) > 3: correctedWordPWL = replacerPWL.replace(truncate_word) distPWL = edit_distance(truncate_word, correctedWordPWL) else: distPWL = dist correctedWordPWL = truncate_word correctedWordDict = replacerDict.replace(word) distDict = edit_distance(word, correctedWordDict) if distPWL > distDict or correctedWordPWL == truncate_word: correctedWord = correctedWordDict else: correctedWord = correctedWordPWL.title() if correctedWord == "": correctedWord = word else: logger.debug("'%s' --> '%s' ", word, correctedWord) checked_list.append(correctedWord) else: checked_list.append(word) spell_checked_query = " ".join(checked_list) logger.info("SPELL CORRECTION DONE\n") return spell_checked_query except: logger.error(sys.exc_info()[1]) logger.info("SPELL CORRECTION DONE\n") return query
def travelNLP(query, category, last_requested_DF): ''' (String,String,String) -> Object Takes the input query, category and last requested DF and annotates the NERs in the query. INPUT: ('hyd to blr', 'travel', 'source') OUTPUT: {source: HYD, destination: BLR} ''' # logger = logging.getLogger(__name__) allExtendedNerDF = {} logger.info("ENTERING TRAVEL MODULE") try: logger.debug(query + " " + last_requested_DF + "\n") query = query.lower() noiseRemovedQuery = preprocessing_tools.noisy_word_normalizer.normalizer.normalize( query) print "Normalize = ", noiseRemovedQuery logger.debug("Normalize = " + noiseRemovedQuery) splittedQuery = preprocessing_tools.number_string_splitter.number_string_splitter.splitNumberString( noiseRemovedQuery) print "Splitted = ", splittedQuery logger.debug("Splitted = " + splittedQuery) abbreviatedQuery = preprocessing_tools.abbreviation_checker.abbreviation_corrector.correctAbbreviation( splittedQuery) print "Abbreviated = ", abbreviatedQuery logger.debug("Abbreviated = " + abbreviatedQuery) spellCheckedQuery = preprocessing_tools.spell_checker.spell_checker.spellCheck( abbreviatedQuery, PWL_FILE) print "Spellchecked = ", spellCheckedQuery logger.debug("Spellchecked = " + spellCheckedQuery) monthFilledQuery = preprocessing_tools.month_filler.month_filler.completeDate( spellCheckedQuery) print "MonthFilledQuery = ", monthFilledQuery logger.debug("MonthFilledQuery = " + monthFilledQuery) gapFilledQuery = preprocessing_tools.argument_filler.argument_filler.fillArguments( monthFilledQuery) print "GapFilledQuery = ", gapFilledQuery logger.debug("GapFilledQuery = " + gapFilledQuery) normalizedQuery = gapFilledQuery print "Final Normalized Query = ", gapFilledQuery print logger.debug("Final Normalized Query = " + gapFilledQuery) NERAnalyzedParse, chunkedParse = preprocessing_tools.corenlp.corenlp.identifyNER( normalizedQuery) print "NER Parse = ", NERAnalyzedParse print "Chunking = ", chunkedParse for index in range(0, len(chunkedParse)): # print NERAnalyzedParse[index], chunkedParse[index] extendedNerDF = preprocessing_tools.extended_ner.travel.travel_extended_ner.identifyExtendedNER( normalizedQuery, category, NERAnalyzedParse[index], last_requested_DF) disambiguatedDF = preprocessing_tools.category_disambiguator.category_disambiguator.disambiguateCategories( normalizedQuery, category, NERAnalyzedParse[index], chunkedParse[index], last_requested_DF) # print "Disambiguated = ", # print disambiguatedDF singleExtendedNerDF = preprocessing_tools.category_disambiguator.category_disambiguator.mergeDictionaries( extendedNerDF, disambiguatedDF) allExtendedNerDF = mergeDictionaries(allExtendedNerDF, singleExtendedNerDF) if "0" in allExtendedNerDF.keys(): del allExtendedNerDF["0"] print "Final Analyzed NERs = ", allExtendedNerDF except: # print "Unexpected error:", sys.exc_info() logger.error(sys.exc_info()[1]) finally: logger.info("LEAVING TRAVEL MODULE") return allExtendedNerDF