Exemplo n.º 1
0
def normalize(word):
    '''
	(String) -> String

	Takes a noisy word as input and returns normalized word.

	INPUT: nah, yeah
	OUTPUT: no, yes

	'''

    logger.info("ENTERING SPELL NOISY NORMALIZER MODULE")

    try:
        normalized_word = findWordByAbbreviation(word)
        return_word = ""
        if normalized_word != '':
            logger.debug("'%s' --> '%s' ", word, normalized_word)
            return_word = normalized_word

        logger.info("SPELL NORMALIZATION DONE\n")
        return return_word

    except:
        logger.error(sys.exc_info()[1])
        logger.info("SPELL NORMALIZATION DONE\n")
        return word
def correctAbbreviation(query):
	'''
	(String) -> String

	Takes a query with abbreviations and resolves them into their counterpart.

	INPUT: I want to travel from hyd to blr on 2 oct.
	OUTPUT: I want to travel from Hyderabad to Bangalore on 2 October.

	'''

	abbreviated_query = ""
	logger.info("ENTERING ABBREVIATION CORRECTION MODULE")
	
	try:
		logger.debug("Query = " + query)

		tokens = query.split()
		for token in tokens:
			expanded_word = expandWord(token)
			abbreviated_query += expanded_word + " "
		
		abbreviated_query = abbreviated_query.strip()
		
		logger.info("ABBREVIATION CORRECTION DONE\n")
		return abbreviated_query
	
	except:
		logger.error(sys.exc_info()[1])
		logger.info("ABBREVIATION CORRECTION DONE\n")
		return query
Exemplo n.º 3
0
def normalize(query):

	'''
	(String) -> String

	Takes a noisy query as input and returns normalized query.

	INPUT: I wanna go to hyderabad 2morrow
	OUTPUT: I want to go to hyderabad tomorrow 

	'''
	
	normalized_query = ""
	logger.info("ENTERING NOISY NORMALIZER MODULE")

	try:
		logger.debug("Query = " + query)

		tokens = query.split()
		for token in tokens:
			normalized_word = get_normalized_word(token)
			normalized_query += normalized_word + " "
		
		normalized_query = normalized_query.strip()
	
		logger.info("NORMALIZATION DONE\n")
		return normalized_query
	
	except:
		logger.error(sys.exc_info()[1])
		logger.info("NORMALIZATION DONE\n")
		return query
def split_dot(word):
    try:

        # going.I 	-> going . I
        p1 = re.compile("([A-Za-z]+)(\.)([A-Za-z]+)")

        # 50Kg.	->	50 Kg
        # 4:00am. -> 4 am
        p2 = re.compile("([0-9\:]+)([A-Za-z]+)(\.)")

        # Rs.50	->	Rs 50
        p3 = re.compile("([A-Za-z]+)(\.)([0-9]+)")

        # Rs50	->	Rs 50
        p4 = re.compile("([A-Za-z]+)([0-9]+)")

        # 50Kg	->	50 Kg
        # 4:00am -> 4 am
        p5 = re.compile("([0-9\:]+)([A-Za-z]+)")

        # END.	->	END .
        p6 = re.compile("([A-Za-z]+)(\.)")

        if p1.match(word):
            # print "Match 1"
            return re.sub(p1, r"\1 \2 \3", word)

        elif p2.match(word):
            # print "Match 2"
            return re.sub(p2, r"\1 \2", word)

        elif p3.match(word):
            # print "Match 3"
            return re.sub(p3, r"\1 \3", word)

        elif p4.match(word):
            # print "Match 4"
            return re.sub(p4, r"\1 \2", word)

        elif p5.match(word):
            # print "Match 5"
            new_word = re.sub(p5, r"\1 \2", word)
            tokens = new_word.split(" ")
            if tokens[1] in date_extensions:
                return tokens[0] + tokens[1]
            return new_word

        elif p6.match(word):
            # print "Match 6"
            return re.sub(p6, r"\1 \2", word)

        return word

    except:
        logger.error(sys.exc_info()[1])
        return word
Exemplo n.º 5
0
def completeDate(query):
    '''
	(String) -> String

	Takes a query with incomplete dates (missing month), and fills it
	using Server Time. Also, it replaces 'th' to make the term number
	for gap filling module.

	INPUT: I will be travelling on 25th.
	OUTPUT: I will be travelling on 25 October.

	'''

    logger.info("ENTERING GAP FILLER MODULE")

    try:
        logger.debug("Query = " + query)

        gap_filled_query = ""

        tokens = query.split()
        new_tokens = []

        skip_flag = 0
        skip_tokens = []

        for index in range(0, len(tokens)):
            token = tokens[index]
            date = isDateToken(token)
            if date:
                new_tokens.append(str(date))
                if not isMonthSpecified(index, tokens):
                    month = getComingMonth(date)
                    new_tokens.append(month)
                # else:
                # 	skip_tokens, month = isRelativeMonthSpecified(index,tokens)
                # 	if skip_tokens:
                # 		new_tokens.append(month)
                # 		skip_flag = 1
            else:
                # if skip_flag and index not in skip_tokens:
                # 	skip_flag = 0
                # 	skip_tokens = []
                # if not skip_flag:
                new_tokens.append(token)

        gap_filled_query = " ".join(new_tokens)

        logger.info("GAP FILLING DONE\n")
        return gap_filled_query

    except:
        logger.error(sys.exc_info()[1])
        logger.info("GAP FILLING DONE\n")
        return query
def splitNumberString(query):
    '''
	(String) -> String

	Takes a query where multiple word are clubbed together in single token
	and separates such tokens to multiple words.

	INPUT: My budget is Rs.50 and extra luggage 10-15kgs.
	OUTPUT: My budget is Rs 50 and extra luggage 10 - 15 kgs.

	Cases to handle:
	Rs.50	->	Rs 50
	Rs50	->	Rs 50
	10-15kgs	-> 10 - 15 Kgs
	10Kgs-15kgs	-> 10 Kgs - 15 Kgs
	10Kg. 	-> 10 Kg
	10.1	-> 10.1
	10-12-2015	-> 10-12-2015
	10.12.2015	-> 10.12.2015
	END.	-> END .
	one-way -> one way
	1-tier	-> 1 tier
	4:00am	-> 4:00 am
	going.I 	-> going . I
	// Handle ticket/pnr no. and don't split them

	Rules (in order):
	1. Split '-' ---> 10-15 -> 10 - 15, if tier, way -> remove '-', handle date case
	2. Case '.', (i) two numbers: do nothing, (ii) two words: split, (iii) one word-one num: split and remove '.'
	3. Split NUM and String. If last char == '.', if word in dict -> remove '.', else full stop. If split == 'nd' (for date), delete token
	'''

    splitted_query = ""
    logger.info("ENTERING SPLITTER MODULE")

    try:
        logger.debug("Query = " + query)

        tokens = query.split()
        for token in tokens:
            splitted_word = split_word(token)
            splitted_query += splitted_word + " "

        splitted_query = splitted_query.strip()

        logger.info("SPLITTING DONE\n")
        return splitted_query

    except:
        logger.error(sys.exc_info()[1])
        logger.info("SPLITTING DONE\n")
        return query
Exemplo n.º 7
0
def getComingMonth(given_date):
    try:
        today = date.today()
        today_date = today.day
        month = today.month

        if given_date <= today_date:
            month += 1
            month %= 12

        monthstr = datetime.date(1900, month, 1).strftime('%B')
        return monthstr
    except:
        logger.error(sys.exc_info()[1])
        return ''
def split_word(word):
    try:
        new_word = ""
        splitted_hypen_terms = split_hypen(word)
        # print "Hypen = ",
        # print splitted_hypen_terms
        for term in splitted_hypen_terms:
            splitted_dot_terms = split_dot(term)
            new_word += splitted_dot_terms + " "

        new_word = new_word.strip()
        if new_word != word:
            logger.debug("%s --> %s", word, new_word)
        return new_word

    except:
        logger.error(sys.exc_info()[1])
        return word
Exemplo n.º 9
0
def isMonthSpecified(index, tokens):
    try:
        start_index = index - 2
        if start_index < 0:
            start_index = 0

        end_index = index + 3
        if end_index > len(tokens):
            end_index = len(tokens)

        for i in range(start_index, end_index):
            token = tokens[i]
            if token in month_list:
                return True
        return False
    except:
        logger.error(sys.exc_info()[1])
        return False
def expandWord(word):
	'''
	(String) -> String

	Takes a abbreviated word as input and returns the expanded word.

	INPUT: hyd, kg, rs
	OUTPUT: hyderabad, kilogram, rupees

	'''

	try:
		expanded_word = findWordByAbbreviation(word)
		if expanded_word != '':
			logger.debug("'%s' --> '%s' ", word, expanded_word)
			return expanded_word
		return word
	except:
		logger.error(sys.exc_info()[1])
		return word
Exemplo n.º 11
0
def get_normalized_word(word):
	'''
	(String) -> String

	Takes a noisy word as input and returns normalized word.

	INPUT: b4, 2morrow, uttar, going
	OUTPUT: before, tomorrow, uttar, ''

	'''

	try:
		normalized_word = findWordByAbbreviation(word)
		if normalized_word != '':
			logger.debug("'%s' --> '%s' ", word, normalized_word)
			return normalized_word
		return word
	except:
		logger.error(sys.exc_info()[1])
		return word
def split_hypen(word):
    try:
        words = word.split('-')

        # Check if word is not '-'
        if len(words) == 2 and words[0] and words[1]:
            # If hypen between two numbers, keep it
            w11 = words[0][len(words[0]) - 1]
            w12 = words[0][0]
            w2 = words[1][0]
            firstTen = range(0, 10)

            if (isNumber(w11) or isNumber(w12)) and isNumber(w2):
                new_words = [words[0], "-", words[1]]
                return new_words

            return words
        return [word]
    except:
        logger.error(sys.exc_info()[1])
        return word
Exemplo n.º 13
0
def unigramSpellCheck(query, PWLdict):
    '''
	(String) -> String

	Takes a noisy query with ungrammatical/Out of Vocab words as
	input and returns the spell corrected query.

	INPUT: I want to buk a flight from hydrabad to banglore.
	OUTPUT: I want to book a flight from Hyderabad to Bangalore.

	'''

    logger.info("ENTERING SPELL CHECKER MODULE")

    try:
        logger.debug("Query = " + query)

        word_list = nltk.word_tokenize(query)
        pos_list = nltk.pos_tag(word_list)

        replacerDict = SpellingReplacer()
        # print replacerDict.check("mumbai")

        replacerPWL = SpellingReplacer(PWLdict)
        # print replacerPWL.check('mumbai')

        checked_list = []
        for item in pos_list:
            word = item[0]
            pos = item[1]

            truncate_word = re.sub(r'(.)\1+', r'\1', word)
            normalized_word = normalize(truncate_word)

            # If word is a special char, don't spell check it
            if re.match("([^\w@#])", word):
                checked_list.append(word)

            elif normalized_word:
                checked_list.append(normalized_word)

            elif replacerPWL.check(truncate_word):
                correctedWord = truncate_word.title()
                checked_list.append(correctedWord)

            elif not replacerDict.check(word):
                correctedWord = ""
                dist = 100

                # Do not replace words from PWL if len(word) <= 3
                if len(truncate_word) > 3:
                    correctedWordPWL = replacerPWL.replace(truncate_word)
                    distPWL = edit_distance(truncate_word, correctedWordPWL)
                else:
                    distPWL = dist
                    correctedWordPWL = truncate_word

                correctedWordDict = replacerDict.replace(word)
                distDict = edit_distance(word, correctedWordDict)

                if distPWL > distDict or correctedWordPWL == truncate_word:
                    correctedWord = correctedWordDict
                else:
                    correctedWord = correctedWordPWL.title()

                if correctedWord == "":
                    correctedWord = word
                else:
                    logger.debug("'%s' --> '%s' ", word, correctedWord)

                checked_list.append(correctedWord)
            else:
                checked_list.append(word)

        spell_checked_query = " ".join(checked_list)

        logger.info("SPELL CORRECTION DONE\n")
        return spell_checked_query

    except:
        logger.error(sys.exc_info()[1])
        logger.info("SPELL CORRECTION DONE\n")
        return query
Exemplo n.º 14
0
def travelNLP(query, category, last_requested_DF):
    '''
	(String,String,String) -> Object

	Takes the input query, category and last requested DF
	and annotates the NERs in the query.

	INPUT: ('hyd to blr', 'travel', 'source')
	OUTPUT: {source: HYD, destination: BLR}

	'''
    # logger = logging.getLogger(__name__)
    allExtendedNerDF = {}
    logger.info("ENTERING TRAVEL MODULE")

    try:
        logger.debug(query + " " + last_requested_DF + "\n")

        query = query.lower()

        noiseRemovedQuery = preprocessing_tools.noisy_word_normalizer.normalizer.normalize(
            query)
        print "Normalize = ", noiseRemovedQuery
        logger.debug("Normalize = " + noiseRemovedQuery)

        splittedQuery = preprocessing_tools.number_string_splitter.number_string_splitter.splitNumberString(
            noiseRemovedQuery)
        print "Splitted = ", splittedQuery
        logger.debug("Splitted = " + splittedQuery)

        abbreviatedQuery = preprocessing_tools.abbreviation_checker.abbreviation_corrector.correctAbbreviation(
            splittedQuery)
        print "Abbreviated = ", abbreviatedQuery
        logger.debug("Abbreviated = " + abbreviatedQuery)

        spellCheckedQuery = preprocessing_tools.spell_checker.spell_checker.spellCheck(
            abbreviatedQuery, PWL_FILE)
        print "Spellchecked = ", spellCheckedQuery
        logger.debug("Spellchecked = " + spellCheckedQuery)

        monthFilledQuery = preprocessing_tools.month_filler.month_filler.completeDate(
            spellCheckedQuery)
        print "MonthFilledQuery = ", monthFilledQuery
        logger.debug("MonthFilledQuery = " + monthFilledQuery)

        gapFilledQuery = preprocessing_tools.argument_filler.argument_filler.fillArguments(
            monthFilledQuery)
        print "GapFilledQuery = ", gapFilledQuery
        logger.debug("GapFilledQuery = " + gapFilledQuery)

        normalizedQuery = gapFilledQuery
        print "Final Normalized Query = ", gapFilledQuery
        print
        logger.debug("Final Normalized Query = " + gapFilledQuery)

        NERAnalyzedParse, chunkedParse = preprocessing_tools.corenlp.corenlp.identifyNER(
            normalizedQuery)
        print "NER Parse = ", NERAnalyzedParse
        print "Chunking = ", chunkedParse

        for index in range(0, len(chunkedParse)):
            # print NERAnalyzedParse[index], chunkedParse[index]
            extendedNerDF = preprocessing_tools.extended_ner.travel.travel_extended_ner.identifyExtendedNER(
                normalizedQuery, category, NERAnalyzedParse[index],
                last_requested_DF)

            disambiguatedDF = preprocessing_tools.category_disambiguator.category_disambiguator.disambiguateCategories(
                normalizedQuery, category, NERAnalyzedParse[index],
                chunkedParse[index], last_requested_DF)
            # print "Disambiguated = ",
            # print disambiguatedDF

            singleExtendedNerDF = preprocessing_tools.category_disambiguator.category_disambiguator.mergeDictionaries(
                extendedNerDF, disambiguatedDF)
            allExtendedNerDF = mergeDictionaries(allExtendedNerDF,
                                                 singleExtendedNerDF)

        if "0" in allExtendedNerDF.keys():
            del allExtendedNerDF["0"]

        print "Final Analyzed NERs = ", allExtendedNerDF

    except:
        # print "Unexpected error:", sys.exc_info()
        logger.error(sys.exc_info()[1])

    finally:
        logger.info("LEAVING TRAVEL MODULE")
        return allExtendedNerDF