예제 #1
0
    def __init__(self):
        """Compile the regexes in grammar"""
        for item in grammar:
            item['matches_compiled'] = {}
            for name, pattern in item['matches'].items():
                item['matches_compiled'][name] = \
                    re.compile(pattern, re.IGNORECASE)

            item['semantics_compiled'] = {}
            for name, pattern in item['semantics'].items():
                item['semantics_compiled'][name] = \
                    re.compile(pattern)

        if constants.SPELLCHECK:
            self.didyoumean = DidYouMean('en-us', constants.DICT_DIR)
예제 #2
0
def test_split_english_sentence():
    """Split a simple string in English"""
    d = DidYouMean("en-US")
    words = "this is a test"
    result = ["this", "is", "a", "test"]
    eq_(result, d._split_string(words))
예제 #3
0
def test_split_hindi_sentence():
    """Split a string in Hindi"""
    d = DidYouMean("hi-IN")
    words = u"यह एक नमून है"
    result = [u"यह", u"एक", u"नमून", u"है"]
    eq_(result, d._split_string(words))
예제 #4
0
def test_split_farsi_sentence():
    """Split a string in Farsi, an RTL language"""
    d = DidYouMean("fa-IR")
    words = u"این یک جملهٔ آزمایشی است"
    result = [u"این", u"یک", u"جملهٔ", u"آزمایشی", u"است"]
    eq_(result, d._split_string(words))
예제 #5
0
def test_split_french_sentence():
    """Split a string in French, which has apostrophes in it"""
    d = DidYouMean("fr-FR")
    words = "c'est une d'example"
    result = ["c'est", "une", "d'example"]
    eq_(result, d._split_string(words))
예제 #6
0
class InputParser:
    def __init__(self):
        """Compile the regexes in grammar"""
        for item in grammar:
            item['matches_compiled'] = {}
            for name, pattern in item['matches'].items():
                item['matches_compiled'][name] = \
                    re.compile(pattern, re.IGNORECASE)

            item['semantics_compiled'] = {}
            for name, pattern in item['semantics'].items():
                item['semantics_compiled'][name] = \
                    re.compile(pattern)

        if constants.SPELLCHECK:
            self.didyoumean = DidYouMean('en-us', constants.DICT_DIR)

    def parse(self, input):
        """
        Parse user input.

        @type input: C{string}
        @param input: string to be parsed
        @rtype: C{list}
        """

        # removes punctuation from the end of the sentance
        clean_input = input.strip(" !@#$%^&*()_+-=[]\\`~{}|;':\",./<>?")

        # check grammar
        input_s = self.spellcheck(input)

        resp = self.check_grammar(input)
        if constants.SPELLCHECK and not resp and input_s['full'] != input:
            resp = self.check_grammar(input_s['full'])

        # use NLP parser as fallback
        if not resp:
            resp = self.nlp_parse(input)

        if constants.SPELLCHECK and resp['type'] == 'nomatch' \
            and input_s['full'] != input:
            resp = self.nlp_parse(input_s['full'])

        return resp

    def nlp_parse(self, input):
        """Parse input using NLTK parser+chunker"""
        resp = {}
        resp['type'] = 'nomatch'
        VDB_set = {}
        WP_set = {}
        tagset = self.build_tagset(input)
        resp['words'] = self.build_keywords(tagset)
        w = resp['words']

        if not w:
            if constants.DEBUG:
                log.debug("No words: " + str(resp))
            return resp

        # store nouns
        NN_set = set(w.get('NN', []))

        # matches a request for a list
        if 'list' in NN_set \
            or 'List' in w.get('NNP', []):
            resp['count'] = w.get('CD', [constants.LIST_COUNT])[0]
            resp['type'] = 'show-list'
            if set(['serving', 'serve']) & set(w.get('VBG', [])):
                resp['meal'] = (NN_set & constants.MEALS_SET).pop()
            if 'in' in w.get('IN', []):
                resp['zone'] = w.get('NNP', [None])[0]
            if 'close' in w.get('VBD', []) \
                or 'close' in w.get('JJ', []) \
                or 'close' in NN_set:
                resp['distance'] = True
            return resp

        # finds neighborhood
        for word in tagset:
            if word[1] == 'VBD':
                VDB_set = word[0]
        for word in tagset:
            if word[1] == 'WP':
                WP_set = word[0]
        if 'neighborhood' in VDB_set and 'what' in WP_set:
            if w.get('NNP', [None])[0]:
                r_name = w.get('NNP', [None])[0]
            else:
                return resp

            r_name = w.get('NNP', [None])[0]
            resp['restaurant'] = r_name
            resp['type'] = 'name-zone'
            return resp

        # matches "how expensive it is" and "is it expensive"
        if 'expensive' in w.get('JJ', ()):
            if w.get('NNP', [None])[0]:
                r_name = w.get('NNP', [None])[0]
            else:
                return resp

            r_name = w.get('NNP', [None])[0]
            resp['restaurant'] = r_name
            resp['type'] = 'name-price'
            return resp

        if 'between' in w.get('IN', ()) \
            or 'price' in NN_set:
            price_range = w.get('CD', ())

            # price between a and b
            # require at least 2 numerals
            if len(price_range) >= 2:
                resp['min'] = min(map(int, price_range))
                resp['max'] = max(map(int, price_range))
                resp['type'] = 'list-price-range'
                return resp

            # price of exactly a
            if len(price_range) > 0:
                price_range = w.get('CD', ())
                resp['price'] = min(price_range)
                resp['type'] = 'list-price-single'
                return resp

        # need to merge NN and JJ for this step
        w['NNJJ'] = NN_set | set(w.get('JJ', []))
        meal = constants.MEALS_SET & w['NNJJ']
        if meal:
            resp['type'] = 'list-meal-single'
            resp['meal'] = meal.copy().pop()
            return resp

        # matches a quality list
        if 'quality' in NN_set and \
            (constants.QUALITIES & w['NNJJ']) and \
            (set(['food', 'service']) & w['NNJJ']):
            resp['degree'] = (constants.QUALITIES \
                & w['NNJJ']).pop()
            resp['type'] = 'list-quality-' + \
                (set(['food', 'service']) & w['NNJJ']).pop()
            return resp

        # matches a phone number request
        if NN_set & constants.PHONE_KEYWORDS:
            r_name = w.get('NNP', [None])[0] or \
                        w['NN'][-1]

            resp['restaurant'] = r_name
            resp['type'] = 'name-phone'
            return resp

        # matches a single meal request
        if NN_set & constants.MEALS_SET:
            r_name = w.get('NNP', [None])[0] or \
                w['NN'][-1]

            resp['restaurant'] = r_name
            resp['type'] = 'name-meal'
            resp['meal'] = word.lower()
            return resp

        # matches a request for an address
        if 'address' in NN_set:
            r_name = w.get('NNP', [None])[0] or \
                        w['NN'][-1]
            resp['restaurant'] = r_name
            resp['type'] = 'name-location'
            return resp

        # matches a restaurant in neighborhood
        if 'in' in w.get('IN', []) and \
            NN_set & constants.NAME_KEYWORDS:
            r_name = w.get('NNP', [None])[0]
            if not r_name:
                for kw in reversed(w['NN']):
                    if kw not in constants.NAME_KEYWORDS:
                        r_name = kw
                        break
            if r_name:
                resp['type'] = 'random-city'
                resp['city'] = string.capitalize(r_name)
                return resp

        # matches a request for a cuisine type
        if NN_set & constants.NAME_KEYWORDS:
            r_name = w.get('NNP', [None])[0]
            if not r_name:
                for kw in reversed(w['NN']):
                    if kw not in constants.NAME_KEYWORDS:
                        r_name = kw
                        break
            if r_name:
                resp['type'] = 'random-cuisine'
                resp['cuisine'] = string.capitalize(r_name)
                return resp

        # merge all numerals together for list-mode
        w['CDLS'] = set(w.get('CD', []) + w.get('LS', []))
        if w['CDLS']:
            w_copy = w['CDLS'].copy()
            while w_copy:
                try:
                    resp['listitem'] = int(w_copy.pop())
                    resp['type'] = 'single-listitem'
                    return resp
                except:
                    pass

        # distance / how far
        if ('far' in w.get('RB', [])
                and 'how' in w.get('WRB', [])) or ('distance' in NN_set):
            r = w.get('NNP', [None])[0]
            if r:
                resp['type'] = 'name-distance'
                resp['restaurant'] = string.capitalize(r)
                return resp

        if constants.DEBUG:
            log.debug(resp)
        return resp

    def format_keywords(self, keyword):
        trimmed = []

        # check if word is a DT or PP type and removes it if so
        if (keyword[0][1][0] == 'D' or keyword[0][1][0] == 'P'):
            keyword.remove(keyword[0])

        # makes a list of just the strings
        for (x, y) in keyword:
            trimmed.append((x, y))
        return trimmed

    def spellcheck(self, input):
        # check if enabled
        if not constants.SPELLCHECK:
            return input

        words = [(w.new, w.corrected, w.old)
                 for w in self.didyoumean.suggest(input)]

        full = []
        old = []
        new = []
        for w in words:
            # exclude capitalized words
            # since they are assumed to be proper nouns
            if w[1] and not constants.CAPITAL_REGEX.match(w[2]):
                new.append(w[0])
                full.append(w[0])
            elif not constants.CAPITAL_REGEX.match(w[2]):
                # but add correctly capitalized words
                if w[0] != w[2]:
                    new.append(w[0])
                    full.append(w[0])
                else:
                    old.append(w[2])
                    full.append(w[2])

            else:
                old.append(w[2])
                full.append(w[2])
        return {
            'full': u' '.join(full),
            'new': new,
            'old': old,
        }

    def build_tagset(self, input):
        # tokenize input
        tokens = nltk.word_tokenize(input)
        return nltk.pos_tag(tokens)

    def build_keywords(self, tagset):
        keywords = {}

        # regexp grammar to catch our keywords
        #  <DT|PP\$>?: an optional determiner (a,the) or posesive (his, hers).
        #  <JJ.*>* zero or more adjectives of any type
        #  <NN.*>+ one or more nouns of any type
        grammar = r"""
        # chunk determiner/possessive, adjectives and nouns
            NP: {<DT|PP\$>?<JJ>*<NN.*>+}
        # numerals
            CD: {<CD|LS>}
        # chunk adjectives
            JJ: {<JJ.*|VBD>}
        # preposition or conjunction
            IN: {<IN>}
        # used for 'how far'
            RB: {<.*RB>}
        # used for 'serving'
            VBG: {<VBG>}
        """
        # parse for keywords
        regexp_parser = nltk.RegexpParser(grammar)
        tree = regexp_parser.parse(tagset)
        if constants.DEBUG:
            log.debug('Tree: ' + str(tree))

        # walk through the grammar tree and pick out keywords
        # go for noun phrases first
        for subtree in tree.subtrees(filter =
            lambda t: t.node == 'NP' or t.node == 'CD' \
                or t.node == 'JJ' or t.node == 'IN' \
                or t.node == 'RB' or t.node == 'VBG'):
            keyword = list(subtree.leaves())
            keyword = self.format_keywords(keyword)

            # append keywords to list
            for kw in keyword:
                # initialize, if no list of this type
                if not keywords.get(kw[1], None):
                    keywords[kw[1]] = []

                keywords[kw[1]].append(kw[0])

        return keywords

    def check_grammar(self, input):
        resp = constants.DEFAULT_RESPONSE.copy()

        # start with matches = False
        matches = False

        for item in grammar:
            # check all matches
            matches = True
            for name, pattern in item['matches_compiled'].items():
                if not pattern.match(input):
                    # build the semantics
                    matches = False

            if matches:
                for name, semantic in item['semantics_compiled'].items():
                    match = semantic.match(input)
                    if not match:
                        matches = False
                    else:
                        resp[name] = match.group('term')

            if matches:
                resp['type'] = item['type']
                break

        if matches:
            if constants.DEBUG:
                log.debug(input + ' -- ' + str(resp))
            return resp
        else:
            if constants.DEBUG:
                log.debug(input + ' -- ' + str(False))
            return False
예제 #7
0
def test_simple_locale():
    e = DidYouMean('en')
    eq_(True, e.check('fial'))
예제 #8
0
def test_wrong_capitalization():
    """Make sure that passing incorrectly capitalized locales still works"""
    e = DidYouMean('en-us')
    eq_(False, e.check('worng'))