Exemplo n.º 1
0
    def add_phrase(self, input_phrase='', phrase='',
                   p_phrase='', pp_phrase='',
                   user_freq=0, commit=True):
        '''
        Add phrase to database
        '''
        if DEBUG_LEVEL > 1:
            LOGGER.debug(
                'input_phrase=%s phrase=%s user_freq=%s ',
                input_phrase.encode('UTF-8'),
                phrase.encode('UTF-8'),
                user_freq)
        if not input_phrase or not phrase:
            return
        input_phrase = itb_util.remove_accents(input_phrase)
        input_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
        phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, phrase)
        p_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase)
        pp_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase)
        select_sqlstr = '''
        SELECT * FROM user_db.phrases
        WHERE input_phrase = :input_phrase
        AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase
        ;'''
        select_sqlargs = {
            'input_phrase': input_phrase,
            'phrase': phrase,
            'p_phrase': p_phrase,
            'pp_phrase': pp_phrase}
        if self.database.execute(select_sqlstr, select_sqlargs).fetchall():
            # there is already such a phrase, i.e. add_phrase was called
            # in error, do nothing to avoid duplicate entries.
            return

        insert_sqlstr = '''
        INSERT INTO user_db.phrases
        (input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp)
        VALUES (:input_phrase, :phrase, :p_phrase, :pp_phrase, :user_freq, :timestamp)
        ;'''
        insert_sqlargs = {'input_phrase': input_phrase,
                          'phrase': phrase,
                          'p_phrase': p_phrase,
                          'pp_phrase': pp_phrase,
                          'user_freq': user_freq,
                          'timestamp': time.time()}
        if DEBUG_LEVEL > 1:
            LOGGER.debug('insert_sqlstr=%s', insert_sqlstr)
            LOGGER.debug('insert_sqlargs=%s', insert_sqlargs)
        try:
            self.database.execute(insert_sqlstr, insert_sqlargs)
            if commit:
                self.database.commit()
        except Exception:
            LOGGER.exception('Unexpected error adding phrase to database.')
 def test_remove_accents(self) -> None:
     self.assertEqual(itb_util.remove_accents('abcÅøßẞüxyz'),
                      'abcAossSSuxyz')
     self.assertEqual(
         itb_util.remove_accents(unicodedata.normalize(
             'NFD', 'abcÅøßẞüxyz')), 'abcAossSSuxyz')
     self.assertEqual(
         unicodedata.normalize(
             'NFC', itb_util.remove_accents('abcÅøßẞüxyz', keep='åÅØø')),
         'abcÅøssSSuxyz')
     self.assertEqual(
         unicodedata.normalize(
             'NFC',
             itb_util.remove_accents(
                 unicodedata.normalize('NFD', 'abcÅøßẞüxyz'),
                 keep=unicodedata.normalize('NFD', 'åÅØø'))),
         'abcÅøssSSuxyz')
     self.assertEqual(
         unicodedata.normalize(
             'NFC',
             itb_util.remove_accents('alkoholförgiftning', keep='åÅÖö')),
         'alkoholförgiftning')
     self.assertEqual(
         unicodedata.normalize(
             'NFC',
             itb_util.remove_accents(
                 unicodedata.normalize('NFD', 'alkoholförgiftning'),
                 keep=unicodedata.normalize('NFD', 'åÅÖö'))),
         'alkoholförgiftning')
Exemplo n.º 3
0
 def update_phrase(self,
                   input_phrase='',
                   phrase='',
                   p_phrase='',
                   pp_phrase='',
                   user_freq=0,
                   commit=True):
     '''
     update the user frequency of a phrase
     '''
     if not input_phrase or not phrase:
         return
     input_phrase = itb_util.remove_accents(input_phrase)
     input_phrase = unicodedata.normalize(self._normalization_form_internal,
                                          input_phrase)
     phrase = unicodedata.normalize(self._normalization_form_internal,
                                    phrase)
     p_phrase = unicodedata.normalize(self._normalization_form_internal,
                                      p_phrase)
     pp_phrase = unicodedata.normalize(self._normalization_form_internal,
                                       pp_phrase)
     sqlstr = '''
     UPDATE user_db.phrases
     SET user_freq = :user_freq, timestamp = :timestamp
     WHERE input_phrase = :input_phrase
      AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase
     ;'''
     sqlargs = {
         'user_freq': user_freq,
         'input_phrase': input_phrase,
         'phrase': phrase,
         'p_phrase': p_phrase,
         'pp_phrase': pp_phrase,
         'timestamp': time.time()
     }
     if DEBUG_LEVEL > 1:
         sys.stderr.write("tabsqlitedb.update_phrase() sqlstr=%s\n" %
                          sqlstr)
         sys.stderr.write("tabsqlitedb.update_phrase() sqlargs=%s\n" %
                          sqlargs)
     try:
         self.db.execute(sqlstr, sqlargs)
         if commit:
             self.db.commit()
     except:
         traceback.print_exc()
 def update_phrase(self,
                   input_phrase='',
                   phrase='',
                   p_phrase='',
                   pp_phrase='',
                   user_freq=0,
                   commit=True):
     '''
     update the user frequency of a phrase
     '''
     if not input_phrase or not phrase:
         return
     input_phrase = itb_util.remove_accents(input_phrase)
     input_phrase = unicodedata.normalize(
         itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
     phrase = unicodedata.normalize(itb_util.NORMALIZATION_FORM_INTERNAL,
                                    phrase)
     p_phrase = unicodedata.normalize(itb_util.NORMALIZATION_FORM_INTERNAL,
                                      p_phrase)
     pp_phrase = unicodedata.normalize(itb_util.NORMALIZATION_FORM_INTERNAL,
                                       pp_phrase)
     sqlstr = '''
     UPDATE user_db.phrases
     SET user_freq = :user_freq, timestamp = :timestamp
     WHERE input_phrase = :input_phrase
      AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase
     ;'''
     sqlargs = {
         'user_freq': user_freq,
         'input_phrase': input_phrase,
         'phrase': phrase,
         'p_phrase': p_phrase,
         'pp_phrase': pp_phrase,
         'timestamp': time.time()
     }
     if DEBUG_LEVEL > 1:
         LOGGER.debug('sqlstr=%s', sqlstr)
         LOGGER.debug('sqlargs=%s', sqlargs)
     try:
         self.database.execute(sqlstr, sqlargs)
         if commit:
             self.database.commit()
     except Exception:
         LOGGER.exception('Unexpected error updating phrase in user_db.')
    def load_dictionary(self):
        '''Load a hunspell dictionary and instantiate a
        enchant.Dict() or a hunspell.Hunspell() object.

        '''
        if DEBUG_LEVEL > 0:
            sys.stderr.write("load_dictionary() ...\n")
        (self.dic_path,
         self.encoding,
         self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name)
        if self.words:
            # List of languages where accent insensitive matching makes sense:
            accent_languages = (
                'af', 'ast', 'az', 'be', 'bg', 'br', 'bs', 'ca', 'cs', 'csb',
                'cv', 'cy', 'da', 'de', 'dsb', 'el', 'en', 'es', 'eu', 'fo',
                'fr', 'fur', 'fy', 'ga', 'gd', 'gl', 'grc', 'gv', 'haw', 'hr',
                'hsb', 'ht', 'hu', 'ia', 'is', 'it', 'kk', 'ku', 'ky', 'lb',
                'ln', 'lv', 'mg', 'mi', 'mk', 'mn', 'mos', 'mt', 'nb', 'nds',
                'nl', 'nn', 'nr', 'nso', 'ny', 'oc', 'pl', 'plt', 'pt', 'qu',
                'quh', 'ru', 'sc', 'se', 'sh', 'shs', 'sk', 'sl', 'smj', 'sq',
                'sr', 'ss', 'st', 'sv', 'tet', 'tk', 'tn', 'ts', 'uk', 'uz',
                've', 'vi', 'wa', 'xh',
            )
            if self.name.split('_')[0] in accent_languages:
                self.word_pairs = [
                    (x, itb_util.remove_accents(x))
                    for x in self.words
                ]
            for word in self.words:
                if len(word) > self.max_word_len:
                    self.max_word_len = len(word)
            if DEBUG_LEVEL > 1:
                sys.stderr.write(
                    'load_dictionary() max_word_len = %s\n'
                    % self.max_word_len)
            if IMPORT_ENCHANT_SUCCESSFUL:
                self.enchant_dict = enchant.Dict(self.name)
            elif IMPORT_HUNSPELL_SUCCESSFUL and self.dic_path:
                aff_path = self.dic_path.replace('.dic', '.aff')
                self.pyhunspell_object = hunspell.HunSpell(
                    self.dic_path, aff_path)
 def update_phrase(self, input_phrase='', phrase='',
                   p_phrase='', pp_phrase='',
                   user_freq=0, commit=True):
     '''
     update the user frequency of a phrase
     '''
     if not input_phrase or not phrase:
         return
     input_phrase = itb_util.remove_accents(input_phrase)
     input_phrase = unicodedata.normalize(
         itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
     phrase = unicodedata.normalize(
         itb_util.NORMALIZATION_FORM_INTERNAL, phrase)
     p_phrase = unicodedata.normalize(
         itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase)
     pp_phrase = unicodedata.normalize(
         itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase)
     sqlstr = '''
     UPDATE user_db.phrases
     SET user_freq = :user_freq, timestamp = :timestamp
     WHERE input_phrase = :input_phrase
      AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase
     ;'''
     sqlargs = {'user_freq': user_freq,
                'input_phrase': input_phrase,
                'phrase': phrase,
                'p_phrase': p_phrase,
                'pp_phrase': pp_phrase,
                'timestamp': time.time()}
     if DEBUG_LEVEL > 1:
         sys.stderr.write(
             "TabSqliteDb.update_phrase() sqlstr=%s\n" %sqlstr)
         sys.stderr.write(
             "TabSqliteDb.update_phrase() sqlargs=%s\n" %sqlargs)
     try:
         self.db.execute(sqlstr, sqlargs)
         if commit:
             self.db.commit()
     except:
         traceback.print_exc()
Exemplo n.º 7
0
    def load_dictionary(self):
        '''Load a hunspell dictionary and instantiate a
        enchant.Dict() or a hunspell.Hunspell() object.

        '''
        if DEBUG_LEVEL > 0:
            sys.stderr.write("load_dictionary() ...\n")
        (self.dic_path,
         self.encoding,
         self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name)
        if self.words:
            # List of languages where accent insensitive matching makes sense:
            accent_languages = (
                'af', 'ast', 'az', 'be', 'bg', 'br', 'bs', 'ca', 'cs', 'csb',
                'cv', 'cy', 'da', 'de', 'dsb', 'el', 'en', 'es', 'eu', 'fo',
                'fr', 'fur', 'fy', 'ga', 'gd', 'gl', 'grc', 'gv', 'haw', 'hr',
                'hsb', 'ht', 'hu', 'ia', 'is', 'it', 'kk', 'ku', 'ky', 'lb',
                'ln', 'lv', 'mg', 'mi', 'mk', 'mn', 'mos', 'mt', 'nb', 'nds',
                'nl', 'nn', 'nr', 'nso', 'ny', 'oc', 'pl', 'plt', 'pt', 'qu',
                'quh', 'ru', 'sc', 'se', 'sh', 'shs', 'sk', 'sl', 'smj', 'sq',
                'sr', 'ss', 'st', 'sv', 'tet', 'tk', 'tn', 'ts', 'uk', 'uz',
                've', 'vi', 'wa', 'xh',
            )
            if self.name.split('_')[0] in accent_languages:
                self.word_pairs = [
                    (x, itb_util.remove_accents(x))
                    for x in self.words
                ]
            for x in self.words:
                if len(x) > self.max_word_len:
                    self.max_word_len = len(x)
            if DEBUG_LEVEL > 1:
                sys.stderr.write(
                    'load_dictionary() max_word_len = %s\n'
                    % self.max_word_len)
            if IMPORT_ENCHANT_SUCCESSFUL:
                self.enchant_dict = enchant.Dict(self.name)
            elif IMPORT_HUNSPELL_SUCCESSFUL and self.dic_path:
                aff_path = self.dic_path.replace('.dic', '.aff')
                self.pyhunspell_object = hunspell.HunSpell(self.dic_path, aff_path)
Exemplo n.º 8
0
    def check_phrase_and_update_frequency(
            self, input_phrase='', phrase='', p_phrase='',
            pp_phrase='', user_freq_increment=1, commit=True):
        '''
        Check whether input_phrase and phrase are already in database. If
        they are in the database, increase the frequency by 1, if not
        add them.
        '''
        if not input_phrase:
            input_phrase = phrase
        if not phrase:
            return
        phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, phrase)
        p_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase)
        pp_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase)
        input_phrase = itb_util.remove_accents(input_phrase)
        input_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)

        if DEBUG_LEVEL > 1:
            LOGGER.debug(
                'phrase=%(p)s, input_phrase=%(t)s',
                {'p': phrase.encode('UTF-8'),
                 't': input_phrase.encode('UTF-8')})

        # There should never be more than 1 database row for the same
        # input_phrase *and* phrase. So the following query on
        # the database should match at most one database
        # row and the length of the result array should be 0 or
        # 1. So the “GROUP BY phrase” is actually redundant. It is
        # only a safeguard for the case when duplicate rows have been
        # added to the database accidentally (But in that case there
        # is a bug somewhere else which should be fixed).
        sqlstr = '''
        SELECT max(user_freq) FROM user_db.phrases
        WHERE input_phrase = :input_phrase
        AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase
        GROUP BY phrase
        ;'''
        sqlargs = {'input_phrase': input_phrase,
                   'phrase': phrase,
                   'p_phrase': p_phrase,
                   'pp_phrase': pp_phrase}
        if DEBUG_LEVEL > 1:
            LOGGER.debug(
                'TabSqliteDb.check_phrase_and_update_frequency() sqlstr=%s',
                sqlstr)
            LOGGER.debug(
                'TabSqliteDb.check_phrase_and_update_frequency() sqlargs=%s',
                sqlargs)
        result = self.database.execute(sqlstr, sqlargs).fetchall()
        if DEBUG_LEVEL > 1:
            LOGGER.debug(
                'check_phrase_and_update_frequency() result=%s', result)
        if result:
            # A match was found in user_db, increase user frequency by
            # user_freq_increment (1 by default)
            self.update_phrase(input_phrase=input_phrase,
                               phrase=phrase,
                               p_phrase=p_phrase,
                               pp_phrase=pp_phrase,
                               user_freq=result[0][0]+user_freq_increment,
                               commit=commit)
            return
        # The phrase was not found in user_db.
        # Add it as a new phrase, i.e. with user_freq = user_freq_increment
        # (1 by default):
        self.add_phrase(input_phrase=input_phrase,
                        phrase=phrase,
                        p_phrase=p_phrase,
                        pp_phrase=pp_phrase,
                        user_freq=user_freq_increment,
                        commit=commit)
        return
Exemplo n.º 9
0
    def select_words(self, input_phrase, p_phrase='', pp_phrase=''):
        '''
        Get phrases from database completing input_phrase.

        Returns a list of matches where each match is a tuple in the
        form of (phrase, user_freq), i.e. returns something like
        [(phrase, user_freq), ...]
        '''
        input_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
        p_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase)
        pp_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase)
        if DEBUG_LEVEL > 1:
            LOGGER.debug(
                'input_phrase=%s p_phrase=%s pp_phrase=%s',
                input_phrase.encode('UTF-8'),
                p_phrase.encode('UTF-8'),
                pp_phrase.encode('UTF-8'))
        phrase_frequencies = {}
        if not ' ' in input_phrase:
            # Get suggestions from hunspell dictionaries. But only
            # if input_phrase does not contain spaces. The hunspell
            # dictionaries contain only single words, not sentences.
            # Trying to complete an input_phrase which contains spaces
            # will never work and spell checking suggestions by hunspell
            # for input which contains spaces is almost always nonsense.
            phrase_frequencies.update([
                x for x in self.hunspell_obj.suggest(input_phrase)])
        if DEBUG_LEVEL > 1:
            LOGGER.debug(
                'hunspell: best_candidates=%s',
                self.best_candidates(phrase_frequencies))
        # Remove the accents *after* getting the hunspell candidates.
        # If the accents were removed before getting the hunspell candidates
        # an input phrase like “Glühwürmchen” would not be added as a
        # candidate because hunspell would get “Gluhwurmchen” then and would
        # not validate that as a correct word. And, because “Glühwürmchen”
        # is not in the German hunspell dictionary as a single word but
        # created by suffix and prefix rules, the accent insensitive match
        # in the German hunspell dictionary would not find it either.
        input_phrase = itb_util.remove_accents(input_phrase)
        input_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
        # Now phrase_frequencies might contain something like this:
        #
        # {'code': 0, 'communicability': 0, 'cold': 0, 'colour': 0}

        # To quote a string to be used as a parameter when assembling
        # an sqlite statement with Python string operations, remove
        # all NUL characters, replace " with "" and wrap the whole
        # string in double quotes. Assembling sqlite statements using
        # parameters containing user input with python string operations
        # is not recommended because of the risk of SQL injection attacks
        # if the quoting is not done the right way. So it is better to use
        # the parameter substitution of the sqlite3 python interface.
        # But unfortunately that does not work when creating views,
        # (“OperationalError: parameters are not allowed in views”).
        quoted_input_phrase = input_phrase.replace(
            '\x00', '').replace('"', '""')
        self.database.execute('DROP VIEW IF EXISTS like_input_phrase_view;')
        sqlstr = '''
        CREATE TEMPORARY VIEW IF NOT EXISTS like_input_phrase_view AS
        SELECT * FROM user_db.phrases
        WHERE input_phrase LIKE "%(quoted_input_phrase)s%%"
        ;''' % {'quoted_input_phrase': quoted_input_phrase}
        self.database.execute(sqlstr)
        sqlargs = {'p_phrase': p_phrase, 'pp_phrase': pp_phrase}
        sqlstr = (
            'SELECT phrase, sum(user_freq) FROM like_input_phrase_view '
            + 'GROUP BY phrase;')
        try:
            # Get “unigram” data from user_db.
            #
            # Example: Let’s assume the user typed “co” and user_db contains
            #
            #     1|colou|colour|green|nice|1
            #     2|col|colour|yellow|ugly|2
            #     3|co|colour|green|awesome|1
            #     4|co|cold|||1
            #     5|conspirac|conspiracy|||5
            #     6|conspi|conspiracy|||1
            #     7|c|conspiracy|||1
            results_uni = self.database.execute(sqlstr, sqlargs).fetchall()
            # Then the result returned by .fetchall() is:
            #
            # [('colour', 4), ('cold', 1), ('conspiracy', 6)]
            #
            # (“c|conspiracy|1” is not selected because it doesn’t
            # match the user input “LIKE co%”! I.e. this is filtered
            # out by the VIEW created above already)
        except Exception:
            LOGGER.exception(
                'Unexpected error getting “unigram” data from user_db.')
        if not results_uni:
            # If no unigrams matched, bigrams and trigrams cannot
            # match either. We can stop here and return what we got
            # from hunspell.
            return self.best_candidates(phrase_frequencies)
        # Now normalize the unigram frequencies with the total count
        # (which is 11 in the above example), which gives us the
        # normalized result:
        # [('colour', 4/11), ('cold', 1/11), ('conspiracy', 6/11)]
        sqlstr = 'SELECT sum(user_freq) FROM like_input_phrase_view;'
        try:
            count = self.database.execute(sqlstr, sqlargs).fetchall()[0][0]
        except Exception:
            LOGGER.exception(
                'Unexpected error getting total unigram count from user_db')
        # Updating the phrase_frequency dictionary with the normalized
        # results gives: {'conspiracy': 6/11, 'code': 0,
        # 'communicability': 0, 'cold': 1/11, 'colour': 4/11}
        for result_uni in results_uni:
            phrase_frequencies.update(
                [(result_uni[0], result_uni[1]/float(count))])
        if DEBUG_LEVEL > 1:
            LOGGER.debug(
                'Unigram best_candidates=%s',
                self.best_candidates(phrase_frequencies))
        if not p_phrase:
            # If no context for bigram matching is available, return
            # what we have so far:
            return self.best_candidates(phrase_frequencies)
        sqlstr = (
            'SELECT phrase, sum(user_freq) FROM like_input_phrase_view '
            + 'WHERE p_phrase = :p_phrase GROUP BY phrase;')
        try:
            results_bi = self.database.execute(sqlstr, sqlargs).fetchall()
        except Exception:
            LOGGER.exception(
                'Unexpected error getting “bigram” data from user_db')
        if not results_bi:
            # If no bigram could be matched, return what we have so far:
            return self.best_candidates(phrase_frequencies)
        # get the total count of p_phrase to normalize the bigram frequencies:
        sqlstr = (
            'SELECT sum(user_freq) FROM like_input_phrase_view '
            + 'WHERE p_phrase = :p_phrase;')
        try:
            count_p_phrase = self.database.execute(
                sqlstr, sqlargs).fetchall()[0][0]
        except Exception:
            LOGGER.exception(
                'Unexpected error getting total bigram count from user_db')
        # Update the phrase frequency dictionary by using a linear
        # combination of the unigram and the bigram results, giving
        # both the weight of 0.5:
        for result_bi in results_bi:
            phrase_frequencies.update(
                [(result_bi[0],
                  0.5*result_bi[1]/float(count_p_phrase)
                  +0.5*phrase_frequencies[result_bi[0]])])
        if DEBUG_LEVEL > 1:
            LOGGER.debug(
                'Bigram best_candidates=%s',
                self.best_candidates(phrase_frequencies))
        if not pp_phrase:
            # If no context for trigram matching is available, return
            # what we have so far:
            return self.best_candidates(phrase_frequencies)
        sqlstr = ('SELECT phrase, sum(user_freq) FROM like_input_phrase_view '
                  + 'WHERE p_phrase = :p_phrase '
                  + 'AND pp_phrase = :pp_phrase GROUP BY phrase;')
        try:
            results_tri = self.database.execute(sqlstr, sqlargs).fetchall()
        except Exception:
            LOGGER.exception(
                'Unexpected error getting “trigram” data from user_db')
        if not results_tri:
            # if no trigram could be matched, return what we have so far:
            return self.best_candidates(phrase_frequencies)
        # get the total count of (p_phrase, pp_phrase) pairs to
        # normalize the bigram frequencies:
        sqlstr = (
            'SELECT sum(user_freq) FROM like_input_phrase_view '
            + 'WHERE p_phrase = :p_phrase AND pp_phrase = :pp_phrase;')
        try:
            count_pp_phrase_p_phrase = self.database.execute(
                sqlstr, sqlargs).fetchall()[0][0]
        except Exception:
            LOGGER.exception(
                'Unexpected error getting total trigram count from user_db')
        # Update the phrase frequency dictionary by using a linear
        # combination of the bigram and the trigram results, giving
        # both the weight of 0.5 (that makes the total weights: 0.25 *
        # unigram + 0.25 * bigram + 0.5 * trigram, i.e. the trigrams
        # get higher weight):
        for result_tri in results_tri:
            phrase_frequencies.update(
                [(result_tri[0],
                  0.5*result_tri[1]/float(count_pp_phrase_p_phrase)
                  +0.5*phrase_frequencies[result_tri[0]])])
        if DEBUG_LEVEL > 1:
            LOGGER.debug(
                'Trigram best_candidates=%s',
                self.best_candidates(phrase_frequencies))
        return self.best_candidates(phrase_frequencies)
Exemplo n.º 10
0
    def load_dictionary(self):
        '''Load a hunspell dictionary and instantiate a
        enchant.Dict() or a hunspell.Hunspell() object.

        '''
        if DEBUG_LEVEL > 0:
            LOGGER.debug('load_dictionary() ...\n')
        (self.dic_path, self.encoding,
         self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name)
        if self.words:
            # List of languages where accent insensitive matching makes sense:
            accent_languages = (
                'af',
                'ast',
                'az',
                'be',
                'bg',
                'br',
                'bs',
                'ca',
                'cs',
                'csb',
                'cv',
                'cy',
                'da',
                'de',
                'dsb',
                'el',
                'en',
                'es',
                'eu',
                'fi',
                'fo',
                'fr',
                'fur',
                'fy',
                'ga',
                'gd',
                'gl',
                'grc',
                'gv',
                'haw',
                'hr',
                'hsb',
                'ht',
                'hu',
                'ia',
                'is',
                'it',
                'kk',
                'ku',
                'ky',
                'lb',
                'ln',
                'lv',
                'mg',
                'mi',
                'mk',
                'mn',
                'mos',
                'mt',
                'nb',
                'nds',
                'nl',
                'nn',
                'nr',
                'nso',
                'ny',
                'oc',
                'pl',
                'plt',
                'pt',
                'qu',
                'quh',
                'ru',
                'sc',
                'se',
                'sh',
                'shs',
                'sk',
                'sl',
                'smj',
                'sq',
                'sr',
                'ss',
                'st',
                'sv',
                'tet',
                'tk',
                'tn',
                'ts',
                'uk',
                'uz',
                've',
                'vi',
                'wa',
                'xh',
            )
            if self.name.split('_')[0] in accent_languages:
                self.word_pairs = [(x, itb_util.remove_accents(x))
                                   for x in self.words]
            for word in self.words:
                if len(word) > self.max_word_len:
                    self.max_word_len = len(word)
            if DEBUG_LEVEL > 1:
                LOGGER.debug('max_word_len = %s\n', self.max_word_len)
            if self.name.split('_')[0] == 'fi':
                self.enchant_dict = None
                self.pyhunspell_object = None
                if IMPORT_LIBVOIKKO_SUCCESSFUL:
                    self.voikko = libvoikko.Voikko('fi')
                return
            if IMPORT_ENCHANT_SUCCESSFUL:
                try:
                    self.enchant_dict = enchant.Dict(self.name)
                except enchant.errors.DictNotFoundError:
                    LOGGER.exception('Error initializing enchant for %s',
                                     self.name)
                    self.enchant_dict = None
                except Exception:
                    LOGGER.exception(
                        'Unknown error initializing enchant for %s', self.name)
                    self.enchant_dict = None
            elif IMPORT_HUNSPELL_SUCCESSFUL and self.dic_path:
                aff_path = self.dic_path.replace('.dic', '.aff')
                try:
                    self.pyhunspell_object = hunspell.HunSpell(
                        self.dic_path, aff_path)
                except hunspell.HunSpellError:
                    LOGGER.debug('Error initializing hunspell for %s',
                                 self.name)
                    self.pyhunspell_object = None
                except Exception:
                    LOGGER.debug('Unknown error initializing hunspell for %s',
                                 self.name)
                    self.pyhunspell_object = None
    def load_dictionary(self):
        '''Load a hunspell dictionary and instantiate a
        enchant.Dict() or a hunspell.Hunspell() object.

        '''
        if DEBUG_LEVEL > 0:
            sys.stderr.write("load_dictionary() ...\n")
        dic_path = os.path.join(self.loc, self.name + '.dic')
        aff_path = os.path.join(self.loc, self.name + '.aff')
        if not os.path.isfile(dic_path) or not os.path.isfile(aff_path):
            sys.stderr.write(
                "load_dictionary %(n)s: %(d)s %(a)s file missing.\n" % {
                    'n': self.name,
                    'd': dic_path,
                    'a': aff_path
                })
            return
        aff_buffer = None
        dic_buffer = None
        try:
            aff_buffer = open(aff_path,
                              mode='r',
                              encoding='ISO-8859-1',
                              errors='ignore').read().replace('\r\n', '\n')
        except (FileNotFoundError, PermissionError):
            traceback.print_exc()
        except:
            sys.stderr.write('Unexpected error loading .aff File: %s\n' %
                             aff_path)
            traceback.print_exc()
        if aff_buffer:
            encoding_pattern = re.compile(
                r'^[\s]*SET[\s]+(?P<encoding>[-a-zA-Z0-9_]+)[\s]*$',
                re.MULTILINE)
            match = encoding_pattern.search(aff_buffer)
            if match:
                self.encoding = match.group('encoding')
                if DEBUG_LEVEL > 0:
                    sys.stderr.write(
                        "load_dictionary(): encoding=%(enc)s found in %(aff)s"
                        % {
                            'enc': self.encoding,
                            'aff': aff_path
                        })
        try:
            dic_buffer = open(dic_path, encoding=self.encoding).readlines()
        except (UnicodeDecodeError, FileNotFoundError, PermissionError):
            if DEBUG_LEVEL > 0:
                sys.stderr.write(
                    "load_dictionary(): " +
                    "loading %(dic)s as %(enc)s encoding failed, " % {
                        'dic': dic_path,
                        'enc': self.encoding
                    } + "fall back to ISO-8859-1.\n")
            self.encoding = 'ISO-8859-1'
            try:
                dic_buffer = open(dic_path, encoding=self.encoding).readlines()
            except (UnicodeDecodeError, FileNotFoundError, PermissionError):
                sys.stderr.write(
                    "load_dictionary(): " +
                    "loading %(dic)s as %(enc)s encoding failed, " % {
                        'dic': dic_path,
                        'enc': self.encoding
                    } + "giving up.\n")
                dic_buffer = None
                traceback.print_exc()
                return
            except:
                sys.stderr.write('Unexpected error loading .dic File: %s\n' %
                                 dic_path)
                traceback.print_exc()
                return
        except:
            sys.stderr.write('Unexpected error loading .dic File: %s\n' %
                             dic_path)
            traceback.print_exc()
            return
        if dic_buffer:
            if DEBUG_LEVEL > 0:
                sys.stderr.write(
                    "load_dictionary(): " +
                    "Successfully loaded %(dic)s using %(enc)s encoding.\n" % {
                        'dic': dic_path,
                        'enc': self.encoding
                    })
            # http://pwet.fr/man/linux/fichiers_speciaux/hunspell says:
            #
            # > A dictionary file (*.dic) contains a list of words, one per
            # > line. The first line of the dictionaries (except personal
            # > dictionaries) contains the word count. Each word may
            # > optionally be followed by a slash ("/") and one or more
            # > flags, which represents affixes or special attributes.
            #
            # Therefore, remove '/' and the following flags from each
            # line to make the buffer a bit smaller and the regular
            # expressions we use later to match words in the
            # dictionary slightly simpler and maybe a tiny bit faster:
            self.words = [
                unicodedata.normalize(NORMALIZATION_FORM_INTERNAL,
                                      re.sub(r'/.*', '', x.replace('\n', '')))
                for x in dic_buffer
            ]
            # List of languages where accent insensitive matching makes sense:
            accent_languages = (
                'af',
                'ast',
                'az',
                'be',
                'bg',
                'br',
                'bs',
                'ca',
                'cs',
                'csb',
                'cv',
                'cy',
                'da',
                'de',
                'dsb',
                'el',
                'en',
                'es',
                'eu',
                'fo',
                'fr',
                'fur',
                'fy',
                'ga',
                'gd',
                'gl',
                'grc',
                'gv',
                'haw',
                'hr',
                'hsb',
                'ht',
                'hu',
                'ia',
                'is',
                'it',
                'kk',
                'ku',
                'ky',
                'lb',
                'ln',
                'lv',
                'mg',
                'mi',
                'mk',
                'mn',
                'mos',
                'mt',
                'nb',
                'nds',
                'nl',
                'nn',
                'nr',
                'nso',
                'ny',
                'oc',
                'pl',
                'plt',
                'pt',
                'qu',
                'quh',
                'ru',
                'sc',
                'se',
                'sh',
                'shs',
                'sk',
                'sl',
                'smj',
                'sq',
                'sr',
                'ss',
                'st',
                'sv',
                'tet',
                'tk',
                'tn',
                'ts',
                'uk',
                'uz',
                've',
                'vi',
                'wa',
                'xh',
            )
            if self.name.split('_')[0] in accent_languages:
                self.word_pairs = [(x, itb_util.remove_accents(x))
                                   for x in self.words]
            if IMPORT_ENCHANT_SUCCESSFUL:
                self.enchant_dict = enchant.Dict(self.name)
            elif IMPORT_HUNSPELL_SUCCESSFUL:
                self.pyhunspell_object = hunspell.HunSpell(dic_path, aff_path)
Exemplo n.º 12
0
    def add_phrase(self,
                   input_phrase='',
                   phrase='',
                   p_phrase='',
                   pp_phrase='',
                   user_freq=0,
                   commit=True):
        '''
        Add phrase to database
        '''
        if DEBUG_LEVEL > 1:
            sys.stderr.write("tabsqlitedb.add_phrase() " + "input_phrase=%s " %
                             input_phrase.encode('UTF-8') +
                             "phrase=%s " % phrase.encode('UTF-8') +
                             "user_freq=%s " % user_freq)
        if not input_phrase or not phrase:
            return
        input_phrase = itb_util.remove_accents(input_phrase)
        input_phrase = unicodedata.normalize(self._normalization_form_internal,
                                             input_phrase)
        phrase = unicodedata.normalize(self._normalization_form_internal,
                                       phrase)
        p_phrase = unicodedata.normalize(self._normalization_form_internal,
                                         p_phrase)
        pp_phrase = unicodedata.normalize(self._normalization_form_internal,
                                          pp_phrase)
        select_sqlstr = '''
        SELECT * FROM user_db.phrases
        WHERE input_phrase = :input_phrase
        AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase
        ;'''
        select_sqlargs = {
            'input_phrase': input_phrase,
            'phrase': phrase,
            'p_phrase': p_phrase,
            'pp_phrase': pp_phrase
        }
        if self.db.execute(select_sqlstr, select_sqlargs).fetchall():
            # there is already such a phrase, i.e. add_phrase was called
            # in error, do nothing to avoid duplicate entries.
            return

        insert_sqlstr = '''
        INSERT INTO user_db.phrases
        (input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp)
        VALUES (:input_phrase, :phrase, :p_phrase, :pp_phrase, :user_freq, :timestamp)
        ;'''
        insert_sqlargs = {
            'input_phrase': input_phrase,
            'phrase': phrase,
            'p_phrase': p_phrase,
            'pp_phrase': pp_phrase,
            'user_freq': user_freq,
            'timestamp': time.time()
        }
        if DEBUG_LEVEL > 1:
            sys.stderr.write("tabsqlitedb.add_phrase() insert_sqlstr=%s\n" %
                             insert_sqlstr)
            sys.stderr.write("tabsqlitedb.add_phrase() insert_sqlargs=%s\n" %
                             insert_sqlargs)
        try:
            self.db.execute(insert_sqlstr, insert_sqlargs)
            if commit:
                self.db.commit()
        except Exception:
            traceback.print_exc()
Exemplo n.º 13
0
    def suggest(self, input_phrase):
        # pylint: disable=line-too-long
        '''Return completions or corrections for the input phrase

        :param input_phrase: A string to find completions or corrections for
        :type input_phrase: String
        :rtype: A list of tuples of the form (<word>, <score>)
                <score> can have these values:
                    0: This is a completion, i.e. input_phrase matches
                       the beginning of <word> (accent insensitive match)
                   -1: This is a spell checking correction from hunspell
                       (i.e. either from enchant or pyhunspell)

        Examples:

        (Attention, the return values are in internal
        normalization form ('NFD'))

        >>> h = Hunspell(['de_DE', 'cs_CZ'])
        >>> h.suggest('Geschwindigkeitsubertre')[0]
        ('Geschwindigkeitsübertretungsverfahren', 0)

        >>> h.suggest('Geschwindigkeitsübertretungsverfahren')[0]
        ('Geschwindigkeitsübertretungsverfahren', 0)

        >>> h.suggest('Glühwürmchen')[0]
        ('Glühwürmchen', 0)

        >>> h.suggest('Alpengluhen')[0]
        ('Alpenglühen', 0)

        >>> h.suggest('filosofictejs')[0]
        ('filosofičtější', 0)

        >>> h.suggest('filosofičtější')[0]
        ('filosofičtější', 0)

        >>> h.suggest('filosofičtějš')[0]
        ('filosofičtější', 0)

        >>> h = Hunspell(['it_IT'])
        >>> h.suggest('principianti')
        [('principianti', 0), ('principiati', -1), ('principiante', -1), ('principiarti', -1), ('principiasti', -1)]

        >>> h = Hunspell(['es_ES'])
        >>> h.suggest('teneis')
        [('tenéis', 0), ('tenes', -1), ('tenis', -1), ('teneos', -1), ('tienes', -1), ('te neis', -1), ('te-neis', -1)]

        >>> h.suggest('tenéis')[0]
        ('tenéis', 0)

        >>> h = Hunspell(['en_US'])
        >>> h.suggest('camel')
        [('camel', 0), ('camellia', 0), ('camelhair', 0), ('came', -1), ('Camel', -1), ('cameo', -1), ('came l', -1), ('camels', -1)]

        >>> h = Hunspell(['fr_FR'])
        >>> h.suggest('differemmen')
        [('différemment', 0)]

        >>> h = Hunspell(['None'])
        >>> h.suggest('camel')
        []

        >>> h = Hunspell(['None', 'en_US'])
        >>> h.suggest('camel')
        [('camel', 0), ('camellia', 0), ('camelhair', 0), ('came', -1), ('Camel', -1), ('cameo', -1), ('came l', -1), ('camels', -1)]

        '''
        # pylint: enable=line-too-long
        if input_phrase in self._suggest_cache:
            return self._suggest_cache[input_phrase]
        if DEBUG_LEVEL > 1:
            LOGGER.debug("Hunspell.suggest() input_phrase=%(ip)s\n",
                         {'ip': input_phrase.encode('UTF-8')})
        # http://pwet.fr/man/linux/fichiers_speciaux/hunspell says:
        #
        # > A dictionary file (*.dic) contains a list of words, one per
        # > line. The first line of the dictionaries (except personal
        # > dictionaries) contains the word count. Each word may
        # > optionally be followed by a slash ("/") and one or more
        # > flags, which represents affixes or special attributes.
        #
        # I.e. if '/' is already contained in the input, it cannot
        # match a word in the dictionary and we return an empty list
        # immediately:
        if '/' in input_phrase:
            self._suggest_cache[input_phrase] = []
            return []
        # make sure input_phrase is in the internal normalization form (NFD):
        input_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
        input_phrase_no_accents = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL,
            itb_util.remove_accents(input_phrase))
        # But enchant and pyhunspell want NFC as input, make a copy in NFC:
        input_phrase_nfc = unicodedata.normalize('NFC', input_phrase)

        suggested_words = {}
        for dictionary in self._dictionaries:
            if dictionary.words:
                # If the input phrase is longer than than the maximum
                # word length in a dictionary, don’t try
                # complete it, it just wastes time then.
                if len(input_phrase) <= dictionary.max_word_len:
                    if dictionary.word_pairs:
                        suggested_words.update([
                            (x[0], 0) for x in dictionary.word_pairs
                            if x[1].startswith(input_phrase_no_accents)
                        ])
                    else:
                        suggested_words.update([(x, 0)
                                                for x in dictionary.words
                                                if x.startswith(input_phrase)])
                if len(input_phrase) >= 4:
                    if dictionary.spellcheck(input_phrase):
                        # This is a valid word in this dictionary.
                        # It might have been missed by the
                        # matching above because the dictionary
                        # might not contain all possible word
                        # forms (The prefix and suffix information
                        # has been ignored). But the spell checker
                        # knows about this, if the spell checker
                        # thinks it is a correct word, it must be
                        # counted as a match of course:
                        suggested_words[input_phrase] = 0
                    extra_suggestions = [
                        unicodedata.normalize(
                            itb_util.NORMALIZATION_FORM_INTERNAL, x)
                        for x in dictionary.spellcheck_suggest(input_phrase)
                    ]
                    suggested_words.update([
                        (suggestion, -1) for suggestion in extra_suggestions
                        if suggestion not in suggested_words
                    ])
        for word in suggested_words:
            if (suggested_words[word] == -1 and itb_util.remove_accents(word)
                    == itb_util.remove_accents(input_phrase)):
                # This spell checking correction is actually even
                # an accent insensitive match, adjust accordingly:
                suggested_words[word] = 0
        sorted_suggestions = sorted(
            suggested_words.items(),
            key=lambda x: (
                -x[1],  # 0: in dictionary, -1: hunspell
                len(x[0]),  # length of word ascending
                x[0],  # alphabetical
            ))[0:MAX_WORDS]
        self._suggest_cache[input_phrase] = sorted_suggestions
        return sorted_suggestions
Exemplo n.º 14
0
    def check_phrase_and_update_frequency(
            self, input_phrase='', phrase='', p_phrase='',
            pp_phrase='', user_freq_increment=1, commit=True):
        '''
        Check whether input_phrase and phrase are already in database. If
        they are in the database, increase the frequency by 1, if not
        add them.
        '''
        if not input_phrase:
            input_phrase = phrase
        if not phrase:
            return
        phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, phrase)
        p_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase)
        pp_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase)
        input_phrase = itb_util.remove_accents(input_phrase)
        input_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)

        if DEBUG_LEVEL > 1:
            sys.stderr.write(
                "TabSqliteDb.check_phrase_and_update_frequency() "
                + "phrase=%(p)s, input_phrase=%(t)s\n"
                %{'p': phrase.encode('UTF-8'),
                  't': input_phrase.encode('UTF-8')})

        # There should never be more than 1 database row for the same
        # input_phrase *and* phrase. So the following query on
        # the database should match at most one database
        # row and the length of the result array should be 0 or
        # 1. So the “GROUP BY phrase” is actually redundant. It is
        # only a safeguard for the case when duplicate rows have been
        # added to the database accidentally (But in that case there
        # is a bug somewhere else which should be fixed).
        sqlstr = '''
        SELECT max(user_freq) FROM user_db.phrases
        WHERE input_phrase = :input_phrase
        AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase
        GROUP BY phrase
        ;'''
        sqlargs = {'input_phrase': input_phrase,
                   'phrase': phrase,
                   'p_phrase': p_phrase,
                   'pp_phrase': pp_phrase}
        if DEBUG_LEVEL > 1:
            sys.stderr.write(
                "TabSqliteDb.check_phrase_and_update_frequency() sqlstr=%s\n"
                %sqlstr)
            sys.stderr.write(
                "TabSqliteDb.check_phrase_and_update_frequency() sqlargs=%s\n"
                %sqlargs)
        result = self.db.execute(sqlstr, sqlargs).fetchall()
        if DEBUG_LEVEL > 1:
            sys.stderr.write(
                "check_phrase_and_update_frequency() result=%s\n" %result)
        if result:
            # A match was found in user_db, increase user frequency by
            # user_freq_increment (1 by default)
            self.update_phrase(input_phrase=input_phrase,
                               phrase=phrase,
                               p_phrase=p_phrase,
                               pp_phrase=pp_phrase,
                               user_freq=result[0][0]+user_freq_increment,
                               commit=commit)
            return
        # The phrase was not found in user_db.
        # Add it as a new phrase, i.e. with user_freq = user_freq_increment
        # (1 by default):
        self.add_phrase(input_phrase=input_phrase,
                        phrase=phrase,
                        p_phrase=p_phrase,
                        pp_phrase=pp_phrase,
                        user_freq=user_freq_increment,
                        commit=commit)
        return
Exemplo n.º 15
0
    def select_words(self, input_phrase, p_phrase='', pp_phrase=''):
        '''
        Get phrases from database completing input_phrase.

        Returns a list of matches where each match is a tuple in the
        form of (phrase, user_freq), i.e. returns something like
        [(phrase, user_freq), ...]
        '''
        input_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
        p_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase)
        pp_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase)
        if DEBUG_LEVEL > 1:
            sys.stderr.write(
                "TabSqliteDb.select_words() "
                + "input_phrase=%s " % input_phrase.encode('UTF-8')
                + "p_phrase=%s " % p_phrase.encode('UTF-8')
                + "pp_phrase=%s\n" % pp_phrase.encode('UTF-8'))
        phrase_frequencies = {}
        if not ' ' in input_phrase:
            # Get suggestions from hunspell dictionaries. But only
            # if input_phrase does not contain spaces. The hunspell
            # dictionaries contain only single words, not sentences.
            # Trying to complete an input_phrase which contains spaces
            # will never work and spell checking suggestions by hunspell
            # for input which contains spaces is almost always nonsense.
            phrase_frequencies.update([
                x for x in self.hunspell_obj.suggest(input_phrase)])
        if DEBUG_LEVEL > 1:
            sys.stderr.write(
                "TabSqliteDb.select_words() hunspell: best_candidates=%s\n"
                %self.best_candidates(phrase_frequencies))
        # Remove the accents *after* getting the hunspell candidates.
        # If the accents were removed before getting the hunspell candidates
        # an input phrase like “Glühwürmchen” would not be added as a
        # candidate because hunspell would get “Gluhwurmchen” then and would
        # not validate that as a correct word. And, because “Glühwürmchen”
        # is not in the German hunspell dictionary as a single word but
        # created by suffix and prefix rules, the accent insensitive match
        # in the German hunspell dictionary would not find it either.
        input_phrase = itb_util.remove_accents(input_phrase)
        input_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
        # Now phrase_frequencies might contain something like this:
        #
        # {'code': 0, 'communicability': 0, 'cold': 0, 'colour': 0}

        # To quote a string to be used as a parameter when assembling
        # an sqlite statement with Python string operations, remove
        # all NUL characters, replace " with "" and wrap the whole
        # string in double quotes. Assembling sqlite statements using
        # parameters containing user input with python string operations
        # is not recommended because of the risk of SQL injection attacks
        # if the quoting is not done the right way. So it is better to use
        # the parameter substitution of the sqlite3 python interface.
        # But unfortunately that does not work when creating views,
        # (“OperationalError: parameters are not allowed in views”).
        quoted_input_phrase = input_phrase.replace(
            '\x00', '').replace('"', '""')
        self.db.execute('DROP VIEW IF EXISTS like_input_phrase_view;')
        sqlstr = '''
        CREATE TEMPORARY VIEW IF NOT EXISTS like_input_phrase_view AS
        SELECT * FROM user_db.phrases
        WHERE input_phrase LIKE "%(quoted_input_phrase)s%%"
        ;''' % {'quoted_input_phrase': quoted_input_phrase}
        self.db.execute(sqlstr)
        sqlargs = {'p_phrase': p_phrase, 'pp_phrase': pp_phrase}
        sqlstr = (
            'SELECT phrase, sum(user_freq) FROM like_input_phrase_view '
            + 'GROUP BY phrase;')
        try:
            # Get “unigram” data from user_db.
            #
            # Example: Let’s assume the user typed “co” and user_db contains
            #
            #     1|colou|colour|green|nice|1
            #     2|col|colour|yellow|ugly|2
            #     3|co|colour|green|awesome|1
            #     4|co|cold|||1
            #     5|conspirac|conspiracy|||5
            #     6|conspi|conspiracy|||1
            #     7|c|conspiracy|||1
            results_uni = self.db.execute(sqlstr, sqlargs).fetchall()
            # Then the result returned by .fetchall() is:
            #
            # [('colour', 4), ('cold', 1), ('conspiracy', 6)]
            #
            # (“c|conspiracy|1” is not selected because it doesn’t
            # match the user input “LIKE co%”! I.e. this is filtered
            # out by the VIEW created above already)
        except:
            traceback.print_exc()
        if not results_uni:
            # If no unigrams matched, bigrams and trigrams cannot
            # match either. We can stop here and return what we got
            # from hunspell.
            return self.best_candidates(phrase_frequencies)
        # Now normalize the unigram frequencies with the total count
        # (which is 11 in the above example), which gives us the
        # normalized result:
        # [('colour', 4/11), ('cold', 1/11), ('conspiracy', 6/11)]
        sqlstr = 'SELECT sum(user_freq) FROM like_input_phrase_view;'
        try:
            count = self.db.execute(sqlstr, sqlargs).fetchall()[0][0]
        except:
            traceback.print_exc()
        # Updating the phrase_frequency dictionary with the normalized
        # results gives: {'conspiracy': 6/11, 'code': 0,
        # 'communicability': 0, 'cold': 1/11, 'colour': 4/11}
        for x in results_uni:
            phrase_frequencies.update([(x[0], x[1]/float(count))])
        if DEBUG_LEVEL > 1:
            sys.stderr.write(
                "TabSqliteDb.select_words() Unigram best_candidates=%s\n"
                %self.best_candidates(phrase_frequencies))
        if not p_phrase:
            # If no context for bigram matching is available, return
            # what we have so far:
            return self.best_candidates(phrase_frequencies)
        sqlstr = (
            'SELECT phrase, sum(user_freq) FROM like_input_phrase_view '
            + 'WHERE p_phrase = :p_phrase GROUP BY phrase;')
        try:
            results_bi = self.db.execute(sqlstr, sqlargs).fetchall()
        except:
            traceback.print_exc()
        if not results_bi:
            # If no bigram could be matched, return what we have so far:
            return self.best_candidates(phrase_frequencies)
        # get the total count of p_phrase to normalize the bigram frequencies:
        sqlstr = (
            'SELECT sum(user_freq) FROM like_input_phrase_view '
            + 'WHERE p_phrase = :p_phrase;')
        try:
            count_p_phrase = self.db.execute(sqlstr, sqlargs).fetchall()[0][0]
        except:
            traceback.print_exc()
        # Update the phrase frequency dictionary by using a linear
        # combination of the unigram and the bigram results, giving
        # both the weight of 0.5:
        for x in results_bi:
            phrase_frequencies.update(
                [(x[0],
                  0.5*x[1]/float(count_p_phrase)
                  +0.5*phrase_frequencies[x[0]])])
        if DEBUG_LEVEL > 1:
            sys.stderr.write(
                "TabSqliteDb.select_words() Bigram best_candidates=%s\n"
                %self.best_candidates(phrase_frequencies))
        if not pp_phrase:
            # If no context for trigram matching is available, return
            # what we have so far:
            return self.best_candidates(phrase_frequencies)
        sqlstr = ('SELECT phrase, sum(user_freq) FROM like_input_phrase_view '
                  + 'WHERE p_phrase = :p_phrase '
                  + 'AND pp_phrase = :pp_phrase GROUP BY phrase;')
        try:
            results_tri = self.db.execute(sqlstr, sqlargs).fetchall()
        except:
            traceback.print_exc()
        if not results_tri:
            # if no trigram could be matched, return what we have so far:
            return self.best_candidates(phrase_frequencies)
        # get the total count of (p_phrase, pp_phrase) pairs to
        # normalize the bigram frequencies:
        sqlstr = (
            'SELECT sum(user_freq) FROM like_input_phrase_view '
            + 'WHERE p_phrase = :p_phrase AND pp_phrase = :pp_phrase;')
        try:
            count_pp_phrase_p_phrase = self.db.execute(
                sqlstr, sqlargs).fetchall()[0][0]
        except:
            traceback.print_exc()
        # Update the phrase frequency dictionary by using a linear
        # combination of the bigram and the trigram results, giving
        # both the weight of 0.5 (that makes the total weights: 0.25 *
        # unigram + 0.25 * bigram + 0.5 * trigram, i.e. the trigrams
        # get higher weight):
        for x in results_tri:
            phrase_frequencies.update(
                [(x[0],
                  0.5*x[1]/float(count_pp_phrase_p_phrase)
                  +0.5*phrase_frequencies[x[0]])])
        if DEBUG_LEVEL > 1:
            sys.stderr.write(
                "TabSqliteDb.select_words() Trigram best_candidates=%s\n"
                %self.best_candidates(phrase_frequencies))
        return self.best_candidates(phrase_frequencies)
Exemplo n.º 16
0
    def add_phrase(self, input_phrase='', phrase='',
                   p_phrase='', pp_phrase='',
                   user_freq=0, commit=True):
        '''
        Add phrase to database
        '''
        if DEBUG_LEVEL > 1:
            sys.stderr.write(
                "TabSqliteDb.add_phrase() "
                + "input_phrase=%s " % input_phrase.encode('UTF-8')
                + "phrase=%s " % phrase.encode('UTF-8')
                + "user_freq=%s " % user_freq
            )
        if not input_phrase or not phrase:
            return
        input_phrase = itb_util.remove_accents(input_phrase)
        input_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
        phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, phrase)
        p_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase)
        pp_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase)
        select_sqlstr = '''
        SELECT * FROM user_db.phrases
        WHERE input_phrase = :input_phrase
        AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase
        ;'''
        select_sqlargs = {
            'input_phrase': input_phrase,
            'phrase': phrase,
            'p_phrase': p_phrase,
            'pp_phrase': pp_phrase}
        if self.db.execute(select_sqlstr, select_sqlargs).fetchall():
            # there is already such a phrase, i.e. add_phrase was called
            # in error, do nothing to avoid duplicate entries.
            return

        insert_sqlstr = '''
        INSERT INTO user_db.phrases
        (input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp)
        VALUES (:input_phrase, :phrase, :p_phrase, :pp_phrase, :user_freq, :timestamp)
        ;'''
        insert_sqlargs = {'input_phrase': input_phrase,
                          'phrase': phrase,
                          'p_phrase': p_phrase,
                          'pp_phrase': pp_phrase,
                          'user_freq': user_freq,
                          'timestamp': time.time()}
        if DEBUG_LEVEL > 1:
            sys.stderr.write(
                "TabSqliteDb.add_phrase() insert_sqlstr=%s\n" %insert_sqlstr)
            sys.stderr.write(
                "TabSqliteDb.add_phrase() insert_sqlargs=%s\n" %insert_sqlargs)
        try:
            self.db.execute(insert_sqlstr, insert_sqlargs)
            if commit:
                self.db.commit()
        except Exception:
            traceback.print_exc()
Exemplo n.º 17
0
    def suggest(self, input_phrase):
        '''Return completions or corrections for the input phrase

        :param input_phrase: A string to find completions or corrections for
        :type input_phrase: String
        :rtype: A list of tuples of the form (<word>, <score>)
                <score> can have these values:
                    0: This is a completion, i.e. input_phrase matches
                       the beginning of <word> (accent insensitive match)
                   -1: This is a spell checking correction from hunspell
                       (i.e. either from enchant or pyhunspell)

        Examples:

        (Attention, the return values are in internal normalization form ('NFD'))

        >>> h = Hunspell(['de_DE', 'cs_CZ'])
        >>> h.suggest('Geschwindigkeitsubertre')[0]
        ('Geschwindigkeitsübertretungsverfahren', 0)

        >>> h.suggest('Geschwindigkeitsübertretungsverfahren')[0]
        ('Geschwindigkeitsübertretungsverfahren', 0)

        >>> h.suggest('Glühwürmchen')[0]
        ('Glühwürmchen', 0)

        >>> h.suggest('Alpengluhen')[0]
        ('Alpenglühen', 0)

        >>> h.suggest('filosofictejsi')
        [('filosofičtější', 0), ('filosofičtěji', -1)]

        >>> h.suggest('filosofictejs')[0]
        ('filosofičtější', 0)

        >>> h.suggest('filosofičtější')[0]
        ('filosofičtější', 0)

        >>> h.suggest('filosofičtějš')[0]
        ('filosofičtější', 0)

        >>> h = Hunspell(['it_IT'])
        >>> h.suggest('principianti')
        [('principianti', 0), ('principiati', -1), ('principiante', -1), ('principiarti', -1), ('principiasti', -1)]

        >>> h = Hunspell(['es_ES'])
        >>> h.suggest('teneis')
        [('tenéis', 0), ('tenes', -1), ('tenis', -1), ('teneos', -1), ('tienes', -1), ('te neis', -1), ('te-neis', -1)]

        >>> h.suggest('tenéis')[0]
        ('tenéis', 0)
        '''
        if input_phrase in self._suggest_cache:
            return self._suggest_cache[input_phrase]
        if DEBUG_LEVEL > 1:
            sys.stderr.write("Hunspell.suggest() input_phrase=%(ip)s\n" %
                             {'ip': input_phrase.encode('UTF-8')})
        # http://pwet.fr/man/linux/fichiers_speciaux/hunspell says:
        #
        # > A dictionary file (*.dic) contains a list of words, one per
        # > line. The first line of the dictionaries (except personal
        # > dictionaries) contains the word count. Each word may
        # > optionally be followed by a slash ("/") and one or more
        # > flags, which represents affixes or special attributes.
        #
        # I.e. if '/' is already contained in the input, it cannot
        # match a word in the dictionary and we return an empty list
        # immediately:
        if '/' in input_phrase:
            self._suggest_cache[input_phrase] = []
            return []
        # make sure input_phrase is in the internal normalization form (NFD):
        input_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
        input_phrase_no_accents = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL,
            itb_util.remove_accents(input_phrase))
        # But enchant and pyhunspell want NFC as input, make a copy in NFC:
        input_phrase_nfc = unicodedata.normalize('NFC', input_phrase)

        suggested_words = {}
        for dictionary in self._dictionaries:
            if dictionary.words:
                # If the input phrase is longer than than the maximum
                # word length in a dictionary, don’t try
                # complete it, it just wastes time then.
                if len(input_phrase) <= dictionary.max_word_len:
                    if dictionary.word_pairs:
                        suggested_words.update([
                            (x[0], 0) for x in dictionary.word_pairs
                            if x[1].startswith(input_phrase_no_accents)
                        ])
                    else:
                        suggested_words.update([(x, 0)
                                                for x in dictionary.words
                                                if x.startswith(input_phrase)])
                if dictionary.enchant_dict:
                    if len(input_phrase) >= 4:
                        # Always pass NFC to enchant and convert the
                        # result back to the internal normalization
                        # form (NFD) (enchant does the right thing for
                        # Korean if the input is NFC).  enchant takes
                        # unicode strings and returns unicode strings,
                        # no encoding and decoding to and from the
                        # hunspell dictionary encoding is necessary
                        # (neither for Python2 nor Python3).
                        # (pyhunspell needs to get its input passed
                        # in dictionary encoding and also returns it
                        # in dictionary encoding).
                        if dictionary.enchant_dict.check(input_phrase_nfc):
                            # This is a valid word in this dictionary.
                            # It might have been missed by the matching
                            # above because the dictionary might not
                            # contain all possible word forms (The
                            # prefix and suffix information has been
                            # ignored). But hunspell knows about this,
                            # if hunspell thinks it is a correct word,
                            # it must be counted as a match of course:
                            suggested_words[input_phrase] = 0
                        extra_suggestions = [
                            unicodedata.normalize(
                                itb_util.NORMALIZATION_FORM_INTERNAL, x)
                            for x in dictionary.enchant_dict.suggest(
                                input_phrase_nfc)
                        ]
                        suggested_words.update([
                            (suggestion, -1)
                            for suggestion in extra_suggestions
                            if suggestion not in suggested_words
                        ])
                elif dictionary.pyhunspell_object:
                    if len(input_phrase) >= 4:
                        # Always pass NFC to pyhunspell and convert
                        # the result back to the internal
                        # normalization form (NFD) (hunspell does the
                        # right thing for Korean if the input is NFC).
                        if dictionary.pyhunspell_object.spell(
                                input_phrase_nfc.encode(
                                    dictionary.encoding, 'replace')):
                            # This is a valid word in this dictionary.
                            # It might have been missed by the matching
                            # above because the dictionary might not
                            # contain all possible word forms (The
                            # prefix and suffix information has been
                            # ignored). But hunspell knows about this,
                            # if hunspell thinks it is a correct word,
                            # it must be counted as a match of course:
                            suggested_words[input_phrase] = 0
                        extra_suggestions = [
                            unicodedata.normalize(
                                itb_util.NORMALIZATION_FORM_INTERNAL, x)
                            for x in dictionary.pyhunspell_object.suggest(
                                input_phrase_nfc.encode(
                                    dictionary.encoding, 'replace'))
                        ]
                        suggested_words.update([
                            (suggestion, -1)
                            for suggestion in extra_suggestions
                            if suggestion not in suggested_words
                        ])
            else:
                if (dictionary.name[:2] not in ('ja', 'ja_JP', 'zh', 'zh_CN',
                                                'zh_TW', 'zh_MO', 'zh_SG')):
                    # For some languages, hunspell dictionaries don’t
                    # exist because hunspell makes no sense for these
                    # languages.  In these cases, just ignore that the
                    # hunspell dictionary is missing.  With the
                    # appropriate input method added, emoji can be
                    # matched nevertheless.
                    suggested_words.update([
                        ('☹ %(name)s dictionary not found. ' % {
                            'name': dictionary.name
                        } + 'Please install hunspell dictionary!', 0)
                    ])
        for word in suggested_words:
            if (suggested_words[word] == -1 and itb_util.remove_accents(word)
                    == itb_util.remove_accents(input_phrase)):
                # This spell checking correction is actually even
                # an accent insensitive match, adjust accordingly:
                suggested_words[word] = 0
        sorted_suggestions = sorted(
            suggested_words.items(),
            key=lambda x: (
                -x[1],  # 0: in dictionary, -1: hunspell
                len(x[0]),  # length of word ascending
                x[0],  # alphabetical
            ))[0:MAX_WORDS]
        self._suggest_cache[input_phrase] = sorted_suggestions
        return sorted_suggestions
    def suggest(self, input_phrase):
        # pylint: disable=line-too-long
        '''Return completions or corrections for the input phrase

        :param input_phrase: A string to find completions or corrections for
        :type input_phrase: String
        :rtype: A list of tuples of the form (<word>, <score>)
                <score> can have these values:
                    0: This is a completion, i.e. input_phrase matches
                       the beginning of <word> (accent insensitive match)
                   -1: This is a spell checking correction from hunspell
                       (i.e. either from enchant or pyhunspell)

        Examples:

        (Attention, the return values are in internal
        normalization form ('NFD'))

        >>> h = Hunspell(['de_DE', 'cs_CZ'])
        >>> h.suggest('Geschwindigkeitsubertre')[0]
        ('Geschwindigkeitsübertretungsverfahren', 0)

        >>> h.suggest('Geschwindigkeitsübertretungsverfahren')[0]
        ('Geschwindigkeitsübertretungsverfahren', 0)

        >>> h.suggest('Glühwürmchen')[0]
        ('Glühwürmchen', 0)

        >>> h.suggest('Alpengluhen')[0]
        ('Alpenglühen', 0)

        >>> h.suggest('filosofictejsi')
        [('filosofičtější', 0), ('filosofičtěji', -1)]

        >>> h.suggest('filosofictejs')[0]
        ('filosofičtější', 0)

        >>> h.suggest('filosofičtější')[0]
        ('filosofičtější', 0)

        >>> h.suggest('filosofičtějš')[0]
        ('filosofičtější', 0)

        >>> h = Hunspell(['it_IT'])
        >>> h.suggest('principianti')
        [('principianti', 0), ('principiati', -1), ('principiante', -1), ('principiarti', -1), ('principiasti', -1)]

        >>> h = Hunspell(['es_ES'])
        >>> h.suggest('teneis')
        [('tenéis', 0), ('tenes', -1), ('tenis', -1), ('teneos', -1), ('tienes', -1), ('te neis', -1), ('te-neis', -1)]

        >>> h.suggest('tenéis')[0]
        ('tenéis', 0)
        '''
        # pylint: enable=line-too-long
        if input_phrase in self._suggest_cache:
            return self._suggest_cache[input_phrase]
        if DEBUG_LEVEL > 1:
            sys.stderr.write(
                "Hunspell.suggest() input_phrase=%(ip)s\n"
                %{'ip': input_phrase.encode('UTF-8')})
        # http://pwet.fr/man/linux/fichiers_speciaux/hunspell says:
        #
        # > A dictionary file (*.dic) contains a list of words, one per
        # > line. The first line of the dictionaries (except personal
        # > dictionaries) contains the word count. Each word may
        # > optionally be followed by a slash ("/") and one or more
        # > flags, which represents affixes or special attributes.
        #
        # I.e. if '/' is already contained in the input, it cannot
        # match a word in the dictionary and we return an empty list
        # immediately:
        if '/' in input_phrase:
            self._suggest_cache[input_phrase] = []
            return []
        # make sure input_phrase is in the internal normalization form (NFD):
        input_phrase = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase)
        input_phrase_no_accents = unicodedata.normalize(
            itb_util.NORMALIZATION_FORM_INTERNAL,
            itb_util.remove_accents(input_phrase))
        # But enchant and pyhunspell want NFC as input, make a copy in NFC:
        input_phrase_nfc = unicodedata.normalize('NFC', input_phrase)

        suggested_words = {}
        for dictionary in self._dictionaries:
            if dictionary.words:
                # If the input phrase is longer than than the maximum
                # word length in a dictionary, don’t try
                # complete it, it just wastes time then.
                if len(input_phrase) <= dictionary.max_word_len:
                    if dictionary.word_pairs:
                        suggested_words.update([
                            (x[0], 0)
                            for x in dictionary.word_pairs
                            if x[1].startswith(input_phrase_no_accents)])
                    else:
                        suggested_words.update([
                            (x, 0)
                            for x in dictionary.words
                            if x.startswith(input_phrase)])
                if dictionary.enchant_dict:
                    if len(input_phrase) >= 4:
                        # Always pass NFC to enchant and convert the
                        # result back to the internal normalization
                        # form (NFD) (enchant does the right thing for
                        # Korean if the input is NFC).  enchant takes
                        # unicode strings and returns unicode strings,
                        # no encoding and decoding to and from the
                        # hunspell dictionary encoding is necessary
                        # (neither for Python2 nor Python3).
                        # (pyhunspell needs to get its input passed
                        # in dictionary encoding and also returns it
                        # in dictionary encoding).
                        if dictionary.enchant_dict.check(input_phrase_nfc):
                            # This is a valid word in this dictionary.
                            # It might have been missed by the matching
                            # above because the dictionary might not
                            # contain all possible word forms (The
                            # prefix and suffix information has been
                            # ignored). But hunspell knows about this,
                            # if hunspell thinks it is a correct word,
                            # it must be counted as a match of course:
                            suggested_words[input_phrase] = 0
                        extra_suggestions = [
                            unicodedata.normalize(
                                itb_util.NORMALIZATION_FORM_INTERNAL, x)
                            for x in
                            dictionary.enchant_dict.suggest(input_phrase_nfc)
                        ]
                        suggested_words.update([
                            (suggestion, -1)
                            for suggestion in extra_suggestions
                            if suggestion not in suggested_words])
                elif dictionary.pyhunspell_object:
                    if len(input_phrase) >= 4:
                        # Always pass NFC to pyhunspell and convert
                        # the result back to the internal
                        # normalization form (NFD) (hunspell does the
                        # right thing for Korean if the input is NFC).
                        if dictionary.pyhunspell_object.spell(
                                input_phrase_nfc.encode(
                                    dictionary.encoding, 'replace')):
                            # This is a valid word in this dictionary.
                            # It might have been missed by the matching
                            # above because the dictionary might not
                            # contain all possible word forms (The
                            # prefix and suffix information has been
                            # ignored). But hunspell knows about this,
                            # if hunspell thinks it is a correct word,
                            # it must be counted as a match of course:
                            suggested_words[input_phrase] = 0
                        extra_suggestions = [
                            unicodedata.normalize(
                                itb_util.NORMALIZATION_FORM_INTERNAL, x)
                            for x in
                            dictionary.pyhunspell_object.suggest(
                                input_phrase_nfc.encode(
                                    dictionary.encoding, 'replace'))
                        ]
                        suggested_words.update([
                            (suggestion, -1)
                            for suggestion in extra_suggestions
                            if suggestion not in suggested_words])
            else:
                if (dictionary.name[:2]
                        not in ('ja', 'ja_JP',
                                'zh', 'zh_CN', 'zh_TW', 'zh_MO', 'zh_SG')):
                    # For some languages, hunspell dictionaries don’t
                    # exist because hunspell makes no sense for these
                    # languages.  In these cases, just ignore that the
                    # hunspell dictionary is missing.  With the
                    # appropriate input method added, emoji can be
                    # matched nevertheless.
                    suggested_words.update([
                        ('☹ %(name)s dictionary not found. '
                         %{'name': dictionary.name}
                         + 'Please install hunspell dictionary!',
                         0)])
        for word in suggested_words:
            if (suggested_words[word] == -1
                    and
                    itb_util.remove_accents(word)
                    == itb_util.remove_accents(input_phrase)):
                # This spell checking correction is actually even
                # an accent insensitive match, adjust accordingly:
                suggested_words[word] = 0
        sorted_suggestions = sorted(
            suggested_words.items(),
            key=lambda x: (
                - x[1],    # 0: in dictionary, -1: hunspell
                len(x[0]), # length of word ascending
                x[0],      # alphabetical
            ))[0:MAX_WORDS]
        self._suggest_cache[input_phrase] = sorted_suggestions
        return sorted_suggestions