Exemplo n.º 1
0
    def __get_suggest(self, word, rating_limit, count):
        word_len = str(len(word) / 2)
        trigrammed_word = '"{}"/1'.format(trigram(word))

        self.__configure(SphinxConfig.index_sugg, word_len)
        result = self.client_sugg.Query(trigrammed_word, SphinxConfig.index_sugg)

        # Если по данному слову не найдено подсказок (а такое бывает?)
        # возвращаем []

        if not result['matches']:
            return []

        maxrank = result['matches'][0]['attrs']['krank']
        maxleven = None

        outlist = list()
        for match in result['matches']:
            if len(outlist) >= count:
                break

            if maxrank - match['attrs']['krank'] < self.default_rating_delta:
                jaro_rating = Levenshtein.jaro(word, match['attrs']['word'])
                if not maxleven:
                    maxleven = jaro_rating - jaro_rating * self.regression_coef
                if jaro_rating >= rating_limit and jaro_rating >= maxleven:
                    outlist.append([match['attrs']['word'], jaro_rating])
                del jaro_rating

        outlist.sort(key=lambda x: x[1], reverse=True)

        return outlist
Exemplo n.º 2
0
    def __dbexport_sugg_dict(self):
        logging.info("Place suggestion dict to DB %s...", self.files['dict.txt'])
        dict_dat_fname = os.path.abspath(Folders.temp + "/suggdict.csv")

        csv_counter = 0
        with open(self.files['dict.txt'], "r") as dict_file, open(dict_dat_fname, "w") as exit_file:
            line = None
            while line != '':
                nodes = []
                line = dict_file.readline()
                if line == '':
                    break
                csv_counter += 1
                splitting_seq = line.split(' ')
                keyword = splitting_seq[0]
                freq = splitting_seq[1].rstrip('\n')
                assert keyword and freq, "Cannot process {}".format(self.files['dict.txt'])

                nodes.append(keyword)
                nodes.append(trigram(keyword))
                nodes.append(freq)

                exit_file.write("\t".join(nodes) + "\n")
        try:
            dict_file.close()
            exit_file.close()
        except:
            pass

        self.aodp.bulk_csv(AoXmlTableEntry.OperationType.update, "AOTRIG", csv_counter, dict_dat_fname)
        logging.info("Done.")