def __get_suggest(self, word, rating_limit, count): word_len = str(len(word) / 2) trigrammed_word = '"{}"/1'.format(trigram(word)) self.__configure(SphinxConfig.index_sugg, word_len) result = self.client_sugg.Query(trigrammed_word, SphinxConfig.index_sugg) # Если по данному слову не найдено подсказок (а такое бывает?) # возвращаем [] if not result['matches']: return [] maxrank = result['matches'][0]['attrs']['krank'] maxleven = None outlist = list() for match in result['matches']: if len(outlist) >= count: break if maxrank - match['attrs']['krank'] < self.default_rating_delta: jaro_rating = Levenshtein.jaro(word, match['attrs']['word']) if not maxleven: maxleven = jaro_rating - jaro_rating * self.regression_coef if jaro_rating >= rating_limit and jaro_rating >= maxleven: outlist.append([match['attrs']['word'], jaro_rating]) del jaro_rating outlist.sort(key=lambda x: x[1], reverse=True) return outlist
def __dbexport_sugg_dict(self): logging.info("Place suggestion dict to DB %s...", self.files['dict.txt']) dict_dat_fname = os.path.abspath(Folders.temp + "/suggdict.csv") csv_counter = 0 with open(self.files['dict.txt'], "r") as dict_file, open(dict_dat_fname, "w") as exit_file: line = None while line != '': nodes = [] line = dict_file.readline() if line == '': break csv_counter += 1 splitting_seq = line.split(' ') keyword = splitting_seq[0] freq = splitting_seq[1].rstrip('\n') assert keyword and freq, "Cannot process {}".format(self.files['dict.txt']) nodes.append(keyword) nodes.append(trigram(keyword)) nodes.append(freq) exit_file.write("\t".join(nodes) + "\n") try: dict_file.close() exit_file.close() except: pass self.aodp.bulk_csv(AoXmlTableEntry.OperationType.update, "AOTRIG", csv_counter, dict_dat_fname) logging.info("Done.")