def process_text(text, models): results = [0] * len(models) tokens = tokenizer.tokenize(text) lastwords = [] lastnormalized = [] for (kind, data) in tokens: if kind == tokenizer.TOKEN_WORD: lastwords.append(data) lastnormalized.append(tokenizer.normalize(data)) lastwords = lastwords[-WORDGRAMS_SIZE:] lastnormalized = lastnormalized[-NORMALIZEDGRAMS_SIZE:] for score in enumerate( map( lambda mod: count_probabilities( lastwords[-WORDGRAMS_SIZE:], lastnormalized[ -NORMALIZEDGRAMS_SIZE:], mod[0]), models)): results[score[0]] += WORD_SHARE * score[1][ 0] + NORMALIZED_SHARE * score[1][1] #if kind == tokenizer.TOKEN_END_OF_SENTENCE: # print('Results: {}'.format(results)) return tuple( reversed( sorted( map(lambda res: (res[1], models[res[0]][1]), enumerate(results)))))
def _phrase_search(user, query): n = normalize(query) keywords = tokenize(n) logging.info('phrase_search: query: '+query) logging.info('n: '+n) logging.info('keywords:' + str(keywords)) if not len(keywords): return [] logging.info('%d - %s' % (0, keywords[0])); results = _lookup(user, keywords[0]) if not results: return [] logging.info('%s' % str(results)); for i in range(1, len(keywords)): logging.info('%d - %s' % (i, keywords[i])); id_pos_dict = _lookup(user, keywords[i]) logging.info('%s' % str(id_pos_dict)); if id_pos_dict: for id in results.keys(): if id not in id_pos_dict: del results[id] else: poses = [] for pos in id_pos_dict[id]: if pos - 1 in results[id]: poses.append(pos) if not len(poses): del results[id] else: results[id] = poses else: return [] return results.keys()
def calculate_unknown(tokens, model): word2id = model['word2id'] known_words = set() unknown_words = [] unknown_count = 0 lastwords = [] for (kind, word) in tokens: if kind == tokenizer.TOKEN_WORD: normalized = tokenizer.normalize(word) lastwords.append(word2id[normalized]) elif kind == tokenizer.TOKEN_END_OF_SENTENCE: if random.randint(1, BETA) == 1: unknown_words.extend(lastwords) else: for word in lastwords: known_words.add(word) lastwords = [] for word in unknown_words: if word not in known_words: unknown_count += 1 model['unknown'] = unknown_count / len(unknown_words) / DELTA print('# Unknown: {:.9f}'.format(model['unknown']))
def possible_replacements(word, dictionary): replacements = set([word]) normalized = tokenizer.normalize(word) distance = 0 if 5 <= len(word): distance = 3 elif 4 <= len(word): distance = 2 elif 2 <= len(word): distance = 1 for (change, dist) in generate_typos(normalized, distance): if len(change) <= 2 and dist > 0: continue elif 3 <= len(change) <= 5 and dist > 1: continue elif 6 <= len(change) <= 8 and dist > 2: continue elif 8 < len(change) and dist > 3: continue if change in dictionary: for replacement in dictionary[change]: replacements.add(replacement) return list(replacements)
def possible_replacements(word, dictionary): replacements = set([word]) normalized = tokenizer.normalize(word) for (change, dist) in generate_typos(normalized, 1): if len(change) <= 2 and dist > 0: continue if change in dictionary: for replacement in dictionary[change]: replacements.add(replacement) return list(replacements)
def gather_normalizedgrams(tokens, model): word2id = model['word2id'] normalizedgrams = model['normalizedgrams'] lastnormalized = [] for (kind, word) in tokens: if kind == tokenizer.TOKEN_WORD: normalized = tokenizer.normalize(word) lastnormalized.append(word2id[normalized]) lastnormalized = lastnormalized[-NORMALIZEDGRAMS_SIZE:] gather_grams(lastnormalized, normalizedgrams) model['normalized_count'] = len(normalizedgrams[0]) print('# Normalizedgrams: {}'.format( tuple(map(lambda dct: len(dct), normalizedgrams))))
def generate_dictionary(input_filename, output_filename=None): if output_filename is None: input_file, input_ext = os.path.splitext(input_filename) output_filename = input_file + '.dat' dictionary = {} with open(input_filename, 'r') as _file: for word in _file: word = word.strip() normalized = tokenizer.normalize(word) if normalized not in dictionary: dictionary[normalized] = [] dictionary[normalized].append(word) #print('Dictionary size: {}'.format(len(dictionary))) save_dictionary(dictionary, output_filename)
def make_dictionary(tokens, model): word2id = model['word2id'] id2word = model['id2word'] for (kind, word) in tokens: if kind == tokenizer.TOKEN_WORD: normalized = tokenizer.normalize(word) if word not in word2id: word2id[word] = len(id2word) id2word.append(word) if normalized not in word2id: word2id[normalized] = len(id2word) id2word.append(normalized) model['words_sum'] += 1 print('# Words: {:d}'.format(len(id2word)))
def process_text(text, models): results = [0] * len(models) tokens = tokenizer.tokenize(text) lastwords = [] lastnormalized = [] for (kind, data) in tokens: if kind == tokenizer.TOKEN_WORD: lastwords.append(data) lastnormalized.append(tokenizer.normalize(data)) lastwords = lastwords[-WORDGRAMS_SIZE:] lastnormalized = lastnormalized[-NORMALIZEDGRAMS_SIZE:] for score in enumerate(map(lambda mod: count_probabilities(lastwords[-WORDGRAMS_SIZE:], lastnormalized[-NORMALIZEDGRAMS_SIZE:], mod[0]), models)): results[score[0]] += WORD_SHARE * score[1][0] + NORMALIZED_SHARE * score[1][1] #if kind == tokenizer.TOKEN_END_OF_SENTENCE: # print('Results: {}'.format(results)) return tuple(reversed(sorted(map(lambda res: (res[1], models[res[0]][1]), enumerate(results)))))
def _and_search(user, query): n = normalize(query) keywords = tokenize(n) logging.info('and_search: query: '+query) logging.info('n: '+n) logging.info('keywords:' + str(keywords)) if not len(keywords): return [] results = _lookup(user, keywords[0]) if not results: return [] for i in range(1, len(keywords)): id_pos_dict = _lookup(user, keywords[i]) if id_pos_dict: for id in results.keys(): if id not in id_pos_dict: del results[id] else: return [] return results.keys()
def add_page_to_index(index, url, content): keywords = tokenize(normalize(content)) pos = 0 for keyword in keywords: add_to_index(index, keyword, url, pos) pos += 1