def gen_candidate(word): """ Generate candidate using k-gram index or automata """ global matcher, use_automata, bindex, word_dict, bindex_thresh, lang, cand_topk_cur if use_automata: cands = list(automata.find_all_matches(word, 2, matcher)) else: word_aug = '$' + word + '$' idx = range(len(word_aug)) query = set() for tup in itertools.izip(idx[:-1],idx[1:]): query.add(word_aug[tup[0]]+word_aug[tup[1]]) candidate = {} query = [u for u in query if u in bindex] for item in query: lst = [word_dict[u] for u in bindex[item] if abs(len(word_dict[u])-len(word)) <= 2] for w in lst: if w in candidate: candidate[w] += 1 else: candidate[w] = 1 cands = [] for key, cnt in candidate.iteritems(): score = float(cnt) / float(len(key) + 1 + len(query) - cnt) if score >= bindex_thresh and edit_distance(key, word) <= 2: cands.append(key) if len(cands) == 0 and word in lang: cands.append(word) cands = [(cand, edit_distance_plus(cand, word)[1]) for cand in cands] cands = sorted(cands, key=lambda x: lang[x[0]] + x[1]) cands = [cand for cand, score in cands[:cand_topk_cur]] return cands
def warui(lang, term, k=1): """Find bad words that matches input Args: lang: Language code such as en, es, fr, jp term: Input word k: Maximum edit distance Yields: Number of matches, matched words and probes as Tuple """ if not isinstance(term, unicode): term = term.decode('utf-8') words = get_words(lang) m = Matcher(words) li = list(automata.find_all_matches(term, k, m)) return (len(li), li, m.probes)
def candidates_for(self, obs, N = 10): if self.enable_dict and obs in self.dm.proper_noun_list: return [obs] if obs in self.d: # don't need to find candidates if valid word already return [obs] elif obs in self.memoize_can.keys(): return self.memoize_can[obs] else: candidates = list(automata.find_all_matches(obs, self.k, self.m)) if len(candidates) == 0: self.memoize_can[obs] = [obs] else: can_prob = [(c, self.emission_prob(obs, c)) for c in candidates] if self.enable_dict: can_prob.append((obs, self.emission_prob(obs, obs))) can_prob.sort(key = lambda x: x[1], reverse = True) # rank candidates by higher probability self.memoize_can[obs] = [can_prob[i][0] for i in range(min(N, len(can_prob)))] return self.memoize_can[obs]
def test_slice(self): m = Matcher(words) self.assertEqual(len((list(automata.find_all_matches('slice', 1, m)))), 15) self.assertEqual(len((list(automata.find_all_matches('slice', 2, m)))), 213)
def test_hello(self): m = Matcher(words) self.assertEqual(len((list(automata.find_all_matches('hello', 1, m)))), 12) self.assertEqual(len((list(automata.find_all_matches('hello', 2, m)))), 128)
def test_food(self): m = Matcher(words) self.assertEqual(len((list(automata.find_all_matches('food', 1, m)))), 18) self.assertEqual(len((list(automata.find_all_matches('food', 2, m)))), 318)
import time start_time = time.time() words = [x.strip().lower().decode('utf-8').split() for x in open('../text/3.txt')] words = [item for sublist in words for item in sublist] words.sort() # words10 = [x for x in words if random.random() <= 0.1] # words100 = [x for x in words if random.random() <= 0.01] # m = Matcher(words) # print m.probes m = Matcher(words) print list(automata.find_all_matches('test', 2, m)) print m.probes print("--- %s seconds ---" % (time.time() - start_time)) # def levenshtein(s1, s2): # if len(s1) < len(s2): # return levenshtein(s2, s1) # if not s1: # return len(s2) # previous_row = xrange(len(s2) + 1) # for i, c1 in enumerate(s1): # current_row = [i + 1] # for j, c2 in enumerate(s2):
self.probes = 0 def __call__(self, w): self.probes += 1 pos = bisect.bisect_left(self.l, w) if pos < len(self.l): return self.l[pos] else: return None f = open('known.json') known = json.loads(f.read()) words = [x.strip().lower() for x in open('wordsEn.txt')] m = Matcher(words) results = [] for k, v in known.iteritems(): results.append(list(automata.find_all_matches(k, v, m))) numresults = len(results) common_set = set(results[0]) for i in range(numresults): common_set.intersection_update(set(results[i])) print sorted(common_set) print len(common_set)
def __call__(self, w): self.probes += 1 pos = bisect.bisect_left(self.l, w) if pos < len(self.l): return self.l[pos] else: return None words = [x.strip().lower().decode('utf-8') for x in open('/usr/share/dict/web2')] words.sort() # words10 = [x for x in words if random.random() <= 0.1] # words100 = [x for x in words if random.random() <= 0.01] m = Matcher(words) matches = list(automata.find_all_matches('food', 1, m)) assert len(matches) == 18 print m.probes m = Matcher(words) matches = list(automata.find_all_matches('food', 2, m)) assert len(matches) == 288 print m.probes def levenshtein(s1, s2): if len(s1) < len(s2): return levenshtein(s2, s1) if not s1: return len(s2)
pos = bisect.bisect_left(self.l, w) if pos < len(self.l): return self.l[pos] else: return None words = [x.strip().lower().decode('utf-8') for x in open('/usr/share/dict/words')] words.sort() words10 = [x for x in words if random.random() <= 0.1] words100 = [x for x in words if random.random() <= 0.01] m = Matcher(words) res = list(automata.find_all_matches('startinq', 1, m)) print res print len(res) print m.probes """ m = Matcher(words) res = list(automata.find_all_matches('food', 2, m)) print res print len(res) print m.probes """ def levenshtein(s1, s2): if len(s1) < len(s2):
else: return None words = [ x.strip().lower().decode('utf-8') for x in open('/usr/share/dict/words') ] print 'l', len(words) #words = [x.strip().lower().decode('utf-8') for x in open('/usr/share/dict/web2')] words.sort() words10 = [x for x in words if random.random() <= 0.1] words100 = [x for x in words if random.random() <= 0.01] m = Matcher(words) from time import time print len(list(automata.find_all_matches('food', 1, m))) st = time() print(list(automata.find_all_matches('mosco', 2, m))) print 'st', time() - st print m.probes assert len(list(automata.find_all_matches('food', 1, m))) == 18 print m.probes m = Matcher(words) assert len(list(automata.find_all_matches('food', 2, m))) == 283 print m.probes def levenshtein(s1, s2): if len(s1) < len(s2): return levenshtein(s2, s1)
start_time = time.time() words = [ x.strip().lower().decode('utf-8').split() for x in open('../text/3.txt') ] words = [item for sublist in words for item in sublist] words.sort() # words10 = [x for x in words if random.random() <= 0.1] # words100 = [x for x in words if random.random() <= 0.01] # m = Matcher(words) # print m.probes m = Matcher(words) print list(automata.find_all_matches('test', 2, m)) print m.probes print("--- %s seconds ---" % (time.time() - start_time)) # def levenshtein(s1, s2): # if len(s1) < len(s2): # return levenshtein(s2, s1) # if not s1: # return len(s2) # previous_row = xrange(len(s2) + 1) # for i, c1 in enumerate(s1): # current_row = [i + 1] # for j, c2 in enumerate(s2): # insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
def __call__(self, w): self.probes += 1 pos = bisect.bisect_left(self.l, w) if pos < len(self.l): return self.l[pos] else: return None words = [x.strip().lower() for x in open('test.txt')] words.sort() words10 = [x for x in words if random.random() <= 0.1] words100 = [x for x in words if random.random() <= 0.01] m = Matcher(words) assert len(list(automata.find_all_matches('food', 1, m))) == 2 print(m.probes) # m.probes m = Matcher(words) assert len(list(automata.find_all_matches('food', 2, m))) == 3 # print # m.probes print(list(automata.find_all_matches('food', 4, m))) def levenshtein(s1, s2): if len(s1) < len(s2): return levenshtein(s2, s1) if not s1: