示例#1
0
def gen_candidate(word):
  """
  Generate candidate using k-gram index or automata
  """
  global matcher, use_automata, bindex, word_dict, bindex_thresh, lang, cand_topk_cur
  if use_automata:
    cands = list(automata.find_all_matches(word, 2, matcher))
  else:
    word_aug = '$' + word + '$'
    idx = range(len(word_aug))
    query = set()
    for tup in itertools.izip(idx[:-1],idx[1:]):
      query.add(word_aug[tup[0]]+word_aug[tup[1]])
    candidate = {}
    query = [u for u in query if u in bindex]
    for item in query:
      lst = [word_dict[u] for u in bindex[item] if abs(len(word_dict[u])-len(word)) <= 2]
      for w in lst:
        if w in candidate:
          candidate[w] += 1
        else:
          candidate[w] = 1
    cands = []
    for key, cnt in candidate.iteritems():
      score = float(cnt) / float(len(key) + 1 + len(query) - cnt)
      if score >= bindex_thresh and edit_distance(key, word) <= 2:
        cands.append(key)
    if len(cands) == 0 and word in lang:
      cands.append(word)
  
  cands = [(cand, edit_distance_plus(cand, word)[1]) for cand in cands]
  cands = sorted(cands, key=lambda x: lang[x[0]] + x[1])
  cands = [cand for cand, score in cands[:cand_topk_cur]]
  return cands
示例#2
0
文件: warui.py 项目: Rayraegah/warui
def warui(lang, term, k=1):
    """Find bad words that matches input

    Args:
      lang: Language code such as en, es, fr, jp
      term: Input word
      k: Maximum edit distance
    Yields:
      Number of matches, matched words and probes as Tuple
    """
    if not isinstance(term, unicode):
        term = term.decode('utf-8')

    words = get_words(lang)

    m = Matcher(words)
    li = list(automata.find_all_matches(term, k, m))
    return (len(li), li, m.probes)
	def candidates_for(self, obs, N = 10):
		if self.enable_dict and obs in self.dm.proper_noun_list:
			return [obs]

		if obs in self.d: # don't need to find candidates if valid word already
			return [obs]
		elif obs in self.memoize_can.keys():
			return self.memoize_can[obs]
		else:
			candidates = list(automata.find_all_matches(obs, self.k, self.m))
			if len(candidates) == 0:
				self.memoize_can[obs] = [obs]
			else:
				can_prob = [(c, self.emission_prob(obs, c)) for c in candidates]
				if self.enable_dict: can_prob.append((obs, self.emission_prob(obs, obs)))
				can_prob.sort(key = lambda x: x[1], reverse = True) # rank candidates by higher probability
				self.memoize_can[obs] = [can_prob[i][0] for i in range(min(N, len(can_prob)))]

			return self.memoize_can[obs]
 def test_slice(self):
     m = Matcher(words)
     self.assertEqual(len((list(automata.find_all_matches('slice', 1, m)))), 15)
     self.assertEqual(len((list(automata.find_all_matches('slice', 2, m)))), 213)
 def test_hello(self):
     m = Matcher(words)
     self.assertEqual(len((list(automata.find_all_matches('hello', 1, m)))), 12)
     self.assertEqual(len((list(automata.find_all_matches('hello', 2, m)))), 128)
 def test_food(self):
     m = Matcher(words)
     self.assertEqual(len((list(automata.find_all_matches('food', 1, m)))), 18)
     self.assertEqual(len((list(automata.find_all_matches('food', 2, m)))), 318)
示例#7
0
import time
start_time = time.time()

words = [x.strip().lower().decode('utf-8').split() for x in open('../text/3.txt')]
words = [item for sublist in words for item in sublist]
words.sort()

# words10 = [x for x in words if random.random() <= 0.1]
# words100 = [x for x in words if random.random() <= 0.01]

# m = Matcher(words)
# print m.probes

m = Matcher(words)
print list(automata.find_all_matches('test', 2, m))
print m.probes

print("--- %s seconds ---" % (time.time() - start_time))


# def levenshtein(s1, s2):
#   if len(s1) < len(s2):
#     return levenshtein(s2, s1)
#   if not s1:
#     return len(s2)

#   previous_row = xrange(len(s2) + 1)
#   for i, c1 in enumerate(s1):
#     current_row = [i + 1]
#     for j, c2 in enumerate(s2):
        self.probes = 0

    def __call__(self, w):
        self.probes += 1
        pos = bisect.bisect_left(self.l, w)
        if pos < len(self.l):
            return self.l[pos]
        else:
            return None


f = open('known.json')
known = json.loads(f.read())

words = [x.strip().lower() for x in open('wordsEn.txt')]
m = Matcher(words)

results = []
for k, v in known.iteritems():
    results.append(list(automata.find_all_matches(k, v, m)))

numresults = len(results)

common_set = set(results[0])

for i in range(numresults):
    common_set.intersection_update(set(results[i]))

print sorted(common_set)
print len(common_set)
示例#9
0
    def __call__(self, w):
        self.probes += 1
        pos = bisect.bisect_left(self.l, w)
        if pos < len(self.l):
            return self.l[pos]
        else:
            return None


words = [x.strip().lower().decode('utf-8') for x in open('/usr/share/dict/web2')]
words.sort()
# words10 = [x for x in words if random.random() <= 0.1]
# words100 = [x for x in words if random.random() <= 0.01]

m = Matcher(words)
matches = list(automata.find_all_matches('food', 1, m))
assert len(matches) == 18
print m.probes

m = Matcher(words)
matches = list(automata.find_all_matches('food', 2, m))
assert len(matches) == 288
print m.probes


def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)
    if not s1:
        return len(s2)
示例#10
0
        pos = bisect.bisect_left(self.l, w)
        if pos < len(self.l):
            return self.l[pos]
        else:
            return None


words = [x.strip().lower().decode('utf-8')
         for x in open('/usr/share/dict/words')]
words.sort()
words10 = [x for x in words if random.random() <= 0.1]
words100 = [x for x in words if random.random() <= 0.01]


m = Matcher(words)
res = list(automata.find_all_matches('startinq', 1, m))
print res
print len(res)
print m.probes

"""
m = Matcher(words)
res = list(automata.find_all_matches('food', 2, m))
print res
print len(res)
print m.probes
"""


def levenshtein(s1, s2):
    if len(s1) < len(s2):
示例#11
0
        else:
            return None


words = [
    x.strip().lower().decode('utf-8') for x in open('/usr/share/dict/words')
]
print 'l', len(words)
#words = [x.strip().lower().decode('utf-8') for x in open('/usr/share/dict/web2')]
words.sort()
words10 = [x for x in words if random.random() <= 0.1]
words100 = [x for x in words if random.random() <= 0.01]

m = Matcher(words)
from time import time
print len(list(automata.find_all_matches('food', 1, m)))
st = time()
print(list(automata.find_all_matches('mosco', 2, m)))
print 'st', time() - st
print m.probes
assert len(list(automata.find_all_matches('food', 1, m))) == 18
print m.probes

m = Matcher(words)
assert len(list(automata.find_all_matches('food', 2, m))) == 283
print m.probes


def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)
示例#12
0
start_time = time.time()

words = [
    x.strip().lower().decode('utf-8').split() for x in open('../text/3.txt')
]
words = [item for sublist in words for item in sublist]
words.sort()

# words10 = [x for x in words if random.random() <= 0.1]
# words100 = [x for x in words if random.random() <= 0.01]

# m = Matcher(words)
# print m.probes

m = Matcher(words)
print list(automata.find_all_matches('test', 2, m))
print m.probes

print("--- %s seconds ---" % (time.time() - start_time))

# def levenshtein(s1, s2):
#   if len(s1) < len(s2):
#     return levenshtein(s2, s1)
#   if not s1:
#     return len(s2)

#   previous_row = xrange(len(s2) + 1)
#   for i, c1 in enumerate(s1):
#     current_row = [i + 1]
#     for j, c2 in enumerate(s2):
#       insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
示例#13
0
    def __call__(self, w):
        self.probes += 1
        pos = bisect.bisect_left(self.l, w)
        if pos < len(self.l):
            return self.l[pos]
        else:
            return None


words = [x.strip().lower() for x in open('test.txt')]
words.sort()
words10 = [x for x in words if random.random() <= 0.1]
words100 = [x for x in words if random.random() <= 0.01]

m = Matcher(words)
assert len(list(automata.find_all_matches('food', 1, m))) == 2
print(m.probes)
# m.probes

m = Matcher(words)
assert len(list(automata.find_all_matches('food', 2, m))) == 3
# print
# m.probes

print(list(automata.find_all_matches('food', 4, m)))


def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)
    if not s1: