Exemplo n.º 1
0
 def test_similarity(self):
     self.assertEqual(
         metrics.levenshtein_similarity("night", "nacht"), 
         metrics.similarity("night", "nacht", metrics.LEVENSHTEIN))
     self.assertEqual(
         metrics.dice_coefficient("night", "nacht"), 
         metrics.similarity("night", "nacht", metrics.DICE))
Exemplo n.º 2
0
 def test_similarity(self):
     self.assertEqual(
         metrics.levenshtein_similarity("night", "nacht"),
         metrics.similarity("night", "nacht", metrics.LEVENSHTEIN))
     self.assertEqual(metrics.dice_coefficient("night", "nacht"),
                      metrics.similarity("night", "nacht", metrics.DICE))
     print("pattern.metrics.similarity()")
Exemplo n.º 3
0
 def fuzzySearch(self, result, query):
     best, best_i = 0, None
     for i in range(len(result) - len(query) + 1):
         score = similarity(result[i:i+len(query)], query)
         if best < score:
             best = score
             best_i = i
     return result[best_i+len(query):] if best_i != None else ''
Exemplo n.º 4
0
 def fuzzySearch(self, result, query):
     best, best_i = 0, None
     for i in range(len(result) - len(query) + 1):
         score = similarity(result[i:i + len(query)], query)
         if best < score:
             best = score
             best_i = i
     return result[best_i + len(query):] if best_i != None else ''
Exemplo n.º 5
0
 def joinSimilar(self, t1, t2):
     s1, w1 = t1
     s2, w2 = t2
     if w1 == 0 or w2 == 0: return #already previously merged
     sim = similarity(' '.join(s1), ' '.join(s2))
     if sim > .75 and sim != 1:
         if w1 > w2:
             self._grams[s2] = 0
             self._grams[s1] += w2
         else:
             self._grams[s1] = 0
             self._grams[s2] += w1
Exemplo n.º 6
0
# 1) Suffix "-bie" and prefix "brie-" are almost identical and go together nicely.
# 2) Semantically, "briefing" refers to a one-sided kind of meeting,
#    where a meeting is a form of communication that many people find mindless;
#    whereas "zombies" are mindless and can't communicate.
#    There is a vaguely humoristic connection between the two concepts.
#    How about: "the drill sergeant zombriefed the men" ?

# To simulate (2) algorithmically, we'd need lots of learning material.
# Let's see if we can simulate (1) with a few tricks.

from pattern.metrics import similarity

# The similarity() function computes (1 - the Levenshtein distance):
# http://www.clips.ua.ac.be/pages/pattern-metrics#similarity
# The higher the number (0.0-1.0), the more similar two strings are.
print similarity("bie", "brie")  # 0.75

# So, given a word (e.g., "briefing"), we could look for a second word
# that we can glue to the left of it - if the prefix of the given word
# and the suffix of the second word are sufficiently similar
# (e.g., similarity >= 0.75) ...

from pattern.en import lexicon  # English {word: word type}-dictionary

w1 = "briefing"

for w2 in lexicon.keys():
    if w2[0].isupper():  # Exclude proper names like "Herbie".
        continue
    if len(w2) <= 3:  # Length of "zombie" > 3, OK.
        continue
Exemplo n.º 7
0
# 1) Suffix "-bie" and prefix "brie-" are almost identical and go together nicely.
# 2) Semantically, "briefing" refers to a one-sided kind of meeting,
#    where a meeting is a form of communication that many people find mindless;
#    whereas "zombies" are mindless and can't communicate.
#    There is a vaguely humoristic connection between the two concepts.
#    How about: "the drill sergeant zombriefed the men" ?

# To simulate (2) algorithmically, we'd need lots of learning material.
# Let's see if we can simulate (1) with a few tricks.

from pattern.metrics import similarity

# The similarity() function computes (1 - the Levenshtein distance):
# http://www.clips.ua.ac.be/pages/pattern-metrics#similarity
# The higher the number (0.0-1.0), the more similar two strings are.
print similarity("bie", "brie") # 0.75

# So, given a word (e.g., "briefing"), we could look for a second word 
# that we can glue to the left of it - if the prefix of the given word 
# and the suffix of the second word are sufficiently similar
# (e.g., similarity >= 0.75) ...

from pattern.en import lexicon # English {word: word type}-dictionary

w1 = "briefing"

for w2 in lexicon.keys():
    if w2[0].isupper(): # Exclude proper names like "Herbie".
        continue
    if len(w2) <= 3:    # Length of "zombie" > 3, OK.
        continue