示例#1
0
    def test_key_distance(self):
        ed = WeightedEditDistance()
        self.assertEqual(ed._key_distance('q', 'q'), 0)
        self.assertEqual(ed._key_distance('q', 'w'), 1)
        self.assertEqual(ed._key_distance('u', 'n'), 2)
        self.assertEqual(ed._key_distance('q', 'p'), 9)

        # '~' is not in our graph, so we should get the default maximum
        # used for characters we don't know about.
        self.assertEqual(ed._key_distance('h', '~'), ed._MAX_DISTANCE)
	def test_key_distance(self):
		ed = WeightedEditDistance()
		self.assertEqual(ed._key_distance('q', 'q'), 0)
		self.assertEqual(ed._key_distance('q', 'w'), 1)
		self.assertEqual(ed._key_distance('u', 'n'), 2)
		self.assertEqual(ed._key_distance('q', 'p'), 9)

		# '~' is not in our graph, so we should get the default maximum
		# used for characters we don't know about.
		self.assertEqual(ed._key_distance('h', '~'), ed._MAX_DISTANCE)
示例#3
0
    def test_edit_distance(self):
        ed = WeightedEditDistance()
        self.assertEqual(ed.edit_distance('led', 'bed'), 4)
        self.assertEqual(ed.edit_distance('bed', 'led'), 4)
        self.assertEqual(ed.edit_distance('a', 'ab'), 10)
        self.assertEqual(ed.edit_distance('ab', 'a'), 10)

        # Counts the two 'p's in 'apple' as insertions.
        # Edit distance between 'l' and 'd' is 6.
        # Edit distance between 'e' and 'd' is 1.
        # Total edit distance is 20 + 6 + 1 = 27.
        self.assertEqual(ed.edit_distance('apple', 'add'), 27)
        self.assertEqual(ed.edit_distance('add', 'apple'), 27)
	def test_edit_distance(self):
		ed = WeightedEditDistance()
		self.assertEqual(ed.edit_distance('led', 'bed'), 4)
		self.assertEqual(ed.edit_distance('bed', 'led'), 4)
		self.assertEqual(ed.edit_distance('a', 'ab'), 10)
		self.assertEqual(ed.edit_distance('ab', 'a'), 10)

		# Counts the two 'p's in 'apple' as insertions.
		# Edit distance between 'l' and 'd' is 6.
		# Edit distance between 'e' and 'd' is 1.
		# Total edit distance is 20 + 6 + 1 = 27.
		self.assertEqual(ed.edit_distance('apple', 'add'), 27)
		self.assertEqual(ed.edit_distance('add', 'apple'), 27)
示例#5
0
 def test_neighbors_consistant(self):
     # Check to make sure our key relationships are bidirectional
     # because we are using this map to represent a bidirectional
     # graph.
     ed = WeightedEditDistance()
     for neighbor in ed._neighbors:
         for edge in ed._neighbors[neighbor]:
             self.assertTrue(neighbor in ed._neighbors[edge])
class ReverseAutocorrect:

    _corpus = "big.txt"

    def __init__(self):
        model, dictionary = self._read_corpus(self._corpus)
        self._model = model
        self._dict = dictionary
        self._wed = WeightedEditDistance()

    def _read_corpus(self, f):
        f = open(f)
        text = f.read()
        f.close()
        r = re.compile("\w[\w']*\w|\w")
        model = collections.defaultdict(lambda: 1)
        dictionary = set()
        for word in r.finditer(text.lower()):
            word = word.group(0)
            model[word] += 1
            dictionary.add(word)
        return model, dictionary

    def generate_sentence(self, sentence):
        """
		We randomly pick a word from the given sentence and then replace that
		word according to _calculate_replacement_word. We then return the
		sentence.
		"""
        sentence = sentence.split()
        i = randrange(0, len(sentence))
        sentence[i] = self._calculate_replacement_word(sentence[i].lower())
        return " ".join(sentence)

    def _calculate_replacement_word(self, word):
        # Our algorithm for calculating replacement words is expensive
        # based on the length of the word, so we limit possible expensive
        # calculations.
        if len(word) >= 6:
            degree = 1
        else:
            degree = 2
        possible_words = self._generate_words(word, degree)
        canidates = []
        for w in possible_words:
            ed = self._wed.edit_distance(word, w)
            canidates.append((ed * self._model[w], w))
        canidates.sort()
        return canidates[0][1]

    def _generate_words(self, word, degree):
        """
		We get a list of all possible words within an edit distance of
		degree from word and then check those words against our dictionary.
		All of the words that are in our dictionary are returned in a set.
		"""
        canidates = set([word])
        canidates = self._generate_possible_words(canidates, degree)
        possible = set()
        for w in canidates:
            if w in self._dict:
                possible.add(w)
        if word in possible:
            possible.remove(word)
        return possible

    def _generate_possible_words(self, possible, degree):
        """
		We recursively generate all possible combinations of letters
		that are within an edit distance of degree from the given
		words in the set possible.
		"""
        if degree == 0:
            return possible

        new_possibilities = set()
        for word in possible:
            for i in range(0, len(word)):
                if word[i] in string.lowercase:
                    for letter in string.lowercase:
                        # Substituting one letter
                        tmp_word = word[:i] + letter + word[i + 1 :]
                        new_possibilities.add(tmp_word)

                        # Adding an additional letter
                        tmp_word = word[:i] + letter + word[i:]
                        new_possibilities.add(tmp_word)

                        # Removing a letter
                    tmp_word = word[:i] + word[i + 1 :]
                    new_possibilities.add(tmp_word)
            for l in string.lowercase:
                new_possibilities.add(word + l)

        return self._generate_possible_words(possible.union(new_possibilities), degree - 1)
 def __init__(self):
     model, dictionary = self._read_corpus(self._corpus)
     self._model = model
     self._dict = dictionary
     self._wed = WeightedEditDistance()
class ReverseAutocorrect():

    _corpus = 'big.txt'

    def __init__(self):
        model, dictionary = self._read_corpus(self._corpus)
        self._model = model
        self._dict = dictionary
        self._wed = WeightedEditDistance()

    def _read_corpus(self, f):
        f = open(f)
        text = f.read()
        f.close()
        r = re.compile("\w[\w']*\w|\w")
        model = collections.defaultdict(lambda: 1)
        dictionary = set()
        for word in r.finditer(text.lower()):
            word = word.group(0)
            model[word] += 1
            dictionary.add(word)
        return model, dictionary

    def generate_sentence(self, sentence):
        """
		We randomly pick a word from the given sentence and then replace that
		word according to _calculate_replacement_word. We then return the
		sentence.
		"""
        sentence = sentence.split()
        i = randrange(0, len(sentence))
        sentence[i] = self._calculate_replacement_word(sentence[i].lower())
        return " ".join(sentence)

    def _calculate_replacement_word(self, word):
        # Our algorithm for calculating replacement words is expensive
        # based on the length of the word, so we limit possible expensive
        # calculations.
        if len(word) >= 6:
            degree = 1
        else:
            degree = 2
        possible_words = self._generate_words(word, degree)
        canidates = []
        for w in possible_words:
            ed = self._wed.edit_distance(word, w)
            canidates.append((ed * self._model[w], w))
        canidates.sort()
        return canidates[0][1]

    def _generate_words(self, word, degree):
        """
		We get a list of all possible words within an edit distance of
		degree from word and then check those words against our dictionary.
		All of the words that are in our dictionary are returned in a set.
		"""
        canidates = set([word])
        canidates = self._generate_possible_words(canidates, degree)
        possible = set()
        for w in canidates:
            if w in self._dict:
                possible.add(w)
        if word in possible:
            possible.remove(word)
        return possible

    def _generate_possible_words(self, possible, degree):
        """
		We recursively generate all possible combinations of letters
		that are within an edit distance of degree from the given
		words in the set possible.
		"""
        if degree == 0:
            return possible

        new_possibilities = set()
        for word in possible:
            for i in range(0, len(word)):
                if word[i] in string.lowercase:
                    for letter in string.lowercase:
                        # Substituting one letter
                        tmp_word = word[:i] + letter + word[i + 1:]
                        new_possibilities.add(tmp_word)

                        # Adding an additional letter
                        tmp_word = word[:i] + letter + word[i:]
                        new_possibilities.add(tmp_word)

                    # Removing a letter
                    tmp_word = word[:i] + word[i + 1:]
                    new_possibilities.add(tmp_word)
            for l in string.lowercase:
                new_possibilities.add(word + l)

        return self._generate_possible_words(possible.union(new_possibilities),
                                             degree - 1)
 def __init__(self):
     model, dictionary = self._read_corpus(self._corpus)
     self._model = model
     self._dict = dictionary
     self._wed = WeightedEditDistance()