Пример #1
0
 def test_edit_distance(self):
     self.assertEqual(edit_distance('', 'aa'), 2)
     self.assertEqual(edit_distance('aa', ''), 2)
     self.assertEqual(edit_distance('a', 'ab'), 1)
     self.assertEqual(edit_distance('ab', 'a'), 1)
     self.assertEqual(edit_distance('ab', 'aa'), 1)
     self.assertEqual(edit_distance('aa', 'ab'), 1)
     self.assertEqual(edit_distance('abd', 'abcdef'), 3)
     self.assertEqual(edit_distance('abcdef', 'abd'), 3)
Пример #2
0
    def contributors_by_fuzzy_match(self, string):
        string_in_lowercase = string.lower()

        # 1. Exact match for fullname, email and irc_nicknames
        account = (
            self.contributor_by_name(string_in_lowercase)
            or self.account_by_email(string_in_lowercase)
            or self.contributor_by_irc_nickname(string_in_lowercase)
        )
        if account:
            return [account], 0

        # 2. Exact match for email username (before @)
        accounts = self.contributors_by_email_username(string_in_lowercase)
        if accounts and len(accounts) == 1:
            return accounts, 0

        # 3. Exact match for first name, last name, and first name + initial combinations such as "Dan B" and "Tim H"
        accounts = [
            contributor
            for contributor in self.contributors()
            if string in self._contributor_name_shorthands(contributor)
        ]
        if accounts and len(accounts) == 1:
            return accounts, 0

        # 4. Finally, fuzzy-match using edit-distance
        string = string_in_lowercase
        contributorWithMinDistance = []
        minDistance = len(string) / 2 - 1
        for contributor in self.contributors():
            tokens = self._tokenize_contributor_name(contributor)
            editdistances = [
                edit_distance(token, string) for token in tokens if abs(len(token) - len(string)) <= minDistance
            ]
            if not editdistances:
                continue
            distance = min(editdistances)
            if distance == minDistance:
                contributorWithMinDistance.append(contributor)
            elif distance < minDistance:
                contributorWithMinDistance = [contributor]
                minDistance = distance
        if not len(contributorWithMinDistance):
            return [], len(string)
        return contributorWithMinDistance, minDistance
Пример #3
0
    def contributors_by_fuzzy_match(self, string):
        string_in_lowercase = string.lower()

        # 1. Exact match for fullname, email and irc_nicknames
        account = self.contributor_by_name(
            string_in_lowercase) or self.account_by_email(
                string_in_lowercase) or self.contributor_by_irc_nickname(
                    string_in_lowercase)
        if account:
            return [account], 0

        # 2. Exact match for email username (before @)
        accounts = self.contributors_by_email_username(string_in_lowercase)
        if accounts and len(accounts) == 1:
            return accounts, 0

        # 3. Exact match for first name, last name, and first name + initial combinations such as "Dan B" and "Tim H"
        accounts = [
            contributor for contributor in self.contributors()
            if string in self._contributor_name_shorthands(contributor)
        ]
        if accounts and len(accounts) == 1:
            return accounts, 0

        # 4. Finally, fuzzy-match using edit-distance
        string = string_in_lowercase
        contributorWithMinDistance = []
        minDistance = len(string) / 2 - 1
        for contributor in self.contributors():
            tokens = self._tokenize_contributor_name(contributor)
            editdistances = [
                edit_distance(token, string) for token in tokens
                if abs(len(token) - len(string)) <= minDistance
            ]
            if not editdistances:
                continue
            distance = min(editdistances)
            if distance == minDistance:
                contributorWithMinDistance.append(contributor)
            elif distance < minDistance:
                contributorWithMinDistance = [contributor]
                minDistance = distance
        if not len(contributorWithMinDistance):
            return [], len(string)
        return contributorWithMinDistance, minDistance