Пример #1
0
def train(instances, subreddits, algorithm, cluster_lambda,
          clustering_training_iterations):
    if algorithm == "lev":
        alg = Levenshtein(instances, cluster_lambda)
        alg.train()
        return alg
    elif algorithm == "lambda_means":
        alg = LambdaMeans(instances, subreddits, cluster_lambda,
                          clustering_training_iterations)
        alg.train(instances)
        return alg
Пример #2
0
    def test_wer(self):
        examples = {
            ("foo", "bar"): 1.0,
            ("foo bar", "foo baz"): 1 / 2,
            ("foo foo", "bar baz"): 1.0,
            ("", ""): 0.0
        }

        for words, wer in examples.items():
            out = StringIO()
            Levenshtein(words[0], words[1], " ", False, False, True, out=out)
            self.assertEqual("WER: " + str(wer) + '\n', out.getvalue())
Пример #3
0
    def test_example(self):
        examples = {
            ("", ""): 0,
            ("a a a", "a a a"): 0,
            ("a b", "a a a"): 1,
            ("a b c a", "a a a"): 1,
            ("foo", "bar"): 6,
            ("foo", "fooo"): 1
        }

        for words, distance in examples.items():
            out = StringIO()
            Levenshtein(words[0], words[1], " ", True, False, False, out=out)
            self.assertEqual("Minimum edit distance: " + str(distance) + '\n',
                             out.getvalue())
Пример #4
0
 def align_hyp(self, ref, hyp):
     match = []
     hyp_idx = 0
     ref_idx = 0
     lev = Levenshtein(ref, hyp)
     for i, op in enumerate(lev.editops()):
         assert hyp_idx < len(hyp) or op == Levenshtein.INS
         assert ref_idx < len(ref) or op == Levenshtein.DEL
         if op == Levenshtein.KEEP:
             assert hyp[hyp_idx] == ref[ref_idx]
             match.append(hyp[hyp_idx])
             hyp_idx += 1
             ref_idx += 1
         elif op == Levenshtein.SUB:
             match.append(None)
             hyp_idx += 1
             ref_idx += 1
         elif op == Levenshtein.DEL:
             hyp_idx += 1
         else:
             assert op == Levenshtein.INS
             match.append(None)
             ref_idx += 1
     return match
Пример #5
0
#!/usr/bin/python3

import argparse
from levenshtein import Levenshtein

parser = argparse.ArgumentParser(description="Find the Levenshtein distance between two strings.")
parser.add_argument("string1", help="First string.")
parser.add_argument("string2", help="Second string.")
parser.add_argument("-d", "--delimiter", help="Word delimiter. Default value: space", nargs='?', const=' ')
parser.add_argument("-D", "--distance", help="Print edit distance.", action="store_true")
parser.add_argument("-A", "--alignment", help="Print alignment.", action="store_true")
parser.add_argument("-E", "--error", help="Print WER.", action="store_true")

args = parser.parse_args()

Levenshtein(args.string1, args.string2, args.delimiter, args.distance, args.alignment, args.error)

 
Пример #6
0
                continue
            if len(word) < 5:
                continue
            yield word


def anonymize(words, token='<NAME>'):
    return [token if w in wilhelm or w in jakob else w for w in words]


def anonymize_letter(letter, token='<NAME>'):
    letter.words = anonymize(letter.words, token=token)


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('path')
    args = parser.parse_args()

    letters = load_letters(bpath=args.path)
    words = set(headings(letters))
    dists = Levenshtein(*words)
    print("Wilhelm:\n")
    for w, _ in sorted(dists.dists_to('Wilhelm'), key=lambda x: x[1]):
        print("\t%s" % w)
    print()
    print("Jakob:\n")
    for w, _ in sorted(dists.dists_to('Jakob'), key=lambda x: x[1]):
        print("\t%s" % w)