예제 #1
0
def _main():
    print sum_multiples(10)
    print sum_multiples(1000)
    print fib_recur(8)
    print fib_iter(50)
    print deriv(lambda x: 2 * (x ** 3), 5)
    print matrix_mult([[1,2,3],[4,5,6]], [[7,8],[9,10],[11,12]])
    print matrix_mult([[2,4,1],[8,9,10],[1,3,2]], [[7,9,1],[3,3,2],[10,11,12]])

    # 5.
    from levenshtein import Levenshtein

    lev1 = Levenshtein("Jack is a very nice boy, isn't he?", "jack is a very nice boy is he")
    print lev1.distance()
예제 #2
0
    def charAlign(self):
        ref_chars = [x for x in self.ref] + [' ']
        hyp_chars = [x for x in self.hyp] + [' ']

        lev = Levenshtein.align(ref_chars,
                                hyp_chars,
                                lowercase=self.lowercase,
                                reserve_list=set([' ']))
        lev.editops()
        self.char_align = lev.expandAlign()
        return self.char_align
예제 #3
0
파일: tests.py 프로젝트: srdecny/diasys
    def test_wer(self):
        examples = {
            ("foo", "bar"): 1.0,
            ("foo bar", "foo baz"): 1 / 2,
            ("foo foo", "bar baz"): 1.0,
            ("", ""): 0.0
        }

        for words, wer in examples.items():
            out = StringIO()
            Levenshtein(words[0], words[1], " ", False, False, True, out=out)
            self.assertEqual("WER: " + str(wer) + '\n', out.getvalue())
예제 #4
0
def train(instances, subreddits, algorithm, cluster_lambda,
          clustering_training_iterations):
    if algorithm == "lev":
        alg = Levenshtein(instances, cluster_lambda)
        alg.train()
        return alg
    elif algorithm == "lambda_means":
        alg = LambdaMeans(instances, subreddits, cluster_lambda,
                          clustering_training_iterations)
        alg.train(instances)
        return alg
예제 #5
0
 def align_hyp(self, ref, hyp):
     match = []
     hyp_idx = 0
     ref_idx = 0
     lev = Levenshtein(ref, hyp)
     for i, op in enumerate(lev.editops()):
         assert hyp_idx < len(hyp) or op == Levenshtein.INS
         assert ref_idx < len(ref) or op == Levenshtein.DEL
         if op == Levenshtein.KEEP:
             assert hyp[hyp_idx] == ref[ref_idx]
             match.append(hyp[hyp_idx])
             hyp_idx += 1
             ref_idx += 1
         elif op == Levenshtein.SUB:
             match.append(None)
             hyp_idx += 1
             ref_idx += 1
         elif op == Levenshtein.DEL:
             hyp_idx += 1
         else:
             assert op == Levenshtein.INS
             match.append(None)
             ref_idx += 1
     return match
예제 #6
0
파일: tests.py 프로젝트: srdecny/diasys
    def test_example(self):
        examples = {
            ("", ""): 0,
            ("a a a", "a a a"): 0,
            ("a b", "a a a"): 1,
            ("a b c a", "a a a"): 1,
            ("foo", "bar"): 6,
            ("foo", "fooo"): 1
        }

        for words, distance in examples.items():
            out = StringIO()
            Levenshtein(words[0], words[1], " ", True, False, False, out=out)
            self.assertEqual("Minimum edit distance: " + str(distance) + '\n',
                             out.getvalue())
    def accepts_sentence(self, words_general):
        # sentence_general: string
        # Returns True if similarity of sentence_general is either:
        # > threshold1 according to tf-idf of one of stored sentences
        # > threshold2 according to ngramperplexity of one of stored sentences
        # > threshold3 according to levenshtein of one of stored sentences
        self.sentences_asked += 1
        accept_ngp = False
        accept_tfidf = False
        accept_edit_distance = False

        perplexity = self.ngp.calc_perplexity(words_general)
        self.sum_ngp += perplexity
        if perplexity <= self.threshold_perplexity_ngram:
            if RUN_CONFIGURATION.mode == MODE.TURBO:
                return True
            self.accepted_by_ngp += 1
            accept_ngp = True

        for words_specific in self.sentences:
            self.queries_asked += 1
            if accept_tfidf and accept_edit_distance:
                return True
            if not accept_tfidf:
                sim = self.tfidf.calc_cosine_similarity(
                    words_general, words_specific)
                self.sum_tfidf += sim
                if sim >= self.threshold_tfidf:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_tfidf += 1
                    accept_tfidf = True
            if not accept_edit_distance:
                edit_distance = Levenshtein.normalized_distance(
                    words_general, words_specific)
                self.sum_edit += edit_distance
                if edit_distance <= self.threshold_edit_distance:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_edit_distance += 1
                    accept_edit_distance = True

        if accept_tfidf or accept_ngp or accept_edit_distance:
            return True

        return False
예제 #8
0
    def __init__(self,
                 ref,
                 hyp,
                 lowercase=False,
                 verbose=False,
                 pronounce_type=PronouncerType.Lexicon,
                 lexicon=None,
                 word_align_weights=Levenshtein.wordAlignWeights):
        if not ref:
            raise Exception("No reference file.\nref: {0}\nhyp: {1}".format(
                ref, hyp))

        if pronounce_type == PronouncerType.Lexicon:
            self.pronouncer = PronouncerLex(lexicon)
        else:
            self.pronouncer = PronouncerBase()

        self.ref = [x for x in ref.strip().split() if x]
        self.hyp = [x for x in hyp.strip().split() if x]
        self.refwords = ' '.join(self.ref)
        self.hypwords = ' '.join(self.hyp)

        self.lowercase = lowercase
        self.verbose = verbose

        # Perform word alignment
        lev = Levenshtein.align(self.ref,
                                self.hyp,
                                lowercase=self.lowercase,
                                weights=word_align_weights)
        lev.editops()
        self.wer_alignment = lev.expandAlignCompact()
        self.wer, self.wer_components = self.wer_alignment.error_rate()

        # Used for POWER alignment
        self.power_alignment = None
        self.power = None
        self.power_components = None

        # Used to find potential error regions
        self.split_regions = None
        self.error_indexes = None
        self.phonetic_alignments = None
        self.phonetic_lev = None
    def accepts_sentence(self, words_general):
        # sentence_general: string
        # Returns True if similarity of sentence_general is either:
        # > threshold1 according to tf-idf of one of stored sentences
        # > threshold2 according to ngramperplexity of one of stored sentences
        # > threshold3 according to levenshtein of one of stored sentences
        self.sentences_asked += 1
        accept_ngp = False
        accept_tfidf = False
        accept_edit_distance = False

        perplexity = self.ngp.calc_perplexity(words_general)
        self.sum_ngp += perplexity
        if perplexity <= self.threshold_perplexity_ngram:
            if RUN_CONFIGURATION.mode == MODE.TURBO:
                return True
            self.accepted_by_ngp += 1
            accept_ngp = True

        for words_specific in self.sentences:
            self.queries_asked += 1
            if accept_tfidf and accept_edit_distance:
                return True
            if not accept_tfidf:
                sim = self.tfidf.calc_cosine_similarity(words_general, words_specific)
                self.sum_tfidf += sim
                if sim >= self.threshold_tfidf:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_tfidf += 1
                    accept_tfidf = True
            if not accept_edit_distance:
                edit_distance = Levenshtein.normalized_distance(words_general, words_specific)
                self.sum_edit += edit_distance
                if edit_distance <= self.threshold_edit_distance:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_edit_distance += 1
                    accept_edit_distance = True

        if accept_tfidf or accept_ngp or accept_edit_distance:
            return True

        return False
예제 #10
0
    def phoneAlignToWordAlign(cls,
                              ref_words,
                              hyp_words,
                              ref_phones,
                              hyp_phones,
                              break_on_syllables=True):
        ref_word_span = (0, len(ref_words))
        hyp_word_span = (0, len(hyp_words))

        # Perform Levenshtein Alignment
        lev = Levenshtein.align(ref=ref_phones,
                                hyp=hyp_phones,
                                reserve_list=PowerAligner.reserve_list,
                                exclusive_sets=PowerAligner.exclusive_sets,
                                weights=Levenshtein.wordAlignWeights)  #,
        #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights)
        phone_align = lev.expandAlignCompact()

        worklist = list()
        worklist.append((ref_word_span, hyp_word_span, phone_align))

        full_reference = list()
        full_hypothesis = list()
        full_alignment = list()
        full_phone_align = list()

        while worklist:
            # Take the next set of sequence boundaries off the worklist
            ref_word_span, hyp_word_span, phone_align = worklist.pop()
            ref_word_index, ref_word_limit = ref_word_span
            hyp_word_index, hyp_word_limit = hyp_word_span

            # TODO: Currently only checking in the forward direction
            ref_word_builder = []  # Temp storage of words in alignment span
            hyp_word_builder = []

            ref_word_iter = enumerate(
                ref_words[ref_word_span[0]:ref_word_span[1]]
            )  # Iterates through the surface words
            hyp_word_iter = enumerate(
                hyp_words[hyp_word_span[0]:hyp_word_span[1]])

            ref_aligned = []  # Finalized alignments
            hyp_aligned = []
            alignment = []  # Finalized alignment labels

            ref_extra_syllable_word_index = None  # Used for marking words mapping to extra syllables in alignment.
            hyp_extra_syllable_word_index = None
            ref_syllable_count = 0
            hyp_syllable_count = 0

            ref_word_started = False  # Indicates whether a word is already accounted for in the alignment when a phoneme is reached.
            hyp_word_started = False

            advance_worklist = False
            commit_alignment = False

            for i in range(len(phone_align.align)):
                ref_type = TokType.checkAnnotation(phone_align.s1[i])
                hyp_type = TokType.checkAnnotation(phone_align.s2[i])

                # Check if word boundaries are reached, both on ref an hyp -- or the case where no more symbols can be read.
                if (i == len(phone_align.align) -
                        1) or (ref_type == TokType.WordBoundary
                               and ref_type == hyp_type):
                    align_tok = None
                    # Only write outputs if either the ref or the hyp has scanned some words.
                    if ref_word_builder:
                        if hyp_word_builder:
                            align_tok = AlignLabels.substitution if ref_word_builder != hyp_word_builder else AlignLabels.correct
                        else:
                            align_tok = AlignLabels.deletion
                    elif hyp_word_builder:
                        align_tok = AlignLabels.insertion

                    if align_tok:
                        # Add the remainder to the worklist
                        ref_word_span_next = (ref_word_index +
                                              len(ref_word_builder),
                                              ref_word_limit)
                        hyp_word_span_next = (hyp_word_index +
                                              len(hyp_word_builder),
                                              hyp_word_limit)
                        phone_align_next = phone_align.subsequence(
                            i, phone_align.length(), preserve_index=False)
                        worklist.append((ref_word_span_next,
                                         hyp_word_span_next, phone_align_next))

                        # "Commit" the current alignment
                        if align_tok in (AlignLabels.correct,
                                         AlignLabels.substitution):
                            alignment.append(align_tok)

                            # Check for syllable conflicts
                            if not break_on_syllables or not ref_extra_syllable_word_index:
                                ref_aligned.append(' '.join(ref_word_builder))
                                ref_syllable_count = 0
                                hyp_syllable_count = 0
                            else:
                                ref_aligned.append(' '.join(ref_word_builder[
                                    0:ref_extra_syllable_word_index]))
                                # The remaining words are deletions
                                for word in ref_word_builder[
                                        ref_extra_syllable_word_index:]:
                                    alignment.append(AlignLabels.deletion)
                                    ref_aligned.append(word)
                                    hyp_aligned.append('')
                                ref_syllable_count = 0

                            if not break_on_syllables or not hyp_extra_syllable_word_index:
                                hyp_aligned.append(' '.join(hyp_word_builder))
                                ref_syllable_count = 0
                                hyp_syllable_count = 0
                            else:
                                hyp_aligned.append(' '.join(hyp_word_builder[
                                    0:hyp_extra_syllable_word_index]))
                                # The remaining words are insertions
                                for word in hyp_word_builder[
                                        hyp_extra_syllable_word_index:]:
                                    alignment.append(AlignLabels.insertion)
                                    ref_aligned.append('')
                                    hyp_aligned.append(word)
                                    hyp_syllable_count = 0

                            if align_tok == AlignLabels.substitution:
                                # Check if you need to rework this alignment.
                                if len(ref_word_builder) != len(
                                        hyp_word_builder):
                                    # Word count mismatch in the alignment span. Is there a possibility that we need to re-align this segment?
                                    ref_word_span_curr = (
                                        ref_word_index,
                                        ref_word_index + len(ref_word_builder))
                                    hyp_word_span_curr = (
                                        hyp_word_index,
                                        hyp_word_index + len(hyp_word_builder))
                                    phone_align_curr = phone_align.subsequence(
                                        0, i + 1, preserve_index=False)

                                    lev = Levenshtein.align(
                                        ref=phone_align_curr.s1_tokens(),
                                        hyp=phone_align_curr.s2_tokens(),
                                        reserve_list=PowerAligner.reserve_list,
                                        exclusive_sets=PowerAligner.
                                        exclusive_sets,
                                        weights=Levenshtein.wordAlignWeights
                                    )  #,
                                    #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights)

                                    phone_align_adjusted = lev.expandAlignCompact(
                                    )

                                    if phone_align_curr.align != phone_align_adjusted.align:
                                        # Looks like we need to redo the phone-to-word alignment.
                                        worklist.append((ref_word_span_curr,
                                                         hyp_word_span_curr,
                                                         phone_align_adjusted))
                                    else:
                                        commit_alignment = True
                                else:
                                    commit_alignment = True

                        elif align_tok == AlignLabels.deletion:
                            for word in ref_word_builder:
                                alignment.append(align_tok)
                                ref_aligned.append(word)
                                hyp_aligned.append('')

                            commit_alignment = True
                            ref_syllable_count = 0

                        elif align_tok == AlignLabels.insertion:
                            for word in hyp_word_builder:
                                alignment.append(align_tok)
                                ref_aligned.append('')
                                hyp_aligned.append(word)

                            commit_alignment = True
                            hyp_syllable_count = 0

                        if commit_alignment:
                            # Commit the alignment.
                            full_reference.extend(ref_aligned)
                            full_hypothesis.extend(hyp_aligned)
                            full_alignment.extend(alignment)
                            full_phone_align.append(
                                phone_align.subsequence(0,
                                                        i,
                                                        preserve_index=False))
                            ref_aligned = []
                            hyp_aligned = []
                            alignment = []
                        break

                # Add words if word boundaries are reached.
                else:
                    if ref_type == TokType.WordBoundary:
                        ref_word_started = False
                        if hyp_type != TokType.WordBoundary and ref_word_builder and not hyp_word_builder:
                            # DELETION
                            # Ref word ended, but no hyp words have been added. Mark the current ref word(s) in the span as deletion errors.
                            # TODO: Dedupe this logic
                            for word in ref_word_builder:
                                alignment.append(AlignLabels.deletion)
                                ref_aligned.append(word)
                                hyp_aligned.append('')
                            ref_syllable_count = 0

                            # Commit the alignment.
                            full_reference.extend(ref_aligned)
                            full_hypothesis.extend(hyp_aligned)
                            full_alignment.extend(alignment)
                            full_phone_align.append(
                                phone_align.subsequence(0,
                                                        i,
                                                        preserve_index=False))

                            # Add the remainder to the worklist
                            ref_word_span_next = (ref_word_index +
                                                  len(ref_word_builder),
                                                  ref_word_limit)
                            hyp_word_span_next = (hyp_word_index +
                                                  len(hyp_word_builder),
                                                  hyp_word_limit)
                            lev = Levenshtein.align(
                                ref=[x for x in phone_align.s1[i:] if x],
                                hyp=[x for x in phone_align.s2 if x],
                                reserve_list=PowerAligner.reserve_list,
                                exclusive_sets=PowerAligner.exclusive_sets,
                                weights=Levenshtein.wordAlignWeights)  #,
                            #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights)
                            phone_align_next = lev.expandAlignCompact()

                            worklist.append(
                                (ref_word_span_next, hyp_word_span_next,
                                 phone_align_next))
                            break
                    elif ref_type == TokType.Phoneme and not ref_word_started:
                        ref_word_started = True
                        try:
                            ref_word_item = ref_word_iter.next()
                            ref_word_builder.append(ref_word_item[1])
                        except StopIteration:
                            pass

                    if hyp_type == TokType.WordBoundary:
                        hyp_word_started = False
                        if ref_type != TokType.WordBoundary and hyp_word_builder and not ref_word_builder:
                            # INSERTION
                            # Hyp word ended, but no ref words have been added. Mark the current hyp word(s) in the span as insertion errors.
                            # TODO: Dedupe this logic
                            for word in hyp_word_builder:
                                alignment.append(AlignLabels.insertion)
                                ref_aligned.append('')
                                hyp_aligned.append(word)
                            hyp_syllable_count = 0

                            # Commit the alignment.
                            full_reference.extend(ref_aligned)
                            full_hypothesis.extend(hyp_aligned)
                            full_alignment.extend(alignment)
                            full_phone_align.append(
                                phone_align.subsequence(0,
                                                        i,
                                                        preserve_index=False))

                            # Add the remainder to the worklist
                            ref_word_span_next = (ref_word_index +
                                                  len(ref_word_builder),
                                                  ref_word_limit)
                            hyp_word_span_next = (hyp_word_index +
                                                  len(hyp_word_builder),
                                                  hyp_word_limit)
                            lev = Levenshtein.align(
                                ref=[x for x in phone_align.s1 if x],
                                hyp=[x for x in phone_align.s2[i:] if x],
                                reserve_list=PowerAligner.reserve_list,
                                exclusive_sets=PowerAligner.exclusive_sets,
                                weights=Levenshtein.wordAlignWeights)  #,
                            #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights)
                            phone_align_next = lev.expandAlignCompact()

                            worklist.append(
                                (ref_word_span_next, hyp_word_span_next,
                                 phone_align_next))
                            break
                    elif hyp_type == TokType.Phoneme and not hyp_word_started:
                        hyp_word_started = True
                        try:
                            hyp_word_item = hyp_word_iter.next()
                            hyp_word_builder.append(hyp_word_item[1])
                        except StopIteration:
                            pass

                # Check for syllable mismatches
                if ref_type == TokType.SyllableBoundary:
                    ref_syllable_count += 1
                if hyp_type == TokType.SyllableBoundary:
                    hyp_syllable_count += 1

                if (ref_type == TokType.SyllableBoundary == hyp_type
                        or ref_syllable_count == hyp_syllable_count):
                    # No syllable conflicts here!
                    ref_extra_syllable_word_index = None
                    hyp_extra_syllable_word_index = None
                elif (ref_type == TokType.SyllableBoundary
                      and not ref_extra_syllable_word_index
                      and TokType.checkAnnotation(
                          phone_align.s2[i - 1]) == TokType.WordBoundary):
                    # Extra syllable in hypothesis. We only care if the syllable immediately follows a word boundary.
                    # This is because this indicates that a new word is being formed, which may likely be an insertion in hyp.
                    ref_extra_syllable_word_index = len(ref_word_builder) - 1
                    # print ref_word_builder
                    # print 'Syllable/word mismatch at', i
                    # print 'Extra hyp word:', ref_word_builder[ref_extra_syllable_word_index]
                elif (hyp_type == TokType.SyllableBoundary
                      and not hyp_extra_syllable_word_index
                      and TokType.checkAnnotation(
                          phone_align.s2[i - 1]) == TokType.WordBoundary):
                    # This time there's an extra syllable in the ref, corresponding to a new ref word.
                    hyp_extra_syllable_word_index = len(hyp_word_builder) - 1
                    # print hyp_word_builder
                    # print 'Syllable/word mismatch at', i
                    # print 'Extra ref word:', hyp_word_builder[hyp_extra_syllable_word_index]
        # Concatenate all phoneme alignments
        fp_align = full_phone_align[0]
        for expand_align in full_phone_align[1:]:
            fp_align.append_alignment(expand_align)

        return ExpandedAlignment(full_reference, full_hypothesis,
                                 full_alignment), fp_align
from levenshtein import Levenshtein

assert(Levenshtein.distance("", "abc") == 3)
assert(Levenshtein.distance("abc", "") == 3)
assert(Levenshtein.distance("", "") == 0)
assert(Levenshtein.distance("abc", "abc") == 0)
assert(Levenshtein.distance("abcdef", "xxxxxx") == 6)
assert(Levenshtein.distance("xxxxxx", "abcdef") == 6)
assert(Levenshtein.distance("abcdef", "abefcd") == 4)
assert(Levenshtein.distance("abefcd", "abcdef") == 4)
assert(Levenshtein.distance("acdefg", "abcdef") == 2)
assert(Levenshtein.distance("abcdef", "acdefg") == 2)

assert(Levenshtein.normalized_distance("abcdef", "abc") == 0.5)
assert(Levenshtein.normalized_distance("abcdef", "") == 1)
assert(Levenshtein.normalized_distance("a", "b") == 1)
assert(Levenshtein.normalized_distance("a", "") == 1)
assert(Levenshtein.normalized_distance("a", "a") == 0)
assert(Levenshtein.normalized_distance("abcd", "c") == 0.75)
assert(Levenshtein.normalized_distance("abcd", "bd") == 0.5)
assert(Levenshtein.normalized_distance("abcd", "db") == 0.75)

print("Success")
from levenshtein import Levenshtein

assert (Levenshtein.distance("", "abc") == 3)
assert (Levenshtein.distance("abc", "") == 3)
assert (Levenshtein.distance("", "") == 0)
assert (Levenshtein.distance("abc", "abc") == 0)
assert (Levenshtein.distance("abcdef", "xxxxxx") == 6)
assert (Levenshtein.distance("xxxxxx", "abcdef") == 6)
assert (Levenshtein.distance("abcdef", "abefcd") == 4)
assert (Levenshtein.distance("abefcd", "abcdef") == 4)
assert (Levenshtein.distance("acdefg", "abcdef") == 2)
assert (Levenshtein.distance("abcdef", "acdefg") == 2)

assert (Levenshtein.normalized_distance("abcdef", "abc") == 0.5)
assert (Levenshtein.normalized_distance("abcdef", "") == 1)
assert (Levenshtein.normalized_distance("a", "b") == 1)
assert (Levenshtein.normalized_distance("a", "") == 1)
assert (Levenshtein.normalized_distance("a", "a") == 0)
assert (Levenshtein.normalized_distance("abcd", "c") == 0.75)
assert (Levenshtein.normalized_distance("abcd", "bd") == 0.5)
assert (Levenshtein.normalized_distance("abcd", "db") == 0.75)

print("Success")
from levenshtein import Levenshtein
# A more through general test is found in levenshteintests.py
# This test is only to show that word based distance works the same way as character based distance

assert (Levenshtein.distance(["hi", "there"], ["hi"]) == 1)
assert (Levenshtein.distance(["hi", "there"], ["there", "hi"]) == 2)
assert (Levenshtein.distance(["hi", "there"], []) == 2)
assert (Levenshtein.distance(["aaa", "bbbb", "cccc"],
                             ["aaa", "fff", "cccc"]) == 1)
assert (Levenshtein.distance([], []) == 0)

print("Success")
예제 #14
0
#!/usr/bin/python3

import argparse
from levenshtein import Levenshtein

parser = argparse.ArgumentParser(description="Find the Levenshtein distance between two strings.")
parser.add_argument("string1", help="First string.")
parser.add_argument("string2", help="Second string.")
parser.add_argument("-d", "--delimiter", help="Word delimiter. Default value: space", nargs='?', const=' ')
parser.add_argument("-D", "--distance", help="Print edit distance.", action="store_true")
parser.add_argument("-A", "--alignment", help="Print alignment.", action="store_true")
parser.add_argument("-E", "--error", help="Print WER.", action="store_true")

args = parser.parse_args()

Levenshtein(args.string1, args.string2, args.delimiter, args.distance, args.alignment, args.error)

 
from levenshtein import Levenshtein
# A more through general test is found in levenshteintests.py
# This test is only to show that word based distance works the same way as character based distance

assert(Levenshtein.distance(["hi", "there"], ["hi"]) == 1)
assert(Levenshtein.distance(["hi", "there"], ["there", "hi"]) == 2)
assert(Levenshtein.distance(["hi", "there"], []) == 2)
assert(Levenshtein.distance(["aaa", "bbbb", "cccc"], ["aaa", "fff", "cccc"]) == 1)
assert(Levenshtein.distance([], []) == 0)

print("Success")
예제 #16
0
                continue
            if len(word) < 5:
                continue
            yield word


def anonymize(words, token='<NAME>'):
    return [token if w in wilhelm or w in jakob else w for w in words]


def anonymize_letter(letter, token='<NAME>'):
    letter.words = anonymize(letter.words, token=token)


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('path')
    args = parser.parse_args()

    letters = load_letters(bpath=args.path)
    words = set(headings(letters))
    dists = Levenshtein(*words)
    print("Wilhelm:\n")
    for w, _ in sorted(dists.dists_to('Wilhelm'), key=lambda x: x[1]):
        print("\t%s" % w)
    print()
    print("Jakob:\n")
    for w, _ in sorted(dists.dists_to('Jakob'), key=lambda x: x[1]):
        print("\t%s" % w)