def _main(): print sum_multiples(10) print sum_multiples(1000) print fib_recur(8) print fib_iter(50) print deriv(lambda x: 2 * (x ** 3), 5) print matrix_mult([[1,2,3],[4,5,6]], [[7,8],[9,10],[11,12]]) print matrix_mult([[2,4,1],[8,9,10],[1,3,2]], [[7,9,1],[3,3,2],[10,11,12]]) # 5. from levenshtein import Levenshtein lev1 = Levenshtein("Jack is a very nice boy, isn't he?", "jack is a very nice boy is he") print lev1.distance()
def charAlign(self): ref_chars = [x for x in self.ref] + [' '] hyp_chars = [x for x in self.hyp] + [' '] lev = Levenshtein.align(ref_chars, hyp_chars, lowercase=self.lowercase, reserve_list=set([' '])) lev.editops() self.char_align = lev.expandAlign() return self.char_align
def test_wer(self): examples = { ("foo", "bar"): 1.0, ("foo bar", "foo baz"): 1 / 2, ("foo foo", "bar baz"): 1.0, ("", ""): 0.0 } for words, wer in examples.items(): out = StringIO() Levenshtein(words[0], words[1], " ", False, False, True, out=out) self.assertEqual("WER: " + str(wer) + '\n', out.getvalue())
def train(instances, subreddits, algorithm, cluster_lambda, clustering_training_iterations): if algorithm == "lev": alg = Levenshtein(instances, cluster_lambda) alg.train() return alg elif algorithm == "lambda_means": alg = LambdaMeans(instances, subreddits, cluster_lambda, clustering_training_iterations) alg.train(instances) return alg
def align_hyp(self, ref, hyp): match = [] hyp_idx = 0 ref_idx = 0 lev = Levenshtein(ref, hyp) for i, op in enumerate(lev.editops()): assert hyp_idx < len(hyp) or op == Levenshtein.INS assert ref_idx < len(ref) or op == Levenshtein.DEL if op == Levenshtein.KEEP: assert hyp[hyp_idx] == ref[ref_idx] match.append(hyp[hyp_idx]) hyp_idx += 1 ref_idx += 1 elif op == Levenshtein.SUB: match.append(None) hyp_idx += 1 ref_idx += 1 elif op == Levenshtein.DEL: hyp_idx += 1 else: assert op == Levenshtein.INS match.append(None) ref_idx += 1 return match
def test_example(self): examples = { ("", ""): 0, ("a a a", "a a a"): 0, ("a b", "a a a"): 1, ("a b c a", "a a a"): 1, ("foo", "bar"): 6, ("foo", "fooo"): 1 } for words, distance in examples.items(): out = StringIO() Levenshtein(words[0], words[1], " ", True, False, False, out=out) self.assertEqual("Minimum edit distance: " + str(distance) + '\n', out.getvalue())
def accepts_sentence(self, words_general): # sentence_general: string # Returns True if similarity of sentence_general is either: # > threshold1 according to tf-idf of one of stored sentences # > threshold2 according to ngramperplexity of one of stored sentences # > threshold3 according to levenshtein of one of stored sentences self.sentences_asked += 1 accept_ngp = False accept_tfidf = False accept_edit_distance = False perplexity = self.ngp.calc_perplexity(words_general) self.sum_ngp += perplexity if perplexity <= self.threshold_perplexity_ngram: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_ngp += 1 accept_ngp = True for words_specific in self.sentences: self.queries_asked += 1 if accept_tfidf and accept_edit_distance: return True if not accept_tfidf: sim = self.tfidf.calc_cosine_similarity( words_general, words_specific) self.sum_tfidf += sim if sim >= self.threshold_tfidf: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_tfidf += 1 accept_tfidf = True if not accept_edit_distance: edit_distance = Levenshtein.normalized_distance( words_general, words_specific) self.sum_edit += edit_distance if edit_distance <= self.threshold_edit_distance: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_edit_distance += 1 accept_edit_distance = True if accept_tfidf or accept_ngp or accept_edit_distance: return True return False
def __init__(self, ref, hyp, lowercase=False, verbose=False, pronounce_type=PronouncerType.Lexicon, lexicon=None, word_align_weights=Levenshtein.wordAlignWeights): if not ref: raise Exception("No reference file.\nref: {0}\nhyp: {1}".format( ref, hyp)) if pronounce_type == PronouncerType.Lexicon: self.pronouncer = PronouncerLex(lexicon) else: self.pronouncer = PronouncerBase() self.ref = [x for x in ref.strip().split() if x] self.hyp = [x for x in hyp.strip().split() if x] self.refwords = ' '.join(self.ref) self.hypwords = ' '.join(self.hyp) self.lowercase = lowercase self.verbose = verbose # Perform word alignment lev = Levenshtein.align(self.ref, self.hyp, lowercase=self.lowercase, weights=word_align_weights) lev.editops() self.wer_alignment = lev.expandAlignCompact() self.wer, self.wer_components = self.wer_alignment.error_rate() # Used for POWER alignment self.power_alignment = None self.power = None self.power_components = None # Used to find potential error regions self.split_regions = None self.error_indexes = None self.phonetic_alignments = None self.phonetic_lev = None
def accepts_sentence(self, words_general): # sentence_general: string # Returns True if similarity of sentence_general is either: # > threshold1 according to tf-idf of one of stored sentences # > threshold2 according to ngramperplexity of one of stored sentences # > threshold3 according to levenshtein of one of stored sentences self.sentences_asked += 1 accept_ngp = False accept_tfidf = False accept_edit_distance = False perplexity = self.ngp.calc_perplexity(words_general) self.sum_ngp += perplexity if perplexity <= self.threshold_perplexity_ngram: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_ngp += 1 accept_ngp = True for words_specific in self.sentences: self.queries_asked += 1 if accept_tfidf and accept_edit_distance: return True if not accept_tfidf: sim = self.tfidf.calc_cosine_similarity(words_general, words_specific) self.sum_tfidf += sim if sim >= self.threshold_tfidf: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_tfidf += 1 accept_tfidf = True if not accept_edit_distance: edit_distance = Levenshtein.normalized_distance(words_general, words_specific) self.sum_edit += edit_distance if edit_distance <= self.threshold_edit_distance: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_edit_distance += 1 accept_edit_distance = True if accept_tfidf or accept_ngp or accept_edit_distance: return True return False
def phoneAlignToWordAlign(cls, ref_words, hyp_words, ref_phones, hyp_phones, break_on_syllables=True): ref_word_span = (0, len(ref_words)) hyp_word_span = (0, len(hyp_words)) # Perform Levenshtein Alignment lev = Levenshtein.align(ref=ref_phones, hyp=hyp_phones, reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets, weights=Levenshtein.wordAlignWeights) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align = lev.expandAlignCompact() worklist = list() worklist.append((ref_word_span, hyp_word_span, phone_align)) full_reference = list() full_hypothesis = list() full_alignment = list() full_phone_align = list() while worklist: # Take the next set of sequence boundaries off the worklist ref_word_span, hyp_word_span, phone_align = worklist.pop() ref_word_index, ref_word_limit = ref_word_span hyp_word_index, hyp_word_limit = hyp_word_span # TODO: Currently only checking in the forward direction ref_word_builder = [] # Temp storage of words in alignment span hyp_word_builder = [] ref_word_iter = enumerate( ref_words[ref_word_span[0]:ref_word_span[1]] ) # Iterates through the surface words hyp_word_iter = enumerate( hyp_words[hyp_word_span[0]:hyp_word_span[1]]) ref_aligned = [] # Finalized alignments hyp_aligned = [] alignment = [] # Finalized alignment labels ref_extra_syllable_word_index = None # Used for marking words mapping to extra syllables in alignment. hyp_extra_syllable_word_index = None ref_syllable_count = 0 hyp_syllable_count = 0 ref_word_started = False # Indicates whether a word is already accounted for in the alignment when a phoneme is reached. hyp_word_started = False advance_worklist = False commit_alignment = False for i in range(len(phone_align.align)): ref_type = TokType.checkAnnotation(phone_align.s1[i]) hyp_type = TokType.checkAnnotation(phone_align.s2[i]) # Check if word boundaries are reached, both on ref an hyp -- or the case where no more symbols can be read. if (i == len(phone_align.align) - 1) or (ref_type == TokType.WordBoundary and ref_type == hyp_type): align_tok = None # Only write outputs if either the ref or the hyp has scanned some words. if ref_word_builder: if hyp_word_builder: align_tok = AlignLabels.substitution if ref_word_builder != hyp_word_builder else AlignLabels.correct else: align_tok = AlignLabels.deletion elif hyp_word_builder: align_tok = AlignLabels.insertion if align_tok: # Add the remainder to the worklist ref_word_span_next = (ref_word_index + len(ref_word_builder), ref_word_limit) hyp_word_span_next = (hyp_word_index + len(hyp_word_builder), hyp_word_limit) phone_align_next = phone_align.subsequence( i, phone_align.length(), preserve_index=False) worklist.append((ref_word_span_next, hyp_word_span_next, phone_align_next)) # "Commit" the current alignment if align_tok in (AlignLabels.correct, AlignLabels.substitution): alignment.append(align_tok) # Check for syllable conflicts if not break_on_syllables or not ref_extra_syllable_word_index: ref_aligned.append(' '.join(ref_word_builder)) ref_syllable_count = 0 hyp_syllable_count = 0 else: ref_aligned.append(' '.join(ref_word_builder[ 0:ref_extra_syllable_word_index])) # The remaining words are deletions for word in ref_word_builder[ ref_extra_syllable_word_index:]: alignment.append(AlignLabels.deletion) ref_aligned.append(word) hyp_aligned.append('') ref_syllable_count = 0 if not break_on_syllables or not hyp_extra_syllable_word_index: hyp_aligned.append(' '.join(hyp_word_builder)) ref_syllable_count = 0 hyp_syllable_count = 0 else: hyp_aligned.append(' '.join(hyp_word_builder[ 0:hyp_extra_syllable_word_index])) # The remaining words are insertions for word in hyp_word_builder[ hyp_extra_syllable_word_index:]: alignment.append(AlignLabels.insertion) ref_aligned.append('') hyp_aligned.append(word) hyp_syllable_count = 0 if align_tok == AlignLabels.substitution: # Check if you need to rework this alignment. if len(ref_word_builder) != len( hyp_word_builder): # Word count mismatch in the alignment span. Is there a possibility that we need to re-align this segment? ref_word_span_curr = ( ref_word_index, ref_word_index + len(ref_word_builder)) hyp_word_span_curr = ( hyp_word_index, hyp_word_index + len(hyp_word_builder)) phone_align_curr = phone_align.subsequence( 0, i + 1, preserve_index=False) lev = Levenshtein.align( ref=phone_align_curr.s1_tokens(), hyp=phone_align_curr.s2_tokens(), reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner. exclusive_sets, weights=Levenshtein.wordAlignWeights ) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align_adjusted = lev.expandAlignCompact( ) if phone_align_curr.align != phone_align_adjusted.align: # Looks like we need to redo the phone-to-word alignment. worklist.append((ref_word_span_curr, hyp_word_span_curr, phone_align_adjusted)) else: commit_alignment = True else: commit_alignment = True elif align_tok == AlignLabels.deletion: for word in ref_word_builder: alignment.append(align_tok) ref_aligned.append(word) hyp_aligned.append('') commit_alignment = True ref_syllable_count = 0 elif align_tok == AlignLabels.insertion: for word in hyp_word_builder: alignment.append(align_tok) ref_aligned.append('') hyp_aligned.append(word) commit_alignment = True hyp_syllable_count = 0 if commit_alignment: # Commit the alignment. full_reference.extend(ref_aligned) full_hypothesis.extend(hyp_aligned) full_alignment.extend(alignment) full_phone_align.append( phone_align.subsequence(0, i, preserve_index=False)) ref_aligned = [] hyp_aligned = [] alignment = [] break # Add words if word boundaries are reached. else: if ref_type == TokType.WordBoundary: ref_word_started = False if hyp_type != TokType.WordBoundary and ref_word_builder and not hyp_word_builder: # DELETION # Ref word ended, but no hyp words have been added. Mark the current ref word(s) in the span as deletion errors. # TODO: Dedupe this logic for word in ref_word_builder: alignment.append(AlignLabels.deletion) ref_aligned.append(word) hyp_aligned.append('') ref_syllable_count = 0 # Commit the alignment. full_reference.extend(ref_aligned) full_hypothesis.extend(hyp_aligned) full_alignment.extend(alignment) full_phone_align.append( phone_align.subsequence(0, i, preserve_index=False)) # Add the remainder to the worklist ref_word_span_next = (ref_word_index + len(ref_word_builder), ref_word_limit) hyp_word_span_next = (hyp_word_index + len(hyp_word_builder), hyp_word_limit) lev = Levenshtein.align( ref=[x for x in phone_align.s1[i:] if x], hyp=[x for x in phone_align.s2 if x], reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets, weights=Levenshtein.wordAlignWeights) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align_next = lev.expandAlignCompact() worklist.append( (ref_word_span_next, hyp_word_span_next, phone_align_next)) break elif ref_type == TokType.Phoneme and not ref_word_started: ref_word_started = True try: ref_word_item = ref_word_iter.next() ref_word_builder.append(ref_word_item[1]) except StopIteration: pass if hyp_type == TokType.WordBoundary: hyp_word_started = False if ref_type != TokType.WordBoundary and hyp_word_builder and not ref_word_builder: # INSERTION # Hyp word ended, but no ref words have been added. Mark the current hyp word(s) in the span as insertion errors. # TODO: Dedupe this logic for word in hyp_word_builder: alignment.append(AlignLabels.insertion) ref_aligned.append('') hyp_aligned.append(word) hyp_syllable_count = 0 # Commit the alignment. full_reference.extend(ref_aligned) full_hypothesis.extend(hyp_aligned) full_alignment.extend(alignment) full_phone_align.append( phone_align.subsequence(0, i, preserve_index=False)) # Add the remainder to the worklist ref_word_span_next = (ref_word_index + len(ref_word_builder), ref_word_limit) hyp_word_span_next = (hyp_word_index + len(hyp_word_builder), hyp_word_limit) lev = Levenshtein.align( ref=[x for x in phone_align.s1 if x], hyp=[x for x in phone_align.s2[i:] if x], reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets, weights=Levenshtein.wordAlignWeights) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align_next = lev.expandAlignCompact() worklist.append( (ref_word_span_next, hyp_word_span_next, phone_align_next)) break elif hyp_type == TokType.Phoneme and not hyp_word_started: hyp_word_started = True try: hyp_word_item = hyp_word_iter.next() hyp_word_builder.append(hyp_word_item[1]) except StopIteration: pass # Check for syllable mismatches if ref_type == TokType.SyllableBoundary: ref_syllable_count += 1 if hyp_type == TokType.SyllableBoundary: hyp_syllable_count += 1 if (ref_type == TokType.SyllableBoundary == hyp_type or ref_syllable_count == hyp_syllable_count): # No syllable conflicts here! ref_extra_syllable_word_index = None hyp_extra_syllable_word_index = None elif (ref_type == TokType.SyllableBoundary and not ref_extra_syllable_word_index and TokType.checkAnnotation( phone_align.s2[i - 1]) == TokType.WordBoundary): # Extra syllable in hypothesis. We only care if the syllable immediately follows a word boundary. # This is because this indicates that a new word is being formed, which may likely be an insertion in hyp. ref_extra_syllable_word_index = len(ref_word_builder) - 1 # print ref_word_builder # print 'Syllable/word mismatch at', i # print 'Extra hyp word:', ref_word_builder[ref_extra_syllable_word_index] elif (hyp_type == TokType.SyllableBoundary and not hyp_extra_syllable_word_index and TokType.checkAnnotation( phone_align.s2[i - 1]) == TokType.WordBoundary): # This time there's an extra syllable in the ref, corresponding to a new ref word. hyp_extra_syllable_word_index = len(hyp_word_builder) - 1 # print hyp_word_builder # print 'Syllable/word mismatch at', i # print 'Extra ref word:', hyp_word_builder[hyp_extra_syllable_word_index] # Concatenate all phoneme alignments fp_align = full_phone_align[0] for expand_align in full_phone_align[1:]: fp_align.append_alignment(expand_align) return ExpandedAlignment(full_reference, full_hypothesis, full_alignment), fp_align
from levenshtein import Levenshtein assert(Levenshtein.distance("", "abc") == 3) assert(Levenshtein.distance("abc", "") == 3) assert(Levenshtein.distance("", "") == 0) assert(Levenshtein.distance("abc", "abc") == 0) assert(Levenshtein.distance("abcdef", "xxxxxx") == 6) assert(Levenshtein.distance("xxxxxx", "abcdef") == 6) assert(Levenshtein.distance("abcdef", "abefcd") == 4) assert(Levenshtein.distance("abefcd", "abcdef") == 4) assert(Levenshtein.distance("acdefg", "abcdef") == 2) assert(Levenshtein.distance("abcdef", "acdefg") == 2) assert(Levenshtein.normalized_distance("abcdef", "abc") == 0.5) assert(Levenshtein.normalized_distance("abcdef", "") == 1) assert(Levenshtein.normalized_distance("a", "b") == 1) assert(Levenshtein.normalized_distance("a", "") == 1) assert(Levenshtein.normalized_distance("a", "a") == 0) assert(Levenshtein.normalized_distance("abcd", "c") == 0.75) assert(Levenshtein.normalized_distance("abcd", "bd") == 0.5) assert(Levenshtein.normalized_distance("abcd", "db") == 0.75) print("Success")
from levenshtein import Levenshtein assert (Levenshtein.distance("", "abc") == 3) assert (Levenshtein.distance("abc", "") == 3) assert (Levenshtein.distance("", "") == 0) assert (Levenshtein.distance("abc", "abc") == 0) assert (Levenshtein.distance("abcdef", "xxxxxx") == 6) assert (Levenshtein.distance("xxxxxx", "abcdef") == 6) assert (Levenshtein.distance("abcdef", "abefcd") == 4) assert (Levenshtein.distance("abefcd", "abcdef") == 4) assert (Levenshtein.distance("acdefg", "abcdef") == 2) assert (Levenshtein.distance("abcdef", "acdefg") == 2) assert (Levenshtein.normalized_distance("abcdef", "abc") == 0.5) assert (Levenshtein.normalized_distance("abcdef", "") == 1) assert (Levenshtein.normalized_distance("a", "b") == 1) assert (Levenshtein.normalized_distance("a", "") == 1) assert (Levenshtein.normalized_distance("a", "a") == 0) assert (Levenshtein.normalized_distance("abcd", "c") == 0.75) assert (Levenshtein.normalized_distance("abcd", "bd") == 0.5) assert (Levenshtein.normalized_distance("abcd", "db") == 0.75) print("Success")
from levenshtein import Levenshtein # A more through general test is found in levenshteintests.py # This test is only to show that word based distance works the same way as character based distance assert (Levenshtein.distance(["hi", "there"], ["hi"]) == 1) assert (Levenshtein.distance(["hi", "there"], ["there", "hi"]) == 2) assert (Levenshtein.distance(["hi", "there"], []) == 2) assert (Levenshtein.distance(["aaa", "bbbb", "cccc"], ["aaa", "fff", "cccc"]) == 1) assert (Levenshtein.distance([], []) == 0) print("Success")
#!/usr/bin/python3 import argparse from levenshtein import Levenshtein parser = argparse.ArgumentParser(description="Find the Levenshtein distance between two strings.") parser.add_argument("string1", help="First string.") parser.add_argument("string2", help="Second string.") parser.add_argument("-d", "--delimiter", help="Word delimiter. Default value: space", nargs='?', const=' ') parser.add_argument("-D", "--distance", help="Print edit distance.", action="store_true") parser.add_argument("-A", "--alignment", help="Print alignment.", action="store_true") parser.add_argument("-E", "--error", help="Print WER.", action="store_true") args = parser.parse_args() Levenshtein(args.string1, args.string2, args.delimiter, args.distance, args.alignment, args.error)
from levenshtein import Levenshtein # A more through general test is found in levenshteintests.py # This test is only to show that word based distance works the same way as character based distance assert(Levenshtein.distance(["hi", "there"], ["hi"]) == 1) assert(Levenshtein.distance(["hi", "there"], ["there", "hi"]) == 2) assert(Levenshtein.distance(["hi", "there"], []) == 2) assert(Levenshtein.distance(["aaa", "bbbb", "cccc"], ["aaa", "fff", "cccc"]) == 1) assert(Levenshtein.distance([], []) == 0) print("Success")
continue if len(word) < 5: continue yield word def anonymize(words, token='<NAME>'): return [token if w in wilhelm or w in jakob else w for w in words] def anonymize_letter(letter, token='<NAME>'): letter.words = anonymize(letter.words, token=token) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('path') args = parser.parse_args() letters = load_letters(bpath=args.path) words = set(headings(letters)) dists = Levenshtein(*words) print("Wilhelm:\n") for w, _ in sorted(dists.dists_to('Wilhelm'), key=lambda x: x[1]): print("\t%s" % w) print() print("Jakob:\n") for w, _ in sorted(dists.dists_to('Jakob'), key=lambda x: x[1]): print("\t%s" % w)