def test_charToWordAlign_extra_hyp_syl_no_overlap(self): ref = """_ _ asked""".split() hyp = """gave we ask""".split() align = """I I S""".split() refwords = ' '.join([r for r in ref if r != '_']) hypwords = ' '.join([h for h in hyp if h != "_"]) ref_phones = """| # _ _ _ _ _ _ _ _ _ ae s k t |""".split( ) hyp_phones = """| # g ey v | # w iy | # ae s k _ |""".split( ) align_phones = """C C I I I I I I I I I C C C D C""".split( ) ref_phones = [r.replace('_', '') for r in ref_phones] hyp_phones = [r.replace('_', '') for r in hyp_phones] lev = Levenshtein.align(ref_phones, hyp_phones, PowerAligner.reserve_list, PowerAligner.exclusive_sets) lev.editops() phone_align = lev.expandAlign() # self.assertEqual(phone_align.align, align_phones) word_align, phone_align = PowerAligner.phoneAlignToWordAlign( refwords.split(), hypwords.split(), ref_phones, hyp_phones) self.assertEqual(word_align.align, align) self.assertEqual(word_align.s1, [x if x != "_" else "" for x in ref]) self.assertEqual(word_align.s2, [x if x != "_" else "" for x in hyp])
def test_charToWordAlign_extra_hyp_syl_overlap(self): ref = """_ butchering""".split() hyp = """the maturing""".split() align = """I S""".split() refwords = ' '.join([r for r in ref if r != '_']) hypwords = ' '.join([h for h in hyp if h != "_"]) ref_phones = """ | # b uh ch # er # ih ng |""".split( ) hyp_phones = """| # dh ax | # m ax ch # uh r # ih ng |""".split( ) align_phones = """I I I I C C S S C C I S C C C C""".split( ) print(ref_phones) print(hyp_phones) lev = Levenshtein.align(ref_phones, hyp_phones, PowerAligner.reserve_list, PowerAligner.exclusive_sets) lev.editops() phone_align = lev.expandAlign() word_align, phone_align = PowerAligner.phoneAlignToWordAlign( refwords.split(), hypwords.split(), ref_phones, hyp_phones) print("POWER") print(word_align) self.assertEqual(word_align.align, align) self.assertEqual(word_align.s1, [x if x != "_" else "" for x in ref]) self.assertEqual(word_align.s2, [x if x != "_" else "" for x in hyp])
def charAlign(self): ref_chars = [x for x in self.ref] + [' '] hyp_chars = [x for x in self.hyp] + [' '] lev = Levenshtein.align(ref_chars, hyp_chars, lowercase=self.lowercase, reserve_list=set([' '])) lev.editops() self.char_align = lev.expandAlign() return self.char_align
def test_fbk_u58(self): ref = """| # p iy |""".split() hyp = """| # hh iy | # s eh d |""".split() align = """C C S C I I I I I C""".split() correct = """C C S C C I I I I I""".split() lev = Levenshtein.align(ref, hyp, lowercase=True, weights=Levenshtein.wordAlignWeights, reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets ) expand_align = lev.expandAlignCompact() print(expand_align) self.assertEqual(expand_align.align, correct)
def test_fbk_u19(self): ref = """| # ow k # ey | # ay m | # g ow # ih ng | # t ax | # d uw | # ax n # ah dh # er | # k ah t |""".split() hyp = """| # k ae n | """.split() align = """C C D C D D D D D D D D D D D D D D D D D D D D D D D S C D D D D D C D D D D D""".split() lev = Levenshtein.align(ref, hyp, lowercase=True, weights=Levenshtein.wordAlignWeights, reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets ) expand_align = lev.expandAlignCompact() print(expand_align) print(' '.join(expand_align.align)) print(' '.join(align)) self.assertEqual(expand_align.align, align)
def test_lev_phones1(self): ref = [ "", "", "", "", "|", "#", "b", "uh", "ch", "#", "", "er","#", "ih", "ng", "|" ] hyp = [ "|", "#", "dh", "ax", "|", "#", "m", "ax", "ch", "#", "uh", "r", "#", "ih", "ng", "|" ] align = [ 'I', 'I', 'I', 'I', 'C', 'C', 'S', 'S', 'C', 'C', 'I', 'S', 'C', 'C', 'C', 'C' ] refwords = [x for x in ref if x] hypwords = [x for x in hyp if x] lev = Levenshtein.align(refwords, hypwords, lowercase=True, weights=Levenshtein.wordAlignWeights, reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets ) lev.editops() expand_align = lev.expandAlign() print(expand_align) self.assertEqual(expand_align.align, align)
def test_fbk_u264_b_1(self): # What the wrong alignment used to be. ref = """| # k l ih r | # t ax |""".split() hyp = """| # f ih t |""".split() align = """C C D S C D D D C D C""".split() correct = """C C D S C D D D C D C""".split() refwords = [x for x in ref if x] hypwords = [x for x in hyp if x] lev = Levenshtein.align(refwords, hypwords, lowercase=True, weights=Levenshtein.wordAlignWeights, reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets ) expand_align = lev.expandAlignCompact() print(expand_align) self.assertEqual(expand_align.align, correct)
def __init__(self, ref, hyp, lowercase=False, verbose=False, pronounce_type=PronouncerType.Lexicon, lexicon=None, word_align_weights=Levenshtein.wordAlignWeights): if not ref: raise Exception("No reference file.\nref: {0}\nhyp: {1}".format( ref, hyp)) if pronounce_type == PronouncerType.Lexicon: self.pronouncer = PronouncerLex(lexicon) else: self.pronouncer = PronouncerBase() self.ref = [x for x in ref.strip().split() if x] self.hyp = [x for x in hyp.strip().split() if x] self.refwords = ' '.join(self.ref) self.hypwords = ' '.join(self.hyp) self.lowercase = lowercase self.verbose = verbose # Perform word alignment lev = Levenshtein.align(self.ref, self.hyp, lowercase=self.lowercase, weights=word_align_weights) lev.editops() self.wer_alignment = lev.expandAlign() self.wer, self.wer_components = self.wer_alignment.error_rate() # Used for POWER alignment self.power_alignment = None self.power = None self.power_components = None # Used to find potential error regions self.split_regions = None self.error_indexes = None self.phonetic_alignments = None self.phonetic_lev = None
def write(self, segid, score_components, expanded_alignment, phonetic_alignments=None): out_dict = dict() out_dict['id'] = segid out_dict['errorTypes'] = score_components.copy() out_dict['errorTypes']['refLength'] = out_dict['errorTypes'].pop('L') out_dict['errRate'] = Levenshtein.errorRate(score_components['S'], score_components['D'], score_components['I'], score_components['L']) out_dict['alignments'] = [] for i in range(len(expanded_alignment.align)): out_dict['alignments'].append({ 'align': expanded_alignment.align[i], 'ref': expanded_alignment.s1[i], 'hyp': expanded_alignment.s2[i] }) self.out_file.write("%s\n" % json.dumps(out_dict))
def test_fbk_u264_a(self): ref = """So the other elephant""".split() hyp = """set in _ an""".split() align = """S S D S""".split() ref_phones = """| # s ow | # dh ax | # ah dh # er | # eh l # ax f # ax n t |""".split() hyp_phones = """| # s eh t | # ih n | # ae n d |""".split() align_phones = """C C C S D D S D C C S S D D C D D D D D D C S C S C""".split() correct_phones = """C C C S D D S D C C S S D D C D D D D D D C S C S C""".split() refwords = ' '.join([r for r in ref if r != '_']) hypwords = ' '.join([h for h in hyp if h != "_"]) lev = Levenshtein.align(ref_phones, hyp_phones, PowerAligner.reserve_list, PowerAligner.exclusive_sets, weights=Levenshtein.wordAlignWeights) #expand_align = lev.expandAlignCompact() lev.editops() expand_align = lev.expandAlign() print(expand_align) self.assertEqual(expand_align.align, correct_phones)
def phoneAlignToWordAlign(cls, ref_words, hyp_words, ref_phones, hyp_phones, break_on_syllables=True): ref_word_span = (0, len(ref_words)) hyp_word_span = (0, len(hyp_words)) # Perform Levenshtein Alignment lev = Levenshtein.align(ref=ref_phones, hyp=hyp_phones, reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets, weights=Levenshtein.wordAlignWeights) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align = lev.expandAlignCompact() worklist = list() worklist.append((ref_word_span, hyp_word_span, phone_align)) full_reference = list() full_hypothesis = list() full_alignment = list() full_phone_align = list() while worklist: # Take the next set of sequence boundaries off the worklist ref_word_span, hyp_word_span, phone_align = worklist.pop() ref_word_index, ref_word_limit = ref_word_span hyp_word_index, hyp_word_limit = hyp_word_span # TODO: Currently only checking in the forward direction ref_word_builder = [] # Temp storage of words in alignment span hyp_word_builder = [] ref_word_iter = enumerate( ref_words[ref_word_span[0]:ref_word_span[1]] ) # Iterates through the surface words hyp_word_iter = enumerate( hyp_words[hyp_word_span[0]:hyp_word_span[1]]) ref_aligned = [] # Finalized alignments hyp_aligned = [] alignment = [] # Finalized alignment labels ref_extra_syllable_word_index = None # Used for marking words mapping to extra syllables in alignment. hyp_extra_syllable_word_index = None ref_syllable_count = 0 hyp_syllable_count = 0 ref_word_started = False # Indicates whether a word is already accounted for in the alignment when a phoneme is reached. hyp_word_started = False advance_worklist = False commit_alignment = False for i in range(len(phone_align.align)): ref_type = TokType.checkAnnotation(phone_align.s1[i]) hyp_type = TokType.checkAnnotation(phone_align.s2[i]) # Check if word boundaries are reached, both on ref an hyp -- or the case where no more symbols can be read. if (i == len(phone_align.align) - 1) or (ref_type == TokType.WordBoundary and ref_type == hyp_type): align_tok = None # Only write outputs if either the ref or the hyp has scanned some words. if ref_word_builder: if hyp_word_builder: align_tok = AlignLabels.substitution if ref_word_builder != hyp_word_builder else AlignLabels.correct else: align_tok = AlignLabels.deletion elif hyp_word_builder: align_tok = AlignLabels.insertion if align_tok: # Add the remainder to the worklist ref_word_span_next = (ref_word_index + len(ref_word_builder), ref_word_limit) hyp_word_span_next = (hyp_word_index + len(hyp_word_builder), hyp_word_limit) phone_align_next = phone_align.subsequence( i, phone_align.length(), preserve_index=False) worklist.append((ref_word_span_next, hyp_word_span_next, phone_align_next)) # "Commit" the current alignment if align_tok in (AlignLabels.correct, AlignLabels.substitution): alignment.append(align_tok) # Check for syllable conflicts if not break_on_syllables or not ref_extra_syllable_word_index: ref_aligned.append(' '.join(ref_word_builder)) ref_syllable_count = 0 hyp_syllable_count = 0 else: ref_aligned.append(' '.join(ref_word_builder[ 0:ref_extra_syllable_word_index])) # The remaining words are deletions for word in ref_word_builder[ ref_extra_syllable_word_index:]: alignment.append(AlignLabels.deletion) ref_aligned.append(word) hyp_aligned.append('') ref_syllable_count = 0 if not break_on_syllables or not hyp_extra_syllable_word_index: hyp_aligned.append(' '.join(hyp_word_builder)) ref_syllable_count = 0 hyp_syllable_count = 0 else: hyp_aligned.append(' '.join(hyp_word_builder[ 0:hyp_extra_syllable_word_index])) # The remaining words are insertions for word in hyp_word_builder[ hyp_extra_syllable_word_index:]: alignment.append(AlignLabels.insertion) ref_aligned.append('') hyp_aligned.append(word) hyp_syllable_count = 0 if align_tok == AlignLabels.substitution: # Check if you need to rework this alignment. if len(ref_word_builder) != len( hyp_word_builder): # Word count mismatch in the alignment span. Is there a possibility that we need to re-align this segment? ref_word_span_curr = ( ref_word_index, ref_word_index + len(ref_word_builder)) hyp_word_span_curr = ( hyp_word_index, hyp_word_index + len(hyp_word_builder)) phone_align_curr = phone_align.subsequence( 0, i + 1, preserve_index=False) lev = Levenshtein.align( ref=phone_align_curr.s1_tokens(), hyp=phone_align_curr.s2_tokens(), reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner. exclusive_sets, weights=Levenshtein.wordAlignWeights ) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align_adjusted = lev.expandAlignCompact( ) if phone_align_curr.align != phone_align_adjusted.align: # Looks like we need to redo the phone-to-word alignment. worklist.append((ref_word_span_curr, hyp_word_span_curr, phone_align_adjusted)) else: commit_alignment = True else: commit_alignment = True elif align_tok == AlignLabels.deletion: for word in ref_word_builder: alignment.append(align_tok) ref_aligned.append(word) hyp_aligned.append('') commit_alignment = True ref_syllable_count = 0 elif align_tok == AlignLabels.insertion: for word in hyp_word_builder: alignment.append(align_tok) ref_aligned.append('') hyp_aligned.append(word) commit_alignment = True hyp_syllable_count = 0 if commit_alignment: # Commit the alignment. full_reference.extend(ref_aligned) full_hypothesis.extend(hyp_aligned) full_alignment.extend(alignment) full_phone_align.append( phone_align.subsequence(0, i, preserve_index=False)) ref_aligned = [] hyp_aligned = [] alignment = [] break # Add words if word boundaries are reached. else: if ref_type == TokType.WordBoundary: ref_word_started = False if hyp_type != TokType.WordBoundary and ref_word_builder and not hyp_word_builder: # DELETION # Ref word ended, but no hyp words have been added. Mark the current ref word(s) in the span as deletion errors. # TODO: Dedupe this logic for word in ref_word_builder: alignment.append(AlignLabels.deletion) ref_aligned.append(word) hyp_aligned.append('') ref_syllable_count = 0 # Commit the alignment. full_reference.extend(ref_aligned) full_hypothesis.extend(hyp_aligned) full_alignment.extend(alignment) full_phone_align.append( phone_align.subsequence(0, i, preserve_index=False)) # Add the remainder to the worklist ref_word_span_next = (ref_word_index + len(ref_word_builder), ref_word_limit) hyp_word_span_next = (hyp_word_index + len(hyp_word_builder), hyp_word_limit) lev = Levenshtein.align( ref=[x for x in phone_align.s1[i:] if x], hyp=[x for x in phone_align.s2 if x], reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets, weights=Levenshtein.wordAlignWeights) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align_next = lev.expandAlignCompact() worklist.append( (ref_word_span_next, hyp_word_span_next, phone_align_next)) break elif ref_type == TokType.Phoneme and not ref_word_started: ref_word_started = True try: ref_word_item = ref_word_iter.__next__() ref_word_builder.append(ref_word_item[1]) except StopIteration: pass if hyp_type == TokType.WordBoundary: hyp_word_started = False if ref_type != TokType.WordBoundary and hyp_word_builder and not ref_word_builder: # INSERTION # Hyp word ended, but no ref words have been added. Mark the current hyp word(s) in the span as insertion errors. # TODO: Dedupe this logic for word in hyp_word_builder: alignment.append(AlignLabels.insertion) ref_aligned.append('') hyp_aligned.append(word) hyp_syllable_count = 0 # Commit the alignment. full_reference.extend(ref_aligned) full_hypothesis.extend(hyp_aligned) full_alignment.extend(alignment) full_phone_align.append( phone_align.subsequence(0, i, preserve_index=False)) # Add the remainder to the worklist ref_word_span_next = (ref_word_index + len(ref_word_builder), ref_word_limit) hyp_word_span_next = (hyp_word_index + len(hyp_word_builder), hyp_word_limit) lev = Levenshtein.align( ref=[x for x in phone_align.s1 if x], hyp=[x for x in phone_align.s2[i:] if x], reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets, weights=Levenshtein.wordAlignWeights) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align_next = lev.expandAlignCompact() worklist.append( (ref_word_span_next, hyp_word_span_next, phone_align_next)) break elif hyp_type == TokType.Phoneme and not hyp_word_started: hyp_word_started = True try: hyp_word_item = hyp_word_iter.__next__() hyp_word_builder.append(hyp_word_item[1]) except StopIteration: pass # Check for syllable mismatches if ref_type == TokType.SyllableBoundary: ref_syllable_count += 1 if hyp_type == TokType.SyllableBoundary: hyp_syllable_count += 1 if (ref_type == TokType.SyllableBoundary == hyp_type or ref_syllable_count == hyp_syllable_count): # No syllable conflicts here! ref_extra_syllable_word_index = None hyp_extra_syllable_word_index = None elif (ref_type == TokType.SyllableBoundary and not ref_extra_syllable_word_index and TokType.checkAnnotation( phone_align.s2[i - 1]) == TokType.WordBoundary): # Extra syllable in hypothesis. We only care if the syllable immediately follows a word boundary. # This is because this indicates that a new word is being formed, which may likely be an insertion in hyp. ref_extra_syllable_word_index = len(ref_word_builder) - 1 # print ref_word_builder # print 'Syllable/word mismatch at', i # print 'Extra hyp word:', ref_word_builder[ref_extra_syllable_word_index] elif (hyp_type == TokType.SyllableBoundary and not hyp_extra_syllable_word_index and TokType.checkAnnotation( phone_align.s2[i - 1]) == TokType.WordBoundary): # This time there's an extra syllable in the ref, corresponding to a new ref word. hyp_extra_syllable_word_index = len(hyp_word_builder) - 1 # print hyp_word_builder # print 'Syllable/word mismatch at', i # print 'Extra ref word:', hyp_word_builder[hyp_extra_syllable_word_index] # Concatenate all phoneme alignments fp_align = full_phone_align[0] for expand_align in full_phone_align[1:]: fp_align.append_alignment(expand_align) return ExpandedAlignment(full_reference, full_hypothesis, full_alignment), fp_align
def write(self, segid, score_components, expanded_alignment, phonetic_alignments=None): self.out_file.write('id: (%d)\n' % segid) labels = ['C', 'S', 'D', 'I'] counts = [score_components[label] for label in labels] # Print score components self.out_file.write('Scores (%s) %s\n' % (' '.join( ['#%s' % label for label in labels]), ' '.join([str(x) for x in counts]))) # Print word alignment self.out_file.write('%s\n' % expanded_alignment) self.out_file.write('\n') # Print phonetic alignments if phonetic_alignments: for palign in [p for p in phonetic_alignments if p]: self.out_file.write("%s\n" % palign) self.out_file.write('\n') # Print statistics self.out_file.write( 'Correct = {0:4.1%} {1} ({2})\n'.format( score_components['C'] / score_components['L'], score_components['C'], score_components['L'])) self.out_file.write( 'Substitutions = {0:4.1%} {1} ({2})\n'.format( score_components['S'] / score_components['L'], score_components['S'], score_components['L'])) self.out_file.write( 'Deletions = {0:4.1%} {1} ({2})\n'.format( score_components['D'] / score_components['L'], score_components['D'], score_components['L'])) self.out_file.write( 'Insertions = {0:4.1%} {1} ({2})\n'.format( score_components['I'] / score_components['L'], score_components['I'], score_components['L'])) self.out_file.write('\n') self.out_file.write( 'Errors = {0:4.1%} {1} ({2})\n'.format( Levenshtein.errorRate(score_components['S'], score_components['D'], score_components['I'], score_components['L']), score_components['S'] + score_components['D'] + score_components['I'], score_components['L'])) self.out_file.write('\n') self.out_file.write( 'Ref. words = {0} ({1})\n'.format( score_components['L'], score_components['L'])) self.out_file.write( 'Hyp. words = {0} ({1})\n'.format( len(expanded_alignment.s2_string().split()), score_components['L'])) self.out_file.write( 'Aligned words = {0} ({1})\n'.format( score_components['C'] + score_components['S'], score_components['L'])) self.out_file.write('\n') self.out_file.write( '-------------------------------------------------------------------------------\n' ) self.out_file.write('\n')