def test_punct_as_separate_words_2deletes(self): sample_ref = [ "it's", "cheaper", "to", "settle", "than", "to", "fight", "the", "lawsuit", "clearly" ] sample_ref_map = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] sample_align = ["C", "C", "C", "S", "C", "C", "C", "D", "D", "C"] sample_hyp = [ "it's", "cheaper", "to", "settled", "than", "to", "fight", "", "", "clearly" ] sample_hyp_map = [0, 1, 2, 3, 4, 5, 6, 9] sample_hyp_punct = [ "it's", "cheaper", "to", "settled", "than", "to", "fight --", "", "", "clearly," ] error_alignment = ExpandedAlignment(sample_ref, sample_hyp, sample_align, sample_ref_map, sample_hyp_map, lowercase=True) ref_punct = '''it's cheaper to settle than to fight the lawsuit -- clearly,''' expected = ExpandedAlignment(sample_ref, sample_hyp_punct, sample_align, sample_ref_map, sample_hyp_map, lowercase=True).s2 actual = PunctInsertOracle.insertPunct(error_alignment, ref_punct).s2 print(actual) self.maxDiff = None self.assertEqual(actual, expected)
def test_punct_period_at_end_wer_2(self): ref_punct = "So we call this the slicer mode." sample_ref = ["So", "we", "call", "this", "the", "slicer", "mode", ""] sample_ref_map = [0, 1, 2, 3, 4, 5, 6] sample_align = ["C", "C", "S", "C", "S", "S", "S", "I"] sample_hyp = ["so", "we", "hold", "this", "new", "slice", "them", "on"] sample_hyp_map = [0, 1, 2, 3, 4, 5, 6, 7] sample_hyp_punct = [ "so", "we", "hold", "this", "new", "slice", "them", "on." ] error_alignment = ExpandedAlignment(sample_ref, sample_hyp, sample_align, sample_ref_map, sample_hyp_map, lowercase=True) expected = ExpandedAlignment(sample_ref, sample_hyp_punct, sample_align, sample_ref_map, sample_hyp_map, lowercase=True).s2 actual = PunctInsertOracle.insertPunct(error_alignment, ref_punct).s2 print(actual) self.maxDiff = None self.assertEqual(actual, expected)
def test_punct_as_separate_words(self): sample_ref = [ "The", "reason", "they", "settled", "out", "is", "because", "it's", "cheaper", "to", "settle", "than", "to", "fight", "the", "lawsuit", "clearly", "two", "million", "dollars", "cheaper", "in", "some", "cases", "and", "much", "worse", "if", "you", "actually", "lose" ] sample_ref_map = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 ] sample_align = [ "C", "C", "C", "S", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "S", "S", "C", "C" ] sample_hyp = [ "the", "reason", "they", "settle", "out", "is", "because", "it's", "cheaper", "to", "settle", "than", "to", "fight", "the", "lawsuit", "clearly", "two", "million", "dollars", "cheaper", "in", "some", "cases", "and", "much", "worse", "a", "few", "actually", "lose" ] sample_hyp_map = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 ] sample_hyp_punct = [ "the", "reason", "they", "settle", "out", "is", "because", "it's", "cheaper", "to", "settle", "than", "to", "fight", "the", "lawsuit --", "clearly,", "two", "million", "dollars", "cheaper", "in", "some", "cases,", "and", "much", "worse", "a", "few", "actually", "lose." ] error_alignment = ExpandedAlignment(sample_ref, sample_hyp, sample_align, sample_ref_map, sample_hyp_map, lowercase=True) ref_punct = '''The reason they settled out is because it's cheaper to settle than to fight the lawsuit -- clearly, two million dollars cheaper in some cases, and much worse if you actually lose.''' expected = ExpandedAlignment(sample_ref, sample_hyp_punct, sample_align, sample_ref_map, sample_hyp_map, lowercase=True).s2 actual = PunctInsertOracle.insertPunct(error_alignment, ref_punct).s2 print(actual) self.maxDiff = None self.assertEqual(actual, expected)
def test_normalizeAlignment_textnum_hyphen(self): ref_aligned = [ u'A', u'50-year-old', u'business', u'man', u'lamented', u'to', u'me', u'that', u'he', u'feels', u'he', u"doesn't", u'have', u'colleagues', u'anymore', u'at', u'work' ] hyp_aligned = [ u'', u'fifty year old', u'business', u'man', u'laments', u'to', u'me', u'that', u'he', u'feels', u'he', u"doesn't", u'have', u'colleagues', u'anymore', u'it', u'work' ] alignment = [ u'D', u'S', u'C', u'C', u'S', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'S', u'C' ] expected = ' '.join([ u'', u'50-year-old', u'business', u'man', u'laments', u'to', u'me', u'that', u'he', u'feels', u'he', u"doesn't", u'have', u'colleagues', u'anymore', u'it', u'work' ]) expand_align = ExpandedAlignment(ref_aligned, hyp_aligned, alignment, lowercase=True) actual = HypothesisNormalizer.normalizeAligned(expand_align) self.assertEqual(actual, expected)
def test_punct_period_at_end_power(self): ref_aligned = [ u'A', u'50-year-old', u'business', u'man', u'lamented', u'to', u'me', u'that', u'he', u'feels', u'he', u"doesn't", u'have', u'colleagues', u'anymore', u'at', u'work' ] ref_map = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] align = [ u'D', u'S', u'C', u'C', u'S', u'C', u'C', u'C', u'C', u'C', u'C', u'S', u'C', u'C', u'S', u'S', u'S' ] hyp_aligned = [ u'', u'fifty year old', u'business', u'man', u'laments', u'to', u'me', u'that', u'he', u'feels', u'he', u'does not', u'have', u'colleagues', u'any more', u'it', u'work' ] hyp_map = [ 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16 ] expand_align = ExpandedAlignment(ref_aligned, hyp_aligned, align, ref_map, hyp_map, lowercase=True) expected = [ u'', u'fifty year old', u'business', u'man', u'laments', u'to', u'me', u'that', u'he', u'feels', u'he', u'does not', u'have', u'colleagues', u'any more', u'it', u'work.' ] ref_punct = "A 50-year-old business man lamented to me that he feels he doesn't have colleagues anymore at work." actual = PunctInsertOracle.insertPunct(expand_align, ref_punct).s2 print(actual) self.maxDiff = None self.assertEqual(actual, expected)
def test_normalizeAlignment_number(self): ref_aligned = [ u'You', u'need', u'to', u'know', u'that', u'the', u'average', u'patent', u'troll', u'defense', u'costs', u'two million', u'dollars', u'and', u'takes', u'18', u'months', u'when', u'you', u'win' ] hyp_aligned = [ u'you', u'need', u'to', u'know', u'that', u'the', u'average', u'patent', u'troll', u'defense', u'cost', u'2000000', u'dollars', u'and', u'takes', u'18', u'months', u'when', u'you', u'win' ] alignment = [ u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'S', u'S', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C' ] expected = ' '.join([ u'you', u'need', u'to', u'know', u'that', u'the', u'average', u'patent', u'troll', u'defense', u'cost', u'two million', u'dollars', u'and', u'takes', u'18', u'months', u'when', u'you', u'win' ]) expand_align = ExpandedAlignment(ref_aligned, hyp_aligned, alignment, lowercase=True) actual = HypothesisNormalizer.normalizeAligned(expand_align) self.assertEqual(actual, expected)
def test_normalize_alignment_year(self): ref_aligned = ["Now", "fast-forward", "to", "2012"] hyp_aligned = ["now", "fast-forward", "to", "twenty twelve"] alignment = ["C", "C", "C", "S"] expected = ' '.join(["now", "fast-forward", "to", "2012"]) expand_align = ExpandedAlignment(ref_aligned, hyp_aligned, alignment, lowercase=True) actual = HypothesisNormalizer.normalizeAligned(expand_align) self.assertEqual(actual, expected)
def test_read_alignments_911(self): jstr = """{"errRate": 0.32653061224489793, "alignments": [{"hyp": "Everyone", "align": "C", "ref": "Everyone"}, {"hyp": "who", "align": "C", "ref": "who"}, {"hyp": "knew", "align": "C", "ref": "knew"}, {"hyp": "me", "align": "C", "ref": "me"}, {"hyp": "before", "align": "C", "ref": "before"}, {"hyp": "nine", "align": "S", "ref": "9/11"}, {"hyp": "eleven", "align": "I", "ref": ""}, {"hyp": "the", "align": "I", "ref": ""}, {"hyp": "believes", "align": "S", "ref": "believes"}, {"hyp": "line", "align": "S", "ref": "I'm"}, {"hyp": "", "align": "D", "ref": "dead"}, {"hyp": "i", "align": "C", "ref": "I"}, {"hyp": "used", "align": "C", "ref": "used"}, {"hyp": "to", "align": "C", "ref": "to"}, {"hyp": "work", "align": "C", "ref": "work"}, {"hyp": "with", "align": "C", "ref": "with"}, {"hyp": "a", "align": "C", "ref": "a"}, {"hyp": "bunch", "align": "C", "ref": "bunch"}, {"hyp": "of", "align": "C", "ref": "of"}, {"hyp": "uptight", "align": "C", "ref": "uptight"}, {"hyp": "religious", "align": "C", "ref": "religious"}, {"hyp": "people", "align": "C", "ref": "people"}, {"hyp": "so", "align": "C", "ref": "so"}, {"hyp": "sometimes", "align": "C", "ref": "sometimes"}, {"hyp": "i", "align": "C", "ref": "I"}, {"hyp": "didn't", "align": "C", "ref": "didn't"}, {"hyp": "wear", "align": "C", "ref": "wear"}, {"hyp": "panties", "align": "C", "ref": "panties"}, {"hyp": "is", "align": "S", "ref": "and"}, {"hyp": "that", "align": "S", "ref": "just"}, {"hyp": "", "align": "D", "ref": "had"}, {"hyp": "a", "align": "C", "ref": "a"}, {"hyp": "big", "align": "C", "ref": "big"}, {"hyp": "smile", "align": "C", "ref": "smile"}, {"hyp": "and", "align": "C", "ref": "and"}, {"hyp": "chuckle", "align": "S", "ref": "chuckled"}, {"hyp": "to", "align": "C", "ref": "to"}, {"hyp": "myself", "align": "C", "ref": "myself"}, {"hyp": "from", "align": "I", "ref": ""}, {"hyp": "this", "align": "C", "ref": "This"}, {"hyp": "next", "align": "C", "ref": "next"}, {"hyp": "one", "align": "C", "ref": "one"}, {"hyp": "takes", "align": "C", "ref": "takes"}, {"hyp": "", "align": "D", "ref": "a"}, {"hyp": "little", "align": "C", "ref": "little"}, {"hyp": "explanation", "align": "C", "ref": "explanation"}, {"hyp": "of", "align": "S", "ref": "before"}, {"hyp": "right", "align": "S", "ref": "I"}, {"hyp": "here", "align": "S", "ref": "share"}, {"hyp": "", "align": "D", "ref": "it"}, {"hyp": "with", "align": "C", "ref": "with"}, {"hyp": "you", "align": "C", "ref": "you"}], "id": 6, "errorTypes": {"C": 36, "D": 4, "I": 3, "S": 9, "refLength": 49}}""" exp_ref = [ u'Everyone', u'who', u'knew', u'me', u'before', u'9/11', u'', u'', u'believes', u"I'm", u'dead', u'I', u'used', u'to', u'work', u'with', u'a', u'bunch', u'of', u'uptight', u'religious', u'people', u'so', u'sometimes', u'I', u"didn't", u'wear', u'panties', u'and', u'just', u'had', u'a', u'big', u'smile', u'and', u'chuckled', u'to', u'myself', u'', u'This', u'next', u'one', u'takes', u'a', u'little', u'explanation', u'before', u'I', u'share', u'it', u'with', u'you' ] exp_ref_map = [ 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 ] exp_align = [ u'C', u'C', u'C', u'C', u'C', u'S', u'I', u'I', u'S', u'S', u'D', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'S', u'S', u'D', u'C', u'C', u'C', u'C', u'S', u'C', u'C', u'I', u'C', u'C', u'C', u'C', u'D', u'C', u'C', u'S', u'S', u'S', u'D', u'C', u'C' ] exp_hyp = [ u'Everyone', u'who', u'knew', u'me', u'before', u'nine', u'eleven', u'the', u'believes', u'line', u'', u'i', u'used', u'to', u'work', u'with', u'a', u'bunch', u'of', u'uptight', u'religious', u'people', u'so', u'sometimes', u'i', u"didn't", u'wear', u'panties', u'is', u'that', u'', u'a', u'big', u'smile', u'and', u'chuckle', u'to', u'myself', u'from', u'this', u'next', u'one', u'takes', u'', u'little', u'explanation', u'of', u'right', u'here', u'', u'with', u'you' ] exp_hyp_map = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 50, 51 ] expected = ExpandedAlignment(exp_ref, exp_hyp, exp_align, exp_ref_map, exp_hyp_map, lowercase=True) actual = AlignmentReaderJson.read_json(jstr) self.maxDiff = None self.assertEqual(len(actual.s1), len(actual.s2)) self.assertEqual(actual.s1, expected.s1) self.assertEqual(actual.s2, expected.s2) self.assertEqual(actual.align, expected.align) self.assertEqual(actual.s1_map, expected.s1_map) self.assertEqual(actual.s2_map, expected.s2_map)
def test_normalize_alignment_any_more(self): hyp_aligned = ["any more"] ref_aligned = ["anymore"] alignment = ["S"] expand_align = ExpandedAlignment(ref_aligned, hyp_aligned, alignment, lowercase=True) actual = HypothesisNormalizer.normalizeAligned(expand_align) expected = ' '.join(ref_aligned) self.assertEqual(actual, expected)
def test_normalize_middle_age(self): ref_aligned = [ "They're", "happier", "than", "middle-aged", "people", "and", "younger", "people", "certainly", ] hyp_aligned = [ "they are", "happier", "the", "middle age", "people", "and", "younger", "people", "certainly", ] alignment = [ "S", "C", "S", "S", "C", "C", "C", "C", "C", ] expected = [ "They're", "happier", "the", "middle age", "people", "and", "younger", "people", "certainly", ] expected = ' '.join([x for x in expected if x]) expand_align = ExpandedAlignment(ref_aligned, hyp_aligned, alignment, lowercase=True) actual = HypothesisNormalizer.normalizeAligned(expand_align) self.assertEqual(actual, expected)
def align(self): # Find the error regions that may need to be realigned self.split_regions, self.error_indexes = self.wer_alignment.split_error_regions( ) self.phonetic_alignments = [None] * len(self.split_regions) for error_index in self.error_indexes: seg = self.split_regions[error_index] ref_words = seg.s1_tokens() hyp_words = seg.s2_tokens() ref_phones = self.pronouncer.pronounce(ref_words) hyp_phones = self.pronouncer.pronounce(hyp_words) power_seg_alignment, self.phonetic_alignments[ error_index] = PowerAligner.phoneAlignToWordAlign( ref_words, hyp_words, ref_phones, hyp_phones) # Replace the error region at the current index. self.split_regions[error_index] = power_seg_alignment # Merge the alignment segments back together. self.power_alignment = ExpandedAlignment(self.split_regions[0].s1, self.split_regions[0].s2, self.split_regions[0].align, self.split_regions[0].s1_map, self.split_regions[0].s2_map, lowercase=self.lowercase) for i in range(1, len(self.split_regions)): self.power_alignment.append_alignment(self.split_regions[i]) # Get the alignment score self.power, self.power_components = self.power_alignment.error_rate() assert self.hypwords == self.power_alignment.s2_string( ), "hyp mismatch:\n{0}\n{1}".format(self.hypwords, self.power_alignment.s2_string()) assert self.refwords == self.power_alignment.s1_string( ), "ref mismatch:\n{0}\n{1}".format(self.refwords, self.power_alignment.s1_string())
def test_normalize_94(self): ref_aligned = [ "Originally", "the", "sample", "was", "aged", "18", "to", "94", ] hyp_aligned = [ "originally", "the", "sample", "was", "aged", "eighteen", "to", "ninety four", ] alignment = [ "C", "C", "C", "C", "C", "S", "C", "S", ] expected = [ "originally", "the", "sample", "was", "aged", "18", "to", "94", ] expected = ' '.join([x for x in expected if x]) expand_align = ExpandedAlignment(ref_aligned, hyp_aligned, alignment, lowercase=True) actual = HypothesisNormalizer.normalizeAligned(expand_align) self.assertEqual(actual, expected)
def test_read_alignments_hyphen(self): jstr = """{"errRate": 0.0967741935483871, "alignments": [{"hyp": "the", "align": "C", "ref": "The"}, {"hyp": "reason", "align": "C", "ref": "reason"}, {"hyp": "they", "align": "C", "ref": "they"}, {"hyp": "settle", "align": "S", "ref": "settled"}, {"hyp": "out", "align": "C", "ref": "out"}, {"hyp": "is", "align": "C", "ref": "is"}, {"hyp": "because", "align": "C", "ref": "because"}, {"hyp": "it's", "align": "C", "ref": "it's"}, {"hyp": "cheaper", "align": "C", "ref": "cheaper"}, {"hyp": "to", "align": "C", "ref": "to"}, {"hyp": "settle", "align": "C", "ref": "settle"}, {"hyp": "than", "align": "C", "ref": "than"}, {"hyp": "to", "align": "C", "ref": "to"}, {"hyp": "fight", "align": "C", "ref": "fight"}, {"hyp": "the", "align": "C", "ref": "the"}, {"hyp": "lawsuit", "align": "C", "ref": "lawsuit"}, {"hyp": "clearly", "align": "C", "ref": "clearly"}, {"hyp": "two", "align": "C", "ref": "two"}, {"hyp": "million", "align": "C", "ref": "million"}, {"hyp": "dollars", "align": "C", "ref": "dollars"}, {"hyp": "cheaper", "align": "C", "ref": "cheaper"}, {"hyp": "in", "align": "C", "ref": "in"}, {"hyp": "some", "align": "C", "ref": "some"}, {"hyp": "cases", "align": "C", "ref": "cases"}, {"hyp": "and", "align": "C", "ref": "and"}, {"hyp": "much", "align": "C", "ref": "much"}, {"hyp": "worse", "align": "C", "ref": "worse"}, {"hyp": "a", "align": "S", "ref": "if"}, {"hyp": "few", "align": "S", "ref": "you"}, {"hyp": "actually", "align": "C", "ref": "actually"}, {"hyp": "lose", "align": "C", "ref": "lose"}], "id": 9, "errorTypes": {"C": 28, "D": 0, "I": 0, "S": 3, "refLength": 31}}""" exp_ref = [ "The", "reason", "they", "settled", "out", "is", "because", "it's", "cheaper", "to", "settle", "than", "to", "fight", "the", "lawsuit", "clearly", "two", "million", "dollars", "cheaper", "in", "some", "cases", "and", "much", "worse", "if", "you", "actually", "lose" ] exp_ref_map = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 ] exp_align = [ "C", "C", "C", "S", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "S", "S", "C", "C" ] exp_hyp = [ "the", "reason", "they", "settle", "out", "is", "because", "it's", "cheaper", "to", "settle", "than", "to", "fight", "the", "lawsuit", "clearly", "two", "million", "dollars", "cheaper", "in", "some", "cases", "and", "much", "worse", "a", "few", "actually", "lose" ] exp_hyp_map = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 ] expected = ExpandedAlignment(exp_ref, exp_hyp, exp_align, exp_ref_map, exp_hyp_map, lowercase=True) actual = AlignmentReaderJson.read_json(jstr) self.maxDiff = None self.assertEqual(len(actual.s1), len(actual.s2)) self.assertEqual(actual.s1, expected.s1) self.assertEqual(actual.s2, expected.s2) self.assertEqual(actual.align, expected.align) self.assertEqual(actual.s1_map, expected.s1_map) self.assertEqual(actual.s2_map, expected.s2_map)
def read_json(jstr): in_dict = json.loads(jstr) if not in_dict: return None ref = [] hyp = [] align = [] ref_map = [] hyp_map = [] for i in range(len(in_dict['alignments'])): alignment = in_dict['alignments'][i] ref.append(alignment['ref']) hyp.append(alignment['hyp']) align.append(alignment['align']) if ref[-1]: ref_map.extend([i for r in ref[-1].split()]) if hyp[-1]: hyp_map.extend([i for h in hyp[-1].split()]) return ExpandedAlignment(ref, hyp, align, ref_map, hyp_map)
def test_normalizeAlignment_text_hyphen(self): ref_aligned = [ u'Our', u'digital', u'body', u'is', u'', u'', u'', u'', u'one-to-one', u'life' ] hyp_aligned = [ u'are', u'what', u'it', u'is', u'all', u'about', u'the', u'these', u'one to one', u'life' ] alignment = [ u'S', u'S', u'S', u'C', u'I', u'I', u'I', u'I', u'S', u'C' ] expected = ' '.join([ u'are', u'what', u'it', u'is', u'all', u'about', u'the', u'these', u'one-to-one', u'life' ]) expand_align = ExpandedAlignment(ref_aligned, hyp_aligned, alignment, lowercase=True) actual = HypothesisNormalizer.normalizeAligned(expand_align) self.assertEqual(actual, expected)
def test_normalizeAlignment_911(self): ref_aligned = [ u'Everyone', u'who', u'knew', u'me', u'before', u'911', u'', u'believes', u"I'm", u'dead' ] hyp_aligned = [ u'everyone', u'who', u'knew', u'me', u'before', u'nine 11', u'the', u'believes', u'line', u'' ] alignment = [ u'C', u'C', u'C', u'C', u'C', u'S', u'I', u'S', u'S', u'D' ] expected = ' '.join([ u'everyone', u'who', u'knew', u'me', u'before', u'911', u'the', u'believes', u'line', u'' ]).strip() expand_align = ExpandedAlignment(ref_aligned, hyp_aligned, alignment, lowercase=True) actual = HypothesisNormalizer.normalizeAligned(expand_align) self.assertEqual(actual, expected)
def test_normalize_alignment_theyre2(self): ref_aligned = [ "In", "our", "study", "they", "are", "more", "positive", "but", "they're", "", "also", "more", "likely", "than", "younger", "people", "to", "experience", "mixed", "emotions", "sadness", "at", "the", "same", "time", "you", "experience", "happiness", "you", "know", "that", "tear", "in", "the", "eye", "when", "you're", "smiling", "at", "a", "friend" ] hyp_aligned = [ "in", "our", "study", "they", "are", "more", "positive", "but", "they", "are", "also", "more", "likely", "than", "younger", "people", "to", "experience", "mixed", "emotions", "sadness", "at", "the", "same", "time", "you", "experience", "happiness", "you", "know", "that", "tear", "in", "the", "eye", "when", "you're", "smiling", "at", "a", "friend" ] alignment = [ "C", "C", "C", "C", "C", "C", "C", "C", "S", "I", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C" ] expected = ' '.join([ "in", "our", "study", "they", "are", "more", "positive", "but", "they're", "also", "more", "likely", "than", "younger", "people", "to", "experience", "mixed", "emotions", "sadness", "at", "the", "same", "time", "you", "experience", "happiness", "you", "know", "that", "tear", "in", "the", "eye", "when", "you're", "smiling", "at", "a", "friend" ]) expand_align = ExpandedAlignment(ref_aligned, hyp_aligned, alignment, lowercase=True) actual = HypothesisNormalizer.normalizeAligned(expand_align) self.assertEqual(actual, expected)
def test_normalize_alignment_ex1(self): ref_aligned = [ "Our", "digital", "body", "is", "one-to-one", "", "", "life", "size", "so", "this", "is", "exactly", "the", "way", "students", "will", "see", "the", "real", "anatomy", ] hyp_aligned = [ "our", "peaceful", "body", "is", "one two", "one", "life", "life", "size", "so", "this", "is", "exactly", "the", "way", "students", "would", "see", "the", "real", "anatomy", ] alignment = [ "C", "S", "C", "C", "S", "I", "I", "C", "C", "C", "C", "C", "C", "C", "C", "C", "S", "C", "C", "C", "C", ] expected = ' '.join(hyp_aligned) expand_align = ExpandedAlignment(ref_aligned, hyp_aligned, alignment, lowercase=True) actual = HypothesisNormalizer.normalizeAligned(expand_align) self.assertEqual(actual, expected)
def test_normalize_abbrev_wrong(self): ref_aligned = [ "So", "we", "learned", "the", "majority", "of", "anatomic", "classes", "taught", "they", "do", "not", "have", "a", "cadaver", "dissection", "lab", ] hyp_aligned = [ "so", "we", "learned", "the", "most jury p.", "o.", "anatomy", "class", "called", "they", "do", "not", "have", "", "had ever", "dissection", "lead", ] alignment = [ "C", "C", "C", "C", "S", "S", "S", "S", "S", "C", "C", "C", "C", "D", "S", "C", "S", ] expected = [ "So", "we", "learned", "the", "most jury p.", "o.", "anatomy", "class", "called", "they", "do", "not", "have", "", "had ever", "dissection", "lead", ] expected = ' '.join([x for x in expected if x]) expand_align = ExpandedAlignment(ref_aligned, hyp_aligned, alignment, lowercase=True) actual = HypothesisNormalizer.normalizeAligned(expand_align, fix_casing=True) self.assertEqual(actual, expected)
def test_punct_tokens_at_front_and_end_short(self): sample_ref = [ u'Everyone', u'who', u'knew', u'me', u'before', u'9/11', u'', u'', u'believes', u"I'm", u'dead' ] sample_ref_map = [ 0, 1, 2, 3, 4, 5, 8, 9, 10, ] sample_align = [ u'C', u'C', u'C', u'C', u'C', u'S', u'I', u'I', u'S', u'S', u'D', ] sample_hyp = [ u'Everyone', u'who', u'knew', u'me', u'before', u'nine', u'eleven', u'the', u'believes', u'line', u'', ] sample_hyp_map = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ] sample_hyp_punct = [ u'"Everyone', u'who', u'knew', u'me', u'before', u'nine', u'eleven', u'the', u'believes', u'line."', u'', ] error_alignment = ExpandedAlignment(sample_ref, sample_hyp, sample_align, sample_ref_map, sample_hyp_map, lowercase=True) ref_punct = u'''"Everyone who knew me before 9/11 believes I'm dead."''' expected = ExpandedAlignment(sample_ref, sample_hyp_punct, sample_align, sample_ref_map, sample_hyp_map, lowercase=True).s2 actual = PunctInsertOracle.insertPunct(error_alignment, ref_punct).s2 self.maxDiff = None self.assertEqual(actual, expected)
def test_punct_tokens_at_front_and_end(self): sample_ref = [ u'Everyone', u'who', u'knew', u'me', u'before', u'9/11', u'', u'', u'believes', u"I'm", u'dead', u'I', u'used', u'to', u'work', u'with', u'a', u'bunch', u'of', u'uptight', u'religious', u'people', u'so', u'sometimes', u'I', u"didn't", u'wear', u'panties', u'and', u'just', u'had', u'a', u'big', u'smile', u'and', u'chuckled', u'to', u'myself', u'', u'This', u'next', u'one', u'takes', u'a', u'little', u'explanation', u'before', u'I', u'share', u'it', u'with', u'you' ] sample_ref_map = [ 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 ] sample_align = [ u'C', u'C', u'C', u'C', u'C', u'S', u'I', u'I', u'S', u'S', u'D', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'S', u'S', u'D', u'C', u'C', u'C', u'C', u'S', u'C', u'C', u'I', u'C', u'C', u'C', u'C', u'D', u'C', u'C', u'S', u'S', u'S', u'D', u'C', u'C' ] sample_hyp = [ u'Everyone', u'who', u'knew', u'me', u'before', u'nine', u'eleven', u'the', u'believes', u'line', u'', u'i', u'used', u'to', u'work', u'with', u'a', u'bunch', u'of', u'uptight', u'religious', u'people', u'so', u'sometimes', u'i', u"didn't", u'wear', u'panties', u'is', u'that', u'', u'a', u'big', u'smile', u'and', u'chuckle', u'to', u'myself', u'from', u'this', u'next', u'one', u'takes', u'', u'little', u'explanation', u'of', u'right', u'here', u'', u'with', u'you' ] sample_hyp_map = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 50, 51 ] sample_hyp_punct = [ u'"Everyone', u'who', u'knew', u'me', u'before', u'nine', u'eleven', u'the', u'believes', u'line."', u'', u'"i', u'used', u'to', u'work', u'with', u'a', u'bunch', u'of', u'uptight', u'religious', u'people,', u'so', u'sometimes', u'i', u"didn't", u'wear', u'panties,', u'is', u'that', u'', u'a', u'big', u'smile', u'and', u'chuckle', u'to', u'myself."', u'from', u'this', u'next', u'one', u'takes', u'', u'little', u'explanation', u'of', u'right', u'here', u'', u'with', u'you.' ] error_alignment = ExpandedAlignment(sample_ref, sample_hyp, sample_align, sample_ref_map, sample_hyp_map, lowercase=True) ref_punct = u'''"Everyone who knew me before 9/11 believes I'm dead." "I used to work with a bunch of uptight religious people, so sometimes I didn't wear panties, and just had a big smile and chuckled to myself." This next one takes a little explanation before I share it with you.''' expected = ExpandedAlignment(sample_ref, sample_hyp_punct, sample_align, sample_ref_map, sample_hyp_map, lowercase=True).s2 actual = PunctInsertOracle.insertPunct(error_alignment, ref_punct).s2 # print(actual) self.maxDiff = None self.assertEqual(actual, expected)
def charAlignToWordAlign(self): if not self.char_align: raise Exception("char_align is None") ref_word_align = [] hyp_word_align = [] align_word = [] tmp_ref_word = [] tmp_hyp_word = [] for i in range(len(self.char_align.align)): ref_char = self.char_align.s1[i] hyp_char = self.char_align.s2[i] align_char = self.char_align.align[i] # check if both words are completed # There are a few of ways this could happen: if ((align_char == AlignLabels.correct and ref_char == ' ') or (align_char == AlignLabels.deletion and ref_char == ' ') or (align_char == AlignLabels.insertion and hyp_char == ' ')): ref_word = ''.join(tmp_ref_word) hyp_word = ''.join(tmp_hyp_word) if ref_word or hyp_word: ref_word_align.append(ref_word) hyp_word_align.append(hyp_word) tmp_ref_word = [] tmp_hyp_word = [] # Check align type if ref_word and hyp_word: if ref_word == hyp_word: align_word.append(AlignLabels.correct) else: align_word.append(AlignLabels.substitution) elif ref_word: align_word.append(AlignLabels.deletion) else: align_word.append(AlignLabels.insertion) continue # Read current chars and check if one of the words is complete if ref_char == ' ': if len(tmp_ref_word) > 1: # Probably a D ref_word = ''.join(tmp_ref_word) ref_word_align.append(ref_word) hyp_word_align.append('') tmp_ref_word = [] align_word.append(AlignLabels.deletion) else: tmp_ref_word.append(ref_char) if hyp_char == ' ': if len(tmp_hyp_word) > 1: # Probably an I hyp_word = ''.join(tmp_hyp_word) ref_word_align.append('') hyp_word_align.append(hyp_word) tmp_hyp_word = [] align_word.append(AlignLabels.insertion) else: tmp_hyp_word.append(hyp_char) self.word_align = ExpandedAlignment(ref_word_align, hyp_word_align, align_word, lowercase=self.lowercase) return self.word_align
def phoneAlignToWordAlign(cls, ref_words, hyp_words, ref_phones, hyp_phones, break_on_syllables=True): ref_word_span = (0, len(ref_words)) hyp_word_span = (0, len(hyp_words)) # Perform Levenshtein Alignment lev = Levenshtein.align(ref=ref_phones, hyp=hyp_phones, reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets, weights=Levenshtein.wordAlignWeights) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align = lev.expandAlignCompact() worklist = list() worklist.append((ref_word_span, hyp_word_span, phone_align)) full_reference = list() full_hypothesis = list() full_alignment = list() full_phone_align = list() while worklist: # Take the next set of sequence boundaries off the worklist ref_word_span, hyp_word_span, phone_align = worklist.pop() ref_word_index, ref_word_limit = ref_word_span hyp_word_index, hyp_word_limit = hyp_word_span # TODO: Currently only checking in the forward direction ref_word_builder = [] # Temp storage of words in alignment span hyp_word_builder = [] ref_word_iter = enumerate( ref_words[ref_word_span[0]:ref_word_span[1]] ) # Iterates through the surface words hyp_word_iter = enumerate( hyp_words[hyp_word_span[0]:hyp_word_span[1]]) ref_aligned = [] # Finalized alignments hyp_aligned = [] alignment = [] # Finalized alignment labels ref_extra_syllable_word_index = None # Used for marking words mapping to extra syllables in alignment. hyp_extra_syllable_word_index = None ref_syllable_count = 0 hyp_syllable_count = 0 ref_word_started = False # Indicates whether a word is already accounted for in the alignment when a phoneme is reached. hyp_word_started = False advance_worklist = False commit_alignment = False for i in range(len(phone_align.align)): ref_type = TokType.checkAnnotation(phone_align.s1[i]) hyp_type = TokType.checkAnnotation(phone_align.s2[i]) # Check if word boundaries are reached, both on ref an hyp -- or the case where no more symbols can be read. if (i == len(phone_align.align) - 1) or (ref_type == TokType.WordBoundary and ref_type == hyp_type): align_tok = None # Only write outputs if either the ref or the hyp has scanned some words. if ref_word_builder: if hyp_word_builder: align_tok = AlignLabels.substitution if ref_word_builder != hyp_word_builder else AlignLabels.correct else: align_tok = AlignLabels.deletion elif hyp_word_builder: align_tok = AlignLabels.insertion if align_tok: # Add the remainder to the worklist ref_word_span_next = (ref_word_index + len(ref_word_builder), ref_word_limit) hyp_word_span_next = (hyp_word_index + len(hyp_word_builder), hyp_word_limit) phone_align_next = phone_align.subsequence( i, phone_align.length(), preserve_index=False) worklist.append((ref_word_span_next, hyp_word_span_next, phone_align_next)) # "Commit" the current alignment if align_tok in (AlignLabels.correct, AlignLabels.substitution): alignment.append(align_tok) # Check for syllable conflicts if not break_on_syllables or not ref_extra_syllable_word_index: ref_aligned.append(' '.join(ref_word_builder)) ref_syllable_count = 0 hyp_syllable_count = 0 else: ref_aligned.append(' '.join(ref_word_builder[ 0:ref_extra_syllable_word_index])) # The remaining words are deletions for word in ref_word_builder[ ref_extra_syllable_word_index:]: alignment.append(AlignLabels.deletion) ref_aligned.append(word) hyp_aligned.append('') ref_syllable_count = 0 if not break_on_syllables or not hyp_extra_syllable_word_index: hyp_aligned.append(' '.join(hyp_word_builder)) ref_syllable_count = 0 hyp_syllable_count = 0 else: hyp_aligned.append(' '.join(hyp_word_builder[ 0:hyp_extra_syllable_word_index])) # The remaining words are insertions for word in hyp_word_builder[ hyp_extra_syllable_word_index:]: alignment.append(AlignLabels.insertion) ref_aligned.append('') hyp_aligned.append(word) hyp_syllable_count = 0 if align_tok == AlignLabels.substitution: # Check if you need to rework this alignment. if len(ref_word_builder) != len( hyp_word_builder): # Word count mismatch in the alignment span. Is there a possibility that we need to re-align this segment? ref_word_span_curr = ( ref_word_index, ref_word_index + len(ref_word_builder)) hyp_word_span_curr = ( hyp_word_index, hyp_word_index + len(hyp_word_builder)) phone_align_curr = phone_align.subsequence( 0, i + 1, preserve_index=False) lev = Levenshtein.align( ref=phone_align_curr.s1_tokens(), hyp=phone_align_curr.s2_tokens(), reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner. exclusive_sets, weights=Levenshtein.wordAlignWeights ) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align_adjusted = lev.expandAlignCompact( ) if phone_align_curr.align != phone_align_adjusted.align: # Looks like we need to redo the phone-to-word alignment. worklist.append((ref_word_span_curr, hyp_word_span_curr, phone_align_adjusted)) else: commit_alignment = True else: commit_alignment = True elif align_tok == AlignLabels.deletion: for word in ref_word_builder: alignment.append(align_tok) ref_aligned.append(word) hyp_aligned.append('') commit_alignment = True ref_syllable_count = 0 elif align_tok == AlignLabels.insertion: for word in hyp_word_builder: alignment.append(align_tok) ref_aligned.append('') hyp_aligned.append(word) commit_alignment = True hyp_syllable_count = 0 if commit_alignment: # Commit the alignment. full_reference.extend(ref_aligned) full_hypothesis.extend(hyp_aligned) full_alignment.extend(alignment) full_phone_align.append( phone_align.subsequence(0, i, preserve_index=False)) ref_aligned = [] hyp_aligned = [] alignment = [] break # Add words if word boundaries are reached. else: if ref_type == TokType.WordBoundary: ref_word_started = False if hyp_type != TokType.WordBoundary and ref_word_builder and not hyp_word_builder: # DELETION # Ref word ended, but no hyp words have been added. Mark the current ref word(s) in the span as deletion errors. # TODO: Dedupe this logic for word in ref_word_builder: alignment.append(AlignLabels.deletion) ref_aligned.append(word) hyp_aligned.append('') ref_syllable_count = 0 # Commit the alignment. full_reference.extend(ref_aligned) full_hypothesis.extend(hyp_aligned) full_alignment.extend(alignment) full_phone_align.append( phone_align.subsequence(0, i, preserve_index=False)) # Add the remainder to the worklist ref_word_span_next = (ref_word_index + len(ref_word_builder), ref_word_limit) hyp_word_span_next = (hyp_word_index + len(hyp_word_builder), hyp_word_limit) lev = Levenshtein.align( ref=[x for x in phone_align.s1[i:] if x], hyp=[x for x in phone_align.s2 if x], reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets, weights=Levenshtein.wordAlignWeights) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align_next = lev.expandAlignCompact() worklist.append( (ref_word_span_next, hyp_word_span_next, phone_align_next)) break elif ref_type == TokType.Phoneme and not ref_word_started: ref_word_started = True try: ref_word_item = ref_word_iter.__next__() ref_word_builder.append(ref_word_item[1]) except StopIteration: pass if hyp_type == TokType.WordBoundary: hyp_word_started = False if ref_type != TokType.WordBoundary and hyp_word_builder and not ref_word_builder: # INSERTION # Hyp word ended, but no ref words have been added. Mark the current hyp word(s) in the span as insertion errors. # TODO: Dedupe this logic for word in hyp_word_builder: alignment.append(AlignLabels.insertion) ref_aligned.append('') hyp_aligned.append(word) hyp_syllable_count = 0 # Commit the alignment. full_reference.extend(ref_aligned) full_hypothesis.extend(hyp_aligned) full_alignment.extend(alignment) full_phone_align.append( phone_align.subsequence(0, i, preserve_index=False)) # Add the remainder to the worklist ref_word_span_next = (ref_word_index + len(ref_word_builder), ref_word_limit) hyp_word_span_next = (hyp_word_index + len(hyp_word_builder), hyp_word_limit) lev = Levenshtein.align( ref=[x for x in phone_align.s1 if x], hyp=[x for x in phone_align.s2[i:] if x], reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets, weights=Levenshtein.wordAlignWeights) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align_next = lev.expandAlignCompact() worklist.append( (ref_word_span_next, hyp_word_span_next, phone_align_next)) break elif hyp_type == TokType.Phoneme and not hyp_word_started: hyp_word_started = True try: hyp_word_item = hyp_word_iter.__next__() hyp_word_builder.append(hyp_word_item[1]) except StopIteration: pass # Check for syllable mismatches if ref_type == TokType.SyllableBoundary: ref_syllable_count += 1 if hyp_type == TokType.SyllableBoundary: hyp_syllable_count += 1 if (ref_type == TokType.SyllableBoundary == hyp_type or ref_syllable_count == hyp_syllable_count): # No syllable conflicts here! ref_extra_syllable_word_index = None hyp_extra_syllable_word_index = None elif (ref_type == TokType.SyllableBoundary and not ref_extra_syllable_word_index and TokType.checkAnnotation( phone_align.s2[i - 1]) == TokType.WordBoundary): # Extra syllable in hypothesis. We only care if the syllable immediately follows a word boundary. # This is because this indicates that a new word is being formed, which may likely be an insertion in hyp. ref_extra_syllable_word_index = len(ref_word_builder) - 1 # print ref_word_builder # print 'Syllable/word mismatch at', i # print 'Extra hyp word:', ref_word_builder[ref_extra_syllable_word_index] elif (hyp_type == TokType.SyllableBoundary and not hyp_extra_syllable_word_index and TokType.checkAnnotation( phone_align.s2[i - 1]) == TokType.WordBoundary): # This time there's an extra syllable in the ref, corresponding to a new ref word. hyp_extra_syllable_word_index = len(hyp_word_builder) - 1 # print hyp_word_builder # print 'Syllable/word mismatch at', i # print 'Extra ref word:', hyp_word_builder[hyp_extra_syllable_word_index] # Concatenate all phoneme alignments fp_align = full_phone_align[0] for expand_align in full_phone_align[1:]: fp_align.append_alignment(expand_align) return ExpandedAlignment(full_reference, full_hypothesis, full_alignment), fp_align
class PowerAligner: # Exclusive tokens that can only align to themselves; not other members in this set. reserve_list = set(['|', '#']) # R-sounds r_set = set.union(set('r'), Phonemes.r_vowels) exclusive_sets = [Phonemes.vowels, Phonemes.consonants, r_set] phoneDistPenalty = 0.25 phoneDistPenalt16ySet = set(['|']) def __init__(self, ref, hyp, lowercase=False, verbose=False, pronounce_type=PronouncerType.Lexicon, lexicon=None, word_align_weights=Levenshtein.wordAlignWeights): if not ref: raise Exception("No reference file.\nref: {0}\nhyp: {1}".format( ref, hyp)) if pronounce_type == PronouncerType.Lexicon: self.pronouncer = PronouncerLex(lexicon) else: self.pronouncer = PronouncerBase() self.ref = [x for x in ref.strip().split() if x] self.hyp = [x for x in hyp.strip().split() if x] self.refwords = ' '.join(self.ref) self.hypwords = ' '.join(self.hyp) self.lowercase = lowercase self.verbose = verbose # Perform word alignment lev = Levenshtein.align(self.ref, self.hyp, lowercase=self.lowercase, weights=word_align_weights) lev.editops() self.wer_alignment = lev.expandAlign() self.wer, self.wer_components = self.wer_alignment.error_rate() # Used for POWER alignment self.power_alignment = None self.power = None self.power_components = None # Used to find potential error regions self.split_regions = None self.error_indexes = None self.phonetic_alignments = None self.phonetic_lev = None def align(self): # Find the error regions that may need to be realigned self.split_regions, self.error_indexes = self.wer_alignment.split_error_regions( ) self.phonetic_alignments = [None] * len(self.split_regions) for error_index in self.error_indexes: seg = self.split_regions[error_index] ref_words = seg.s1_tokens() hyp_words = seg.s2_tokens() ref_phones = self.pronouncer.pronounce(ref_words) hyp_phones = self.pronouncer.pronounce(hyp_words) power_seg_alignment, self.phonetic_alignments[ error_index] = PowerAligner.phoneAlignToWordAlign( ref_words, hyp_words, ref_phones, hyp_phones) # Replace the error region at the current index. self.split_regions[error_index] = power_seg_alignment # Merge the alignment segments back together. self.power_alignment = ExpandedAlignment(self.split_regions[0].s1, self.split_regions[0].s2, self.split_regions[0].align, self.split_regions[0].s1_map, self.split_regions[0].s2_map, lowercase=self.lowercase) for i in range(1, len(self.split_regions)): self.power_alignment.append_alignment(self.split_regions[i]) # Get the alignment score self.power, self.power_components = self.power_alignment.error_rate() assert self.hypwords == self.power_alignment.s2_string( ), "hyp mismatch:\n{0}\n{1}".format(self.hypwords, self.power_alignment.s2_string()) assert self.refwords == self.power_alignment.s1_string( ), "ref mismatch:\n{0}\n{1}".format(self.refwords, self.power_alignment.s1_string()) # TODO: Make this simpler (and maybe recursive) @classmethod def phoneAlignToWordAlign(cls, ref_words, hyp_words, ref_phones, hyp_phones, break_on_syllables=True): ref_word_span = (0, len(ref_words)) hyp_word_span = (0, len(hyp_words)) # Perform Levenshtein Alignment lev = Levenshtein.align(ref=ref_phones, hyp=hyp_phones, reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets, weights=Levenshtein.wordAlignWeights) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align = lev.expandAlignCompact() worklist = list() worklist.append((ref_word_span, hyp_word_span, phone_align)) full_reference = list() full_hypothesis = list() full_alignment = list() full_phone_align = list() while worklist: # Take the next set of sequence boundaries off the worklist ref_word_span, hyp_word_span, phone_align = worklist.pop() ref_word_index, ref_word_limit = ref_word_span hyp_word_index, hyp_word_limit = hyp_word_span # TODO: Currently only checking in the forward direction ref_word_builder = [] # Temp storage of words in alignment span hyp_word_builder = [] ref_word_iter = enumerate( ref_words[ref_word_span[0]:ref_word_span[1]] ) # Iterates through the surface words hyp_word_iter = enumerate( hyp_words[hyp_word_span[0]:hyp_word_span[1]]) ref_aligned = [] # Finalized alignments hyp_aligned = [] alignment = [] # Finalized alignment labels ref_extra_syllable_word_index = None # Used for marking words mapping to extra syllables in alignment. hyp_extra_syllable_word_index = None ref_syllable_count = 0 hyp_syllable_count = 0 ref_word_started = False # Indicates whether a word is already accounted for in the alignment when a phoneme is reached. hyp_word_started = False advance_worklist = False commit_alignment = False for i in range(len(phone_align.align)): ref_type = TokType.checkAnnotation(phone_align.s1[i]) hyp_type = TokType.checkAnnotation(phone_align.s2[i]) # Check if word boundaries are reached, both on ref an hyp -- or the case where no more symbols can be read. if (i == len(phone_align.align) - 1) or (ref_type == TokType.WordBoundary and ref_type == hyp_type): align_tok = None # Only write outputs if either the ref or the hyp has scanned some words. if ref_word_builder: if hyp_word_builder: align_tok = AlignLabels.substitution if ref_word_builder != hyp_word_builder else AlignLabels.correct else: align_tok = AlignLabels.deletion elif hyp_word_builder: align_tok = AlignLabels.insertion if align_tok: # Add the remainder to the worklist ref_word_span_next = (ref_word_index + len(ref_word_builder), ref_word_limit) hyp_word_span_next = (hyp_word_index + len(hyp_word_builder), hyp_word_limit) phone_align_next = phone_align.subsequence( i, phone_align.length(), preserve_index=False) worklist.append((ref_word_span_next, hyp_word_span_next, phone_align_next)) # "Commit" the current alignment if align_tok in (AlignLabels.correct, AlignLabels.substitution): alignment.append(align_tok) # Check for syllable conflicts if not break_on_syllables or not ref_extra_syllable_word_index: ref_aligned.append(' '.join(ref_word_builder)) ref_syllable_count = 0 hyp_syllable_count = 0 else: ref_aligned.append(' '.join(ref_word_builder[ 0:ref_extra_syllable_word_index])) # The remaining words are deletions for word in ref_word_builder[ ref_extra_syllable_word_index:]: alignment.append(AlignLabels.deletion) ref_aligned.append(word) hyp_aligned.append('') ref_syllable_count = 0 if not break_on_syllables or not hyp_extra_syllable_word_index: hyp_aligned.append(' '.join(hyp_word_builder)) ref_syllable_count = 0 hyp_syllable_count = 0 else: hyp_aligned.append(' '.join(hyp_word_builder[ 0:hyp_extra_syllable_word_index])) # The remaining words are insertions for word in hyp_word_builder[ hyp_extra_syllable_word_index:]: alignment.append(AlignLabels.insertion) ref_aligned.append('') hyp_aligned.append(word) hyp_syllable_count = 0 if align_tok == AlignLabels.substitution: # Check if you need to rework this alignment. if len(ref_word_builder) != len( hyp_word_builder): # Word count mismatch in the alignment span. Is there a possibility that we need to re-align this segment? ref_word_span_curr = ( ref_word_index, ref_word_index + len(ref_word_builder)) hyp_word_span_curr = ( hyp_word_index, hyp_word_index + len(hyp_word_builder)) phone_align_curr = phone_align.subsequence( 0, i + 1, preserve_index=False) lev = Levenshtein.align( ref=phone_align_curr.s1_tokens(), hyp=phone_align_curr.s2_tokens(), reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner. exclusive_sets, weights=Levenshtein.wordAlignWeights ) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align_adjusted = lev.expandAlignCompact( ) if phone_align_curr.align != phone_align_adjusted.align: # Looks like we need to redo the phone-to-word alignment. worklist.append((ref_word_span_curr, hyp_word_span_curr, phone_align_adjusted)) else: commit_alignment = True else: commit_alignment = True elif align_tok == AlignLabels.deletion: for word in ref_word_builder: alignment.append(align_tok) ref_aligned.append(word) hyp_aligned.append('') commit_alignment = True ref_syllable_count = 0 elif align_tok == AlignLabels.insertion: for word in hyp_word_builder: alignment.append(align_tok) ref_aligned.append('') hyp_aligned.append(word) commit_alignment = True hyp_syllable_count = 0 if commit_alignment: # Commit the alignment. full_reference.extend(ref_aligned) full_hypothesis.extend(hyp_aligned) full_alignment.extend(alignment) full_phone_align.append( phone_align.subsequence(0, i, preserve_index=False)) ref_aligned = [] hyp_aligned = [] alignment = [] break # Add words if word boundaries are reached. else: if ref_type == TokType.WordBoundary: ref_word_started = False if hyp_type != TokType.WordBoundary and ref_word_builder and not hyp_word_builder: # DELETION # Ref word ended, but no hyp words have been added. Mark the current ref word(s) in the span as deletion errors. # TODO: Dedupe this logic for word in ref_word_builder: alignment.append(AlignLabels.deletion) ref_aligned.append(word) hyp_aligned.append('') ref_syllable_count = 0 # Commit the alignment. full_reference.extend(ref_aligned) full_hypothesis.extend(hyp_aligned) full_alignment.extend(alignment) full_phone_align.append( phone_align.subsequence(0, i, preserve_index=False)) # Add the remainder to the worklist ref_word_span_next = (ref_word_index + len(ref_word_builder), ref_word_limit) hyp_word_span_next = (hyp_word_index + len(hyp_word_builder), hyp_word_limit) lev = Levenshtein.align( ref=[x for x in phone_align.s1[i:] if x], hyp=[x for x in phone_align.s2 if x], reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets, weights=Levenshtein.wordAlignWeights) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align_next = lev.expandAlignCompact() worklist.append( (ref_word_span_next, hyp_word_span_next, phone_align_next)) break elif ref_type == TokType.Phoneme and not ref_word_started: ref_word_started = True try: ref_word_item = ref_word_iter.__next__() ref_word_builder.append(ref_word_item[1]) except StopIteration: pass if hyp_type == TokType.WordBoundary: hyp_word_started = False if ref_type != TokType.WordBoundary and hyp_word_builder and not ref_word_builder: # INSERTION # Hyp word ended, but no ref words have been added. Mark the current hyp word(s) in the span as insertion errors. # TODO: Dedupe this logic for word in hyp_word_builder: alignment.append(AlignLabels.insertion) ref_aligned.append('') hyp_aligned.append(word) hyp_syllable_count = 0 # Commit the alignment. full_reference.extend(ref_aligned) full_hypothesis.extend(hyp_aligned) full_alignment.extend(alignment) full_phone_align.append( phone_align.subsequence(0, i, preserve_index=False)) # Add the remainder to the worklist ref_word_span_next = (ref_word_index + len(ref_word_builder), ref_word_limit) hyp_word_span_next = (hyp_word_index + len(hyp_word_builder), hyp_word_limit) lev = Levenshtein.align( ref=[x for x in phone_align.s1 if x], hyp=[x for x in phone_align.s2[i:] if x], reserve_list=PowerAligner.reserve_list, exclusive_sets=PowerAligner.exclusive_sets, weights=Levenshtein.wordAlignWeights) #, #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights) phone_align_next = lev.expandAlignCompact() worklist.append( (ref_word_span_next, hyp_word_span_next, phone_align_next)) break elif hyp_type == TokType.Phoneme and not hyp_word_started: hyp_word_started = True try: hyp_word_item = hyp_word_iter.__next__() hyp_word_builder.append(hyp_word_item[1]) except StopIteration: pass # Check for syllable mismatches if ref_type == TokType.SyllableBoundary: ref_syllable_count += 1 if hyp_type == TokType.SyllableBoundary: hyp_syllable_count += 1 if (ref_type == TokType.SyllableBoundary == hyp_type or ref_syllable_count == hyp_syllable_count): # No syllable conflicts here! ref_extra_syllable_word_index = None hyp_extra_syllable_word_index = None elif (ref_type == TokType.SyllableBoundary and not ref_extra_syllable_word_index and TokType.checkAnnotation( phone_align.s2[i - 1]) == TokType.WordBoundary): # Extra syllable in hypothesis. We only care if the syllable immediately follows a word boundary. # This is because this indicates that a new word is being formed, which may likely be an insertion in hyp. ref_extra_syllable_word_index = len(ref_word_builder) - 1 # print ref_word_builder # print 'Syllable/word mismatch at', i # print 'Extra hyp word:', ref_word_builder[ref_extra_syllable_word_index] elif (hyp_type == TokType.SyllableBoundary and not hyp_extra_syllable_word_index and TokType.checkAnnotation( phone_align.s2[i - 1]) == TokType.WordBoundary): # This time there's an extra syllable in the ref, corresponding to a new ref word. hyp_extra_syllable_word_index = len(hyp_word_builder) - 1 # print hyp_word_builder # print 'Syllable/word mismatch at', i # print 'Extra ref word:', hyp_word_builder[hyp_extra_syllable_word_index] # Concatenate all phoneme alignments fp_align = full_phone_align[0] for expand_align in full_phone_align[1:]: fp_align.append_alignment(expand_align) return ExpandedAlignment(full_reference, full_hypothesis, full_alignment), fp_align