def test_tokenize_full_syll(): t = 'j_uː_ n_oʊ_ dʒ_ʌ_s_;t_ ' s = Separator(phone='_', syllable=None, word=' ') assert list(s.tokenize(t)) \ == [['j', 'uː'], ['n', 'oʊ'], ['dʒ', 'ʌ', 's', ';t']] s = Separator(phone='_', syllable=';', word=' ') assert list(s.tokenize(t)) \ == [[['j', 'uː']], [['n', 'oʊ']], [['dʒ', 'ʌ', 's'], ['t']]]
def test_tokenize_noboundaries(): s = Separator(phone=None, syllable=' ', word=';eword') t = 'j uː ;eword n oʊ ;eword dʒ ʌ s t ;eword' assert list(s.tokenize(t, 'word', keep_boundaries=False)) \ == ['juː', 'noʊ', 'dʒʌst'] assert list(s.tokenize(t, 'syllable', keep_boundaries=False)) \ == ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't'] s = Separator(phone=' ', word='_') t = 'j uː _ n oʊ _ dʒ ʌ s t _' assert list(s.tokenize(t, 'word', keep_boundaries=False)) \ == ['juː', 'noʊ', 'dʒʌst'] assert list(s.tokenize(t, 'phone', keep_boundaries=False)) \ == ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't'] s = Separator(phone='_', word=' ') t = 'j_uː_ n_oʊ_ dʒ_ʌ_s_t_ ' assert list(s.tokenize(t, 'word', keep_boundaries=False)) \ == ['juː', 'noʊ', 'dʒʌst'] assert list(s.tokenize(t, 'phone', keep_boundaries=False)) \ == ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't'] s = Separator(phone='_', syllable=';', word=' ') t = 'j_uː_ n_oʊ_ dʒ_ʌ_s_;t_ ' assert list(s.tokenize(t, 'word', keep_boundaries=False)) \ == ['juː', 'noʊ', 'dʒʌst'] assert list(s.tokenize(t, 'syllable', keep_boundaries=False)) \ == ['juː', 'noʊ', 'dʒʌs', 't'] assert list(s.tokenize(t, 'phone', keep_boundaries=False)) \ == ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't']
def test_tokenize_full_nosyll(): t = 'j_uː_ n_oʊ_ dʒ_ʌ_s_t_ ' s = Separator(phone='_', syllable=None, word=' ') assert list(s.tokenize(t)) \ == [['j', 'uː'], ['n', 'oʊ'], ['dʒ', 'ʌ', 's', 't']] s = Separator(phone='_', syllable=';', word=' ') assert list(s.tokenize(t)) \ == [[['j', 'uː']], [['n', 'oʊ']], [['dʒ', 'ʌ', 's', 't']]] # tokenize phones only t = t.replace(' ', '') s = Separator(phone='_', syllable=None, word=None) assert list(s.tokenize(t)) == \ ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't']
def test_tokenize_none(): s = Separator(phone=None, syllable=None, word=' ') text = 'te9abesitosgr uNone' assert list(s.tokenize(text, level='word')) == ['te9abesitosgr', 'uNone'] assert s.strip('uNone') == 'uNone' assert s.strip('None') == 'None' assert s.strip('Noneu') == 'Noneu'
def test_split_vs_tokenize(text, expected, keep_boundaries): s = Separator(phone='p', syllable='s', word='w') assert list(s.split(text, 'word', keep_boundaries=keep_boundaries)) \ == expected assert list(s.tokenize(text, 'word', keep_boundaries=keep_boundaries)) \ == [e for e in expected if len(e)]
def test_summary_perfect(gold): d = summary(gold, gold) sep = Separator(phone=None, syllable=None, word=' ') nwords = sum(len(sep.tokenize(utt, level='word')) for utt in gold) # all is in correct for category in ('under', 'over', 'mis'): assert not d[category] # expected number of words in correct assert sum(d['correct'].values()) == nwords
class SegmentationSummary(object): """Computes a summary of the segmentation errors The errors can be oversegmentations, undersegmentations or missegmentations. Correct segmentations are also reported. """ def __init__(self): # token separation on words only self.separator = Separator(phone=None, syllable=None, word=' ') # count over/under/mis/good segmentation for each word type self.over_segmentation = collections.defaultdict(int) self.under_segmentation = collections.defaultdict(int) self.mis_segmentation = collections.defaultdict(int) self.correct_segmentation = collections.defaultdict(int) def to_dict(self): """Exports the summary as a dictionary Returns ------- summary : dict A dictionary with the complete summary in the following entries: 'over', 'under', 'mis', 'correct'. In each entry, the words are sorted by decreasing frequency, and alphabetically (for equivalent frequency). """ # collapse all the dicts in a single one summary = { 'over': self.over_segmentation, 'under': self.under_segmentation, 'mis': self.mis_segmentation, 'correct': self.correct_segmentation } # sort by most frequent word decreasing order (and then # alphabetically increasing order) summary = { k: { w[0]: w[1] for w in sorted( v.items(), key=lambda x: (-x[1], x[0]), reverse=False) } for k, v in summary.items() } return summary def summarize(self, text, gold): """Computes segmentation errors on a whole text Call :meth:`summarize_utterance` on each utterance of gold and text. Parameters ---------- text : list of str The list of utterances for the segmented text (to be evaluated) gold : list of str The list of utterances for the gold text Raises ------ ValueError If `text` and `gold` do not have the same number of utterances. If :meth:`summarize_utterance` raise a ValueError. """ if not len(gold) == len(text): raise ValueError( 'text and gold do not have the same number of utterances') for t, g in zip(text, gold): self.summarize_utterance(t, g) def summarize_utterance(self, text, gold): """Computes segmentation errors on a single utterance This method returns no result but update the intern summary, accessible using :meth:`to_dict`. Parameters ---------- text : str A segmented utterance gold : str A gold utterance Raises ------ ValueError If `text` and `gold` are mismatched, i.e. they do not contain the same suite of letters (once all the spaces removed). """ # check gold and text match (with all spaces removed) if self.separator.remove(gold) != self.separator.remove(text): raise ValueError('mismatch in gold and text: {} != {}'.format( gold, text)) # get text and gold as lists of words gold_words = self.separator.tokenize(gold, level='word') text_words = self.separator.tokenize(text, level='word') # silly case where gold and text are identical if gold_words == text_words: for word in gold_words: self.correct_segmentation[word] += 1 return # divide gold and text in chunks, packing chunks where gold # and text share a common boundary. chunks = self._boundary_chunks(text_words, gold_words) # classify each chunk as under/over/mis/good segmentation for text_chunk, gold_chunk in chunks: category = self._classify_chunk(text_chunk, gold_chunk) if category == 'correct': d = self.correct_segmentation elif category == 'under': d = self.under_segmentation elif category == 'over': d = self.over_segmentation else: d = self.mis_segmentation # register the chunk's words into the summary for the # relevant category for word in gold_chunk: d[word] += 1 @classmethod def _boundary_chunks(cls, text, gold): """Returns the list of chunks in a pair of text/gold utterance""" return cls._boundary_chunks_aux(text, gold, []) @classmethod def _boundary_chunks_aux(cls, text, gold, chunks): lg = len(gold) lt = len(text) # end of recursion if not lg and not lt: return chunks # impossible to have one empty but not the other. Should be # the case by construction, this assert is not required. assert lg and lt # compute the next chunk chunk = cls._compute_chunk(text, gold) # recursion return cls._boundary_chunks_aux(text[len(chunk[0]):], gold[len(chunk[1]):], chunks + [chunk]) @staticmethod def _compute_chunk(text, gold): """Find the first chunk in a pair of text/gold utterances A chunk is a pair of lists of words sharing a common boundary (begin and end of a sequence of words). Example ------- >>> gold = 'baby going home'.split() >>> text = 'ba by going home'.split() >> _compute_chunk(text, gold) (['ba', 'by'], ['baby']) >>> text = 'babygoinghome'.split() >> _compute_chunk(text, gold) (['babygoinghome'], ['baby', 'going', 'home']) """ # non empty texts and same letters. This should be the case by # construction, those asserts are not required. assert len(gold) and len(text) assert ''.join(gold) == ''.join(text) # easy case, first word is the same if gold[0] == text[0]: return ([text[0]], [gold[0]]) text_concat, text_index = text[0], 0 gold_concat, gold_index = gold[0], 0 while len(gold_concat) != len(text_concat): if len(gold_concat) < len(text_concat): gold_index += 1 gold_concat = gold_concat + gold[gold_index] else: text_index += 1 text_concat = text_concat + text[text_index] return (text[:text_index + 1], gold[:gold_index + 1]) def _classify_chunk(self, text, gold): """A chunk is either over/under/mis/correct""" if len(gold) == len(text): if len(gold) == 1: return 'correct' return 'mis' elif len(gold) < len(text): if len(gold) == 1: return 'over' return 'mis' else: # len(gold) > len(text) if len(text) == 1: return 'under' return 'mis'