def apply(self, example, is_train=False, stats=None, **kwargs): if "src_feats" not in example: # Do nothing return example if self.reversible_tokenization == "joiner": original_src = example["src_original"] \ if self.prior_tokenization else None word_to_subword_mapping = subword_map_by_joiner( example["src"], original_subwords=original_src) else: # Spacer word_to_subword_mapping = subword_map_by_spacer(example["src"]) inferred_feats = defaultdict(list) for subword, word_id in zip(example["src"], word_to_subword_mapping): for feat_name, feat_values in example["src_feats"].items(): # Punctuation only if not re.sub(r'(\W)+', '', subword).strip() \ and not self.prior_tokenization: inferred_feat = "<null>" else: inferred_feat = feat_values[word_id] inferred_feats[feat_name].append(inferred_feat) for feat_name, feat_values in inferred_feats.items(): example["src_feats"][feat_name] = inferred_feats[feat_name] return example
def test_subword_group_joiner(self): data_in = [ 'however', '■,', 'according', 'to', 'the', 'logs', '■,', 'she', 'is', 'hard', '■-■', 'working', '■.' ] # noqa: E501 true_out = [0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 7, 7, 7] out = subword_map_by_joiner(data_in) self.assertEqual(out, true_out)
def test_subword_group_naive(self): data_in = [ 'however', ',', 'according', 'to', 'the', 'logs', ',', 'she', 'is', 'hard', '-', 'working', '.' ] # noqa: E501 true_out = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER) self.assertEqual(out, true_out)
def test_subword_group_joiner_with_case_markup_advanced(self): data_in = [ '⦅mrk_case_modifier_C⦆', 'dummy', 'text', '⦅mrk_case_modifier_C⦆', '1■', 'h■', 'k', '⦅mrk_begin_case_region_U⦆', 'th■', '⦅mrk_end_case_region_U⦆', 'n', 'more', 'dummy', 'text' ] # noqa: E501 true_out = [0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 5, 6] out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER) self.assertEqual(out, true_out)
def test_subword_group_joiner_with_case_markup(self): data_in = [ '⦅mrk_case_modifier_C⦆', 'however', '■,', 'according', 'to', 'the', 'logs', '■,', '⦅mrk_begin_case_region_U⦆', 'she', 'is', 'hard', '■-■', 'working', '⦅mrk_end_case_region_U⦆', '■.' ] # noqa: E501 true_out = [0, 0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 7, 7, 7, 7, 7] out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER) self.assertEqual(out, true_out)
def test_subword_group_joiner_prior_tokenization(self): data_in = [ '⦅mrk_case_modifier_C⦆', 'how■', 'ever', '■,', 'according', 'to', 'the', 'logs', '■,', '⦅mrk_begin_case_region_U⦆', 'she', 'is', 'hard', '■-■', 'working', '⦅mrk_end_case_region_U⦆', '■.' ] # noqa: E501 original_data_in = [ 'However', '■,', 'according', 'to', 'the', 'logs', '■,', 'SHE', 'IS', 'HARD-WORKING', '■.' ] # noqa: E501 true_out = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 9, 9, 9, 10] # noqa: E501 out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER, original_subwords=original_data_in) self.assertEqual(out, true_out)