def test_one_string(self): txt = [ u"株式会社KADOKAWA", ] _, offsets_map = normalize_ops.normalize_utf8_with_offsets_map( txt, u"NFKC") # post_norm_txt = "株式会社KADOKAWA" post_norm_offsets_starts = [[ 0, 3, 6, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20 ]] post_norm_offsets_ends = [[ 3, 6, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 ]] pre_norm_offsets_starts = normalize_ops.find_source_offsets( offsets_map, post_norm_offsets_starts) pre_norm_offsets_ends = normalize_ops.find_source_offsets( offsets_map, post_norm_offsets_ends) expected_pre_norm_characters = [ u"株", u"式", u"会", u"社", u"K", u"A", u"D", u"O", u"K", u"A", u"W", u"A", u"" ] self.assertAllEqual( self._extract_substrs(txt[0], pre_norm_offsets_starts, pre_norm_offsets_ends), [x.encode("utf-8") for x in expected_pre_norm_characters])
def test_tensor_input(self, txt_input, normalization_form, post_norm_offsets, expected): _, offsets_map = normalize_ops.normalize_utf8_with_offsets_map( txt_input, normalization_form) pre_norm_offsets = normalize_ops.find_source_offsets( offsets_map, post_norm_offsets) self.assertAllEqual(expected, pre_norm_offsets)
def test_ragged_tensor_input(self, txt_input, normalization_form, post_norm_offsets, expected): txt_input = ragged_factory_ops.constant(txt_input) post_norm_offsets = ragged_factory_ops.constant(post_norm_offsets, dtype="int64") _, offsets_map = normalize_ops.normalize_utf8_with_offsets_map( txt_input, normalization_form) pre_norm_offsets = normalize_ops.find_source_offsets( offsets_map, post_norm_offsets) self.assertAllEqual(expected, pre_norm_offsets)
def test_string_ragged_dimension_higher_than_offsets_input(self): txt = ragged_factory_ops.constant([ ["株式会社"], [""], ["KADOKAWA"], ]) _, offsets_map = normalize_ops.normalize_utf8_with_offsets_map(txt, u"NFKC") post_norm_offsets = ragged_factory_ops.constant( [[[0, 1, 2]], [[]], [[0, 1, 2]]], dtype="int64") with self.assertRaises(errors.InvalidArgumentError): bomb = normalize_ops.find_source_offsets(offsets_map, post_norm_offsets) self.evaluate(bomb)
def test_sliced_offsets_map_and_input_offset(self): txt = ragged_factory_ops.constant([ ["株式会社"], [""], ["KADOKAWA"], ]) _, offsets_map = normalize_ops.normalize_utf8_with_offsets_map(txt, u"NFKC") post_norm_offsets = ragged_factory_ops.constant( [[[0, 1, 2]], [[]], [[0, 1, 2]]], dtype="int64") sliced_offsets_map = offsets_map[2] sliced_post_norm_offsets = post_norm_offsets[2] sliced_pre_norm_offsets = normalize_ops.find_source_offsets( sliced_offsets_map, sliced_post_norm_offsets) expected = [[0, 3, 6]] self.assertAllEqual(expected, sliced_pre_norm_offsets)