示例#1
0
    def test_one_string(self):
        txt = [
            u"株式会社KADOKAWA",
        ]
        _, offsets_map = normalize_ops.normalize_utf8_with_offsets_map(
            txt, u"NFKC")

        # post_norm_txt = "株式会社KADOKAWA"
        post_norm_offsets_starts = [[
            0, 3, 6, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20
        ]]
        post_norm_offsets_ends = [[
            3, 6, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
        ]]

        pre_norm_offsets_starts = normalize_ops.find_source_offsets(
            offsets_map, post_norm_offsets_starts)
        pre_norm_offsets_ends = normalize_ops.find_source_offsets(
            offsets_map, post_norm_offsets_ends)
        expected_pre_norm_characters = [
            u"株", u"式", u"会", u"社", u"K", u"A", u"D", u"O", u"K", u"A", u"W",
            u"A", u""
        ]
        self.assertAllEqual(
            self._extract_substrs(txt[0], pre_norm_offsets_starts,
                                  pre_norm_offsets_ends),
            [x.encode("utf-8") for x in expected_pre_norm_characters])
示例#2
0
 def test_tensor_input(self, txt_input, normalization_form,
                       post_norm_offsets, expected):
     _, offsets_map = normalize_ops.normalize_utf8_with_offsets_map(
         txt_input, normalization_form)
     pre_norm_offsets = normalize_ops.find_source_offsets(
         offsets_map, post_norm_offsets)
     self.assertAllEqual(expected, pre_norm_offsets)
示例#3
0
 def test_ragged_tensor_input(self, txt_input, normalization_form,
                              post_norm_offsets, expected):
     txt_input = ragged_factory_ops.constant(txt_input)
     post_norm_offsets = ragged_factory_ops.constant(post_norm_offsets,
                                                     dtype="int64")
     _, offsets_map = normalize_ops.normalize_utf8_with_offsets_map(
         txt_input, normalization_form)
     pre_norm_offsets = normalize_ops.find_source_offsets(
         offsets_map, post_norm_offsets)
     self.assertAllEqual(expected, pre_norm_offsets)
示例#4
0
 def test_string_ragged_dimension_higher_than_offsets_input(self):
   txt = ragged_factory_ops.constant([
       ["株式会社"],
       [""],
       ["KADOKAWA"],
   ])
   _, offsets_map = normalize_ops.normalize_utf8_with_offsets_map(txt, u"NFKC")
   post_norm_offsets = ragged_factory_ops.constant(
       [[[0, 1, 2]], [[]], [[0, 1, 2]]], dtype="int64")
   with self.assertRaises(errors.InvalidArgumentError):
     bomb = normalize_ops.find_source_offsets(offsets_map, post_norm_offsets)
     self.evaluate(bomb)
示例#5
0
  def test_sliced_offsets_map_and_input_offset(self):
    txt = ragged_factory_ops.constant([
        ["株式会社"],
        [""],
        ["KADOKAWA"],
    ])
    _, offsets_map = normalize_ops.normalize_utf8_with_offsets_map(txt, u"NFKC")
    post_norm_offsets = ragged_factory_ops.constant(
        [[[0, 1, 2]], [[]], [[0, 1, 2]]], dtype="int64")

    sliced_offsets_map = offsets_map[2]
    sliced_post_norm_offsets = post_norm_offsets[2]
    sliced_pre_norm_offsets = normalize_ops.find_source_offsets(
        sliced_offsets_map, sliced_post_norm_offsets)
    expected = [[0, 3, 6]]
    self.assertAllEqual(expected, sliced_pre_norm_offsets)