def test_keep_dims(self):
    indices = np.array([
    [[[18, 17, 36, 18, 17, -1, -1, -1, -1, -1],
       [15, 3, 28, 31, 7, 15, 3, 28, 27, -1],
       [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]],
      [[33, 32, 17, 23, 37, 4, 18, 2, 10, -1],
       [0, -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [14, 4, 0, 1, -1, -1, -1, -1, -1, -1]]],
    [[[13, 35, 29, 16, 6, 12, -1, -1, -1, -1],
       [34, 6, 8, -1, -1, -1, -1, -1, -1, -1],
       [21, 26, 9, -1, -1, -1, -1, -1, -1, -1]],
      [[22, 25, 20, -1, -1, -1, -1, -1, -1, -1],
       [19, 30, 0, 11, -1, -1, -1, -1, -1, -1],
       [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]]]
    ])
    lower_case_diff = [[[['[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]'], ['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]']], [['[["i", 0, 1, "T"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[["i", 0, 2, "OK"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[["i", 0, 1, "H"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]']]], [[['[["i", 0, 1, "E"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[["i", 0, 1, "U"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[["i", 0, 1, "L"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]']], [['[["i", 0, 1, "L"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]']]]]
    tokenize_diff = [[['[["d", 16, 21, ""]]', '[["i", 0, 0, " "], ["d", 1, 2, ""], ["d", 23, 24, ""], ["d", 37, 38, ""]]', '[["d", 0, 9, ""]]'], ['[["d", 21, 22, ""], ["d", 26, 27, ""], ["d", 37, 38, ""]]', '[["i", 0, 0, " "], ["d", 2, 11, ""]]', '[["i", 0, 0, " "], ["d", 3, 4, ""], ["d", 9, 10, ""], ["d", 11, 17, ""]]']], [['[["d", 30, 34, ""]]', '[["i", 0, 0, " "], ["d", 14, 21, ""]]', '[["i", 0, 0, " "], ["d", 20, 27, ""]]'], ['[["d", 12, 19, ""]]', '[["i", 0, 0, " "], ["d", 21, 27, ""]]', '[["d", 0, 9, ""]]']]]
    missing_vals = np.array([[[[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'ok', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'you', u'', u'', u'', u'', u'', u'', u'']]], [[[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'you', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']]]], dtype='|S40')
    strings = np.array([
      [
        "It is what it is. I've seen summer and I've seen rain",
        "The sun is not yellow, it's chicken. OK. Hey, you!"
      ],
      [
        'Ended up sleeping in a doorway. Under a bodega. Lights over broadway',
        'Look out kid. Its something you did.'
      ]
    ], dtype=np.unicode)
    index_to_word = self._get_index_to_word(np.char.lower(strings), en_tokenizer)

    index_to_word = ['__UNK__'] + index_to_word[:-1]

    string_trans = st.StringTransform(
      index_to_word=index_to_word,
      word_tokenizer=en_tokenizer,
      lower_case=True,
      unk_index=0,
      max_sent_len=10,
      name='ST'
    )
    trans = dct.DocumentTransform(
      sent_tokenizer=lambda s: s.split('.'),
      string_transform=string_trans,
    )
    # for i in xrange(2):
    self.pour_pump(
      trans,
      strings,
      {
        'ST/indices': indices,
        'ST/missing_vals': missing_vals,
        'ST/tokenize_diff': tokenize_diff,
        'ST/lower_case_diff': lower_case_diff

      },
      test_type=False
    )

    self.write_read_example(trans, strings, self.temp_dir)
    trans = self.write_read(trans, self.temp_dir)
示例#2
0
    def test_ja_normalize(self):

        indices = np.array(
            [[[12, 13, 16, 17, 22, 7, 21, 19, 8, 14, 15, 2, 20, -1, -1]],
             [[5, 11, 3, 0, 10, 14, 18, 6, 4, 9, 1, -1, -1, -1, -1]]])
        strings = np.array([[u'チラシ・勧誘印刷物の無断投函は一切お断り'],
                            [u'すみませんが、もう一度どお願いします。']])
        tokenize_diff = [['[]'], ['[]']]
        missing_vals = np.array([[[
            u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'',
            u'', u''
        ]],
                                 [[
                                     u'', u'', u'', u'', u'', u'', u'', u'',
                                     u'', u'', u'', u'', u'', u'', u''
                                 ]]],
                                dtype='|U20')
        index_to_word = self._get_index_to_word(strings,
                                                ja_tokenizer) + ['__UNK__']
        trans = n.StringTransform(word_tokenizer=ja_tokenizer,
                                  index_to_word=index_to_word,
                                  word_detokenizer=lambda a: ''.join(a),
                                  max_sent_len=15,
                                  unk_index=len(index_to_word) - 1)
        trans.calc_global_values(strings)
        for i in xrange(2):
            self.pour_pump(trans,
                           strings, {
                               'indices': indices,
                               'missing_vals': missing_vals,
                               'tokenize_diff': tokenize_diff,
                           },
                           test_type=False)
            trans = self.write_read(trans, self.temp_dir)
示例#3
0
    def _from_save_dict(self, save_dict):
        """Reconstruct the transform object from the dictionary of attributes."""
        for key in self.attribute_dict:
            if key == 'string_transform':
                continue
            setattr(self, key, save_dict[key])

        trans_save_dict = save_dict['string_transform']
        trans = st.StringTransform(save_dict=trans_save_dict)
        setattr(self, 'string_transform', trans)
示例#4
0
    def test_ja_normalize_2(self):

        indices = np.array(
            [[[2, 1, 15, 16, 19, 20, 25, 10, 24, 22, 11, 17, 18, 5, 23]],
             [[8, 14, 6, 3, 13, 17, 21, 9, 7, 12, 4, -1, -1, -1, -1]]])
        strings = np.array([[u'20チラシ・勧誘印刷物の無断投函は一切お断り'],
                            [u'すみませんが、もう一度どお願いします。']])
        tokenize_diff = [['[]'], ['[]']]
        missing_vals = np.array([[[
            u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'',
            u'', u''
        ]],
                                 [[
                                     u'', u'', u'', u'', u'', u'', u'', u'',
                                     u'', u'', u'', u'', u'', u'', u''
                                 ]]],
                                dtype='|U20')
        index_to_word = ['[UNK]'] + self._get_index_to_word(
            strings, ja_tokenizer, half_width=True)
        half_width_diff = [[[
            '[["i", 0, 1, "\\uff12"]]', '[["i", 0, 1, "\\uff10"]]', '[]', '[]',
            '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'
        ]],
                           [[
                               '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]',
                               '[]', '[]', '[]', '[]', '[]', '[]', '[]'
                           ]]]
        trans = n.StringTransform(
            name='',
            word_tokenizer=ja_tokenizer,
            index_to_word=index_to_word,
            word_detokenizer=lambda a: ''.join(a),
            half_width=True,
            max_sent_len=15,
        )
        trans.calc_global_values(strings)
        for i in xrange(2):
            self.pour_pump(trans,
                           strings, {
                               'indices': indices,
                               'missing_vals': missing_vals,
                               'tokenize_diff': tokenize_diff,
                               'half_width_diff': half_width_diff
                           },
                           test_type=False)
            trans = self.write_read(trans, self.temp_dir)
示例#5
0
    def test_array_iter(self):
        indices = np.array([[[0, 10, 17, 11, 10, 3, -1, -1, -1, -1]],
                            [[7, 2, 4, 1, 12, 9, 14, 16, 18, 3]],
                            [[6, 15, 10, 13, 19, 2, 11, 1, 8, 3]]])

        strings = np.array([["It is what it is."],
                            ["Whatever, Bob's mother has seen the world."],
                            ["The sun is not yellow, it's chicken. OK."]])
        tokenize_diff = [
            ['[["d", 16, 17, ""], ["d", 18, 22, ""]]'],
            ['[["d", 8, 9, ""], ["d", 14, 15, ""], ["d", 43, 44, ""]]'],
            [
                '[["d", 21, 22, ""], ["d", 26, 27, ""], ["i", 37, 37, "."], ["i", 38, 38, "OK"]]'
            ]
        ]
        missing_vals = np.array([[['It', '', '', '', '', '', '', '', '', '']],
                                 [['', '', '', '', '', '', '', '', '', '']],
                                 [['', '', '', '', '', '', '', '', '', '']]],
                                dtype='|S42')
        # index_to_word = ['[UNK]'] + self._get_index_to_word(strings, en_tokenizer)
        trans = n.StringTransform(
            # index_to_word=index_to_word,
            word_tokenizer=en_tokenizer,
            name='string_transform',
            max_sent_len=10,
            max_vocab_size=20)

        array_iter = [strings[0:1], strings[1:2], strings[2:3]]

        trans.calc_global_values(data_iter=array_iter)
        for i in xrange(2):
            self.pour_pump(trans,
                           strings, {
                               'string_transform/indices': indices,
                               'string_transform/missing_vals': missing_vals,
                               'string_transform/tokenize_diff': tokenize_diff,
                           },
                           test_type=False)
            trans = self.write_read(trans, self.temp_dir)
示例#6
0
    def test_en_normalize(self):
        indices = np.array([[[4, 10, 17, 11, 10, 2, -1, -1, -1, -1]],
                            [[7, 1, 3, 0, 12, 9, 14, 16, 18, 2]],
                            [[6, 15, 10, 13, 19, 1, 11, 0, 8, 2]]])

        strings = np.array([["It is what it is."],
                            ["Whatever, Bob's mother has seen the world."],
                            ["The sun is not yellow, it's chicken. OK."]])
        tokenize_diff = [
            ['[["d", 16, 17, ""], ["d", 18, 22, ""]]'],
            ['[["d", 8, 9, ""], ["d", 14, 15, ""], ["d", 43, 44, ""]]'],
            [
                '[["d", 21, 22, ""], ["d", 26, 27, ""], ["i", 37, 37, "."], ["i", 38, 38, "OK"]]'
            ]
        ]
        missing_vals = np.array([[['', '', '', '', '', '', '', '', '', '']],
                                 [['', '', '', '', '', '', '', '', '', '']],
                                 [['', '', '', '', '', '', '', '', '', '']]],
                                dtype='|S42')
        index_to_word = self._get_index_to_word(strings,
                                                en_tokenizer) + ['__UNK__']
        trans = n.StringTransform(index_to_word=index_to_word,
                                  word_tokenizer=en_tokenizer,
                                  name='string_transform',
                                  max_sent_len=10,
                                  unk_index=len(index_to_word) - 1)
        trans.calc_global_values(strings)
        for i in xrange(2):
            self.pour_pump(trans,
                           strings, {
                               'string_transform/indices': indices,
                               'string_transform/missing_vals': missing_vals,
                               'string_transform/tokenize_diff': tokenize_diff,
                           },
                           test_type=False)
            trans = self.write_read(trans, self.temp_dir)
  def test_array(self):
    array = self._get_array()

    dataset_transform = tr.DatasetTransform(name='DT')
    dataset_transform.add_transform(
      cols=[0],
      transform=ct.CatTransform(
        name='CAT',
        norm_mode='mean_std',
        index_to_cat_val=sorted(np.unique(array[:, 0: 1])),
        input_dtype=np.dtype('U')
      )
    )
    dataset_transform.add_transform(
      cols=[1, 2, 3],
      transform=dt.DateTimeTransform(
        name='DATE',
        norm_mode='min_max',
        fill_nat_func=lambda a: np.array(datetime.datetime(1950, 1, 1)),
        input_dtype=np.datetime64,
      )
    )
    dataset_transform.add_transform(
      cols=[4, 5, 6],
      transform=nt.NumTransform(
        name='NUM',
        norm_mode='min_max',
        input_dtype=np.float64,
        fill_nan_func=lambda a: np.array(0),
      )
    )
    dataset_transform.add_transform(
      cols=[7, 8],
      transform=st.StringTransform(
        name='STRING',
        word_tokenizer=en_tokenizer,
        index_to_word=['__UNK__'] + self._get_index_to_word(array[:, 7:9], en_tokenizer),
        input_dtype=np.dtype('U'),
        max_sent_len=20
      )
    )
    dataset_transform.calc_global_values(data=array)

    for i in xrange(2):
      self.pour_pump(
        dataset_transform,
        array,
        {
          'DT/CAT/indices': [[0], [1], [2], [0]],
          'DT/CAT/missing_vals': np.array([[u''], [u''], [u''], [u'']]),
          'DT/CAT/one_hots':  [[[1.0, -0.5773502691896258, -0.5773502691896258]], [[-1.0, 1.7320508075688774, -0.5773502691896258]], [[-1.0, -0.5773502691896258, 1.7320508075688774]], [[1.0, -0.5773502691896258, -0.5773502691896258]]],
          'DT/DATE/nats':  [[False, False, True], [False, True, True], [False, False, True], [False, False, True]],
          'DT/DATE/diff': np.array([[datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)]], dtype='timedelta64[us]'),
          'DT/DATE/nums': np.array([[0.9976643838327857, 0.9976643838327857, 0.0], [0.9977039705474843, 0.0, 0.0], [0.9977435572621828, 0.9988915719884407, 0.0], [0.9976643838327857, 1.0, 0.0]]),
          'DT/NUM/nans': [[False, False, True], [False, True, True], [False, False, True], [False, False, True]],
          'DT/NUM/nums': [[0.09090909090909091, 0.18181818181818182, 0.0], [0.36363636363636365, 0.0, 0.0], [0.6363636363636364, 0.7272727272727273, 0.0], [0.9090909090909091, 1.0, 0.0]],
          'DT/STRING/indices': [[[9, 29, 50, 30, 29, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [6, 38, 2, 23, 49, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[7, 16, 43, 28, 49, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [5, 26, 53, 31, 22, 50, 8, 46, 42, 15, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[12, 41, 29, 34, 54, 2, 30, 1, 18, 3, 10, 3, -1, -1, -1, -1, -1, -1, -1, -1], [13, 21, 45, 39, 27, 14, 20, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[4, 32, 33, 14, 48, 44, 31, 51, 47, 43, 52, 17, 3, -1, -1, -1, -1, -1, -1, -1], [11, 1, 24, 19, 36, 43, 40, 35, 25, 37, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1]]],
          'DT/STRING/missing_vals': np.array([[[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']]], dtype='|U10'),
          'DT/STRING/tokenize_diff': np.array([['[["d", 16, 17, ""], ["d", 18, 32, ""]]', '[["d", 8, 9, ""], ["d", 19, 20, ""], ["d", 21, 35, ""]]'], ['[["d", 24, 25, ""], ["d", 26, 40, ""]]', '[["d", 58, 59, ""], ["d", 60, 69, ""]]'], ['[["d", 21, 22, ""], ["d", 26, 27, ""], ["d", 37, 38, ""], ["d", 42, 43, ""], ["d", 44, 52, ""]]', '[["d", 35, 36, ""], ["d", 37, 49, ""]]'], ['[["d", 2, 3, ""], ["d", 57, 58, ""], ["d", 59, 66, ""]]', '[["d", 3, 4, ""], ["d", 45, 46, ""], ["d", 47, 56, ""]]']], dtype='|U95'),
          # 'DT/Partition_0/tubes/missing_cols': np.array([1, 2]),
          # 'DT/Partition_0/tubes/missing_array': np.array([['b', 'None', 'b', 'c'], [1.0, 2.0, np.nan, 1.0]], dtype=np.object),
        }
      )
      # self.write_read_example(dataset_transform, array, self.temp_dir)
      dataset_transform = self.write_read(dataset_transform, self.temp_dir)
示例#8
0
    def test_read_write(self):
        indices = np.array([[[16, 15, 27, 16, 15, 5, -1, -1, -1, -1],
                             [13, 3, 20, 22, 7, 13, 3, 20, 19, -1]],
                            [[24, 23, 15, 17, 28, 4, 16, 2, 9, 5],
                             [12, 4, 0, 1, -1, -1, -1, -1, -1, -1]],
                            [[11, 26, 21, 14, 6, 10, 5, -1, -1, -1],
                             [25, 6, 8, -1, -1, -1, -1, -1, -1, -1]]])
        lower_case_diff = [[[
            '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]',
            '[]', '[]'
        ],
                            [
                                '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]',
                                '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]'
                            ]],
                           [[
                               '[["i", 0, 1, "T"]]', '[]', '[]', '[]', '[]',
                               '[]', '[]', '[]', '[]', '[]'
                           ],
                            [
                                '[["i", 0, 1, "H"]]', '[]', '[]', '[]', '[]',
                                '[]', '[]', '[]', '[]', '[]'
                            ]],
                           [[
                               '[["i", 0, 1, "E"]]', '[]', '[]', '[]', '[]',
                               '[]', '[]', '[]', '[]', '[]'
                           ],
                            [
                                '[["i", 0, 1, "U"]]', '[]', '[]', '[]', '[]',
                                '[]', '[]', '[]', '[]', '[]'
                            ]]]
        tokenize_diff = [
            [
                '[["d", 16, 17, ""], ["d", 18, 22, ""]]',
                '[["d", 1, 2, ""], ["d", 23, 24, ""], ["d", 37, 38, ""]]'
            ],
            [
                '[["d", 21, 22, ""], ["d", 26, 27, ""], ["i", 37, 37, "."], ["i", 38, 38, "OK"]]',
                '[["d", 3, 4, ""], ["d", 9, 10, ""], ["d", 11, 17, ""]]'
            ],
            ['[["d", 30, 31, ""], ["d", 32, 35, ""]]', '[["d", 14, 21, ""]]']
        ]
        missing_vals = np.array(
            [[[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''],
              [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']],
             [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''],
              [u'', u'', u'you', u'', u'', u'', u'', u'', u'', u'']],
             [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''],
              [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']]],
            dtype='|S40')
        strings = np.array(
            [["It is what it is.", "I've seen summer and I've seen rain"],
             ["The sun is not yellow, it's chicken. OK.", "Hey, you!"],
             ['Ended up sleeping in a doorway.', 'Under a bodega']],
            dtype=np.unicode)
        index_to_word = self._get_index_to_word(np.char.lower(strings),
                                                en_tokenizer)

        index_to_word = ['__UNK__'] + index_to_word[:-1]

        trans = n.StringTransform(
            index_to_word=index_to_word,
            word_tokenizer=en_tokenizer,
            lower_case=True,
            unk_index=0,
            max_sent_len=10,
        )
        trans.calc_global_values(strings)
        # for i in xrange(2):
        self.pour_pump(trans,
                       strings, {
                           'indices': indices,
                           'missing_vals': missing_vals,
                           'tokenize_diff': tokenize_diff,
                           'lower_case_diff': lower_case_diff
                       },
                       test_type=False)

        self.write_read_example(trans, strings, self.temp_dir)
        trans = self.write_read(trans, self.temp_dir)
示例#9
0
    def test_en_normalize_2(self):

        indices = np.array([[[8, 4, 15, 8, 4, 3, -1, -1, -1, -1]],
                            [[16, 2, 5, 1, 9, 7, 12, 14, 17, 3]],
                            [[14, 13, 4, 10, 18, 2, 8, 1, 6, 3]]])
        lemmatize_diff = [[[
            '[]', '[["i", 0, 2, "is"]]', '[]', '[]', '[["i", 0, 2, "is"]]',
            '[]', '[]', '[]', '[]', '[]'
        ]],
                          [[
                              '[]', '[]', '[]', '[]', '[]',
                              '[["i", 2, 4, "s"]]', '[["i", 3, 3, "n"]]', '[]',
                              '[]', '[]'
                          ]],
                          [[
                              '[]', '[]', '[["i", 0, 2, "is"]]', '[]', '[]',
                              '[]', '[]', '[]', '[]', '[]'
                          ]]]
        lower_case_diff = [[[
            '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]',
            '[]', '[]'
        ]],
                           [[
                               '[["i", 0, 1, "W"]]', '[]',
                               '[["i", 0, 1, "B"]]', '[]', '[]', '[]', '[]',
                               '[]', '[]', '[]'
                           ]],
                           [[
                               '[["i", 0, 1, "T"]]', '[]', '[]', '[]', '[]',
                               '[]', '[]', '[]', '[]', '[]'
                           ]]]
        tokenize_diff = [
            ['[["d", 16, 17, ""], ["d", 18, 22, ""]]'],
            ['[["d", 8, 9, ""], ["d", 14, 15, ""], ["d", 43, 44, ""]]'],
            [
                '[["d", 21, 22, ""], ["d", 26, 27, ""], ["i", 37, 37, "."], ["i", 38, 38, "OK"]]'
            ]
        ]
        missing_vals = np.array([[['', '', '', '', '', '', '', '', '', '']],
                                 [['', '', '', '', '', '', '', '', '', '']],
                                 [['', '', '', '', '', '', '', '', '', '']]],
                                dtype='|S8')
        strings = np.array([["It is what it is."],
                            ["Whatever, Bob's mother has seen the world."],
                            ["The sun is not yellow, it's chicken. OK."]])
        index_to_word = self._get_index_to_word(np.char.lower(strings),
                                                en_tokenizer, basic_lemmatizer)
        index_to_word = ['__UNK__'] + index_to_word
        trans = n.StringTransform(
            index_to_word=index_to_word,
            word_tokenizer=en_tokenizer,
            lemmatize=True,
            lemmatizer=basic_lemmatizer,
            lower_case=True,
            unk_index=0,
            max_sent_len=10,
        )
        trans.calc_global_values(strings)
        for i in xrange(2):
            self.pour_pump(trans,
                           strings, {
                               'indices': indices,
                               'lower_case_diff': lower_case_diff,
                               'lemmatize_diff': lemmatize_diff,
                               'missing_vals': missing_vals,
                               'tokenize_diff': tokenize_diff,
                           },
                           test_type=False)
            trans = self.write_read(trans, self.temp_dir)
示例#10
0
    def test_remove_dims(self):
        indices = np.array([[18, 17, 36, 18, 17, -1, -1, -1, -1, -1],
                            [15, 3, 28, 31, 7, 15, 3, 28, 27, -1],
                            [33, 32, 17, 23, 37, 4, 18, 2, 10, -1],
                            [0, -1, -1, -1, -1, -1, -1, -1, -1, -1],
                            [14, 4, 0, 1, -1, -1, -1, -1, -1, -1],
                            [13, 35, 29, 16, 6, 12, -1, -1, -1, -1],
                            [34, 6, 8, -1, -1, -1, -1, -1, -1, -1],
                            [21, 26, 9, -1, -1, -1, -1, -1, -1, -1],
                            [22, 25, 20, -1, -1, -1, -1, -1, -1, -1],
                            [19, 30, 0, 11, -1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]])
        lower_case_diff = [[
            '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]',
            '[]', '[]'
        ],
                           [
                               '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]',
                               '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]'
                           ],
                           [
                               '[["i", 0, 1, "T"]]', '[]', '[]', '[]', '[]',
                               '[]', '[]', '[]', '[]', '[]'
                           ],
                           [
                               '[["i", 0, 2, "OK"]]', '[]', '[]', '[]', '[]',
                               '[]', '[]', '[]', '[]', '[]'
                           ],
                           [
                               '[["i", 0, 1, "H"]]', '[]', '[]', '[]', '[]',
                               '[]', '[]', '[]', '[]', '[]'
                           ],
                           [
                               '[["i", 0, 1, "E"]]', '[]', '[]', '[]', '[]',
                               '[]', '[]', '[]', '[]', '[]'
                           ],
                           [
                               '[["i", 0, 1, "U"]]', '[]', '[]', '[]', '[]',
                               '[]', '[]', '[]', '[]', '[]'
                           ],
                           [
                               '[["i", 0, 1, "L"]]', '[]', '[]', '[]', '[]',
                               '[]', '[]', '[]', '[]', '[]'
                           ],
                           [
                               '[["i", 0, 1, "L"]]', '[]', '[]', '[]', '[]',
                               '[]', '[]', '[]', '[]', '[]'
                           ],
                           [
                               '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]',
                               '[]', '[]', '[]', '[]', '[]'
                           ],
                           [
                               '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]',
                               '[]', '[]'
                           ]]
        tokenize_diff = [
            '[["d", 16, 21, ""]]',
            '[["i", 0, 0, " "], ["d", 1, 2, ""], ["d", 23, 24, ""], ["d", 37, 38, ""]]',
            '[["d", 21, 22, ""], ["d", 26, 27, ""], ["d", 37, 38, ""]]',
            '[["i", 0, 0, " "], ["d", 2, 11, ""]]',
            '[["i", 0, 0, " "], ["d", 3, 4, ""], ["d", 9, 10, ""], ["d", 11, 17, ""]]',
            '[["d", 30, 34, ""]]', '[["i", 0, 0, " "], ["d", 14, 21, ""]]',
            '[["i", 0, 0, " "], ["d", 20, 27, ""]]', '[["d", 12, 19, ""]]',
            '[["i", 0, 0, " "], ["d", 21, 27, ""]]', '[["d", 0, 9, ""]]'
        ]
        missing_vals = np.array(
            [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''],
             [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''],
             [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''],
             [u'ok', u'', u'', u'', u'', u'', u'', u'', u'', u''],
             [u'', u'', u'you', u'', u'', u'', u'', u'', u'', u''],
             [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''],
             [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''],
             [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''],
             [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''],
             [u'', u'', u'you', u'', u'', u'', u'', u'', u'', u''],
             [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']],
            dtype='|S40')
        strings = np.array([
            [
                "It is what it is. I've seen summer and I've seen rain",
                "The sun is not yellow, it's chicken. OK. Hey, you!"
            ],
            [
                'Ended up sleeping in a doorway. Under a bodega. Lights over broadway',
                'Look out kid. Its something you did.'
            ]
        ],
                           dtype=np.unicode)
        index_to_word = self._get_index_to_word(np.char.lower(strings),
                                                en_tokenizer)
        ids = np.array([
            'f46a7d9b0971a571b019aef7397ba54de6acf73518609b67d2ac1dee',
            'f46a7d9b0971a571b019aef7397ba54de6acf73518609b67d2ac1dee',
            'dd1ff31c7034df9b181dec11a0ae633bf9bee80662d76e1b5f655c2e',
            'dd1ff31c7034df9b181dec11a0ae633bf9bee80662d76e1b5f655c2e',
            'dd1ff31c7034df9b181dec11a0ae633bf9bee80662d76e1b5f655c2e',
            'af78931ab7820443f0986de9ef1f276363014d89b9dd587f16b5f3e5',
            'af78931ab7820443f0986de9ef1f276363014d89b9dd587f16b5f3e5',
            'af78931ab7820443f0986de9ef1f276363014d89b9dd587f16b5f3e5',
            '52c66ddc2885dd3f2d30d8fbca09abab17e5d512dfd28c13f9a4bf1d',
            '52c66ddc2885dd3f2d30d8fbca09abab17e5d512dfd28c13f9a4bf1d',
            '52c66ddc2885dd3f2d30d8fbca09abab17e5d512dfd28c13f9a4bf1d'
        ])
        index_to_word = ['__UNK__'] + index_to_word[:-1]

        string_trans = st.StringTransform(index_to_word=index_to_word,
                                          word_tokenizer=en_tokenizer,
                                          lower_case=True,
                                          unk_index=0,
                                          max_sent_len=10,
                                          name='ST')
        dts_trans = dct.DocumentToSentenceTransform(
            name='DTS',
            sent_tokenizer=lambda s: s.split('.'),
            sent_detokenizer=lambda s: '.'.join(s),
            keep_dims=False)
        trans = cht.ChainTransform(name='CT')
        trans.add_transform(dts_trans)
        trans.add_transform(string_trans, 'DTS/sentences')
        # for i in xrange(2):
        self.pour_pump(trans,
                       strings, {
                           'CT/ST/indices': indices,
                           'CT/ST/missing_vals': missing_vals,
                           'CT/ST/tokenize_diff': tokenize_diff,
                           'CT/ST/lower_case_diff': lower_case_diff,
                           'CT/DTS/ids': ids
                       },
                       test_type=False)

        self.write_read_example(trans, strings, self.temp_dir, num_cols=1)