def test_keep_dims(self): indices = np.array([ [[[18, 17, 36, 18, 17, -1, -1, -1, -1, -1], [15, 3, 28, 31, 7, 15, 3, 28, 27, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[33, 32, 17, 23, 37, 4, 18, 2, 10, -1], [0, -1, -1, -1, -1, -1, -1, -1, -1, -1], [14, 4, 0, 1, -1, -1, -1, -1, -1, -1]]], [[[13, 35, 29, 16, 6, 12, -1, -1, -1, -1], [34, 6, 8, -1, -1, -1, -1, -1, -1, -1], [21, 26, 9, -1, -1, -1, -1, -1, -1, -1]], [[22, 25, 20, -1, -1, -1, -1, -1, -1, -1], [19, 30, 0, 11, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]]] ]) lower_case_diff = [[[['[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]'], ['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]']], [['[["i", 0, 1, "T"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[["i", 0, 2, "OK"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[["i", 0, 1, "H"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]']]], [[['[["i", 0, 1, "E"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[["i", 0, 1, "U"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[["i", 0, 1, "L"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]']], [['[["i", 0, 1, "L"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]'], ['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]']]]] tokenize_diff = [[['[["d", 16, 21, ""]]', '[["i", 0, 0, " "], ["d", 1, 2, ""], ["d", 23, 24, ""], ["d", 37, 38, ""]]', '[["d", 0, 9, ""]]'], ['[["d", 21, 22, ""], ["d", 26, 27, ""], ["d", 37, 38, ""]]', '[["i", 0, 0, " "], ["d", 2, 11, ""]]', '[["i", 0, 0, " "], ["d", 3, 4, ""], ["d", 9, 10, ""], ["d", 11, 17, ""]]']], [['[["d", 30, 34, ""]]', '[["i", 0, 0, " "], ["d", 14, 21, ""]]', '[["i", 0, 0, " "], ["d", 20, 27, ""]]'], ['[["d", 12, 19, ""]]', '[["i", 0, 0, " "], ["d", 21, 27, ""]]', '[["d", 0, 9, ""]]']]] missing_vals = np.array([[[[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'ok', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'you', u'', u'', u'', u'', u'', u'', u'']]], [[[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'you', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']]]], dtype='|S40') strings = np.array([ [ "It is what it is. I've seen summer and I've seen rain", "The sun is not yellow, it's chicken. OK. Hey, you!" ], [ 'Ended up sleeping in a doorway. Under a bodega. Lights over broadway', 'Look out kid. Its something you did.' ] ], dtype=np.unicode) index_to_word = self._get_index_to_word(np.char.lower(strings), en_tokenizer) index_to_word = ['__UNK__'] + index_to_word[:-1] string_trans = st.StringTransform( index_to_word=index_to_word, word_tokenizer=en_tokenizer, lower_case=True, unk_index=0, max_sent_len=10, name='ST' ) trans = dct.DocumentTransform( sent_tokenizer=lambda s: s.split('.'), string_transform=string_trans, ) # for i in xrange(2): self.pour_pump( trans, strings, { 'ST/indices': indices, 'ST/missing_vals': missing_vals, 'ST/tokenize_diff': tokenize_diff, 'ST/lower_case_diff': lower_case_diff }, test_type=False ) self.write_read_example(trans, strings, self.temp_dir) trans = self.write_read(trans, self.temp_dir)
def test_ja_normalize(self): indices = np.array( [[[12, 13, 16, 17, 22, 7, 21, 19, 8, 14, 15, 2, 20, -1, -1]], [[5, 11, 3, 0, 10, 14, 18, 6, 4, 9, 1, -1, -1, -1, -1]]]) strings = np.array([[u'チラシ・勧誘印刷物の無断投函は一切お断り'], [u'すみませんが、もう一度どお願いします。']]) tokenize_diff = [['[]'], ['[]']] missing_vals = np.array([[[ u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'' ]], [[ u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'' ]]], dtype='|U20') index_to_word = self._get_index_to_word(strings, ja_tokenizer) + ['__UNK__'] trans = n.StringTransform(word_tokenizer=ja_tokenizer, index_to_word=index_to_word, word_detokenizer=lambda a: ''.join(a), max_sent_len=15, unk_index=len(index_to_word) - 1) trans.calc_global_values(strings) for i in xrange(2): self.pour_pump(trans, strings, { 'indices': indices, 'missing_vals': missing_vals, 'tokenize_diff': tokenize_diff, }, test_type=False) trans = self.write_read(trans, self.temp_dir)
def _from_save_dict(self, save_dict): """Reconstruct the transform object from the dictionary of attributes.""" for key in self.attribute_dict: if key == 'string_transform': continue setattr(self, key, save_dict[key]) trans_save_dict = save_dict['string_transform'] trans = st.StringTransform(save_dict=trans_save_dict) setattr(self, 'string_transform', trans)
def test_ja_normalize_2(self): indices = np.array( [[[2, 1, 15, 16, 19, 20, 25, 10, 24, 22, 11, 17, 18, 5, 23]], [[8, 14, 6, 3, 13, 17, 21, 9, 7, 12, 4, -1, -1, -1, -1]]]) strings = np.array([[u'20チラシ・勧誘印刷物の無断投函は一切お断り'], [u'すみませんが、もう一度どお願いします。']]) tokenize_diff = [['[]'], ['[]']] missing_vals = np.array([[[ u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'' ]], [[ u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'' ]]], dtype='|U20') index_to_word = ['[UNK]'] + self._get_index_to_word( strings, ja_tokenizer, half_width=True) half_width_diff = [[[ '[["i", 0, 1, "\\uff12"]]', '[["i", 0, 1, "\\uff10"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ]], [[ '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ]]] trans = n.StringTransform( name='', word_tokenizer=ja_tokenizer, index_to_word=index_to_word, word_detokenizer=lambda a: ''.join(a), half_width=True, max_sent_len=15, ) trans.calc_global_values(strings) for i in xrange(2): self.pour_pump(trans, strings, { 'indices': indices, 'missing_vals': missing_vals, 'tokenize_diff': tokenize_diff, 'half_width_diff': half_width_diff }, test_type=False) trans = self.write_read(trans, self.temp_dir)
def test_array_iter(self): indices = np.array([[[0, 10, 17, 11, 10, 3, -1, -1, -1, -1]], [[7, 2, 4, 1, 12, 9, 14, 16, 18, 3]], [[6, 15, 10, 13, 19, 2, 11, 1, 8, 3]]]) strings = np.array([["It is what it is."], ["Whatever, Bob's mother has seen the world."], ["The sun is not yellow, it's chicken. OK."]]) tokenize_diff = [ ['[["d", 16, 17, ""], ["d", 18, 22, ""]]'], ['[["d", 8, 9, ""], ["d", 14, 15, ""], ["d", 43, 44, ""]]'], [ '[["d", 21, 22, ""], ["d", 26, 27, ""], ["i", 37, 37, "."], ["i", 38, 38, "OK"]]' ] ] missing_vals = np.array([[['It', '', '', '', '', '', '', '', '', '']], [['', '', '', '', '', '', '', '', '', '']], [['', '', '', '', '', '', '', '', '', '']]], dtype='|S42') # index_to_word = ['[UNK]'] + self._get_index_to_word(strings, en_tokenizer) trans = n.StringTransform( # index_to_word=index_to_word, word_tokenizer=en_tokenizer, name='string_transform', max_sent_len=10, max_vocab_size=20) array_iter = [strings[0:1], strings[1:2], strings[2:3]] trans.calc_global_values(data_iter=array_iter) for i in xrange(2): self.pour_pump(trans, strings, { 'string_transform/indices': indices, 'string_transform/missing_vals': missing_vals, 'string_transform/tokenize_diff': tokenize_diff, }, test_type=False) trans = self.write_read(trans, self.temp_dir)
def test_en_normalize(self): indices = np.array([[[4, 10, 17, 11, 10, 2, -1, -1, -1, -1]], [[7, 1, 3, 0, 12, 9, 14, 16, 18, 2]], [[6, 15, 10, 13, 19, 1, 11, 0, 8, 2]]]) strings = np.array([["It is what it is."], ["Whatever, Bob's mother has seen the world."], ["The sun is not yellow, it's chicken. OK."]]) tokenize_diff = [ ['[["d", 16, 17, ""], ["d", 18, 22, ""]]'], ['[["d", 8, 9, ""], ["d", 14, 15, ""], ["d", 43, 44, ""]]'], [ '[["d", 21, 22, ""], ["d", 26, 27, ""], ["i", 37, 37, "."], ["i", 38, 38, "OK"]]' ] ] missing_vals = np.array([[['', '', '', '', '', '', '', '', '', '']], [['', '', '', '', '', '', '', '', '', '']], [['', '', '', '', '', '', '', '', '', '']]], dtype='|S42') index_to_word = self._get_index_to_word(strings, en_tokenizer) + ['__UNK__'] trans = n.StringTransform(index_to_word=index_to_word, word_tokenizer=en_tokenizer, name='string_transform', max_sent_len=10, unk_index=len(index_to_word) - 1) trans.calc_global_values(strings) for i in xrange(2): self.pour_pump(trans, strings, { 'string_transform/indices': indices, 'string_transform/missing_vals': missing_vals, 'string_transform/tokenize_diff': tokenize_diff, }, test_type=False) trans = self.write_read(trans, self.temp_dir)
def test_array(self): array = self._get_array() dataset_transform = tr.DatasetTransform(name='DT') dataset_transform.add_transform( cols=[0], transform=ct.CatTransform( name='CAT', norm_mode='mean_std', index_to_cat_val=sorted(np.unique(array[:, 0: 1])), input_dtype=np.dtype('U') ) ) dataset_transform.add_transform( cols=[1, 2, 3], transform=dt.DateTimeTransform( name='DATE', norm_mode='min_max', fill_nat_func=lambda a: np.array(datetime.datetime(1950, 1, 1)), input_dtype=np.datetime64, ) ) dataset_transform.add_transform( cols=[4, 5, 6], transform=nt.NumTransform( name='NUM', norm_mode='min_max', input_dtype=np.float64, fill_nan_func=lambda a: np.array(0), ) ) dataset_transform.add_transform( cols=[7, 8], transform=st.StringTransform( name='STRING', word_tokenizer=en_tokenizer, index_to_word=['__UNK__'] + self._get_index_to_word(array[:, 7:9], en_tokenizer), input_dtype=np.dtype('U'), max_sent_len=20 ) ) dataset_transform.calc_global_values(data=array) for i in xrange(2): self.pour_pump( dataset_transform, array, { 'DT/CAT/indices': [[0], [1], [2], [0]], 'DT/CAT/missing_vals': np.array([[u''], [u''], [u''], [u'']]), 'DT/CAT/one_hots': [[[1.0, -0.5773502691896258, -0.5773502691896258]], [[-1.0, 1.7320508075688774, -0.5773502691896258]], [[-1.0, -0.5773502691896258, 1.7320508075688774]], [[1.0, -0.5773502691896258, -0.5773502691896258]]], 'DT/DATE/nats': [[False, False, True], [False, True, True], [False, False, True], [False, False, True]], 'DT/DATE/diff': np.array([[datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)]], dtype='timedelta64[us]'), 'DT/DATE/nums': np.array([[0.9976643838327857, 0.9976643838327857, 0.0], [0.9977039705474843, 0.0, 0.0], [0.9977435572621828, 0.9988915719884407, 0.0], [0.9976643838327857, 1.0, 0.0]]), 'DT/NUM/nans': [[False, False, True], [False, True, True], [False, False, True], [False, False, True]], 'DT/NUM/nums': [[0.09090909090909091, 0.18181818181818182, 0.0], [0.36363636363636365, 0.0, 0.0], [0.6363636363636364, 0.7272727272727273, 0.0], [0.9090909090909091, 1.0, 0.0]], 'DT/STRING/indices': [[[9, 29, 50, 30, 29, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [6, 38, 2, 23, 49, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[7, 16, 43, 28, 49, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [5, 26, 53, 31, 22, 50, 8, 46, 42, 15, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[12, 41, 29, 34, 54, 2, 30, 1, 18, 3, 10, 3, -1, -1, -1, -1, -1, -1, -1, -1], [13, 21, 45, 39, 27, 14, 20, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[4, 32, 33, 14, 48, 44, 31, 51, 47, 43, 52, 17, 3, -1, -1, -1, -1, -1, -1, -1], [11, 1, 24, 19, 36, 43, 40, 35, 25, 37, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1]]], 'DT/STRING/missing_vals': np.array([[[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']]], dtype='|U10'), 'DT/STRING/tokenize_diff': np.array([['[["d", 16, 17, ""], ["d", 18, 32, ""]]', '[["d", 8, 9, ""], ["d", 19, 20, ""], ["d", 21, 35, ""]]'], ['[["d", 24, 25, ""], ["d", 26, 40, ""]]', '[["d", 58, 59, ""], ["d", 60, 69, ""]]'], ['[["d", 21, 22, ""], ["d", 26, 27, ""], ["d", 37, 38, ""], ["d", 42, 43, ""], ["d", 44, 52, ""]]', '[["d", 35, 36, ""], ["d", 37, 49, ""]]'], ['[["d", 2, 3, ""], ["d", 57, 58, ""], ["d", 59, 66, ""]]', '[["d", 3, 4, ""], ["d", 45, 46, ""], ["d", 47, 56, ""]]']], dtype='|U95'), # 'DT/Partition_0/tubes/missing_cols': np.array([1, 2]), # 'DT/Partition_0/tubes/missing_array': np.array([['b', 'None', 'b', 'c'], [1.0, 2.0, np.nan, 1.0]], dtype=np.object), } ) # self.write_read_example(dataset_transform, array, self.temp_dir) dataset_transform = self.write_read(dataset_transform, self.temp_dir)
def test_read_write(self): indices = np.array([[[16, 15, 27, 16, 15, 5, -1, -1, -1, -1], [13, 3, 20, 22, 7, 13, 3, 20, 19, -1]], [[24, 23, 15, 17, 28, 4, 16, 2, 9, 5], [12, 4, 0, 1, -1, -1, -1, -1, -1, -1]], [[11, 26, 21, 14, 6, 10, 5, -1, -1, -1], [25, 6, 8, -1, -1, -1, -1, -1, -1, -1]]]) lower_case_diff = [[[ '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ], [ '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]' ]], [[ '[["i", 0, 1, "T"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ], [ '[["i", 0, 1, "H"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ]], [[ '[["i", 0, 1, "E"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ], [ '[["i", 0, 1, "U"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ]]] tokenize_diff = [ [ '[["d", 16, 17, ""], ["d", 18, 22, ""]]', '[["d", 1, 2, ""], ["d", 23, 24, ""], ["d", 37, 38, ""]]' ], [ '[["d", 21, 22, ""], ["d", 26, 27, ""], ["i", 37, 37, "."], ["i", 38, 38, "OK"]]', '[["d", 3, 4, ""], ["d", 9, 10, ""], ["d", 11, 17, ""]]' ], ['[["d", 30, 31, ""], ["d", 32, 35, ""]]', '[["d", 14, 21, ""]]'] ] missing_vals = np.array( [[[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'you', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']]], dtype='|S40') strings = np.array( [["It is what it is.", "I've seen summer and I've seen rain"], ["The sun is not yellow, it's chicken. OK.", "Hey, you!"], ['Ended up sleeping in a doorway.', 'Under a bodega']], dtype=np.unicode) index_to_word = self._get_index_to_word(np.char.lower(strings), en_tokenizer) index_to_word = ['__UNK__'] + index_to_word[:-1] trans = n.StringTransform( index_to_word=index_to_word, word_tokenizer=en_tokenizer, lower_case=True, unk_index=0, max_sent_len=10, ) trans.calc_global_values(strings) # for i in xrange(2): self.pour_pump(trans, strings, { 'indices': indices, 'missing_vals': missing_vals, 'tokenize_diff': tokenize_diff, 'lower_case_diff': lower_case_diff }, test_type=False) self.write_read_example(trans, strings, self.temp_dir) trans = self.write_read(trans, self.temp_dir)
def test_en_normalize_2(self): indices = np.array([[[8, 4, 15, 8, 4, 3, -1, -1, -1, -1]], [[16, 2, 5, 1, 9, 7, 12, 14, 17, 3]], [[14, 13, 4, 10, 18, 2, 8, 1, 6, 3]]]) lemmatize_diff = [[[ '[]', '[["i", 0, 2, "is"]]', '[]', '[]', '[["i", 0, 2, "is"]]', '[]', '[]', '[]', '[]', '[]' ]], [[ '[]', '[]', '[]', '[]', '[]', '[["i", 2, 4, "s"]]', '[["i", 3, 3, "n"]]', '[]', '[]', '[]' ]], [[ '[]', '[]', '[["i", 0, 2, "is"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ]]] lower_case_diff = [[[ '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ]], [[ '[["i", 0, 1, "W"]]', '[]', '[["i", 0, 1, "B"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ]], [[ '[["i", 0, 1, "T"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ]]] tokenize_diff = [ ['[["d", 16, 17, ""], ["d", 18, 22, ""]]'], ['[["d", 8, 9, ""], ["d", 14, 15, ""], ["d", 43, 44, ""]]'], [ '[["d", 21, 22, ""], ["d", 26, 27, ""], ["i", 37, 37, "."], ["i", 38, 38, "OK"]]' ] ] missing_vals = np.array([[['', '', '', '', '', '', '', '', '', '']], [['', '', '', '', '', '', '', '', '', '']], [['', '', '', '', '', '', '', '', '', '']]], dtype='|S8') strings = np.array([["It is what it is."], ["Whatever, Bob's mother has seen the world."], ["The sun is not yellow, it's chicken. OK."]]) index_to_word = self._get_index_to_word(np.char.lower(strings), en_tokenizer, basic_lemmatizer) index_to_word = ['__UNK__'] + index_to_word trans = n.StringTransform( index_to_word=index_to_word, word_tokenizer=en_tokenizer, lemmatize=True, lemmatizer=basic_lemmatizer, lower_case=True, unk_index=0, max_sent_len=10, ) trans.calc_global_values(strings) for i in xrange(2): self.pour_pump(trans, strings, { 'indices': indices, 'lower_case_diff': lower_case_diff, 'lemmatize_diff': lemmatize_diff, 'missing_vals': missing_vals, 'tokenize_diff': tokenize_diff, }, test_type=False) trans = self.write_read(trans, self.temp_dir)
def test_remove_dims(self): indices = np.array([[18, 17, 36, 18, 17, -1, -1, -1, -1, -1], [15, 3, 28, 31, 7, 15, 3, 28, 27, -1], [33, 32, 17, 23, 37, 4, 18, 2, 10, -1], [0, -1, -1, -1, -1, -1, -1, -1, -1, -1], [14, 4, 0, 1, -1, -1, -1, -1, -1, -1], [13, 35, 29, 16, 6, 12, -1, -1, -1, -1], [34, 6, 8, -1, -1, -1, -1, -1, -1, -1], [21, 26, 9, -1, -1, -1, -1, -1, -1, -1], [22, 25, 20, -1, -1, -1, -1, -1, -1, -1], [19, 30, 0, 11, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]]) lower_case_diff = [[ '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ], [ '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]' ], [ '[["i", 0, 1, "T"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ], [ '[["i", 0, 2, "OK"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ], [ '[["i", 0, 1, "H"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ], [ '[["i", 0, 1, "E"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ], [ '[["i", 0, 1, "U"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ], [ '[["i", 0, 1, "L"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ], [ '[["i", 0, 1, "L"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ], [ '[["i", 0, 1, "I"]]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ], [ '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]' ]] tokenize_diff = [ '[["d", 16, 21, ""]]', '[["i", 0, 0, " "], ["d", 1, 2, ""], ["d", 23, 24, ""], ["d", 37, 38, ""]]', '[["d", 21, 22, ""], ["d", 26, 27, ""], ["d", 37, 38, ""]]', '[["i", 0, 0, " "], ["d", 2, 11, ""]]', '[["i", 0, 0, " "], ["d", 3, 4, ""], ["d", 9, 10, ""], ["d", 11, 17, ""]]', '[["d", 30, 34, ""]]', '[["i", 0, 0, " "], ["d", 14, 21, ""]]', '[["i", 0, 0, " "], ["d", 20, 27, ""]]', '[["d", 12, 19, ""]]', '[["i", 0, 0, " "], ["d", 21, 27, ""]]', '[["d", 0, 9, ""]]' ] missing_vals = np.array( [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'ok', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'you', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'you', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], dtype='|S40') strings = np.array([ [ "It is what it is. I've seen summer and I've seen rain", "The sun is not yellow, it's chicken. OK. Hey, you!" ], [ 'Ended up sleeping in a doorway. Under a bodega. Lights over broadway', 'Look out kid. Its something you did.' ] ], dtype=np.unicode) index_to_word = self._get_index_to_word(np.char.lower(strings), en_tokenizer) ids = np.array([ 'f46a7d9b0971a571b019aef7397ba54de6acf73518609b67d2ac1dee', 'f46a7d9b0971a571b019aef7397ba54de6acf73518609b67d2ac1dee', 'dd1ff31c7034df9b181dec11a0ae633bf9bee80662d76e1b5f655c2e', 'dd1ff31c7034df9b181dec11a0ae633bf9bee80662d76e1b5f655c2e', 'dd1ff31c7034df9b181dec11a0ae633bf9bee80662d76e1b5f655c2e', 'af78931ab7820443f0986de9ef1f276363014d89b9dd587f16b5f3e5', 'af78931ab7820443f0986de9ef1f276363014d89b9dd587f16b5f3e5', 'af78931ab7820443f0986de9ef1f276363014d89b9dd587f16b5f3e5', '52c66ddc2885dd3f2d30d8fbca09abab17e5d512dfd28c13f9a4bf1d', '52c66ddc2885dd3f2d30d8fbca09abab17e5d512dfd28c13f9a4bf1d', '52c66ddc2885dd3f2d30d8fbca09abab17e5d512dfd28c13f9a4bf1d' ]) index_to_word = ['__UNK__'] + index_to_word[:-1] string_trans = st.StringTransform(index_to_word=index_to_word, word_tokenizer=en_tokenizer, lower_case=True, unk_index=0, max_sent_len=10, name='ST') dts_trans = dct.DocumentToSentenceTransform( name='DTS', sent_tokenizer=lambda s: s.split('.'), sent_detokenizer=lambda s: '.'.join(s), keep_dims=False) trans = cht.ChainTransform(name='CT') trans.add_transform(dts_trans) trans.add_transform(string_trans, 'DTS/sentences') # for i in xrange(2): self.pour_pump(trans, strings, { 'CT/ST/indices': indices, 'CT/ST/missing_vals': missing_vals, 'CT/ST/tokenize_diff': tokenize_diff, 'CT/ST/lower_case_diff': lower_case_diff, 'CT/DTS/ids': ids }, test_type=False) self.write_read_example(trans, strings, self.temp_dir, num_cols=1)