def test_c_py_compose_transforms_module(): """ Test combining Python and C++ transforms """ ds.config.set_seed(0) def test_config(arr, input_columns, output_cols, op_list): data = ds.NumpySlicesDataset(arr, column_names=input_columns, shuffle=False) data = data.map(operations=op_list, input_columns=input_columns, output_columns=output_cols, column_order=output_cols) res = [] for i in data.create_dict_iterator(output_numpy=True): for col_name in output_cols: res.append(i[col_name].tolist()) return res arr = [1, 0] assert test_config(arr, ["cols"], ["cols"], [py_transforms.OneHotOp(2), c_transforms.Mask(c_transforms.Relational.EQ, 1)]) == \ [[[False, True]], [[True, False]]] assert test_config(arr, ["cols"], ["cols"], [py_transforms.OneHotOp(2), (lambda x: x + x), c_transforms.Fill(1)]) \ == [[[1, 1]], [[1, 1]]] assert test_config(arr, ["cols"], ["cols"], [py_transforms.OneHotOp(2), (lambda x: x + x), c_transforms.Fill(1), (lambda x: x + x)]) \ == [[[2, 2]], [[2, 2]]] assert test_config([[1, 3]], ["cols"], ["cols"], [c_transforms.PadEnd([3], -1), (lambda x: x + x)]) \ == [[2, 6, -2]] arr = ([[1]], [[3]]) assert test_config(arr, ["col0", "col1"], ["a"], [(lambda x, y: x + y), c_transforms.PadEnd([2], -1)]) == [[4, -1]]
def process_tnews_clue_dataset(data_dir, label_list, bert_vocab_path, data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64): """Process TNEWS dataset""" ### Loading TNEWS from CLUEDataset assert data_usage in ['train', 'eval', 'test'] if data_usage == 'train': dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) elif data_usage == 'eval': dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) else: dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) ### Processing label if data_usage == 'test': dataset = dataset.map(input_columns=["id"], output_columns=["id", "label_id"], columns_order=["id", "label_id", "sentence"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["label_id"], operations=ops.Fill(0)) else: label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(input_columns="label_desc", output_columns="label_id", operations=label_lookup) ### Processing sentence vocab = text.Vocab.from_file(bert_vocab_path) tokenizer = text.BertTokenizer(vocab, lower_case=True) lookup = text.Lookup(vocab, unknown_token='[UNK]') dataset = dataset.map(input_columns=["sentence"], operations=tokenizer) dataset = dataset.map(input_columns=["sentence"], operations=ops.Slice(slice(0, max_seq_len))) dataset = dataset.map(input_columns=["sentence"], operations=ops.Concatenate(prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S'))) dataset = dataset.map(input_columns=["sentence"], output_columns=["text_ids"], operations=lookup) dataset = dataset.map(input_columns=["text_ids"], operations=ops.PadEnd([max_seq_len], 0)) dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"], columns_order=["label_id", "text_ids", "mask_ids"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["mask_ids"], operations=ops.Mask(ops.Relational.NE, 0, mstype.int32)) dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "segment_ids"], columns_order=["label_id", "text_ids", "mask_ids", "segment_ids"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["segment_ids"], operations=ops.Fill(0)) dataset = dataset.batch(batch_size) label = [] text_ids = [] mask_ids = [] segment_ids = [] for data in dataset: label.append(data[0]) text_ids.append(data[1]) mask_ids.append(data[2]) segment_ids.append(data[3]) return label, text_ids, mask_ids, segment_ids
def test_eager_fill(): """ Test Fill op is callable """ fill_op = data_trans.Fill(3) expected = np.array([3, 3, 3, 3]) assert np.array_equal(fill_op([4, 5, 6, 7]), expected)
def process_ner_msra_dataset(data_dir, label_list, bert_vocab_path, max_seq_len=128, class_filter=None, split_begin=None, split_end=None): """Process MSRA dataset""" ### Loading MSRA from CLUEDataset dataset = ds.GeneratorDataset(process_msra(data_dir, class_filter, split_begin, split_end), column_names=['text', 'label']) ### Processing label label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(operations=label_lookup, input_columns="label", output_columns="label_ids") dataset = dataset.map( operations=ops.Concatenate(prepend=np.array([0], dtype='i')), input_columns=["label_ids"]) dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len)), input_columns=["label_ids"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["label_ids"]) ### Processing sentence vocab = text.Vocab.from_file(bert_vocab_path) lookup = text.Lookup(vocab, unknown_token='[UNK]') unicode_char_tokenizer = text.UnicodeCharTokenizer() dataset = dataset.map(operations=unicode_char_tokenizer, input_columns=["text"], output_columns=["sentence"]) dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len - 2)), input_columns=["sentence"]) dataset = dataset.map(operations=ops.Concatenate( prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S')), input_columns=["sentence"]) dataset = dataset.map(operations=lookup, input_columns=["sentence"], output_columns=["input_ids"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["input_ids"]) dataset = dataset.map( operations=ops.Duplicate(), input_columns=["input_ids"], output_columns=["input_ids", "input_mask"], column_order=["input_ids", "input_mask", "label_ids"]) dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0, mstype.int32), input_columns=["input_mask"]) dataset = dataset.map( operations=ops.Duplicate(), input_columns=["input_ids"], output_columns=["input_ids", "segment_ids"], column_order=["input_ids", "input_mask", "segment_ids", "label_ids"]) dataset = dataset.map(operations=ops.Fill(0), input_columns=["segment_ids"]) return dataset
def test_compose(): """ Test C++ and Python Compose Op """ ds.config.set_seed(0) def test_config(arr, op_list): try: data = ds.NumpySlicesDataset(arr, column_names="col", shuffle=False) data = data.map(input_columns=["col"], operations=op_list) res = [] for i in data.create_dict_iterator(output_numpy=True): res.append(i["col"].tolist()) return res except (TypeError, ValueError) as e: return str(e) # Test simple compose with only 1 op, this would generate a warning assert test_config([[1, 0], [3, 4]], ops.Compose([ops.Fill(2)])) == [[2, 2], [2, 2]] # Test 1 column -> 2 columns -> 1 -> 2 -> 1 assert test_config([[1, 0]], ops.Compose([ops.Duplicate(), ops.Concatenate(), ops.Duplicate(), ops.Concatenate()])) \ == [[1, 0] * 4] # Test one Python transform followed by a C transform. Type after OneHot is a float (mixed use-case) assert test_config( [1, 0], ops.Compose([py_ops.OneHotOp(2), ops.TypeCast(mstype.int32)])) == [[[0, 1]], [[1, 0]]] # Test exceptions. with pytest.raises(TypeError) as error_info: ops.Compose([1, ops.TypeCast(mstype.int32)]) assert "op_list[0] is not a c_transform op (TensorOp) nor a callable pyfunc." in str( error_info.value) # Test empty op list with pytest.raises(ValueError) as error_info: test_config([1, 0], ops.Compose([])) assert "op_list can not be empty." in str(error_info.value) # Test Python compose op assert test_config([1, 0], py_ops.Compose([py_ops.OneHotOp(2)])) == [[[0, 1]], [[1, 0]]] assert test_config([1, 0], py_ops.Compose([py_ops.OneHotOp(2), (lambda x: x + x)])) == [[[0, 2]], [[2, 0]]] # Test nested Python compose op assert test_config([1, 0], py_ops.Compose([py_ops.Compose([py_ops.OneHotOp(2)]), (lambda x: x + x)])) \ == [[[0, 2]], [[2, 0]]] with pytest.raises(TypeError) as error_info: py_ops.Compose([(lambda x: x + x)])() assert "Compose was called without an image. Fix invocation (avoid it being invoked as Compose([...])())." in str( error_info.value)
def test_py_vision_with_c_transforms(): """ Test combining Python vision operations with C++ transforms operations """ ds.config.set_seed(0) def test_config(op_list): data_dir = "../data/dataset/testImageNetData/train/" data1 = ds.ImageFolderDataset(dataset_dir=data_dir, shuffle=False) data1 = data1.map(operations=op_list, input_columns=["image"]) transformed_images = [] for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): transformed_images.append(item["image"]) return transformed_images # Test with Mask Op output_arr = test_config([ py_vision.Decode(), py_vision.CenterCrop((2)), np.array, c_transforms.Mask(c_transforms.Relational.GE, 100) ]) exp_arr = [ np.array([[[True, False, False], [True, False, False]], [[True, False, False], [True, False, False]]]), np.array([[[True, False, False], [True, False, False]], [[True, False, False], [True, False, False]]]) ] for exp_a, output in zip(exp_arr, output_arr): np.testing.assert_array_equal(exp_a, output) # Test with Fill Op output_arr = test_config([ py_vision.Decode(), py_vision.CenterCrop((4)), np.array, c_transforms.Fill(10) ]) exp_arr = [np.ones((4, 4, 3)) * 10] * 2 for exp_a, output in zip(exp_arr, output_arr): np.testing.assert_array_equal(exp_a, output) # Test with Concatenate Op, which will raise an error since ConcatenateOp only supports rank 1 tensors. with pytest.raises(RuntimeError) as error_info: test_config([ py_vision.Decode(), py_vision.CenterCrop((2)), np.array, c_transforms.Concatenate(0) ]) assert "Only 1D tensors supported" in str(error_info.value)
def test_fillop_bytes(): def gen(): yield (np.array(["A", "B", "C"], dtype='S'), ) data = ds.GeneratorDataset(gen, column_names=["col"]) fill_op = data_trans.Fill(b'abc') data = data.map(operations=fill_op, input_columns=["col"]) expected = np.array([b'abc', b'abc', b'abc'], dtype='S') for data_row in data.create_tuple_iterator(output_numpy=True): np.testing.assert_array_equal(data_row[0], expected)
def test_fillop_string(): def gen(): yield (np.array(["45555", "45555"], dtype='S'), ) data = ds.GeneratorDataset(gen, column_names=["col"]) fill_op = data_trans.Fill("error") data = data.map(operations=fill_op, input_columns=["col"]) expected = np.array(['error', 'error'], dtype='S') for data_row in data.create_tuple_iterator(output_numpy=True): np.testing.assert_array_equal(data_row[0], expected)
def test_fillop_up_type_cast(): def gen(): yield (np.array([4, 5, 6, 7], dtype=np.float), ) data = ds.GeneratorDataset(gen, column_names=["col"]) fill_op = data_trans.Fill(3) data = data.map(operations=fill_op, input_columns=["col"]) expected = np.array([3., 3., 3., 3.], dtype=np.float) for data_row in data: np.testing.assert_array_equal(data_row[0].asnumpy(), expected)
def test_fillop_down_type_cast(): def gen(): yield (np.array([4, 5, 6, 7], dtype=np.uint8),) data = ds.GeneratorDataset(gen, column_names=["col"]) fill_op = data_trans.Fill(-3) data = data.map(input_columns=["col"], operations=fill_op) expected = np.array([253, 253, 253, 253], dtype=np.uint8) for data_row in data: np.testing.assert_array_equal(data_row[0], expected)
def test_fillop_error_handling(): def gen(): yield (np.array([4, 4, 4, 4]),) data = ds.GeneratorDataset(gen, column_names=["col"]) fill_op = data_trans.Fill("words") data = data.map(input_columns=["col"], operations=fill_op) with pytest.raises(RuntimeError) as error_info: for data_row in data: print(data_row) assert "Types do not match" in repr(error_info.value)
def test_fillop_error_handling(): def gen(): yield (np.array([4, 4, 4, 4]), ) data = ds.GeneratorDataset(gen, column_names=["col"]) fill_op = data_trans.Fill("words") data = data.map(operations=fill_op, input_columns=["col"]) with pytest.raises(RuntimeError) as error_info: for _ in data: pass assert "fill datatype does not match the input datatype" in str( error_info.value)
def skip_test_serdes_fill(remove_json_files=True): """ Test serdes on Fill data transform. """ def gen(): yield (np.array([4, 5, 6, 7], dtype=np.int32), ) data = ds.GeneratorDataset(gen, column_names=["col"]) fill_op = c.Fill(3) data = data.map(operations=fill_op, input_columns=["col"]) expected = np.array([3, 3, 3, 3], dtype=np.int32) for data_row in data: np.testing.assert_array_equal(data_row[0].asnumpy(), expected) # FIXME - need proper serdes support for Fill's fill_value parameter util_check_serialize_deserialize_file(data, "fill_pipeline", remove_json_files)
def test_compose(): ds.config.set_seed(0) def test_config(arr, op_list): try: data = ds.NumpySlicesDataset(arr, column_names="col", shuffle=False) data = data.map(operations=ops.Compose(op_list), input_columns=["col"]) res = [] for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): res.append(i["col"].tolist()) return res except (TypeError, ValueError) as e: return str(e) # test simple compose with only 1 op, this would generate a warning assert test_config([[1, 0], [3, 4]], [ops.Fill(2)]) == [[2, 2], [2, 2]] # test 1 column -> 2columns -> 1 -> 2 -> 1 assert test_config([[1, 0]], [ ops.Duplicate(), ops.Concatenate(), ops.Duplicate(), ops.Concatenate() ]) == [[1, 0] * 4] # test one python transform followed by a C transform. type after oneHot is float (mixed use-case) assert test_config( [1, 0], [py_ops.OneHotOp(2), ops.TypeCast(mstype.int32)]) == [[[0, 1]], [[1, 0]]] # test exceptions. compose, randomApply randomChoice use the same validator assert "op_list[0] is not a c_transform op" in test_config( [1, 0], [1, ops.TypeCast(mstype.int32)]) # test empty op list assert "op_list can not be empty." in test_config([1, 0], [])
def process_cmnli_clue_dataset(data_dir, label_list, bert_vocab_path, data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64, drop_remainder=True): """Process CMNLI dataset""" ### Loading CMNLI from CLUEDataset assert data_usage in ['train', 'eval', 'test'] if data_usage == 'train': dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) elif data_usage == 'eval': dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) else: dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) ### Processing label if data_usage == 'test': dataset = dataset.map( operations=ops.Duplicate(), input_columns=["id"], output_columns=["id", "label_id"], column_order=["id", "label_id", "sentence1", "sentence2"]) dataset = dataset.map(operations=ops.Fill(0), input_columns=["label_id"]) else: label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(operations=label_lookup, input_columns="label", output_columns="label_id") ### Processing sentence pairs vocab = text.Vocab.from_file(bert_vocab_path) tokenizer = text.BertTokenizer(vocab, lower_case=True) lookup = text.Lookup(vocab, unknown_token='[UNK]') ### Tokenizing sentences and truncate sequence pair dataset = dataset.map(operations=tokenizer, input_columns=["sentence1"]) dataset = dataset.map(operations=tokenizer, input_columns=["sentence2"]) dataset = dataset.map(operations=text.TruncateSequencePair(max_seq_len - 3), input_columns=["sentence1", "sentence2"]) ### Adding special tokens dataset = dataset.map(operations=ops.Concatenate( prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S')), input_columns=["sentence1"]) dataset = dataset.map( operations=ops.Concatenate(append=np.array(["[SEP]"], dtype='S')), input_columns=["sentence2"]) ### Generating segment_ids dataset = dataset.map( operations=ops.Duplicate(), input_columns=["sentence1"], output_columns=["sentence1", "type_sentence1"], column_order=["sentence1", "type_sentence1", "sentence2", "label_id"]) dataset = dataset.map(operations=ops.Duplicate(), input_columns=["sentence2"], output_columns=["sentence2", "type_sentence2"], column_order=[ "sentence1", "type_sentence1", "sentence2", "type_sentence2", "label_id" ]) dataset = dataset.map(operations=[lookup, ops.Fill(0)], input_columns=["type_sentence1"]) dataset = dataset.map(operations=[lookup, ops.Fill(1)], input_columns=["type_sentence2"]) dataset = dataset.map( operations=ops.Concatenate(), input_columns=["type_sentence1", "type_sentence2"], output_columns=["segment_ids"], column_order=["sentence1", "sentence2", "segment_ids", "label_id"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["segment_ids"]) ### Generating text_ids dataset = dataset.map(operations=ops.Concatenate(), input_columns=["sentence1", "sentence2"], output_columns=["text_ids"], column_order=["text_ids", "segment_ids", "label_id"]) dataset = dataset.map(operations=lookup, input_columns=["text_ids"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["text_ids"]) ### Generating mask_ids dataset = dataset.map( operations=ops.Duplicate(), input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"], column_order=["text_ids", "mask_ids", "segment_ids", "label_id"]) dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0, mstype.int32), input_columns=["mask_ids"]) dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) return dataset