def test_c_py_compose_transforms_module(): """ Test combining Python and C++ transforms """ ds.config.set_seed(0) def test_config(arr, input_columns, output_cols, op_list): data = ds.NumpySlicesDataset(arr, column_names=input_columns, shuffle=False) data = data.map(operations=op_list, input_columns=input_columns, output_columns=output_cols, column_order=output_cols) res = [] for i in data.create_dict_iterator(output_numpy=True): for col_name in output_cols: res.append(i[col_name].tolist()) return res arr = [1, 0] assert test_config(arr, ["cols"], ["cols"], [py_transforms.OneHotOp(2), c_transforms.Mask(c_transforms.Relational.EQ, 1)]) == \ [[[False, True]], [[True, False]]] assert test_config(arr, ["cols"], ["cols"], [py_transforms.OneHotOp(2), (lambda x: x + x), c_transforms.Fill(1)]) \ == [[[1, 1]], [[1, 1]]] assert test_config(arr, ["cols"], ["cols"], [py_transforms.OneHotOp(2), (lambda x: x + x), c_transforms.Fill(1), (lambda x: x + x)]) \ == [[[2, 2]], [[2, 2]]] assert test_config([[1, 3]], ["cols"], ["cols"], [c_transforms.PadEnd([3], -1), (lambda x: x + x)]) \ == [[2, 6, -2]] arr = ([[1]], [[3]]) assert test_config(arr, ["col0", "col1"], ["a"], [(lambda x, y: x + y), c_transforms.PadEnd([2], -1)]) == [[4, -1]]
def process_ner_msra_dataset(data_dir, label_list, bert_vocab_path, max_seq_len=128, class_filter=None, split_begin=None, split_end=None): """Process MSRA dataset""" ### Loading MSRA from CLUEDataset dataset = ds.GeneratorDataset(process_msra(data_dir, class_filter, split_begin, split_end), column_names=['text', 'label']) ### Processing label label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(operations=label_lookup, input_columns="label", output_columns="label_ids") dataset = dataset.map( operations=ops.Concatenate(prepend=np.array([0], dtype='i')), input_columns=["label_ids"]) dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len)), input_columns=["label_ids"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["label_ids"]) ### Processing sentence vocab = text.Vocab.from_file(bert_vocab_path) lookup = text.Lookup(vocab, unknown_token='[UNK]') unicode_char_tokenizer = text.UnicodeCharTokenizer() dataset = dataset.map(operations=unicode_char_tokenizer, input_columns=["text"], output_columns=["sentence"]) dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len - 2)), input_columns=["sentence"]) dataset = dataset.map(operations=ops.Concatenate( prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S')), input_columns=["sentence"]) dataset = dataset.map(operations=lookup, input_columns=["sentence"], output_columns=["input_ids"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["input_ids"]) dataset = dataset.map( operations=ops.Duplicate(), input_columns=["input_ids"], output_columns=["input_ids", "input_mask"], column_order=["input_ids", "input_mask", "label_ids"]) dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0, mstype.int32), input_columns=["input_mask"]) dataset = dataset.map( operations=ops.Duplicate(), input_columns=["input_ids"], output_columns=["input_ids", "segment_ids"], column_order=["input_ids", "input_mask", "segment_ids", "label_ids"]) dataset = dataset.map(operations=ops.Fill(0), input_columns=["segment_ids"]) return dataset
def test_eager_mask(): """ Test Mask op is callable """ mask_op = data_trans.Mask(data_trans.Relational.EQ, 3, mstype.bool_) expected = np.array([False, False, True, False, False]) assert np.array_equal(mask_op([1, 2, 3, 4, 5]), expected)
def test_py_vision_with_c_transforms(): """ Test combining Python vision operations with C++ transforms operations """ ds.config.set_seed(0) def test_config(op_list): data_dir = "../data/dataset/testImageNetData/train/" data1 = ds.ImageFolderDataset(dataset_dir=data_dir, shuffle=False) data1 = data1.map(operations=op_list, input_columns=["image"]) transformed_images = [] for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): transformed_images.append(item["image"]) return transformed_images # Test with Mask Op output_arr = test_config([ py_vision.Decode(), py_vision.CenterCrop((2)), np.array, c_transforms.Mask(c_transforms.Relational.GE, 100) ]) exp_arr = [ np.array([[[True, False, False], [True, False, False]], [[True, False, False], [True, False, False]]]), np.array([[[True, False, False], [True, False, False]], [[True, False, False], [True, False, False]]]) ] for exp_a, output in zip(exp_arr, output_arr): np.testing.assert_array_equal(exp_a, output) # Test with Fill Op output_arr = test_config([ py_vision.Decode(), py_vision.CenterCrop((4)), np.array, c_transforms.Fill(10) ]) exp_arr = [np.ones((4, 4, 3)) * 10] * 2 for exp_a, output in zip(exp_arr, output_arr): np.testing.assert_array_equal(exp_a, output) # Test with Concatenate Op, which will raise an error since ConcatenateOp only supports rank 1 tensors. with pytest.raises(RuntimeError) as error_info: test_config([ py_vision.Decode(), py_vision.CenterCrop((2)), np.array, c_transforms.Concatenate(0) ]) assert "Only 1D tensors supported" in str(error_info.value)
def process_tnews_clue_dataset(data_dir, label_list, bert_vocab_path, data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64): """Process TNEWS dataset""" ### Loading TNEWS from CLUEDataset assert data_usage in ['train', 'eval', 'test'] if data_usage == 'train': dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) elif data_usage == 'eval': dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) else: dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='TNEWS', usage=data_usage, shuffle=shuffle_dataset) ### Processing label if data_usage == 'test': dataset = dataset.map(input_columns=["id"], output_columns=["id", "label_id"], columns_order=["id", "label_id", "sentence"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["label_id"], operations=ops.Fill(0)) else: label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(input_columns="label_desc", output_columns="label_id", operations=label_lookup) ### Processing sentence vocab = text.Vocab.from_file(bert_vocab_path) tokenizer = text.BertTokenizer(vocab, lower_case=True) lookup = text.Lookup(vocab, unknown_token='[UNK]') dataset = dataset.map(input_columns=["sentence"], operations=tokenizer) dataset = dataset.map(input_columns=["sentence"], operations=ops.Slice(slice(0, max_seq_len))) dataset = dataset.map(input_columns=["sentence"], operations=ops.Concatenate(prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S'))) dataset = dataset.map(input_columns=["sentence"], output_columns=["text_ids"], operations=lookup) dataset = dataset.map(input_columns=["text_ids"], operations=ops.PadEnd([max_seq_len], 0)) dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"], columns_order=["label_id", "text_ids", "mask_ids"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["mask_ids"], operations=ops.Mask(ops.Relational.NE, 0, mstype.int32)) dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "segment_ids"], columns_order=["label_id", "text_ids", "mask_ids", "segment_ids"], operations=ops.Duplicate()) dataset = dataset.map(input_columns=["segment_ids"], operations=ops.Fill(0)) dataset = dataset.batch(batch_size) label = [] text_ids = [] mask_ids = [] segment_ids = [] for data in dataset: label.append(data[0]) text_ids.append(data[1]) mask_ids.append(data[2]) segment_ids.append(data[3]) return label, text_ids, mask_ids, segment_ids
def mask_compare(array, op, constant, dtype=mstype.bool_): data = ds.NumpySlicesDataset([array]) array = np.array(array) data = data.map(operations=ops.Mask(op, constant, dtype)) for d in data: if op == ops.Relational.EQ: array = array == np.array(constant, dtype=array.dtype) elif op == ops.Relational.NE: array = array != np.array(constant, dtype=array.dtype) elif op == ops.Relational.GT: array = array > np.array(constant, dtype=array.dtype) elif op == ops.Relational.GE: array = array >= np.array(constant, dtype=array.dtype) elif op == ops.Relational.LT: array = array < np.array(constant, dtype=array.dtype) elif op == ops.Relational.LE: array = array <= np.array(constant, dtype=array.dtype) array = array.astype(dtype=mstype_to_np_type[dtype]) np.testing.assert_array_equal(array, d[0].asnumpy())
def process_cmnli_clue_dataset(data_dir, label_list, bert_vocab_path, data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64, drop_remainder=True): """Process CMNLI dataset""" ### Loading CMNLI from CLUEDataset assert data_usage in ['train', 'eval', 'test'] if data_usage == 'train': dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) elif data_usage == 'eval': dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) else: dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='CMNLI', usage=data_usage, shuffle=shuffle_dataset) ### Processing label if data_usage == 'test': dataset = dataset.map( operations=ops.Duplicate(), input_columns=["id"], output_columns=["id", "label_id"], column_order=["id", "label_id", "sentence1", "sentence2"]) dataset = dataset.map(operations=ops.Fill(0), input_columns=["label_id"]) else: label_vocab = text.Vocab.from_list(label_list) label_lookup = text.Lookup(label_vocab) dataset = dataset.map(operations=label_lookup, input_columns="label", output_columns="label_id") ### Processing sentence pairs vocab = text.Vocab.from_file(bert_vocab_path) tokenizer = text.BertTokenizer(vocab, lower_case=True) lookup = text.Lookup(vocab, unknown_token='[UNK]') ### Tokenizing sentences and truncate sequence pair dataset = dataset.map(operations=tokenizer, input_columns=["sentence1"]) dataset = dataset.map(operations=tokenizer, input_columns=["sentence2"]) dataset = dataset.map(operations=text.TruncateSequencePair(max_seq_len - 3), input_columns=["sentence1", "sentence2"]) ### Adding special tokens dataset = dataset.map(operations=ops.Concatenate( prepend=np.array(["[CLS]"], dtype='S'), append=np.array(["[SEP]"], dtype='S')), input_columns=["sentence1"]) dataset = dataset.map( operations=ops.Concatenate(append=np.array(["[SEP]"], dtype='S')), input_columns=["sentence2"]) ### Generating segment_ids dataset = dataset.map( operations=ops.Duplicate(), input_columns=["sentence1"], output_columns=["sentence1", "type_sentence1"], column_order=["sentence1", "type_sentence1", "sentence2", "label_id"]) dataset = dataset.map(operations=ops.Duplicate(), input_columns=["sentence2"], output_columns=["sentence2", "type_sentence2"], column_order=[ "sentence1", "type_sentence1", "sentence2", "type_sentence2", "label_id" ]) dataset = dataset.map(operations=[lookup, ops.Fill(0)], input_columns=["type_sentence1"]) dataset = dataset.map(operations=[lookup, ops.Fill(1)], input_columns=["type_sentence2"]) dataset = dataset.map( operations=ops.Concatenate(), input_columns=["type_sentence1", "type_sentence2"], output_columns=["segment_ids"], column_order=["sentence1", "sentence2", "segment_ids", "label_id"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["segment_ids"]) ### Generating text_ids dataset = dataset.map(operations=ops.Concatenate(), input_columns=["sentence1", "sentence2"], output_columns=["text_ids"], column_order=["text_ids", "segment_ids", "label_id"]) dataset = dataset.map(operations=lookup, input_columns=["text_ids"]) dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0), input_columns=["text_ids"]) ### Generating mask_ids dataset = dataset.map( operations=ops.Duplicate(), input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"], column_order=["text_ids", "mask_ids", "segment_ids", "label_id"]) dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0, mstype.int32), input_columns=["mask_ids"]) dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) return dataset