def get_dataset(batch_size=1, repeat_count=1, distribute_file=''): """ get dataset """ _ = distribute_file ds = de.TFRecordDataset( [cfg.data_file], cfg.schema_file, columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) type_cast_op = C.TypeCast(mstype.int32) ds = ds.map(input_columns="segment_ids", operations=type_cast_op) ds = ds.map(input_columns="input_mask", operations=type_cast_op) ds = ds.map(input_columns="input_ids", operations=type_cast_op) if cfg.task == "Regression": type_cast_op_float = C.TypeCast(mstype.float32) ds = ds.map(input_columns="label_ids", operations=type_cast_op_float) else: ds = ds.map(input_columns="label_ids", operations=type_cast_op) ds = ds.repeat(repeat_count) # apply shuffle operation buffer_size = 960 ds = ds.shuffle(buffer_size=buffer_size) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) return ds
def convert_dtype(self, ms_dataset): """Convert the dataset dtype if the dtype is invalid. :param ms_dataset: a dataset object of mindspore :return: a dataset object of mindspore after dtype convert """ item = self.dataset[0] image, label = item[0], item[1] try: image_dtype = str(image.dtype) except: pass try: label_dtype = str(label.dtype) except: label_dtype = "int64" if image_dtype in self.invalid_dtype: type_cast_op = C2.TypeCast(self.dtype_map[image_dtype]) ms_dataset = ms_dataset.map(input_columns="image", operations=type_cast_op) if label_dtype in self.invalid_dtype: type_cast_op = C2.TypeCast(self.dtype_map[label_dtype]) ms_dataset = ms_dataset.map(input_columns="label", operations=type_cast_op) return ms_dataset
def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", data_file_path=None, schema_file_path=None, do_shuffle=True): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) ds = de.TFRecordDataset( [data_file_path], schema_file_path if schema_file_path != "" else None, columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) if assessment_method == "Spearman_correlation": type_cast_op_float = C.TypeCast(mstype.float32) ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") else: ds = ds.map(operations=type_cast_op, input_columns="label_ids") ds = ds.map(operations=type_cast_op, input_columns="segment_ids") ds = ds.map(operations=type_cast_op, input_columns="input_mask") ds = ds.map(operations=type_cast_op, input_columns="input_ids") ds = ds.repeat(repeat_count) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) return ds
def create_gru_dataset(epoch_count=1, batch_size=1, rank_size=1, rank_id=0, do_shuffle=True, dataset_path=None, is_training=True): """create dataset""" ds = de.MindDataset( dataset_path, columns_list=["source_ids", "target_ids", "target_mask"], shuffle=do_shuffle, num_parallel_workers=10, num_shards=rank_size, shard_id=rank_id) operations = random_teacher_force ds = ds.map(operations=operations, input_columns=["source_ids", "target_ids", "target_mask"], output_columns=["source_ids", "target_ids", "teacher_force"], column_order=["source_ids", "target_ids", "teacher_force"]) type_cast_op = deC.TypeCast(mstype.int32) type_cast_op_bool = deC.TypeCast(mstype.bool_) ds = ds.map(operations=type_cast_op, input_columns="source_ids") ds = ds.map(operations=type_cast_op, input_columns="target_ids") ds = ds.map(operations=type_cast_op_bool, input_columns="teacher_force") ds = ds.batch(batch_size, drop_remainder=True) ds = ds.repeat(1) return ds
def test_random_apply(): ds.config.set_seed(0) def test_config(arr, op_list, prob=0.5): try: data = ds.NumpySlicesDataset(arr, column_names="col", shuffle=False) data = data.map(input_columns=["col"], operations=ops.RandomApply(op_list, prob)) res = [] for i in data.create_dict_iterator(): res.append(i["col"].tolist()) return res except (TypeError, ValueError) as e: return str(e) res1 = test_config([[0, 1]], [ops.Duplicate(), ops.Concatenate()]) assert res1 in [[[0, 1]], [[0, 1, 0, 1]]] # test single nested compose assert test_config([[0, 1, 2]], [ ops.Compose([ops.Duplicate(), ops.Concatenate(), ops.Slice([0, 1, 2])]) ]) == [[0, 1, 2]] # test exception assert "is not of type (<class 'list'>" in test_config([1, 0], ops.TypeCast( mstype.int32)) assert "Input prob is not within the required interval" in test_config( [0, 1], [ops.Slice([0, 1])], 1.1) assert "is not of type (<class 'float'>" in test_config( [1, 0], [ops.TypeCast(mstype.int32)], None) assert "op_list with value None is not of type (<class 'list'>" in test_config( [1, 0], None)
def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_target='Ascend'): """ create train or evaluation dataset for warpctc Args: dataset_path(str): dataset path batch_size(int): batch size of generated dataset, default is 1 num_shards(int): number of devices shard_id(int): rank id device_target(str): platform of training, support Ascend and GPU """ dataset = _CaptchaDataset(dataset_path, cf.max_captcha_digits, device_target) ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) image_trans = [ vc.Rescale(1.0 / 255.0, 0.0), vc.Normalize([0.9010, 0.9049, 0.9025], std=[0.1521, 0.1347, 0.1458]), vc.Resize((m.ceil(cf.captcha_height / 16) * 16, cf.captcha_width)), c.TypeCast(mstype.float16) ] label_trans = [ c.TypeCast(mstype.int32) ] ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) if device_target == 'Ascend': ds = ds.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8) else: ds = ds.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8) ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) ds = ds.batch(batch_size, drop_remainder=True) return ds
def create_ctpn_dataset(mindrecord_file, batch_size=1, repeat_num=1, device_num=1, rank_id=0, is_training=True, num_parallel_workers=4): """Creatr deeptext dataset with MindDataset.""" ds = de.MindDataset(mindrecord_file, columns_list=["image", "annotation"], num_shards=device_num, shard_id=rank_id,\ num_parallel_workers=8, shuffle=is_training) decode = C.Decode() ds = ds.map(operations=decode, input_columns=["image"], num_parallel_workers=1) compose_map_func = (lambda image, annotation: preprocess_fn( image, annotation, is_training)) hwc_to_chw = C.HWC2CHW() normalize_op = C.Normalize((123.675, 116.28, 103.53), (58.395, 57.12, 57.375)) type_cast0 = CC.TypeCast(mstype.float32) type_cast1 = CC.TypeCast(mstype.float16) type_cast2 = CC.TypeCast(mstype.int32) type_cast3 = CC.TypeCast(mstype.bool_) if is_training: ds = ds.map( operations=compose_map_func, input_columns=["image", "annotation"], output_columns=[ "image", "image_shape", "box", "label", "valid_num" ], column_order=["image", "image_shape", "box", "label", "valid_num"], num_parallel_workers=num_parallel_workers) ds = ds.map(operations=[normalize_op, type_cast0], input_columns=["image"], num_parallel_workers=12) ds = ds.map(operations=[hwc_to_chw, type_cast1], input_columns=["image"], num_parallel_workers=12) else: ds = ds.map( operations=compose_map_func, input_columns=["image", "annotation"], output_columns=[ "image", "image_shape", "box", "label", "valid_num" ], column_order=["image", "image_shape", "box", "label", "valid_num"], num_parallel_workers=num_parallel_workers) ds = ds.map(operations=[normalize_op, hwc_to_chw, type_cast1], input_columns=["image"], num_parallel_workers=24) # transpose_column from python to c ds = ds.map(operations=[type_cast1], input_columns=["image_shape"]) ds = ds.map(operations=[type_cast1], input_columns=["box"]) ds = ds.map(operations=[type_cast2], input_columns=["label"]) ds = ds.map(operations=[type_cast3], input_columns=["valid_num"]) ds = ds.batch(batch_size, drop_remainder=True) ds = ds.repeat(repeat_num) return ds
def test_compose(): """ Test C++ and Python Compose Op """ ds.config.set_seed(0) def test_config(arr, op_list): try: data = ds.NumpySlicesDataset(arr, column_names="col", shuffle=False) data = data.map(input_columns=["col"], operations=op_list) res = [] for i in data.create_dict_iterator(output_numpy=True): res.append(i["col"].tolist()) return res except (TypeError, ValueError) as e: return str(e) # Test simple compose with only 1 op, this would generate a warning assert test_config([[1, 0], [3, 4]], ops.Compose([ops.Fill(2)])) == [[2, 2], [2, 2]] # Test 1 column -> 2 columns -> 1 -> 2 -> 1 assert test_config([[1, 0]], ops.Compose([ops.Duplicate(), ops.Concatenate(), ops.Duplicate(), ops.Concatenate()])) \ == [[1, 0] * 4] # Test one Python transform followed by a C transform. Type after OneHot is a float (mixed use-case) assert test_config( [1, 0], ops.Compose([py_ops.OneHotOp(2), ops.TypeCast(mstype.int32)])) == [[[0, 1]], [[1, 0]]] # Test exceptions. with pytest.raises(TypeError) as error_info: ops.Compose([1, ops.TypeCast(mstype.int32)]) assert "op_list[0] is not a c_transform op (TensorOp) nor a callable pyfunc." in str( error_info.value) # Test empty op list with pytest.raises(ValueError) as error_info: test_config([1, 0], ops.Compose([])) assert "op_list can not be empty." in str(error_info.value) # Test Python compose op assert test_config([1, 0], py_ops.Compose([py_ops.OneHotOp(2)])) == [[[0, 1]], [[1, 0]]] assert test_config([1, 0], py_ops.Compose([py_ops.OneHotOp(2), (lambda x: x + x)])) == [[[0, 2]], [[2, 0]]] # Test nested Python compose op assert test_config([1, 0], py_ops.Compose([py_ops.Compose([py_ops.OneHotOp(2)]), (lambda x: x + x)])) \ == [[[0, 2]], [[2, 0]]] with pytest.raises(TypeError) as error_info: py_ops.Compose([(lambda x: x + x)])() assert "Compose was called without an image. Fix invocation (avoid it being invoked as Compose([...])())." in str( error_info.value)
def create_poetry_dataset(batch_size, poetry, tokenizer): """create poetry dataset method""" dt = PoetryDataGenerator(batch_size, poetry, tokenizer) ds = de.GeneratorDataset(dt, ["input_ids", "token_type_id", "pad_mask"]) #ds.set_dataset_size(dt.__len__()) int_type_cast_op = C.TypeCast(mstype.int32) float_type_cast_op = C.TypeCast(mstype.float32) ds = ds.map(input_columns="input_ids", operations=int_type_cast_op) ds = ds.map(input_columns="token_type_id", operations=int_type_cast_op) ds = ds.map(input_columns="pad_mask", operations=float_type_cast_op) ds = ds.batch(batch_size, drop_remainder=True) return ds
def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0): """ create a train or eval dataset Args: dataset_path(string): the path of dataset. do_train(bool): whether dataset is used for train or eval. batch_size(int): the batch size of dataset. Default: 16. device_num (int): Number of shards that the dataset should be divided into (default=1). rank (int): The shard ID within num_shards (default=0). Returns: dataset """ if device_num == 1: ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank) # define map operations if do_train: trans = [ C.RandomCropDecodeResize(299), C.RandomHorizontalFlip(prob=0.5), C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4) ] else: trans = [C.Decode(), C.Resize(320), C.CenterCrop(299)] trans += [ C.Normalize(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5]), C.HWC2CHW(), C2.TypeCast(mstype.float32) ] type_cast_op = C2.TypeCast(mstype.int32) ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8) ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) return ds
def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1): """ create a train or eval dataset Args: dataset_path(string): the path of dataset. do_train(bool): whether dataset is used for train or eval. rank (int): The shard ID within num_shards (default=None). group_size (int): Number of shards that the dataset should be divided into (default=None). repeat_num(int): the repeat times of dataset. Default: 1. Returns: dataset """ if group_size == 1: ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) else: ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, num_shards=group_size, shard_id=rank) # define map operations if do_train: trans = [ C.RandomCropDecodeResize(224), C.RandomHorizontalFlip(prob=0.5), C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4) ] else: trans = [C.Decode(), C.Resize(256), C.CenterCrop(224)] trans += [ toBGR(), C.Rescale(1.0 / 255.0, 0.0), # C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), C.HWC2CHW(), C2.TypeCast(mstype.float32) ] type_cast_op = C2.TypeCast(mstype.int32) ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) # apply batch operations ds = ds.batch(cfg.batch_size, drop_remainder=True) return ds
def create_tinybert_dataset(batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None, data_type='tfrecord', seq_length=128, task_type=mstype.int32, drop_remainder=True): """create tinybert dataset""" if isinstance(data_dir, list): data_files = data_dir else: data_files = [data_dir] columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] shuffle = (do_shuffle == "true") if data_type == 'mindrecord': ds = de.MindDataset(data_files, columns_list=columns_list, shuffle=shuffle, num_shards=device_num, shard_id=rank) else: ds = de.TFRecordDataset(data_files, columns_list=columns_list, shuffle=shuffle, num_shards=device_num, shard_id=rank, shard_equal_rows=(device_num == 1)) if device_num == 1 and shuffle is True: ds = ds.shuffle(10000) type_cast_op = C.TypeCast(mstype.int32) slice_op = C.Slice(slice(0, seq_length, 1)) label_type = mstype.int32 if task_type == 'classification' else mstype.float32 ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["segment_ids"]) ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["input_mask"]) ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["input_ids"]) ds = ds.map(operations=[C.TypeCast(label_type), slice_op], input_columns=["label_ids"]) # apply batch operations ds = ds.batch(batch_size, drop_remainder=drop_remainder) return ds
def generate_mnist_dataset(data_path, batch_size=32, repeat_size=1, samples=None, num_parallel_workers=1, sparse=True): """ create dataset for training or testing """ # define dataset ds1 = ds.MnistDataset(data_path, num_samples=samples) # define operation parameters resize_height, resize_width = 32, 32 rescale = 1.0 / 255.0 shift = 0.0 # define map operations resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) rescale_op = CV.Rescale(rescale, shift) hwc2chw_op = CV.HWC2CHW() type_cast_op = C.TypeCast(mstype.int32) # apply map operations on images if not sparse: one_hot_enco = C.OneHot(10) ds1 = ds1.map(input_columns="label", operations=one_hot_enco, num_parallel_workers=num_parallel_workers) type_cast_op = C.TypeCast(mstype.float32) ds1 = ds1.map(input_columns="label", operations=type_cast_op, num_parallel_workers=num_parallel_workers) ds1 = ds1.map(input_columns="image", operations=resize_op, num_parallel_workers=num_parallel_workers) ds1 = ds1.map(input_columns="image", operations=rescale_op, num_parallel_workers=num_parallel_workers) ds1 = ds1.map(input_columns="image", operations=hwc2chw_op, num_parallel_workers=num_parallel_workers) # apply DatasetOps buffer_size = 10000 ds1 = ds1.shuffle(buffer_size=buffer_size) ds1 = ds1.batch(batch_size, drop_remainder=True) ds1 = ds1.repeat(repeat_size) return ds1
def create_dataset_dp(batch_size, data_path, device_num=1, rank=0, drop=True, data_start_index=0, eod_id=9): """ Create dataset using data parallel. Inputs: batch_size: batch size data_path: path of your MindRecord files device_num: total device number rank: current rank id drop: whether drop remainder eod_id: the id for <EOD> Returns: dataset: the dataset for training or evaluating """ ds.config.set_seed(1) home_path = os.path.join(os.getcwd(), data_path) files = os.listdir(data_path) dis = int(batch_size / device_num) if dis < 1: raise ValueError("Batch size / device_num should be positive, but found {}".format(dis)) data = [ os.path.join(home_path, name) for name in files if not name.endswith(".db") ] data.sort(key=lambda x: int(x[x.find("mindrecord")+10:])) print(data) if data_start_index >= len(data): raise ValueError(f"data start index {data_start_index} is larger than dataset length {len(data)}") dataset = ds.MindDataset(data[data_start_index:], columns_list=["input_ids"], shuffle=False) type_cast_op = C.TypeCast(mstype.int32) type_cast_op_float = C.TypeCast(mstype.float16) map_func = (lambda input_ids: get_input_data_from_batch(input_ids, eod_id, rank, dis)) dataset = dataset.batch(batch_size, drop_remainder=drop) dataset = dataset.map(operations=map_func, input_columns=["input_ids"], output_columns=["input_ids", "position_id", "attention_mask"], column_order=["input_ids", "position_id", "attention_mask"]) dataset = dataset.map(input_columns="position_id", operations=type_cast_op) dataset = dataset.map(input_columns="attention_mask", operations=type_cast_op_float) dataset = dataset.map(input_columns="input_ids", operations=type_cast_op) dataset = dataset.repeat(1) return dataset
def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, is_training=True, do_shuffle=True): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) if is_training: ds = de.TFRecordDataset( [data_file_path], schema_file_path if schema_file_path != "" else None, columns_list=[ "input_ids", "input_mask", "segment_ids", "start_positions", "end_positions", "unique_ids", "is_impossible" ], shuffle=do_shuffle) ds = ds.map(operations=type_cast_op, input_columns="start_positions") ds = ds.map(operations=type_cast_op, input_columns="end_positions") else: ds = de.TFRecordDataset( [data_file_path], schema_file_path if schema_file_path != "" else None, columns_list=[ "input_ids", "input_mask", "segment_ids", "unique_ids" ]) ds = ds.map(operations=type_cast_op, input_columns="segment_ids") ds = ds.map(operations=type_cast_op, input_columns="input_mask") ds = ds.map(operations=type_cast_op, input_columns="input_ids") ds = ds.repeat(repeat_count) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) return ds
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="CPU"): data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) # define map operations trans = [] if do_train: trans += [ C.RandomCrop((32, 32), (4, 4, 4, 4)), C.RandomHorizontalFlip(prob=0.5) ] trans += [ C.Resize((48,48)), C.Rescale(1.0 / 255.0, 0.0), C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]), C.HWC2CHW() ] type_cast_op = C2.TypeCast(mstype.int32) data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) # apply batch operations data_set = data_set.shuffle(buffer_size=10) data_set = data_set.batch(batch_size, drop_remainder=False) # apply dataset repeat operation data_set = data_set.repeat(repeat_num) return data_set
def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None): """create train dataset""" # apply repeat operations files = os.listdir(data_dir) data_files = [] for file_name in files: if "tf_record" in file_name: data_files.append(os.path.join(data_dir, file_name)) print(data_files) ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], shuffle=de.Shuffle.FILES if do_shuffle == "true" else False, num_shards=device_num, shard_id=rank, shard_equal_rows=True) ori_dataset_size = ds.get_dataset_size() print('origin dataset size: ', ori_dataset_size) type_cast_op = C.TypeCast(mstype.int32) ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") ds = ds.map(operations=type_cast_op, input_columns="segment_ids") ds = ds.map(operations=type_cast_op, input_columns="input_mask") ds = ds.map(operations=type_cast_op, input_columns="input_ids") # apply batch operations ds = ds.batch(cfg.batch_size, drop_remainder=True) logger.info("data size: {}".format(ds.get_dataset_size())) logger.info("repeat count: {}".format(ds.get_repeat_count())) return ds
def create_dataset(data_path, batch_size=32, repeat_size=1, num_parallel_workers=1): """ create dataset for train or test """ # define dataset mnist_ds = ds.MnistDataset(data_path) resize_height, resize_width = 32, 32 rescale = 1.0 / 255.0 shift = 0.0 rescale_nml = 1 / 0.3081 shift_nml = -1 * 0.1307 / 0.3081 # define map operations resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) rescale_op = CV.Rescale(rescale, shift) hwc2chw_op = CV.HWC2CHW() type_cast_op = C.TypeCast(mstype.int32) # apply map operations on images mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) # apply DatasetOps buffer_size = 10000 mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) mnist_ds = mnist_ds.repeat(repeat_size) return mnist_ds
def create_dataset(data_path, batch_size=32, repeat_size=1, mode="train"): """ create dataset for train or test """ cifar_ds = ds.Cifar10Dataset(data_path) rescale = 1.0 / 255.0 shift = 0.0 resize_op = CV.Resize((cfg.image_height, cfg.image_width)) rescale_op = CV.Rescale(rescale, shift) normalize_op = CV.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) if mode == "train": random_crop_op = CV.RandomCrop([32, 32], [4, 4, 4, 4]) random_horizontal_op = CV.RandomHorizontalFlip() channel_swap_op = CV.HWC2CHW() typecast_op = C.TypeCast(mstype.int32) cifar_ds = cifar_ds.map(input_columns="label", operations=typecast_op) if mode == "train": cifar_ds = cifar_ds.map(input_columns="image", operations=random_crop_op) cifar_ds = cifar_ds.map(input_columns="image", operations=random_horizontal_op) cifar_ds = cifar_ds.map(input_columns="image", operations=resize_op) cifar_ds = cifar_ds.map(input_columns="image", operations=rescale_op) cifar_ds = cifar_ds.map(input_columns="image", operations=normalize_op) cifar_ds = cifar_ds.map(input_columns="image", operations=channel_swap_op) cifar_ds = cifar_ds.shuffle(buffer_size=cfg.buffer_size) cifar_ds = cifar_ds.batch(batch_size, drop_remainder=True) cifar_ds = cifar_ds.repeat(repeat_size) return cifar_ds
def me_de_train_dataset(sink_mode=False): """test me de train dataset""" # apply repeat operations repeat_count = 1 sink_size = -1 batch_size = 16 data_set = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], shuffle=False) type_cast_op = C.TypeCast(mstype.int32) new_repeat_count = repeat_count if sink_mode: sink_size = 100 new_repeat_count = 3 data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") # apply batch operations data_set = data_set.batch(batch_size, drop_remainder=True) logger.info("data size: {}".format(data_set.get_dataset_size())) logger.info("repeat_count: {}".format(data_set.get_repeat_count())) return data_set, new_repeat_count, sink_size
def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None): """create tinybert dataset""" files = os.listdir(data_dir) data_files = [] for file_name in files: if "record" in file_name: data_files.append(os.path.join(data_dir, file_name)) if task == "td": columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] else: columns_list = ["input_ids", "input_mask", "segment_ids"] ds = de.TFRecordDataset(data_files, schema_dir, columns_list=columns_list, shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank, shard_equal_rows=True) type_cast_op = C.TypeCast(mstype.int32) ds = ds.map(input_columns="segment_ids", operations=type_cast_op) ds = ds.map(input_columns="input_mask", operations=type_cast_op) ds = ds.map(input_columns="input_ids", operations=type_cast_op) if task == "td": ds = ds.map(input_columns="label_ids", operations=type_cast_op) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) return ds
def load_test_data(batch_size=1, data_file=None): """Load test dataset.""" data_set = ds.MindDataset(data_file, columns_list=[ "source_eos_ids", "source_eos_mask", "target_sos_ids", "target_sos_mask", "target_eos_ids", "target_eos_mask" ], shuffle=False) type_cast_op = deC.TypeCast(mstype.int32) data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_ids") data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_mask") data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_ids") data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_mask") data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_ids") data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_mask") # apply batch operations data_set = data_set.batch(batch_size, drop_remainder=True) return data_set
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): """ create a train or eval imagenet2012 dataset for resnet50 Args: dataset_path(string): the path of dataset. do_train(bool): whether dataset is used for train or eval. repeat_num(int): the repeat times of dataset. Default: 1 batch_size(int): the batch size of dataset. Default: 32 target(str): the device target. Default: Ascend Returns: dataset """ if target == "Ascend": device_num, rank_id = _get_rank_info() else: init("nccl") rank_id = get_rank() device_num = get_group_size() if device_num == 1: ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) else: ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) image_size = 224 mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] std = [0.229 * 255, 0.224 * 255, 0.225 * 255] # define map operations if do_train: trans = [ C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), C.RandomHorizontalFlip(prob=0.5), C.Normalize(mean=mean, std=std), C.HWC2CHW() ] else: trans = [ C.Decode(), C.Resize(256), C.CenterCrop(image_size), C.Normalize(mean=mean, std=std), C.HWC2CHW() ] type_cast_op = C2.TypeCast(mstype.int32) ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans) ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) # apply dataset repeat operation ds = ds.repeat(repeat_num) return ds
def test_case_project_between_maps(): columns = ["col_3d", "col_sint64", "col_2d"] parameters = {"params": {'columns': columns}} data1 = ds.TFRecordDataset(DATA_DIR_TF, SCHEMA_DIR_TF, shuffle=False) type_cast_op = C.TypeCast(mstype.int64) data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op) data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op) data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op) data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op) data1 = data1.project(columns=columns) data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op) data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op) data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op) data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op) data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op) filename = "project_between_maps_result.npz" ordered_save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def transform(dataset: MnistDataset): """Transforming the MNIST dataset.""" resize_height, resize_width = 32, 32 rescale = 1.0 / 255.0 shift = 0.0 rescale_nml = 1 / 0.3081 shift_nml = -1 * 0.1307 / 0.3081 resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) rescale_op = CV.Rescale(rescale, shift) hwc2chw_op = CV.HWC2CHW() type_cast_op = C.TypeCast(mstype.int32) dataset = dataset.map(operations=type_cast_op, input_columns="label") dataset = dataset.map(operations=resize_op, input_columns="image") dataset = dataset.map(operations=rescale_op, input_columns="image") dataset = dataset.map(operations=rescale_nml_op, input_columns="image") dataset = dataset.map(operations=hwc2chw_op, input_columns="image") dataset = dataset.batch(Config().trainer.batch_size, drop_remainder=True) return dataset
def create_dataset_cifar10(data_path, batch_size=32, repeat_size=1, status="train", target="Ascend"): """ create dataset for train or test """ if target == "Ascend": device_num, rank_id = _get_rank_info() if target != "Ascend" or device_num == 1: cifar_ds = ds.Cifar10Dataset(data_path) else: cifar_ds = ds.Cifar10Dataset(data_path, num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) rescale = 1.0 / 255.0 shift = 0.0 cfg = alexnet_cifar10_cfg resize_op = CV.Resize((cfg.image_height, cfg.image_width)) rescale_op = CV.Rescale(rescale, shift) normalize_op = CV.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) if status == "train": random_crop_op = CV.RandomCrop([32, 32], [4, 4, 4, 4]) random_horizontal_op = CV.RandomHorizontalFlip() channel_swap_op = CV.HWC2CHW() typecast_op = C.TypeCast(mstype.int32) cifar_ds = cifar_ds.map(input_columns="label", operations=typecast_op, num_parallel_workers=8) if status == "train": cifar_ds = cifar_ds.map(input_columns="image", operations=random_crop_op, num_parallel_workers=8) cifar_ds = cifar_ds.map(input_columns="image", operations=random_horizontal_op, num_parallel_workers=8) cifar_ds = cifar_ds.map(input_columns="image", operations=resize_op, num_parallel_workers=8) cifar_ds = cifar_ds.map(input_columns="image", operations=rescale_op, num_parallel_workers=8) cifar_ds = cifar_ds.map(input_columns="image", operations=normalize_op, num_parallel_workers=8) cifar_ds = cifar_ds.map(input_columns="image", operations=channel_swap_op, num_parallel_workers=8) cifar_ds = cifar_ds.shuffle(buffer_size=cfg.buffer_size) cifar_ds = cifar_ds.batch(batch_size, drop_remainder=True) cifar_ds = cifar_ds.repeat(repeat_size) return cifar_ds
def me_de_train_dataset(sink_mode=False): """test me de train dataset""" # apply repeat operations repeat_count = 1 batch_size = 16 ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=[ "input_ids", "input_mask", "segment_ids", "next_sentence_labels", "masked_lm_positions", "masked_lm_ids", "masked_lm_weights" ], shuffle=False) type_cast_op = C.TypeCast(mstype.int32) new_repeat_count = repeat_count if sink_mode: repeat_count = 30 sink_steps = 100 ori_dataaet_size = ds.get_dataset_size() new_size = sink_steps * batch_size ds.set_dataset_size(new_size) new_repeat_count = int(repeat_count * ori_dataaet_size // ds.get_dataset_size()) ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op) ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op) ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op) ds = ds.map(input_columns="segment_ids", operations=type_cast_op) ds = ds.map(input_columns="input_mask", operations=type_cast_op) ds = ds.map(input_columns="input_ids", operations=type_cast_op) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) ds = ds.repeat(repeat_count) logger.info("data size: {}".format(ds.get_dataset_size())) logger.info("repeat_count: {}".format(ds.get_repeat_count())) return ds, new_repeat_count
def create_dataset(data_path, batch_size): ds = de.Cifar10Dataset( data_path, num_parallel_workers=8, shuffle=False, ) # define map operations trans = [] # if do_train: # trans += [ # # C.RandomCrop((32, 32), (4, 4, 4, 4)), # # C.RandomHorizontalFlip(prob=0.5) # ] trans += [ C.Resize((224, 224)), C.Rescale(1.0 / 255.0, 0.0), C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]), C.HWC2CHW() ] type_cast_op = C2.TypeCast(mstype.int32) ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) return ds
def create_dataset(data_dir, training=True, batch_size=32, resize=(32, 32), rescale=1 / (255 * 0.3081), shift=-0.1307 / 0.3081, buffer_size=64): data_train = os.path.join(data_dir, 'train') # 训练集信息 data_test = os.path.join(data_dir, 'test') # 测试集信息 print(data_train) print(data_test) ds = ms.dataset.MnistDataset(data_train if training else data_test) ds = ds.map(input_columns=["image"], operations=[ CV.Resize(resize), CV.Rescale(rescale, shift), CV.HWC2CHW() ]) ds = ds.map(input_columns=["label"], operations=C.TypeCast(ms.int32)) # When `dataset_sink_mode=True` on Ascend, append `ds = ds.repeat(num_epochs) to the end ds = ds.shuffle(buffer_size=buffer_size).batch(batch_size, drop_remainder=True) return ds
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): """ Create a train or eval dataset. Args: dataset_path (str): The path of dataset. do_train (bool): Whether dataset is used for train or eval. repeat_num (int): The repeat times of dataset. Default: 1. batch_size (int): The batch size of dataset. Default: 32. Returns: Dataset. """ if do_train: dataset_path = os.path.join(dataset_path, 'train') do_shuffle = True else: dataset_path = os.path.join(dataset_path, 'eval') do_shuffle = False if device_num == 1 or not do_train: ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=do_shuffle) else: ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=do_shuffle, num_shards=device_num, shard_id=device_id) resize_height = 224 resize_width = 224 buffer_size = 100 rescale = 1.0 / 255.0 shift = 0.0 # define map operations random_crop_op = C.RandomCrop((32, 32), (4, 4, 4, 4)) random_horizontal_flip_op = C.RandomHorizontalFlip(device_id / (device_id + 1)) resize_op = C.Resize((resize_height, resize_width)) rescale_op = C.Rescale(rescale, shift) normalize_op = C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]) change_swap_op = C.HWC2CHW() trans = [] if do_train: trans += [random_crop_op, random_horizontal_flip_op] trans += [resize_op, rescale_op, normalize_op, change_swap_op] type_cast_op = C2.TypeCast(mstype.int32) ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op) ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) # apply dataset repeat operation ds = ds.repeat(repeat_num) return ds