def val_dataloader(self): """Summary Returns: TYPE: Description """ ds_valid = MultiLabelDataset( folder=self.hparams.data, is_train='valid', fname='valid_v7.1.csv', types=self.hparams.types, pathology=self.hparams.pathology, resize=int(self.hparams.shape), ) ds_valid.reset_state() ag_valid = [ imgaug.Resize(self.hparams.shape, interp=cv2.INTER_AREA), imgaug.ToFloat32(), ] ds_valid = AugmentImageComponent(ds_valid, ag_valid, 0) ds_valid = BatchData(ds_valid, self.hparams.batch, remainder=True) ds_valid = MultiProcessRunner(ds_valid, num_proc=4, num_prefetch=16) ds_valid = PrintData(ds_valid) ds_valid = MapData( ds_valid, lambda dp: [ torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))), torch.tensor(dp[1]).float() ]) return ds_valid
def get_train_aseval_dataflow(): """ Args: shard, num_shards: to get subset of evaluation data """ prw = PRWDataset(cfg.DATA.BASEDIR) imgs = prw.load() # no filter for training # test if it can repeat keys ds = DataFromList(imgs, shuffle=False) aug = imgaug.AugmentorList( [CustomResize(cfg.PREPROC.SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE)]) def preprocess(img): fname = img['file_name'] im = cv2.imread(fname, cv2.IMREAD_COLOR) orig_shape = im.shape[:2] assert im is not None, fname im = im.astype('float32') # augmentation: im, params = aug.augment_return_params(im) ret = [fname, im, orig_shape] return ret ds = MapData(ds, preprocess) return ds
def test_dataloader(self): ds_test = MultiLabelDataset(folder=self.hparams.data, is_train='valid', fname='test_v7.1.csv', types=self.hparams.types, pathology=self.hparams.pathology, resize=int(self.hparams.shape), fold_idx=None, n_folds=1) ds_test.reset_state() ag_test = [ imgaug.Resize(self.hparams.shape, interp=cv2.INTER_AREA), imgaug.ToFloat32(), ] ds_test = AugmentImageComponent(ds_test, ag_test, 0) ds_test = BatchData(ds_test, self.hparams.batch, remainder=True) ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16) ds_test = PrintData(ds_test) ds_test = MapData( ds_test, lambda dp: [ torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))), torch.tensor(dp[1]).float() ]) return ds_test
def read_and_augment_images(ds): def mapf(dp): fname = dp[0] im = cv2.imread(fname, cv2.IMREAD_COLOR).astype('float32') assert im is not None, dp[0] dp[0] = im # assume floatbox as input assert dp[1].dtype == np.float32 dp[1] = box_to_point8(dp[1]) dp.append(fname) return dp ds = MapData(ds, mapf) augs = [ CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE), imgaug.Flip(horiz=True) ] ds = AugmentImageComponents(ds, augs, index=(0, ), coords_index=(1, )) def unmapf(points): boxes = point8_to_box(points) return boxes ds = MapDataComponent(ds, unmapf, 1) return ds
def get_train_dataflow(): imgs = COCODetection.load_many(config.BASEDIR, config.TRAIN_DATASET) # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. imgs = list(filter(lambda img: len(img['boxes']) > 0, imgs)) # log invalid training ds = DataFromListOfDict( imgs, ['file_name', 'boxes', 'class', 'is_crowd' ], # we need this four keys only shuffle=True) ds = read_and_augment_images(ds) def add_anchor_to_dp(dp): im, boxes, klass, is_crowd, fname = dp try: fm_labels, fm_boxes = get_rpn_anchor_input(im, boxes, klass, is_crowd) boxes = boxes[is_crowd == 0] # skip crowd boxes in training target klass = klass[is_crowd == 0] if not len(boxes): raise MalformedData("No valid gt_boxes!") except MalformedData as e: log_once( "Input {} is invalid for training: {}".format(fname, str(e)), 'warn') return None return [im, fm_labels, fm_boxes, boxes, klass] ds = MapData(ds, add_anchor_to_dp) return ds
def get_resnet_train_dataflow(): imgs = ResnetDetection.load_many( config.BASEDIR, config.TRAIN_DATASET) # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. imgs = list(imgs) ds = DataFromList(imgs, shuffle=True) augmentors = get_resnet_augmentor() def preprocess(img): im, fname, label = img['image_data'], img['id'], img['with_ship'] im = cv2.imread(im) #============Aug================ im = cv2.resize(im, (config.RESNET_SIZE, config.RESNET_SIZE)) augmented = strong_aug()(image=im) im = augmented['image'] # im, multi_mask = do_flip_transpose2(im, multi_mask, type=random.randint(0,7)) #============================ ret = [im, label] return ret ds = MapData(ds, preprocess) ds = AugmentImageComponent(ds, augmentors, copy=False) ds = BatchData(ds, config.RESNET_BATCH) ds = PrefetchDataZMQ(ds, 6) return ds
def get_debug_dataflow(add_mask=True, imageHW=768): """ Return a training dataflow. Each datapoint is: image, fm_labels, fm_boxes, gt_boxes, gt_class [, masks] """ imgs = Detection.load_many( config.BASEDIR, config.TRAIN_DATASET, add_gt=True, add_mask=add_mask) # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. imgs = list(imgs) import os import pandas as pd csv_path = os.path.join(config.BASEDIR, 'train_ship_segmentations_v2.csv') df = pd.read_csv(csv_path, engine="python") df = df.dropna(axis=0) df = df.set_index('ImageId') ds = DataFromList(imgs, shuffle=True) def preprocess(img): im, fname = img['image_data'], img['id'] multi_mask = getAnnotation(df, fname) im = cv2.imread(im) im, multi_mask = fix_resize_transform_range(im, multi_mask, [imageHW, imageHW], 1.0) boxes, klass, masks, is_crowd = multi_mask_to_annotation(multi_mask) return boxes ds = MapData(ds, preprocess) ds = PrefetchDataZMQ(ds, 6) return ds
def test_dataloader(self): """Summary Returns: TYPE: Description """ ds_test = CustomDataSet(folder=self.hparams.data, train_or_valid='test', size=np.inf, hparams=self.hparams) ds_test.reset_state() ag_test = [ imgaug.Resize(self.hparams.shape, interp=cv2.INTER_NEAREST), imgaug.ToFloat32(), ] ds_test = AugmentImageComponent(ds_test, [ imgaug.Albumentations(AB.CLAHE(p=1)), ], 0) ds_test = AugmentImageComponent(ds_test, ag_test, 0) ds_test = BatchData(ds_test, self.hparams.batch, remainder=True) # ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16) ds_test = PrintData(ds_test) ds_test = MapData( ds_test, lambda dp: [ torch.tensor(dp[0][:, np.newaxis, :, :]).float(), torch.tensor(dp[1][:, np.newaxis, :, :]).float() ]) return ds_test
def get_train_dataflow(add_mask=True): """ """ if config.CROSS_VALIDATION: imgs = BRATS_SEG.load_from_file(config.BASEDIR, config.TRAIN_DATASET) else: imgs = BRATS_SEG.load_many( config.BASEDIR, config.TRAIN_DATASET, add_gt=False, add_mask=add_mask) # no filter for training imgs = list(imgs) ds = DataFromList(imgs, shuffle=True) def preprocess(data): if config.NO_CACHE: fname, gt, im = data['file_name'], data['gt'], data['image_data'] volume_list, label, weight, _, _ = crop_brain_region(im, gt) batch = sampler3d(volume_list, label, weight) else: volume_list, label, weight, _, _ = data['preprocessed'] batch = sampler3d(volume_list, label, weight) return [batch['images'], batch['weights'], batch['labels']] ds = BatchData(MapData(ds, preprocess), config.BATCH_SIZE) ds = PrefetchDataZMQ(ds, 6) return ds
def prepared(self, num_gpu, batch_size, eval=False): # use a single process version to debug if needed if self.min_num_workers == 0: ds = MapData(self, self.ex_process.train_process) else: ds = MultiProcessMapData(self, max(num_gpu, self.min_num_workers), self.ex_process.train_process) return BatchData(ds, batch_size)
def train_dataloader(self): ds_train = MultiLabelDataset(folder=self.hparams.data, is_train='train', fname='covid_train_v5.csv', types=self.hparams.types, pathology=self.hparams.pathology, resize=int(self.hparams.shape), balancing=None) ds_train.reset_state() ag_train = [ # imgaug.Albumentations( # AB.SmallestMaxSize(self.hparams.shape, p=1.0)), imgaug.ColorSpace(mode=cv2.COLOR_GRAY2RGB), # imgaug.Affine(shear=10), imgaug.RandomChooseAug([ imgaug.Albumentations(AB.Blur(blur_limit=4, p=0.25)), imgaug.Albumentations(AB.MotionBlur(blur_limit=4, p=0.25)), imgaug.Albumentations(AB.MedianBlur(blur_limit=4, p=0.25)), ]), imgaug.Albumentations(AB.CLAHE(tile_grid_size=(32, 32), p=0.5)), imgaug.RandomOrderAug([ imgaug.Affine(shear=10, border=cv2.BORDER_CONSTANT, interp=cv2.INTER_AREA), imgaug.Affine(translate_frac=(0.01, 0.02), border=cv2.BORDER_CONSTANT, interp=cv2.INTER_AREA), imgaug.Affine(scale=(0.5, 1.0), border=cv2.BORDER_CONSTANT, interp=cv2.INTER_AREA), ]), imgaug.RotationAndCropValid(max_deg=10, interp=cv2.INTER_AREA), imgaug.GoogleNetRandomCropAndResize( crop_area_fraction=(0.8, 1.0), aspect_ratio_range=(0.8, 1.2), interp=cv2.INTER_AREA, target_shape=self.hparams.shape), imgaug.ColorSpace(mode=cv2.COLOR_RGB2GRAY), imgaug.ToFloat32(), ] ds_train = AugmentImageComponent(ds_train, ag_train, 0) # Label smoothing ag_label = [ imgaug.BrightnessScale((0.8, 1.2), clip=False), ] # ds_train = AugmentImageComponent(ds_train, ag_label, 1) ds_train = BatchData(ds_train, self.hparams.batch, remainder=True) if self.hparams.debug: ds_train = FixedSizeData(ds_train, 2) ds_train = MultiProcessRunner(ds_train, num_proc=4, num_prefetch=16) ds_train = PrintData(ds_train) ds_train = MapData( ds_train, lambda dp: [ torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))), torch.tensor(dp[1]).float() ]) return ds_train
def val_dataloader(self): """Summary Returns: TYPE: Description """ ds_valid = CustomDataSet(folder=self.hparams.data, train_or_valid='valid', size=np.inf, hparams=self.hparams) ds_valid.reset_state() ag_valid = [ imgaug.Resize(self.hparams.shape, interp=cv2.INTER_NEAREST), imgaug.ToFloat32(), ] ds_valid = AugmentImageComponent(ds_valid, [ imgaug.Albumentations(AB.CLAHE(p=1)), ], 0) if self.hparams.types == 6: ds_valid = AugmentImageComponents(ds_valid, ag_valid, [0, 1, 2, 3, 4, 5, 6]) elif self.hparams.types == 1: ds_valid = AugmentImageComponents(ds_valid, ag_valid, [0, 1]) ds_valid = BatchData(ds_valid, self.hparams.batch, remainder=True) ds_valid = MultiProcessRunner(ds_valid, num_proc=4, num_prefetch=16) ds_valid = PrintData(ds_valid) if self.hparams.types == 6: ds_valid = MapData( ds_valid, lambda dp: [ torch.tensor(dp[0][:, np.newaxis, :, :]).float(), torch.tensor(dp[1][:, np.newaxis, :, :]).float(), torch.tensor(dp[2][:, np.newaxis, :, :]).float(), torch.tensor(dp[3][:, np.newaxis, :, :]).float(), torch.tensor(dp[4][:, np.newaxis, :, :]).float(), torch.tensor(dp[5][:, np.newaxis, :, :]).float(), torch.tensor(dp[6][:, np.newaxis, :, :]).float(), ]) elif self.hparams.types == 1: ds_valid = MapData( ds_valid, lambda dp: [ torch.tensor(dp[0][:, np.newaxis, :, :]).float(), torch.tensor(dp[1][:, np.newaxis, :, :]).float(), ]) return ds_valid
def get_train_dataflow(): """ Return a training dataflow. Each datapoint consists of the following: An image: (h, w, 3), 1 or more pairs of (anchor_labels, anchor_boxes): anchor_labels: (h', w', NA) anchor_boxes: (h', w', NA, 4) gt_boxes: (N, 4) gt_labels: (N,) If MODE_MASK, gt_masks: (N, h, w) """ roidbs = list( itertools.chain.from_iterable( DatasetRegistry.get(x).training_roidbs() for x in cfg.DATA.TRAIN)) print( "---------------------------------------------------------------- data.py:343" ) print_class_histogram(roidbs) # Filter out images that have no gt boxes, but this filter shall not be applied for testing. # The model does support training with empty images, but it is not useful for COCO. num = len(roidbs) roidbs = list( filter(lambda img: len(img["boxes"][img["is_crowd"] == 0]) > 0, roidbs)) logger.info( "Filtered {} images which contain no non-crowd groudtruth boxes. Total #images for training: {}" .format(num - len(roidbs), len(roidbs))) ds = DataFromList(roidbs, shuffle=True) preprocess = TrainingDataPreprocessor(cfg) if cfg.DATA.NUM_WORKERS > 0: if cfg.TRAINER == "horovod": # one dataflow for each process, therefore don't need large buffer buffer_size = cfg.DATA.NUM_WORKERS * 10 ds = MultiThreadMapData(ds, cfg.DATA.NUM_WORKERS, preprocess, buffer_size=buffer_size) # MPI does not like fork() else: buffer_size = cfg.DATA.NUM_WORKERS * 20 ds = MultiProcessMapData(ds, cfg.DATA.NUM_WORKERS, preprocess, buffer_size=buffer_size) else: ds = MapData(ds, preprocess) return ds
def make_data(): from COCOAllJoints import COCOJoints from dataset import Preprocessing d = COCOJoints() train_data, _ = d.load_data(1) from tensorpack.dataflow import DataFromList, MapData, BatchData dp = DataFromList(train_data) dp = MapData(dp, Preprocessing) dp = BatchData(dp, cfg.batch_size, use_list=True) dp.reset_state() dataiter = dp.get_data() return dataiter
def get_batched_eval_dataflow(name, shard=0, num_shards=1, batch_size=1): """ Args: name (str): name of the dataset to evaluate shard, num_shards: to get subset of evaluation data """ roidbs = DetectionDataset().load_inference_roidbs(name) num_imgs = len(roidbs) img_per_shard = num_imgs // num_shards img_range = (shard * img_per_shard, (shard + 1) * img_per_shard if shard + 1 < num_shards else num_imgs) # no filter for training ds = DataFromListOfDictBatched(roidbs[img_range[0]: img_range[1]], ['file_name', 'id'], batch_size) def decode_images(inputs): return [[cv2.imread(inp[0], cv2.IMREAD_COLOR), inp[1]] for inp in inputs] def resize_images(inputs): resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) resized_imgs = [resizer.augment(inp[0]) for inp in inputs] org_shapes = [inp[0].shape for inp in inputs] scales = [np.sqrt(rimg.shape[0] * 1.0 / org_shape[0] * rimg.shape[1] / org_shape[1]) for rimg, org_shape in zip(resized_imgs, org_shapes)] return [[resized_imgs[i], inp[1], scales[i], org_shapes[i][:2]] for i, inp in enumerate(inputs)] def pad_and_batch(inputs): heights, widths, _ = zip(*[inp[0].shape for inp in inputs]) max_h, max_w = max(heights), max(widths) padded_images = np.stack([np.pad(inp[0], [[0, max_h-inp[0].shape[0]], [0, max_w-inp[0].shape[1]], [0,0]], 'constant') for inp in inputs]) return [padded_images, [inp[1] for inp in inputs], list(zip(heights, widths)), [inp[2] for inp in inputs], [inp[3] for inp in inputs]] ds = MapData(ds, decode_images) ds = MapData(ds, resize_images) ds = MapData(ds, pad_and_batch) return ds
def dataflow_to_dataset(df, types, shapes): """ Wrap a dataflow to tf.data.Dataset. This function will also reset the dataflow. If the dataflow itself is finite, the returned dataset is also finite. Therefore, if used for training, you'll need to add `.repeat()` on the returned dataset. Args: df (DataFlow): a dataflow which produces lists types([tf.DType]): list of types Returns: (tf.data.Dataset) """ # TODO theoretically it can support dict assert isinstance(df, DataFlow), df assert isinstance(types, (list, tuple)), types df = MapData(df, lambda dp: tuple(dp)) df.reset_state() ds = tf.data.Dataset.from_generator(df.get_data, tuple(types), tuple(shapes)) return ds
def get_resnet_val_dataflow(): imgs = ResnetDetection.load_many( config.BASEDIR, config.VAL_DATASET) imgs = list(imgs) # ds = DataFromListOfDict(imgs, ['image_data', 'with_ship', 'id']) ds = DataFromList(imgs, shuffle=False) def f(img): image, label = img['image_data'], img['with_ship'] im = cv2.imread(image) im = cv2.resize(im, (config.RESNET_SIZE, config.RESNET_SIZE)) return [im, label] ds = MapData(ds, f) ds = BatchData(ds, config.RESNET_BATCH) ds = PrefetchDataZMQ(ds, 1) return ds
def main(): parser = argparse.ArgumentParser() parser.add_argument('--mdb_file', required=True) parser.add_argument('--data', choices=['gdb9', 'zinc'], required=True, help='Dataset to use.') parser.add_argument('-o', '--output', required=True) args = parser.parse_args() ds = LMDBData(args.mdb_file, shuffle=False) ds = MapData(ds, lambda dp: loads_msgpack(dp[1])) # used by PenalizedLogPScore ss = (CycleLengthScore(), LogPScore(), SAScore(GraphMolecularMetrics._SA_MODEL)) conv = get_decoder(args.data, True) values = [] for dp in ds.get_data(): m = conv.to_mol(dp[1].squeeze(), dp[0]) row = np.empty(len(ss), dtype=float) for i, s in enumerate(ss): row[i] = s.compute(m) values.append(row) values = np.row_stack(values) m = np.mean(values, axis=0) amin = np.min(values, axis=0) amax = np.max(values, axis=0) sd = np.std(values, axis=0, ddof=1) out = {} for s, mv, sdv, mi, mx in zip(ss, m, sd, amin, amax): out[s.name] = {'mean': mv, 'std': sdv, 'min': mi, 'max': mx} with open(args.output, 'wb') as fout: pickle.dump(out, fout)
def get_query_dataflow(): """ Args: shard, num_shards: to get subset of evaluation data """ prw = PRWDataset(cfg.DATA.BASEDIR) imgs = prw.load_query() # no filter for training # test if it can repeat keys ds = DataFromList(imgs, shuffle=False) aug = imgaug.AugmentorList( [CustomResize(cfg.PREPROC.SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE)]) def preprocess(img): fname, boxes, re_id_class = img['file_name'], img['boxes'], img[ 're_id_class'] boxes = np.copy(boxes) im = cv2.imread(fname, cv2.IMREAD_COLOR) assert im is not None, fname im = im.astype('float32') # assume floatbox as input assert boxes.dtype == np.float32, "Loader has to return floating point boxes!" # augmentation: im, params = aug.augment_return_params(im) points = box_to_point8(boxes) points = aug.augment_coords(points, params) boxes = point8_to_box(points) assert np.min(np_area(boxes)) > 0, "Some boxes have zero area!" ret = [im, boxes, re_id_class] return ret ds = MapData(ds, preprocess) return ds
def get_augmented_speech_commands_data(subset, options, do_multiprocess=True, shuffle=True): isTrain = subset == 'train' and do_multiprocess shuffle = shuffle if shuffle is not None else isTrain ds = SpeechCommandsDataFlow( os.path.join(options.data_dir, 'speech_commands_v0.02'), subset, shuffle, None) if isTrain: add_noise_func = functools.partial(_add_noise, noises=ds.noises) ds = MapDataComponent(ds, _pad_or_clip_to_desired_sample, index=0) ds = MapDataComponent(ds, _to_float, index=0) if isTrain: ds = MapDataComponent(ds, _time_shift, index=0) ds = MapData(ds, add_noise_func) ds = BatchData(ds, options.batch_size // options.nr_gpu, remainder=not isTrain) if do_multiprocess: ds = PrefetchData(ds, 4, 4) return ds
def get_train_dataflow(): """ Return a training dataflow. Each datapoint consists of the following: An image: (h, w, 3), 1 or more pairs of (anchor_labels, anchor_boxes): anchor_labels: (h', w', NA) anchor_boxes: (h', w', NA, 4) gt_boxes: (N, 4) gt_labels: (N,) If MODE_MASK, gt_masks: (N, h, w) """ roidbs = DetectionDataset().load_training_roidbs(cfg.DATA.TRAIN) print_class_histogram(roidbs) # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. num = len(roidbs) roidbs = list( filter(lambda img: len(img['boxes'][img['is_crowd'] == 0]) > 0, roidbs)) logger.info( "Filtered {} images which contain no non-crowd groudtruth boxes. Total #images for training: {}" .format(num - len(roidbs), len(roidbs))) ds = DataFromList(roidbs, shuffle=True) aug = imgaug.AugmentorList([ CustomResize(cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE), imgaug.Flip(horiz=True) ]) def preprocess(roidb): fname, boxes, klass, is_crowd = roidb['file_name'], roidb[ 'boxes'], roidb['class'], roidb['is_crowd'] boxes = np.copy(boxes) im = cv2.imread(fname, cv2.IMREAD_COLOR) assert im is not None, fname im = im.astype('float32') height, width = im.shape[:2] # assume floatbox as input assert boxes.dtype == np.float32, "Loader has to return floating point boxes!" if not cfg.DATA.ABSOLUTE_COORD: boxes[:, 0::2] *= width boxes[:, 1::2] *= height # augmentation: im, params = aug.augment_return_params(im) points = box_to_point8(boxes) points = aug.augment_coords(points, params) boxes = point8_to_box(points) assert np.min(np_area(boxes)) > 0, "Some boxes have zero area!" ret = {'image': im} # rpn anchor: try: if cfg.MODE_FPN: multilevel_anchor_inputs = get_multilevel_rpn_anchor_input( im, boxes, is_crowd) for i, (anchor_labels, anchor_boxes) in enumerate(multilevel_anchor_inputs): ret['anchor_labels_lvl{}'.format(i + 2)] = anchor_labels ret['anchor_boxes_lvl{}'.format(i + 2)] = anchor_boxes else: # anchor_labels, anchor_boxes ret['anchor_labels'], ret[ 'anchor_boxes'] = get_rpn_anchor_input( im, boxes, is_crowd) boxes = boxes[is_crowd == 0] # skip crowd boxes in training target klass = klass[is_crowd == 0] ret['gt_boxes'] = boxes ret['gt_labels'] = klass if not len(boxes): raise MalformedData("No valid gt_boxes!") except MalformedData as e: log_once( "Input {} is filtered for training: {}".format(fname, str(e)), 'warn') return None if cfg.MODE_MASK: # augmentation will modify the polys in-place segmentation = copy.deepcopy(roidb['segmentation']) segmentation = [ segmentation[k] for k in range(len(segmentation)) if not is_crowd[k] ] assert len(segmentation) == len(boxes) # Apply augmentation on polygon coordinates. # And produce one image-sized binary mask per box. masks = [] width_height = np.asarray([width, height], dtype=np.float32) for polys in segmentation: if not cfg.DATA.ABSOLUTE_COORD: polys = [p * width_height for p in polys] polys = [aug.augment_coords(p, params) for p in polys] masks.append( segmentation_to_mask(polys, im.shape[0], im.shape[1])) masks = np.asarray(masks, dtype='uint8') # values in {0, 1} ret['gt_masks'] = masks # from viz import draw_annotation, draw_mask # viz = draw_annotation(im, boxes, klass) # for mask in masks: # viz = draw_mask(viz, mask) # tpviz.interactive_imshow(viz) return ret if cfg.DATA.NUM_WORKERS > 0: if cfg.TRAINER == 'horovod': buffer_size = cfg.DATA.NUM_WORKERS * 10 # one dataflow for each process, therefore don't need large buffer ds = MultiThreadMapData(ds, cfg.DATA.NUM_WORKERS, preprocess, buffer_size=buffer_size) # MPI does not like fork() else: buffer_size = cfg.DATA.NUM_WORKERS * 20 ds = MultiProcessMapDataZMQ(ds, cfg.DATA.NUM_WORKERS, preprocess, buffer_size=buffer_size) else: ds = MapData(ds, preprocess) return ds
def get_train_dataflow_davis(add_mask=False): # train_img_path = config.DAVIS_PATH + "train/" # train_label_path = config.DAVIS_PATH + "train-gt/" # imgs = glob.glob(train_img_path + "*/*.jpg") # train_img_path = "/home/luiten/vision/PReMVOS/data/first/bike-trial/lucid_data_dreaming/" # train_label_path = "/home/luiten/vision/PReMVOS/data/first/bike-trial/lucid_data_dreaming/" # train_img_path = "/home/luiten/vision/PReMVOS/data/"+config.DAVIS_NAME+"/lucid_data_dreaming/" # train_label_path = "/home/luiten/vision/PReMVOS/data/"+config.DAVIS_NAME+"/lucid_data_dreaming/" # train_img_path = "/home/luiten/vision/youtubevos/ytvos_data/together/generated/augment_images/" # train_label_path = "/home/luiten/vision/youtubevos/ytvos_data/together/generated/augment_gt/" train_img_path = "/home/luiten/vision/youtubevos/DAVIS/davis_together/augment_images/" train_label_path = "/home/luiten/vision/youtubevos/DAVIS/davis_together/augment_gt/" imgs = sorted(glob.glob(train_img_path + "*/*.jpg")) ds = DataFromList(imgs, shuffle=True) aug = imgaug.AugmentorList([ CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE), imgaug.Flip(horiz=True) ]) def preprocess(fname): # print("start preproc mapillary") start = time.time() label_fname = fname.replace(train_img_path, train_label_path).replace(".jpg", ".png") pil_label = Image.open(label_fname) label = np.array(pil_label) instances = np.unique(label) instance_classes = [x // 256 for x in instances] if len(instances) == 0: print("no instances") pil_label.close() return None masks = np.array([label == inst for inst in instances], dtype=np.uint8) boxes1 = np.array( [get_bbox_from_segmentation_mask(mask) for mask in masks], dtype=np.float32) boxes = boxes1 # second_klass = np.array(instance_classes, dtype=np.int) second_klass = np.zeros_like(instance_classes, dtype=np.int) klass = np.ones_like(second_klass) is_crowd = np.zeros_like(second_klass) res = preproc_img(fname, boxes, klass, second_klass, is_crowd, aug) if res is None: print("davis: preproc_img returned None on", fname) pil_label.close() return None ret, params = res if add_mask: do_flip, h, w = params[1] assert do_flip in (True, False), do_flip # augment label label = np.array(pil_label.resize((w, h), Image.NEAREST)) if do_flip: label = label[:, ::-1] # create augmented masks masks = np.array([label == inst for inst in instances], dtype=np.uint8) ret.append(masks) end = time.time() elapsed = end - start # print("davis example done, elapsed:", elapsed) VISUALIZE = False if VISUALIZE: from viz import draw_annotation, draw_mask config.CLASS_NAMES = [str(idx) for idx in range(81)] im = ret[0] boxes = ret[3] draw_klass = ret[-2] viz = draw_annotation(im, boxes, draw_klass) for mask in masks: viz = draw_mask(viz, mask) tpviz.interactive_imshow(viz) pil_label.close() return ret ds = MapData(ds, preprocess) # ds = MultiProcessMapData(ds, nr_proc=8, map_func=preprocess, buffer_size=35) # ds = MultiProcessMapData(ds, nr_proc=8, map_func=preprocess) return ds
def get_train_dataflow_w_unlabeled(load_path): """ Return a training dataflow. Each datapoint consists of the following: An image: (h, w, 3), 1 or more pairs of (anchor_labels, anchor_boxes): anchor_labels: (h', w', NA) anchor_boxes: (h', w', NA, 4) gt_boxes: (N, 4) gt_labels: (N,) If MODE_MASK, gt_masks: (N, h, w) """ assert os.path.isfile(load_path), "{} does not find".format(load_path) roidbs = list( itertools.chain.from_iterable( DatasetRegistry.get(x).training_roidbs() for x in cfg.DATA.TRAIN)) print_class_histogram(roidbs) if "VOC" in cfg.DATA.TRAIN[0]: roidbs_u = list( itertools.chain.from_iterable( DatasetRegistry.get(x).training_roidbs() for x in cfg.DATA.UNLABEL)) unlabled2017_used = False else: unlabled2017_used = np.any(["@" not in x for x in cfg.DATA.TRAIN]) def prase_name(x): if not unlabled2017_used: assert "@" in load_path, ( "{}: Did you use wrong pseudo_data.py for " "this model?").format(load_path) return x + "-unlabeled" else: # return coco2017 unlabeled data return "coco_unlabeled2017" roidbs_u = list( itertools.chain.from_iterable( DatasetRegistry.get(prase_name(x)).training_roidbs() for x in cfg.DATA.TRAIN)) print_class_histogram(roidbs_u) # Filter out images that have no gt boxes, but this filter shall not be applied for testing. # The model does support training with empty images, but it is not useful for COCO. def remove_no_box_data(_roidbs, filter_fn, dset): num = len(_roidbs) _roidbs = filter_fn(_roidbs) logger.info( "Filtered {} images which contain no non-crowd groudtruth boxes. Total {} #images for training: {}" .format(num - len(_roidbs), dset, len(_roidbs))) return _roidbs roidbs = remove_no_box_data( roidbs, lambda x: list( filter(lambda img: len(img["boxes"][img["is_crowd"] == 0]) > 0, x) ), "labeled") # load unlabeled if unlabled2017_used: assert "@" not in load_path, "Did you use the wrong pseudo path" pseudo_targets = dd.io.load(load_path) logger.info("Loaded {} pseudo targets from {}".format( len(pseudo_targets), load_path)) roidbs_u = remove_no_box_data( roidbs_u, lambda x: list( filter( lambda img: len(pseudo_targets[img["image_id"]]["boxes"]) > 0, x)), "unlabeled") preprocess = TrainingDataPreprocessorSSlAug( cfg, confidence=cfg.TRAIN.CONFIDENCE, pseudo_targets=pseudo_targets) ds = DataFrom2List(roidbs, roidbs_u, shuffle=True) if cfg.DATA.NUM_WORKERS > 0: if cfg.TRAINER == "horovod": buffer_size = cfg.DATA.NUM_WORKERS * 10 ds = MultiThreadMapData(ds, cfg.DATA.NUM_WORKERS, preprocess, buffer_size=buffer_size) else: buffer_size = cfg.DATA.NUM_WORKERS * 20 ds = MultiProcessMapData(ds, cfg.DATA.NUM_WORKERS, preprocess, buffer_size=buffer_size) else: ds = MapData(ds, preprocess) return ds
def get_eval_dataflow(name, shard=0, num_shards=1): seqs = [] with open("davis2017_fast_val_ids.txt") as f: for l in f: seqs.append(l.strip()) seqs_timesteps = [] for seq in seqs: files = sorted( glob.glob(cfg.DATA.DAVIS2017_ROOT + "/JPEGImages/480p/" + seq.split("__")[0] + "/*.jpg"))[1:-1] timesteps = [f.split('/')[-1].replace(".jpg", "") for f in files] for timestep in timesteps: ann_fn = cfg.DATA.DAVIS2017_ROOT + "/Annotations/480p/" + seq.split( "__")[0] + '/' + timestep + ".png" ann = np.array(PIL.Image.open(ann_fn)) ann_mask = ann == int(seq.split("__")[1]) if ann_mask.any(): seqs_timesteps.append( (seq.split('__')[0], seq.split('__')[1], timestep)) # seqs_timesteps += [(seq.split('__')[0], seq.split('__')[1], timestep) for timestep in timesteps] num_seqs_timesteps = len(seqs_timesteps) seqs_timesteps_per_shard = num_seqs_timesteps // num_shards seqs_timesteps_range = (shard * seqs_timesteps_per_shard, (shard + 1) * seqs_timesteps_per_shard if shard + 1 < num_shards else num_seqs_timesteps) ds = DataFromList( seqs_timesteps[seqs_timesteps_range[0]:seqs_timesteps_range[1]]) def preprocess(seq_timestep): seq, obj_id, timestep = seq_timestep ann_fn = cfg.DATA.DAVIS2017_ROOT + "/Annotations/480p/" + seq + '/' + timestep + ".png" ann = np.array(PIL.Image.open(ann_fn)) ann_mask = ann == int(obj_id) if not ann_mask.any(): return None, None, None, None, None # ann_box = np.array([-1000000, -1000000, 100000, 100000]) else: ann_box = get_bbox_from_segmentation_mask_np(ann_mask) ff_fn = cfg.DATA.DAVIS2017_ROOT + "/Annotations/480p/" + seq + '/' + str( 0).zfill(5) + ".png" ff = np.array(PIL.Image.open(ff_fn)) ff_mask = ff == int(obj_id) ff_box = get_bbox_from_segmentation_mask_np(ff_mask) x1, y1, x2, y2 = [float(x) for x in ann_box] target_bbox = np.array([x1, y1, x2, y2], dtype=np.float32) x1, y1, x2, y2 = [float(x) for x in ff_box] ref_bbox = np.array([x1, y1, x2, y2], dtype=np.float32) target_img_fn = cfg.DATA.DAVIS2017_ROOT + "/JPEGImages/480p/" + seq + "/" + timestep + ".jpg" ref_img_fn = cfg.DATA.DAVIS2017_ROOT + "/JPEGImages/480p/" + seq + "/" + str( 0).zfill(5) + ".jpg" target_img = cv2.imread(target_img_fn, cv2.IMREAD_COLOR) ref_img = cv2.imread(ref_img_fn, cv2.IMREAD_COLOR) return ref_img, ref_bbox, target_img, target_bbox, "__".join( seq_timestep) ds = MapData(ds, preprocess) return ds
def get_train_dataflow(): roidbs = DetectionDataset().load_training_roidbs(cfg.DATA.TRAIN) ds = DataFromList(roidbs, shuffle=True) # for now let's not do flipping to keep things simple aug = imgaug.AugmentorList([ CustomResize(cfg.PREPROC.TRAIN_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) ]) #, #imgaug.Flip(horiz=True)]) if cfg.MODE_HARD_MINING: from annoy import AnnoyIndex hard_mining_index = AnnoyIndex(128, 'euclidean') hard_mining_index.load(cfg.HARD_MINING_DATA_PATH + "/index_all/index.ann") names_path = cfg.HARD_MINING_DATA_PATH + "index_all/names.txt" hard_mining_names_all = [] with open(names_path) as f: for l in f: hard_mining_names_all.append(l.strip()) hard_example_names_got = [ x[7:] for x in hard_mining_names_all if x.startswith("GOT10k/") ] hard_example_names_vid = [ x[12:] for x in hard_mining_names_all if x.startswith("ImageNetVID/") ] hard_example_names_ytbvos = [ x[11:] for x in hard_mining_names_all if x.startswith("YouTubeVOS/") ] hard_example_names_lasot = [ x[6:] for x in hard_mining_names_all if x.startswith("LaSOT/") ] assert len(hard_example_names_got) > 0 assert len(hard_example_names_vid) > 0 assert len(hard_example_names_ytbvos) > 0 assert len(hard_example_names_lasot) > 0 hard_example_names_got.sort() hard_example_names_vid.sort() hard_example_names_ytbvos.sort() hard_example_names_lasot.sort() hard_mining_names = { "all": hard_mining_names_all, "GOT10k": hard_example_names_got, "ImageNetVID": hard_example_names_vid, "YouTubeVOS": hard_example_names_ytbvos, "LaSOT": hard_example_names_lasot } else: hard_mining_index = None hard_mining_names = None def preprocess(roidb): if roidb.startswith("VID/"): return _preprocess_imagenet_vid(roidb[4:], aug, hard_mining_index, hard_mining_names) elif roidb.startswith("DAVIS/"): return _preprocess_davis_like( roidb[6:], aug, os.path.join(cfg.DATA.DAVIS2017_ROOT, "Annotations", "480p")) elif roidb.startswith("YouTubeVOS/"): return _preprocess_davis_like( roidb[11:], aug, os.path.join(cfg.DATA.YOUTUBE_VOS_ROOT, "train", "Annotations"), "YouTubeVOS", hard_mining_index, hard_mining_names) elif roidb.startswith("GOT10K/"): return _preprocess_got10k(roidb[7:], aug, hard_mining_index, hard_mining_names) elif roidb.startswith("LaSOT/"): return _preprocess_lasot(roidb[6:], aug, hard_mining_index, hard_mining_names) elif roidb.startswith("YouTube-BB/"): return _preprocess_youtube_bb(roidb[11:], aug) elif roidb.startswith("TrackingNet/"): return _preprocess_trackingnet(roidb[12:], aug) else: assert False #ds = MultiProcessMapDataZMQ(ds, 10, preprocess) #ds = MapData(ds, preprocess) if cfg.DATA.DEBUG_VIS or not cfg.DATA.MULTITHREAD: ds = MapData(ds, preprocess) else: #ds = MultiThreadMapData(ds, 6, preprocess) ds = MultiThreadMapData(ds, 8, preprocess, buffer_size=80) return ds
def train_dataloader(self): """Summary Returns: TYPE: Description """ ds_train = CustomDataSet(folder=self.hparams.data, train_or_valid='train', size=np.inf, hparams=self.hparams) ds_train.reset_state() ag_train = [ imgaug.Affine(shear=10, interp=cv2.INTER_NEAREST), imgaug.Affine(translate_frac=(0.01, 0.02), interp=cv2.INTER_NEAREST), imgaug.Affine(scale=(0.25, 1.0), interp=cv2.INTER_NEAREST), imgaug.RotationAndCropValid(max_deg=10, interp=cv2.INTER_NEAREST), imgaug.GoogleNetRandomCropAndResize( crop_area_fraction=(0.8, 1.0), aspect_ratio_range=(0.8, 1.2), interp=cv2.INTER_NEAREST, target_shape=self.hparams.shape), imgaug.Resize(self.hparams.shape, interp=cv2.INTER_NEAREST), imgaug.Flip(horiz=True, vert=False, prob=0.5), imgaug.Flip(horiz=False, vert=True, prob=0.5), imgaug.Transpose(prob=0.5), imgaug.Albumentations(AB.RandomRotate90(p=1)), imgaug.ToFloat32(), ] ds_train = AugmentImageComponent( ds_train, [ # imgaug.Float32(), # imgaug.RandomChooseAug([ # imgaug.Albumentations(AB.IAAAdditiveGaussianNoise(p=0.25)), # imgaug.Albumentations(AB.GaussNoise(p=0.25)), # ]), # imgaug.ToUint8(), imgaug.RandomChooseAug([ imgaug.Albumentations(AB.Blur(blur_limit=4, p=0.25)), imgaug.Albumentations(AB.MotionBlur(blur_limit=4, p=0.25)), imgaug.Albumentations(AB.MedianBlur(blur_limit=4, p=0.25)), ]), imgaug.RandomChooseAug([ # imgaug.Albumentations(AB.IAASharpen(p=0.5)), # imgaug.Albumentations(AB.IAAEmboss(p=0.5)), imgaug.Albumentations(AB.RandomBrightnessContrast(p=0.5)), ]), imgaug.ToUint8(), imgaug.Albumentations(AB.CLAHE(tile_grid_size=(32, 32), p=0.5)), ], 0) ds_train = AugmentImageComponents(ds_train, ag_train, [0, 1]) ds_train = BatchData(ds_train, self.hparams.batch, remainder=True) if self.hparams.debug: ds_train = FixedSizeData(ds_train, 2) ds_train = MultiProcessRunner(ds_train, num_proc=4, num_prefetch=16) ds_train = PrintData(ds_train) ds_train = MapData( ds_train, lambda dp: [ torch.tensor(dp[0][:, np.newaxis, :, :]).float(), torch.tensor(dp[1][:, np.newaxis, :, :]).float(), ]) return ds_train
def get_dataflow(is_train=True): train_df = pd.read_csv(os.path.join('/data/kaggle/HPA', 'train.csv')) #train_df = oversample(train_df) labels = [[int(i) for i in s.split()] for s in train_df['Target']] fnames = train_df['Id'].tolist() fnames = [os.path.join(config.TRAIN_DATASET, f) for f in fnames] sprase_label = [ np.eye(config.NUM_CLASS, dtype=np.float)[np.array(la)].sum(axis=0) for la in labels ] extra_df = pd.read_csv( os.path.join('/data/kaggle/HPA', 'HPAv18RGBY_WithoutUncertain_wodpl.csv')) #extra_df = oversample(extra_df) extra_labels = [[int(i) for i in s.split()] for s in extra_df['Target']] extra_labels = [ np.eye(config.NUM_CLASS, dtype=np.float)[np.array(la)].sum(axis=0) for la in extra_labels ] extra_fnames = extra_df['Id'].tolist() extra_fnames = [ os.path.join(config.EXTRA_DATASET, f) for f in extra_fnames ] fnames = fnames + extra_fnames sprase_label = sprase_label + extra_labels fnames = np.array(fnames) sprase_label = np.array(sprase_label) msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42) for train_index, test_index in msss.split(fnames, sprase_label): x_train, x_test = fnames[train_index], fnames[test_index] y_train, y_test = sprase_label[train_index], sprase_label[test_index] holdout_data = list(zip(x_test, y_test)) # 5 fold the rest mskf = MultilabelStratifiedKFold(n_splits=5, random_state=1) for fold_num, (train_index, test_index) in enumerate(mskf.split(x_train, y_train)): if fold_num == config.FOLD: foldx_train, foldx_test = x_train[train_index], x_train[test_index] foldy_train, foldy_test = y_train[train_index], y_train[test_index] break train_data = list(zip(foldx_train, foldy_train)) val_data = list(zip(foldx_test, foldy_test)) train_data = oversample_2(train_data) pseudo_df = pd.read_csv(os.path.join('/data/kaggle/HPA', 'LB623.csv')) pseudo_fnames = pseudo_df['Id'].tolist() pseudo_fnames = [ os.path.join(config.TEST_DATASET, f) for f in pseudo_fnames ] #pseudo_labels = np.load("./SOTA.npy") #pseudo_labels = [np.array(_) for _ in pseudo_labels] pseudo_labels = [[int(i) for i in s.split()] for s in pseudo_df['Predicted']] pseudo_labels = [ np.eye(config.NUM_CLASS, dtype=np.float)[np.array(la)].sum(axis=0) for la in pseudo_labels ] pseudo_data = list(zip(pseudo_fnames, pseudo_labels)) train_data = train_data + pseudo_data print("train: ", len(train_data), len(val_data)) if not is_train: return val_data ds = DataFromList(train_data, shuffle=True) ds = BatchData(MapData(ds, preprocess), config.BATCH) ds = PrefetchDataZMQ(ds, 6) return ds
default=32, type=int) parser.add_argument('--benchmark', action='store_true') parser.add_argument('--no-zmq-ops', action='store_true') args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = '' if args.fake: ds = FakeData([[args.batch, 224, 224, 3], [args.batch]], 1000, random=False, dtype=['uint8', 'int32']) else: augs = fbresnet_augmentor(True) ds = get_data(args.batch, augs) logger.info("Serving data on {}".format(socket.gethostname())) if args.benchmark: from zmq_ops import dump_arrays ds = MapData(ds, dump_arrays) TestDataSpeed(ds, warmup=300).start() else: format = None if args.no_zmq_ops else 'zmq_ops' send_dataflow_zmq(ds, 'ipc://@imagenet-train-b{}'.format(args.batch), hwm=150, format=format, bind=True)
def get_train_dataflow(add_mask=False): """ Return a training dataflow. Each datapoint is: image, fm_labels, fm_boxes, gt_boxes, gt_class [, masks] """ imgs = COCODetection.load_many(config.BASEDIR, config.TRAIN_DATASET, add_gt=True, add_mask=add_mask) # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. imgs = list(filter(lambda img: len(img['boxes']) > 0, imgs)) # log invalid training ds = DataFromList(imgs, shuffle=True) aug = imgaug.AugmentorList([ CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE), imgaug.Flip(horiz=True) ]) def preprocess(img): fname, boxes, klass, is_crowd = img['file_name'], img['boxes'], img[ 'class'], img['is_crowd'] im = cv2.imread(fname, cv2.IMREAD_COLOR) assert im is not None, fname im = im.astype('float32') # assume floatbox as input assert boxes.dtype == np.float32 # augmentation: im, params = aug.augment_return_params(im) points = box_to_point8(boxes) points = aug.augment_coords(points, params) boxes = point8_to_box(points) # rpn anchor: try: fm_labels, fm_boxes = get_rpn_anchor_input(im, boxes, klass, is_crowd) boxes = boxes[is_crowd == 0] # skip crowd boxes in training target klass = klass[is_crowd == 0] if not len(boxes): raise MalformedData("No valid gt_boxes!") except MalformedData as e: log_once( "Input {} is invalid for training: {}".format(fname, str(e)), 'warn') return None ret = [im, fm_labels, fm_boxes, boxes, klass] # masks segmentation = img.get('segmentation', None) if segmentation is not None: segmentation = [ segmentation[k] for k in range(len(segmentation)) if not is_crowd[k] ] assert len(segmentation) == len(boxes) # one image-sized binary mask per box masks = [] for polys in segmentation: polys = [aug.augment_coords(p, params) for p in polys] masks.append( segmentation_to_mask(polys, im.shape[0], im.shape[1])) masks = np.asarray(masks, dtype='uint8') # values in {0, 1} ret.append(masks) # from viz import draw_annotation, draw_mask # viz = draw_annotation(im, boxes, klass) # for mask in masks: # viz = draw_mask(viz, mask) # tpviz.interactive_imshow(viz) return ret ds = MapData(ds, preprocess) ds = PrefetchDataZMQ(ds, 1) return ds
def get_dataflow(is_train=True): train_df = pd.read_csv(os.path.join('/data/kaggle/HPA', 'train.csv')) labels = [[int(i) for i in s.split()] for s in train_df['Target']] binary_label = [] for la in labels: if MODEL_LABEL in la: binary_label.append([1]) else: binary_label.append([0]) fnames = train_df['Id'].tolist() fnames = [os.path.join(config.TRAIN_DATASET, f) for f in fnames] sprase_label = [ np.eye(config.NUM_CLASS, dtype=np.float)[np.array(la)].sum(axis=0) for la in binary_label ] if config.EXTRA: extra_df = pd.read_csv( os.path.join('/data/kaggle/HPA', 'HPAv18RBGY_wodpl.csv')) extra_labels = [[int(i) for i in s.split()] for s in extra_df['Target']] binary_label = [] for la in labels: if MODEL_LABEL in la: binary_label.append([1]) else: binary_label.append([0]) extra_labels = [ np.eye(config.NUM_CLASS, dtype=np.float)[np.array(la)].sum(axis=0) for la in binary_label ] extra_fnames = extra_df['Id'].tolist() extra_fnames = [ os.path.join(config.EXTRA_DATASET, f) for f in extra_fnames ] fnames = fnames + extra_fnames sprase_label = sprase_label + extra_labels # extra_data = list(zip(extra_fnames, extra_labels)) fnames = np.array(fnames) sprase_label = np.array(sprase_label) print(fnames.shape[0]) msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42) for train_index, test_index in msss.split(fnames, sprase_label): x_train, x_test = fnames[train_index], fnames[test_index] y_train, y_test = sprase_label[train_index], sprase_label[test_index] train_data = list(zip(x_train, y_train)) val_data = list(zip(x_test, y_test)) if not is_train: return val_data ds = DataFromList(train_data, shuffle=True) ds = BatchData(MapData(ds, preprocess), config.BATCH) ds = PrefetchDataZMQ(ds, 6) return ds