def test_dataset(dataset): transformation = InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, instance_sampler=ExactlyOneSampler(), past_length=10, future_length=5, dummy_value=1.0, ) dl = TrainDataLoader( dataset=dataset, transform=transformation, batch_size=batch_size, stack_fn=partial(batchify, ctx=current_context()), decode_fn=partial(as_in_context, ctx=current_context()), num_workers=num_workers, ) item_ids = defaultdict(int) for epoch in range(num_epochs): for batch in islice(dl, num_batches_per_epoch): for item_id in batch["item_id"]: item_ids[item_id] += 1 for i in range(len(dataset)): assert num_passes - 1 <= item_ids[i] <= num_passes + 1
def test_training_loader_soft_constraint_03() -> None: ( list_dataset, transformation, list_dataset_pred_length, train_data_transformed_original, ) = get_dataset_and_transformation() # the expected number of batches exp_num_batches = len(train_data_transformed_original) # CASE 03: ONE WORKER TRAVERSES ALL train_dataset_loader = TrainDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, stack_fn=partial(batchify, ctx=current_context()), decode_fn=partial(as_in_context, ctx=current_context()), num_workers=1, # This is the crucial difference ) batches = list(islice(train_dataset_loader, int(3 * exp_num_batches))) transformation_counts = get_transformation_counts(batches) assert all( k in transformation_counts for k in range(CD_NUM_TIME_SERIES) ), "One worker should be able to traverse all in one sweep, and should not deplete its iterator."
def test_training_loader_batch_size_hard_constraint() -> None: ( list_dataset, transformation, list_dataset_pred_length, train_data_transformed_original, ) = get_dataset_and_transformation() train_dataset_loader_1 = TrainDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, stack_fn=partial(batchify, ctx=current_context()), num_workers=NUM_WORKERS_MP, # This is the crucial difference ) train_dataset_loader_2 = TrainDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, stack_fn=partial(batchify, ctx=current_context()), num_workers=NUM_WORKERS_MP, # This is the crucial difference shuffle_buffer_length=3 * BATCH_SIZE, ) batches_1 = list(islice(train_dataset_loader_1, 30)) batches_2 = list(islice(train_dataset_loader_2, 30)) assert all([len(batch["item_id"]) == BATCH_SIZE for batch in batches_1 ]), "Not every batch from training loader is right size." assert all( [len(batch["item_id"]) == BATCH_SIZE for batch in batches_2] ), "Not every batch from training loader is right size, with shuffling on."
def test_training_loader_soft_constraint_02() -> None: ( list_dataset, transformation, list_dataset_pred_length, train_data_transformed_original, ) = get_dataset_and_transformation() # the expected number of batches exp_num_batches = len(train_data_transformed_original) # CASE 02: NOT EVERY TS VISITED ONCE train_dataset_loader = TrainDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, stack_fn=partial(batchify, ctx=current_context()), decode_fn=partial(as_in_context, ctx=current_context()), num_workers=NUM_WORKERS_MP, # This is the crucial difference ) batches = list(islice(train_dataset_loader, int(0.5 * exp_num_batches))) transformation_counts = get_transformation_counts(batches) assert not all([ k in transformation_counts for k in range(CD_NUM_TIME_SERIES) ]), "It should not have been possible to process every time series once. "
def test_inference_loader_equivalence() -> None: ( list_dataset, transformation, list_dataset_pred_length, train_data_transformed_original, ) = get_dataset_and_transformation() current_desired_context = current_context() # original no multiprocessing processed validation dataset inference_loader_data_transformed_original = list( InferenceDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, num_workers=0, # This is the crucial difference ctx=current_context(), ) ) inference_loader = InferenceDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS_MP, # This is the crucial difference ctx=current_context(), ) # multi-processed validation dataset mp_inf_data_loader_result_01 = list(inference_loader) # multi-processed validation dataset NR2, second iteration/pass through mp_inf_data_loader_result_02 = list(inference_loader) # ASSERTIONS: assert get_transformation_counts( mp_inf_data_loader_result_01 ) == get_transformation_counts( inference_loader_data_transformed_original ), "The multiprocessing ValidationDataLoader should yield equivalent result to the non multiprocessing one." assert get_transformation_counts( mp_inf_data_loader_result_02 ) == get_transformation_counts( inference_loader_data_transformed_original ), "The multiprocessing ValidationDataLoader should yield equivalent result to the non multiprocessing one." assert ( len(mp_inf_data_loader_result_02[1]["item_id"]) == BATCH_SIZE ), "Incorrect batch size from multiprocessing." assert ( mp_inf_data_loader_result_02[0]["past_target"].context == current_desired_context ), "Batches in incorrect context"
def get_dataset_and_transformation(): # dont recompute, since expensive global _data_cache if _data_cache is not None: return _data_cache # create constant dataset with each time series having # variable length and unique constant integer entries dataset = ConstantDataset(num_steps=CD_NUM_STEPS, num_timeseries=CD_NUM_TIME_SERIES) list_dataset = list(dataset.train) for i, ts in enumerate(list_dataset): ts["start"] = pd.Timestamp(ts_input=ts["start"], freq=dataset.freq) # get randomness in the ts lengths ts["target"] = np.array( ts["target"] * random.randint(1, CD_MAX_LEN_MULTIPLICATION_FACTOR)) list_dataset = ListDataset(data_iter=list_dataset, freq=dataset.freq) list_dataset_pred_length = dataset.prediction_length # use every possible time point to split the time series transformation = InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, instance_sampler=UniformSplitSampler( p=1.0, min_future=list_dataset_pred_length, ), past_length=CONTEXT_LEN, future_length=list_dataset_pred_length, dummy_value=1.0, ) # original no multiprocessing processed validation dataset train_data_transformed_original = list( ValidationDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, stack_fn=partial(batchify, ctx=current_context()), decode_fn=partial(as_in_context, ctx=current_context()), num_workers=None, # This is the crucial difference )) _data_cache = ( list_dataset, transformation, list_dataset_pred_length, train_data_transformed_original, ) return _data_cache
def _prepare_image(img, nrow=8, padding=2, square_image=False): """Given an image of format HW, CHW, or NCHW, returns a image of format HWC. If the input is a batch of images, a grid of images is made by stitching them together. If data type is float, values must be in the range [0, 1], and then they are rescaled to range [0, 255]. If data type is 'uint8`, values are unchanged. """ if isinstance(img, np.ndarray): img = nd.array(img, dtype=img.dtype, ctx=current_context()) if not isinstance(img, NDArray): raise TypeError('expected MXNet NDArray or numpy.ndarray, ' 'while received type {}'.format(str(type(img)))) assert img.ndim == 2 or img.ndim == 3 or img.ndim == 4 if img.dtype == np.uint8: return make_image_grid( img, nrow=nrow, padding=padding, square_image=square_image).transpose((1, 2, 0)) elif img.dtype == np.float32 or img.dtype == np.float64: min_val = img.min().asscalar() max_val = img.max().asscalar() if min_val < 0.0: raise ValueError('expected non-negative min value from img, ' 'while received {}'.format(min_val)) if max_val > 1.0: raise ValueError('expected max value from img not greater than 1, ' 'while received {}'.format(max_val)) img = make_image_grid(img, nrow=nrow, padding=padding, square_image=square_image) * 255.0 return img.astype(np.uint8).transpose((1, 2, 0)) else: raise ValueError('expected input image dtype is one of uint8, float32, ' 'and float64, received dtype {}'.format(str(img.dtype)))
def test_training_loader_soft_constraint_01() -> None: ( list_dataset, transformation, list_dataset_pred_length, train_data_transformed_original, ) = get_dataset_and_transformation() # the expected number of batches exp_num_batches = len(train_data_transformed_original) # CASE 01: EVERY TS VISITED AT LEAST ONCE train_dataset_loader = TrainDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, stack_fn=partial(batchify, ctx=current_context()), num_workers=NUM_WORKERS_MP, # This is the crucial difference ) batches = list(islice(train_dataset_loader, int(3 * exp_num_batches))) transformation_counts = get_transformation_counts(batches) assert all([k in transformation_counts for k in range(CD_NUM_TIME_SERIES) ]), "Not every time series processed at least once."
def test_dataset(dataset): class ExactlyOneSampler(InstanceSampler): def __call__(self, ts: np.ndarray, a: int, b: int) -> np.ndarray: window_size = b - a + 1 assert window_size > 0 return np.array([a]) transformation = InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, train_sampler=ExactlyOneSampler(), past_length=10, future_length=5, dummy_value=1.0, ) dl = TrainDataLoader( dataset=dataset, transform=transformation, batch_size=batch_size, stack_fn=partial(batchify, ctx=current_context()), num_workers=num_workers, ) item_ids = defaultdict(int) for epoch in range(num_epochs): for batch in islice(dl, num_batches_per_epoch): for item_id in batch["item_id"]: item_ids[item_id] += 1 for i in range(len(dataset)): assert num_passes - 1 <= item_ids[i] <= num_passes + 1
def test_validation_data_loader(dataset_context): with dataset_context as dataset: dataset_length = len(list(dataset)) counter = defaultdict(lambda: 0) dl = ValidationDataLoader( dataset=dataset, transform=default_transformation(), batch_size=4, stack_fn=partial(batchify, ctx=current_context()), ) batches = list(dl) for batch in batches: assert all(x is True for x in batch["is_train"]) counter = count_item_ids(batches) for entry in dataset: assert counter[entry[FieldName.ITEM_ID]] == 1 batches_again = list(dl) assert (b1 == b2 for b1, b2 in zip(batches, batches_again))
def test_training_loader_soft_constraint_01() -> None: ( list_dataset, transformation, list_dataset_pred_length, train_data_transformed_original, ) = get_dataset_and_transformation() # the expected number of batches exp_num_batches = len(train_data_transformed_original) # CASE 01: EVERY TS VISITED AT LEAST ONCE train_dataset_loader_01 = TrainDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS_MP, # This is the crucial difference ctx=current_context(), num_batches_per_epoch=int(3 * exp_num_batches), ) # give all the workers a little time to get ready, so they can start at the same time time.sleep(1.5) # multi-processed validation dataset mp_training_data_loader_result_01 = list(train_dataset_loader_01) # should contain an entry for every time series id transformation_counts_01 = get_transformation_counts( mp_training_data_loader_result_01) assert all([ k in transformation_counts_01 for k in range(CD_NUM_TIME_SERIES) ]), "Not every time series processed at least once."
def test_training_loader_soft_constraint_03() -> None: ( list_dataset, transformation, list_dataset_pred_length, train_data_transformed_original, ) = get_dataset_and_transformation() # the expected number of batches exp_num_batches = len(train_data_transformed_original) # CASE 03: ONE WORKER TRAVERSES ALL train_dataset_loader_03 = TrainDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, num_workers=1, # This is the crucial difference ctx=current_context(), num_batches_per_epoch=int(3 * exp_num_batches), ) # multi-processed validation dataset mp_training_data_loader_result_03 = list(train_dataset_loader_03) # should contain an entry for every time series id transformation_counts_03 = get_transformation_counts( mp_training_data_loader_result_03) assert all( k in transformation_counts_03 for k in range(CD_NUM_TIME_SERIES) ), "One worker should be able to traverse all in one sweep, and should not deplete its iterator."
def test_training_loader_soft_constraint_02() -> None: ( list_dataset, transformation, list_dataset_pred_length, train_data_transformed_original, ) = get_dataset_and_transformation() # the expected number of batches exp_num_batches = len(train_data_transformed_original) # CASE 02: NOT EVERY TS VISITED ONCE train_dataset_loader_02 = TrainDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS_MP, # This is the crucial difference ctx=current_context(), num_batches_per_epoch=int(0.5 * exp_num_batches), ) # multi-processed validation dataset mp_training_data_loader_result_02 = list(train_dataset_loader_02) # should contain an entry for every time series id transformation_counts_02 = get_transformation_counts( mp_training_data_loader_result_02) assert not all([ k in transformation_counts_02 for k in range(CD_NUM_TIME_SERIES) ]), "It should not have been possible to process every time series once. "
def test_training_loader_batch_size_hard_constraint() -> None: ( list_dataset, transformation, list_dataset_pred_length, train_data_transformed_original, ) = get_dataset_and_transformation() train_dataset_loader_01 = TrainDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS_MP, # This is the crucial difference ctx=current_context(), num_batches_per_epoch=30, ) train_dataset_loader_02 = TrainDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS_MP, # This is the crucial difference ctx=current_context(), num_batches_per_epoch=30, shuffle_buffer_length=3 * BATCH_SIZE, ) # multi-processed training dataset mp_training_data_loader_result_01 = list(train_dataset_loader_01) # multi-processed training dataset mp_training_data_loader_result_02 = list(train_dataset_loader_02) assert all( [ len(batch["item_id"]) == BATCH_SIZE for batch in mp_training_data_loader_result_01 ] ), "Not every batch from training loader is right size." assert all( [ len(batch["item_id"]) == BATCH_SIZE for batch in mp_training_data_loader_result_02 ] ), "Not every batch from training loader is right size, with shuffling on."
def test_validation_loader_equivalence() -> None: ( list_dataset, transformation, list_dataset_pred_length, train_data_transformed_original, ) = get_dataset_and_transformation() validation_dataset_loader = ValidationDataLoader( dataset=list_dataset, transform=transformation, batch_size=BATCH_SIZE, stack_fn=partial(batchify, ctx=current_context()), decode_fn=partial(as_in_context, ctx=current_context()), num_workers=NUM_WORKERS_MP, # This is the crucial difference ) # multi-processed validation dataset mp_val_data_loader_result_01 = list(validation_dataset_loader) # multi-processed validation dataset NR2, second iteration/pass through mp_val_data_loader_result_02 = list(validation_dataset_loader) # ASSERTIONS: assert len(list_dataset.list_data) == len( get_transformation_counts(mp_val_data_loader_result_01) ), "The dataloaders do not cover the whole dataset. Check that each time series was assigned at least one worker." assert get_transformation_counts( mp_val_data_loader_result_01 ) == get_transformation_counts( train_data_transformed_original ), "The multiprocessing ValidationDataLoader should yield equivalent result to the non multiprocessing one." assert get_transformation_counts( mp_val_data_loader_result_02 ) == get_transformation_counts( train_data_transformed_original ), "The multiprocessing ValidationDataLoader should yield equivalent result to the non multiprocessing one." assert (mp_val_data_loader_result_02[0]["past_target"].context == current_context()), "Batches in incorrect context"
def test_training_data_loader(dataset_context, num_workers): with dataset_context as dataset: dataset_length = len(list(dataset)) batch_size = 4 dl = TrainDataLoader( dataset=dataset, transform=default_transformation(), batch_size=batch_size, stack_fn=partial(batchify, ctx=current_context()), decode_fn=partial(as_in_context, ctx=current_context()), num_workers=num_workers, ) num_epochs = 20 epoch_length = 2 passes_through_dataset = int( (num_epochs * epoch_length * batch_size) / dataset_length ) # these are to make sure that the test makes sense: # we want to go over the dataset multiple times assert passes_through_dataset >= 10 # we want each epoch to be shorter than the dataset assert epoch_length * batch_size < dataset_length batches = [] for epoch in range(num_epochs): for batch in islice(dl, epoch_length): assert all(x is True for x in batch["is_train"]) batches.append(batch) counter = count_item_ids(batches) if num_workers is None or num_workers == 1: for entry in dataset: assert counter[entry[FieldName.ITEM_ID]] >= 1
def _make_sprite_image(images, save_path): """Given an NDArray as a batch images, make a sprite image out of it following the rule defined in https://www.tensorflow.org/programmers_guide/embedding and save it in sprite.png under the path provided by the user.""" if isinstance(images, np.ndarray): images = nd.array(images, dtype=images.dtype, ctx=current_context()) elif not isinstance(images, (NDArray, np.ndarray)): raise TypeError('images must be an MXNet NDArray or numpy.ndarray,' ' while received type {}'.format(str(type(images)))) assert isinstance(images, NDArray) shape = images.shape nrow = int(np.ceil(np.sqrt(shape[0]))) _save_image(images, os.path.join(save_path, 'sprite.png'), nrow=nrow, padding=0)
def test_flatten_slice_after_conv(): data = mx.symbol.Variable('data') weight = mx.symbol.Variable('weight') bias = mx.symbol.Variable('bias') conv1= mx.symbol.Convolution(data = data, weight=weight, bias=bias, name='conv1', num_filter=64, kernel=(3,3), stride=(1,1)) flatten1 = mx.symbol.flatten(data = conv1) slice1 = mx.symbol.slice(data = flatten1, begin=0, end=1) shape = (2, 16, 16, 16) val = np.random.rand(2, 16, 16, 16).astype(np.float32) exe = slice1._simple_bind(context.current_context(), data=shape) exe.arg_arrays[0][:] = val exe.arg_arrays[1][:] = np.random.normal(size=exe.arg_arrays[1].shape) exe.arg_arrays[2][:] = np.random.normal(size=exe.arg_arrays[2].shape) p = exe.forward(is_train=False) p[0].wait_to_read() print(p[0])
def test_inference_data_loader(dataset_context): with dataset_context as dataset: dataset_length = len(list(dataset)) counter = defaultdict(lambda: 0) dl = InferenceDataLoader( dataset=dataset, transform=default_transformation(), batch_size=4, stack_fn=partial(batchify, ctx=current_context()), ) batches = list(dl) for batch in batches: assert all(x is False for x in batch["is_train"]) counter = count_item_ids(batches) for entry in dataset: assert counter[entry[FieldName.ITEM_ID]] == 1
def test_context(): ctx_list = [] ctx_list.append(context.current_context()) def f(): set_default_context(mx.gpu(11)) ctx_list.append(context.current_context()) thread = threading.Thread(target=f) thread.start() thread.join() assert Context.devtype2str[ctx_list[0].device_typeid] == "cpu" assert ctx_list[0].device_id == 0 assert Context.devtype2str[ctx_list[1].device_typeid] == "gpu" assert ctx_list[1].device_id == 11 e1 = threading.Event() e2 = threading.Event() status = [False] def g(): with mx.cpu(10): e2.set() e1.wait() if context.current_context().device_id == 10: status[0] = True thread = threading.Thread(target=g) thread.start() e2.wait() with Context("cpu", 11): e1.set() thread.join() e1.clear() e2.clear() assert status[0], "Spawned thread didn't set the correct context"
def g(): with mx.cpu(10): e2.set() e1.wait() if context.current_context().device_id == 10: status[0] = True
def f(): set_default_context(mx.gpu(11)) ctx_list.append(context.current_context())