def test_bucketing(self): r"""Tests bucketing. """ hparams = copy.deepcopy(self._hparams) hparams.update({ "bucket_boundaries": [7], "bucket_batch_sizes": [6, 4] }) text_data = MonoTextData(hparams) iterator = DataIterator(text_data) hparams.update({ "bucket_boundaries": [7], "bucket_batch_sizes": [7, 7], "allow_smaller_final_batch": False }) text_data_1 = MonoTextData(hparams) iterator_1 = DataIterator(text_data_1) for data_batch, data_batch_1 in zip(iterator, iterator_1): length = data_batch['length'][0] if length < 7: last_batch_size = hparams['num_epochs'] % 6 self.assertTrue( len(data_batch['text']) == 6 or len(data_batch['text']) == last_batch_size) else: last_batch_size = hparams['num_epochs'] % 4 self.assertTrue( len(data_batch['text']) == 4 or len(data_batch['text']) == last_batch_size) self.assertEqual(len(data_batch_1['text']), 7)
def _run_and_test(self, hparams): # Construct database text_data = MonoTextData(hparams) self.assertEqual( text_data.vocab.size, self._vocab_size + len(text_data.vocab.special_tokens)) iterator = DataIterator(text_data) for data_batch in iterator: # Run the logics self.assertEqual(set(data_batch.keys()), set(text_data.list_items())) # Test utterance count utt_ind = np.sum(data_batch["text_ids"], 2) != 0 utt_cnt = np.sum(utt_ind, 1) self.assertListEqual( data_batch[text_data.utterance_cnt_name].tolist(), utt_cnt.tolist()) if text_data.hparams.dataset.pad_to_max_seq_length: max_l = text_data.hparams.dataset.max_seq_length max_l += text_data._decoder.added_length for x in data_batch['text']: for xx in x: self.assertEqual(len(xx), max_l) for x in data_batch['text_ids']: for xx in x: self.assertEqual(len(xx), max_l)
def test_dynamic_batching(self): r"""Tests dynamic batching using :class:`texar.torch.data.BatchingStrategy`. """ sent_lengths = np.random.randint(10, 20, size=(100, )) sentences = [['a'] * length for length in sent_lengths] data_source = SequenceDataSource(sentences) class CustomData(DataBase): def __init__(self, source): super().__init__(source) def process(self, raw_example): return raw_example def collate(self, examples): return Batch(len(examples), text=examples) train_data = CustomData(data_source) batch_size = 5 max_tokens = 75 strategy = TokenCountBatchingStrategy(max_tokens, batch_size, len) iterator = DataIterator(train_data, strategy) for batch in iterator: self.assertLessEqual(len(batch), batch_size) self.assertLessEqual(sum(len(s) for s in batch.text), max_tokens)
def _test_modes_with_workers(self, lazy_mode: str, cache_mode: str, num_workers: int): from tqdm import tqdm gc.collect() mem = get_process_memory() with work_in_progress(f"Data loading with lazy mode '{lazy_mode}' " f"and cache mode '{cache_mode}' " f"with {num_workers} workers"): print(f"Memory before: {mem:.2f} MB") with work_in_progress("Construction"): data = ParallelData(self.source, '../../Downloads/src.vocab', '../../Downloads/tgt.vocab', {'batch_size': self.batch_size, 'lazy_strategy': lazy_mode, 'cache_strategy': cache_mode, 'num_parallel_calls': num_workers, 'shuffle': False, 'allow_smaller_final_batch': False, 'max_dataset_size': 100000}) print(f"Memory after construction: {mem:.2f} MB") iterator = DataIterator(data) with work_in_progress("Iteration"): for batch in tqdm(iterator, leave=False): self.assertEqual(batch.batch_size, self.batch_size) gc.collect() print(f"Memory after iteration: {mem:.2f} MB") with work_in_progress("2nd iteration"): for batch in tqdm(iterator, leave=False): self.assertEqual(batch.batch_size, self.batch_size)
def setUp(self): self.batch_size = 4 self.num_label = len(pathologies) data_hparams = { "datasource": { "img_root": "tests/test_iu_xray_data/iu_xray_images", "text_root": "tests/test_iu_xray_data/text_root", "vocab_path": "tests/test_iu_xray_data/test_vocab.txt", "transforms": transforms, "pathologies": pathologies, }, "batch_size": self.batch_size, "shuffle": False, } dataset = IU_XRay_Dataset(data_hparams) dataset.to(torch.device('cpu')) self.loader = DataIterator(dataset) self.extractor = SimpleFusionEncoder() mlc_hparam = { 'num_tags': len(pathologies), } self.mlc = MLC(mlc_hparam) self.mlc_trainer = MLCTrainer(mlc_hparam) self.loss = torch.nn.BCEWithLogitsLoss()
def _run_and_test(self, hparams): # Construct database record_data = RecordData(hparams) iterator = DataIterator(record_data) def _prod(lst): res = 1 for i in lst: res *= i return res for idx, data_batch in enumerate(iterator): self.assertEqual(set(data_batch.keys()), set(record_data.list_items())) # Check data consistency for key in self._unconvert_features: value = data_batch[key][0] self.assertEqual(value, self._dataset_valid[key][idx]) self.assertEqual(list(data_batch['shape'][0]), list(self._dataset_valid['shape'][idx])) # Check data type conversion for key, item in self._feature_convert_types.items(): dtype = get_numpy_dtype(item) value = data_batch[key][0] if dtype is np.str_: self.assertIsInstance(value, str) elif dtype is np.bytes_: self.assertIsInstance(value, bytes) else: if isinstance(value, torch.Tensor): value_dtype = get_numpy_dtype(value.dtype) else: value_dtype = value.dtype dtype_matched = np.issubdtype(value_dtype, dtype) self.assertTrue(dtype_matched) # Check image decoding and resize if hparams["dataset"].get("image_options"): image_options = hparams["dataset"].get("image_options") if isinstance(image_options, dict): image_options = [image_options] for image_option_feature in image_options: image_key = image_option_feature.get("image_feature_name") if image_key is None: continue image_gen = data_batch[image_key][0] image_valid_shape = self._dataset_valid["shape"][idx] resize_height = image_option_feature.get("resize_height") resize_width = image_option_feature.get("resize_width") if resize_height and resize_width: self.assertEqual( image_gen.shape[0] * image_gen.shape[1], resize_height * resize_width) else: self.assertEqual(_prod(image_gen.shape), _prod(image_valid_shape))
def _run_and_test(self, hparams, test_batch_size=False, test_transform=False): # Construct database text_data = MonoTextData(hparams) self.assertEqual( text_data.vocab.size, self._vocab_size + len(text_data.vocab.special_tokens)) iterator = DataIterator(text_data) for data_batch in iterator: self.assertEqual(set(data_batch.keys()), set(text_data.list_items())) if test_batch_size: self.assertEqual(len(data_batch['text']), hparams['batch_size']) if test_transform: for i in range(len(data_batch['text'])): text_ = data_batch['text'][i] self.assertTrue(text_ in self.upper_cased_text) max_seq_length = text_data.hparams.dataset.max_seq_length mode = text_data.hparams.dataset.length_filter_mode max_l = max_seq_length if max_seq_length is not None: if text_data.hparams.dataset.eos_token != '': max_l += 1 if text_data.hparams.dataset.bos_token != '': max_l += 1 if max_seq_length == 6: for length in data_batch['length']: self.assertLessEqual(length, max_l) if mode == "discard": for length in data_batch['length']: self.assertEqual(length, 5) elif mode == "truncate": num_length_6 = 0 for length in data_batch['length']: num_length_6 += int(length == 6) self.assertGreater(num_length_6, 0) else: raise ValueError("Unknown mode: %s" % mode) if text_data.hparams.dataset.pad_to_max_seq_length: for x in data_batch['text']: self.assertEqual(len(x), max_l) for x in data_batch['text_ids']: self.assertEqual(len(x), max_l)
def test_iterator_single_dataset(self): r"""Tests iterating over a single dataset. """ data = MonoTextData(self._test_hparams) data_iterator = DataIterator(data) data_iterator.switch_to_dataset(dataset_name="data") iterator = data_iterator.get_iterator() i = 1001 for idx, batch in enumerate(iterator): self.assertEqual(batch.batch_size, self._test_hparams['batch_size']) np.testing.assert_array_equal(batch['length'], [1, 1]) for example in batch['text']: self.assertEqual(example[0], str(i)) i += 1 self.assertEqual(i, 2001)
def _run_and_test(self, hparams, proc_shr=False, test_transform=None, discard_src=False): # Construct database text_data = PairedTextData(hparams) self.assertEqual( text_data.source_vocab.size, self._vocab_size + len(text_data.source_vocab.special_tokens)) iterator = DataIterator(text_data) for data_batch in iterator: self.assertEqual(set(data_batch.keys()), set(text_data.list_items())) if proc_shr: tgt_eos = '<EOS>' else: tgt_eos = '<TARGET_EOS>' # Test matching src_text = data_batch['source_text'] tgt_text = data_batch['target_text'] if proc_shr: for src, tgt in zip(src_text, tgt_text): np.testing.assert_array_equal(src[:3], tgt[:3]) else: for src, tgt in zip(src_text, tgt_text): np.testing.assert_array_equal(src[:3], tgt[1:4]) self.assertTrue(tgt_eos in data_batch['target_text'][0]) if test_transform: for i in range(len(data_batch['source_text'])): text_ = data_batch['source_text'][i] self.assertTrue(text_ in self.src_upper_cased_text) for i in range(len(data_batch['target_text'])): text_ = data_batch['target_text'][i] self.assertTrue(text_ in self.tgt_upper_cased_text) if discard_src: src_hparams = text_data.hparams.source_dataset max_l = src_hparams.max_seq_length max_l += sum( int(x is not None and x != '') for x in [text_data._src_bos_token, text_data._tgt_bos_token]) for l in data_batch["source_length"]: self.assertLessEqual(l, max_l)
def test_auto_storage_moving(self): cuda_tensors = set() def move_tensor(tensor, device, non_blocking=False): if isinstance(device, torch.device) and device.type == "cuda": self.assertTrue(non_blocking) cuda_tensors.add(id(tensor)) return tensor device = torch.device("cuda:0") with patch.object(torch.Tensor, "to", move_tensor): train = MonoTextData(self._train_hparams, device=device) iterator = DataIterator(train) for batch in iterator: self.assertTrue(id(batch.text_ids) in cuda_tensors) self.assertTrue(id(batch.length) in cuda_tensors)
def setUp(self): self.batch_size = 4 self.num_label = len(pathologies) hparams = { "datasource": { "img_root": "tests/test_iu_xray_data/iu_xray_images", "text_root": "tests/test_iu_xray_data/text_root", "vocab_path": "tests/test_iu_xray_data/test_vocab.txt", "transforms": transforms, "pathologies": pathologies, }, "batch_size": self.batch_size, "shuffle": False, } dataset = IU_XRay_Dataset(hparams) dataset.to(torch.device('cpu')) self.vocab = dataset.source.vocab self.ground_truth_keys = [ 'img_tensor', 'label', 'token_tensor', 'stop_prob' ] self.ground_truth_findings = [ 'cardiac and mediastinal contours ' 'are within normal limits <EOS> the ' 'lungs are clear <EOS> bony structures ' 'are intact <EOS>' ] self.ground_truth_token_tensors = torch.Tensor([ [61, 10, 36, 55, 7, 25, 8, 28, 2, 0, 0, 0], [5, 19, 7, 21, 2, 0, 0, 0, 0, 0, 0, 0], [58, 52, 7, 73, 2, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ]).to(torch.long) self.loader = DataIterator(dataset)
def test_iterator_multi_datasets(self): r"""Tests iterating over multiple datasets. """ train = MonoTextData(self._train_hparams) test = MonoTextData(self._test_hparams) train_batch_size = self._train_hparams["batch_size"] test_batch_size = self._test_hparams["batch_size"] data_iterator = DataIterator({"train": train, "test": test}) data_iterator.switch_to_dataset(dataset_name="train") iterator = data_iterator.get_iterator() for idx, val in enumerate(iterator): self.assertEqual(len(val), train_batch_size) number = idx * train_batch_size + 1 self.assertEqual(val.text[0], [str(number)]) # numbers: 1 - 2000, first 4 vocab entries are special tokens self.assertEqual(val.text_ids[0], torch.tensor(number + 3)) data_iterator.switch_to_dataset(dataset_name="test") iterator = data_iterator.get_iterator() for idx, val in enumerate(iterator): self.assertEqual(len(val), test_batch_size) number = idx * test_batch_size + 1001 self.assertEqual(val.text[0], [str(number)]) self.assertEqual(val.text_ids[0], torch.tensor(number + 3)) # test `get_iterator` interface for idx, val in enumerate(data_iterator.get_iterator('train')): self.assertEqual(len(val), train_batch_size) number = idx * train_batch_size + 1 self.assertEqual(val.text[0], [str(number)]) self.assertEqual(val.text_ids[0], torch.tensor(number + 3)) # test exception for invalid dataset name with self.assertRaises(ValueError) as context: data_iterator.switch_to_dataset('val') self.assertTrue('not found' in str(context.exception))
def _run_and_test(self, hparams, discard_index=None): # Construct database text_data = MultiAlignedData(hparams) self.assertEqual( text_data.vocab(0).size, self._vocab_size + len(text_data.vocab(0).special_tokens)) iterator = DataIterator(text_data) for batch in iterator: self.assertEqual(set(batch.keys()), set(text_data.list_items())) text_0 = batch['0_text'] text_1 = batch['1_text'] text_2 = batch['2_text'] int_3 = batch['label'] number_1 = batch['4_number1'] number_2 = batch['4_number2'] text_4 = batch['4_text'] for t0, t1, t2, i3, n1, n2, t4 in zip(text_0, text_1, text_2, int_3, number_1, number_2, text_4): np.testing.assert_array_equal(t0[:2], t1[1:3]) np.testing.assert_array_equal(t0[:3], t2[1:4]) if t0[0].startswith('This'): self.assertEqual(i3, 0) else: self.assertEqual(i3, 1) self.assertEqual(n1, 128) self.assertEqual(n2, 512) self.assertTrue(isinstance(n1, torch.Tensor)) self.assertTrue(isinstance(n2, torch.Tensor)) self.assertTrue(isinstance(t4, str)) if discard_index is not None: hpms = text_data._hparams.datasets[discard_index] max_l = hpms.max_seq_length max_l += sum( int(x is not None and x != '') for x in [ text_data.vocab(discard_index).bos_token, text_data.vocab(discard_index).eos_token ]) for i in range(2): for length in batch[text_data.length_name(i)]: self.assertLessEqual(length, max_l) # TODO(avinash): Add this back once variable utterance is added # for lengths in batch[text_data.length_name(2)]: # for length in lengths: # self.assertLessEqual(length, max_l) for i, hpms in enumerate(text_data._hparams.datasets): if hpms.data_type != "text": continue max_l = hpms.max_seq_length mode = hpms.length_filter_mode if max_l is not None and mode == "truncate": max_l += sum( int(x is not None and x != '') for x in [ text_data.vocab(i).bos_token, text_data.vocab(i).eos_token ]) for length in batch[text_data.length_name(i)]: self.assertLessEqual(length, max_l)
def _test_modes_with_workers(self, lazy_mode: str, cache_mode: str, num_workers: int, parallelize_processing: bool = True, support_random_access: bool = False, shuffle: bool = False, **kwargs): hparams = { 'batch_size': self.batch_size, 'lazy_strategy': lazy_mode, 'cache_strategy': cache_mode, 'num_parallel_calls': num_workers, 'shuffle': shuffle, 'shuffle_buffer_size': self.size // 5, 'parallelize_processing': parallelize_processing, 'allow_smaller_final_batch': False, **kwargs, } numbers_data = [[x] * self.seq_len for x in range(self.size)] string_data = [ ' '.join(map(str, range(self.seq_len))) for _ in range(self.size) ] if not support_random_access: source = ZipDataSource( # type: ignore IterDataSource(numbers_data), SequenceDataSource(string_data)) else: source = ZipDataSource(SequenceDataSource(numbers_data), SequenceDataSource(string_data)) data = MockDataBase(source, hparams) # type: ignore iterator = DataIterator(data) if data._hparams.allow_smaller_final_batch: total_examples = self.size total_batches = (self.size + self.batch_size - 1) // self.batch_size else: total_examples = self.size // self.batch_size * self.batch_size total_batches = self.size // self.batch_size def check_batch(idx, batch): if idx == total_batches - 1: batch_size = (total_examples - 1) % self.batch_size + 1 else: batch_size = self.batch_size self.assertEqual(batch.numbers.shape, (batch_size, self.seq_len)) if not shuffle: numbers = np.asarray( [idx * self.batch_size + x + 1 for x in range(batch_size)]) self.assertTrue(np.all(batch.numbers == numbers[:, np.newaxis])) # check laziness if parallelize_processing: if lazy_mode == 'none': self.assertEqual(len(data._processed_cache), self.size) else: self.assertEqual(len(data._processed_cache), 0) if not support_random_access: if lazy_mode == 'process': self.assertEqual(len(data._cached_source._cache), self.size) else: self.assertEqual(len(data._cached_source._cache), 0) # first epoch cnt = 0 for idx, batch in enumerate(iterator): check_batch(idx, batch) cnt += 1 self.assertEqual(cnt, total_batches) # check cache if parallelize_processing: if cache_mode == 'none': self.assertEqual(len(data._processed_cache), 0) elif cache_mode == 'loaded': self.assertEqual(len(data._processed_cache), 0) else: self.assertEqual(len(data._processed_cache), self.size) if lazy_mode != 'none' and not support_random_access: if cache_mode == 'none': self.assertEqual(len(data._cached_source._cache), 0) elif cache_mode == 'loaded': self.assertEqual(len(data._cached_source._cache), self.size) else: self.assertEqual(len(data._cached_source._cache), 0) # second epoch cnt = 0 for idx, batch in enumerate(iterator): check_batch(idx, batch) cnt += 1 self.assertEqual(cnt, total_batches) # check again if parallelize_processing: if cache_mode == 'none': self.assertEqual(len(data._processed_cache), 0) elif cache_mode == 'loaded': self.assertEqual(len(data._processed_cache), 0) else: self.assertEqual(len(data._processed_cache), self.size) if lazy_mode != 'none' and not support_random_access: if cache_mode == 'none': self.assertEqual(len(data._cached_source._cache), 0) elif cache_mode == 'loaded': self.assertEqual(len(data._cached_source._cache), self.size) else: self.assertEqual(len(data._cached_source._cache), 0)
len(examples), img_tensor=img_tensor, label=label, token_tensor=token_tensor, stop_prob=stop_prob, ) @staticmethod def default_hparams(): r"""Returns a dictionary of hyperparameters with default values. Returns: (Dict) default hyperparameters """ hparams = DatasetBase.default_hparams() hparams.update({ "datasource": None, }) return hparams if __name__ == "__main__": dataset_hparams = config.dataset dataset = IU_XRay_Dataset(dataset_hparams["train"]) # Dataloader dataset.to(torch.device('cpu')) train_loader = DataIterator(dataset) for batch in train_loader: print(batch) break