def test_merge_with_not_resizable(self): dset1 = ds.NumpyDataset(self.name, self.packet_shape) dset2 = ds.NumpyDataset(self.name, self.packet_shape) dset1.resizable = False packet = self.items['raw'][0] dset2.add_data_item(packet, self.mock_targets[0], self.mock_meta[0]) self.assertRaises(Exception, dset1.merge_with, dset2)
def test_merge_with_incompatible_dataset(self): bad_packet_shape = (self.n_f + 1, self.f_h, self.f_h) dset1 = ds.NumpyDataset(self.name, bad_packet_shape) dset2 = ds.NumpyDataset(self.name, self.packet_shape) packet = self.items['raw'][0] dset2.add_data_item(packet, self.mock_targets[0], self.mock_meta[0]) self.assertRaises(ValueError, dset1.merge_with, dset2)
def test_is_compatible_with_bad_item_types(self): bad_item_types = self.item_types.copy() bad_item_types['raw'] = not bad_item_types['raw'] dset1 = ds.NumpyDataset(self.name, self.packet_shape, item_types=bad_item_types) dset2 = ds.NumpyDataset(self.name, self.packet_shape, item_types=self.item_types) self.assertFalse(dset1.is_compatible_with(dset2))
def test_merge_with_new_metafields(self): dset1 = ds.NumpyDataset(self.name, self.packet_shape) dset2 = ds.NumpyDataset(self.name, self.packet_shape) packet = self.items['raw'][0] meta2 = self.mock_meta[0].copy() meta2['test'] = 'value' exp_metafields = self.metafields.union(meta2.keys()) dset1.add_data_item(packet, self.mock_targets[0], self.mock_meta[0]) dset2.add_data_item(packet, self.mock_targets[0], meta2) dset1.merge_with(dset2) self.assertSetEqual(dset1.metadata_fields, exp_metafields)
def load_dataset(self, name, item_types=None): """ Load a dataset from secondary storage. This function assumes that the relevant dataset files are located in the same directory (loaddir). Parameters ---------- :param name: the dataset name. :type name: str :param item_types: (optional) types of dataset items to load. :type item_types: typing.Mapping[str, bool] """ # TODO: Think of a way to load dataset with items that does not depend # on knowledge of NumpyDataset internals # currently excluded from unit tests for that very reason self._check_before_read() config = self.load_dataset_config(name) itypes = item_types or config['item_types'] dataset = ds.NumpyDataset(name, config['packet_shape'], item_types=itypes) data = self._data_handler.load_data(name, dataset.item_types) targets = self._target_handler.load_targets(name) metadata = self._meta_handler.load_metadata(name) dataset._data.extend(data) dataset._targ.extend({'classification': targets}) dataset._meta.extend(metadata) dataset._num_data = config['num_data'] return dataset
def get_output_dataset_and_handler(output_packet_shape, **dataset_args): dataset = ds.NumpyDataset(dataset_args['name'], output_packet_shape, item_types=dataset_args['item_types'], dtype=dataset_args['dtype']) output_handler = fs_io.DatasetFsPersistencyHandler( save_dir=dataset_args['outdir']) return dataset, output_handler
def test_add_item_wrong_packet_shape(self): dset = ds.NumpyDataset(self.name, self.packet_shape, item_types=self.item_types) packet = np.ones((1, *self.packet_shape)) targ, meta = self.mock_targets[0], self.mock_meta[0] self.assertRaises(ValueError, dset.add_data_item, packet, targ, meta)
def test_get_targets(self): packet, target = self.items['raw'][0], self.mock_targets[0] meta = self.mock_meta[0] exp_targets = [self.mock_targets[0]] dset = ds.NumpyDataset(self.name, self.packet_shape) dset.add_data_item(packet, target, meta) targets = dset.get_targets() self._assertDatasetTargets(targets, exp_targets)
def test_add_item_non_resizable_dataset(self): dset = ds.NumpyDataset(self.name, self.packet_shape, item_types=self.item_types) dset.resizable = False packet = self.items['raw'][0] targ, meta = self.mock_targets[0], self.mock_meta[0] self.assertRaises(Exception, dset.add_data_item, packet, targ, meta)
def test_add_item(self): dset = ds.NumpyDataset(self.name, self.packet_shape, item_types=self.item_types) packet = self.items['raw'][0] exp_data = {k: [v[0]] for k, v in self.items.items()} num_data = dset.num_data dset.add_data_item(packet, self.mock_targets[0], self.mock_meta[0]) self.assertEqual(dset.num_data, num_data + 1)
def test_merge_with_only_subset_of_items(self): dset1 = ds.NumpyDataset(self.name, self.packet_shape) dset2 = ds.NumpyDataset(self.name, self.packet_shape) packet = self.items['raw'][0] meta2 = self.mock_meta[0].copy() meta2['test'] = 'value' meta3 = self.mock_meta[0].copy() meta3['test2'] = 'value' exp_metafields = self.metafields.union(meta2.keys()) dset1.add_data_item(packet, self.mock_targets[0], self.mock_meta[0]) dset2.add_data_item(packet, self.mock_targets[0], meta2) dset2.add_data_item(packet, self.mock_targets[0], meta2) dset2.add_data_item(packet, self.mock_targets[0], meta3) # add items 0 and 1 from dset2 to dset1 dset1.merge_with(dset2, slice(2)) # metadata fields from item 2 of dset2 should not be added exp_metafields = self.metafields.union(meta2.keys()) self.assertEqual(dset1.num_data, 3) self.assertSetEqual(dset1.metadata_fields, exp_metafields)
def test_implicit_dtype_conversion_when_adding_items(self): dset = ds.NumpyDataset(self.name, self.packet_shape, dtype='float16', item_types=self.item_types) dset.add_data_item(self.items['raw'][0], self.mock_targets[0], self.mock_meta[0]) items_dict = dset.get_data_as_dict() for itype, is_present in dset.item_types.items(): if is_present: self.assertEqual(items_dict[itype][0].dtype.name, 'float16')
def test_get_metadata(self): packet, target = self.items['raw'][0], self.mock_targets[0] meta = self.mock_meta[0] exp_metadata = [self.mock_meta[0]] dset = ds.NumpyDataset(self.name, self.packet_shape) dset.add_data_item(packet, target, meta) metadata = dset.get_metadata() msg = "Metadata not equal: expected {}:, actual {}:".format( exp_metadata, meta) self.assertListEqual(metadata, exp_metadata, msg)
def test_get_data_as_dict(self): item_types = {'raw': True, 'yx': True, 'gtux': False, 'gtuy': False} exp_items = {k: [self.items[k][0]] for k, v in item_types.items() if v} packet, target = self.items['raw'][0], self.mock_targets[0] meta = self.mock_meta[0] dset = ds.NumpyDataset(self.name, self.packet_shape, item_types=item_types) dset.add_data_item(packet, target, meta) items = dset.get_data_as_dict() self._assertDatasetData(items, exp_items, exp_items.keys())
def test_merge_with(self): dset1 = ds.NumpyDataset(self.name, self.packet_shape, item_types=self.item_types) dset2 = ds.NumpyDataset(self.name, self.packet_shape, item_types=self.item_types) packet = self.items['raw'][0] dset1.add_data_item(packet, self.mock_targets[0], self.mock_meta[0]) dset2.add_data_item(packet, self.mock_targets[0], self.mock_meta[0]) num_data = dset1.num_data exp_data = { 'raw': [packet, packet], 'yx': [self.items['yx'][0], self.items['yx'][0]], 'gtux': [self.items['gtux'][0], self.items['gtux'][0]], 'gtuy': [self.items['gtuy'][0], self.items['gtuy'][0]] } exp_targets = [self.mock_targets[0], self.mock_targets[0]] exp_metadata = [self.mock_meta[0], self.mock_meta[0]] exp_metafields = self.metafields dset1.merge_with(dset2) self._assertDatasetItems(dset1, exp_data, exp_targets, exp_metadata, exp_metafields, num_data + dset2.num_data, cons.ALL_ITEM_TYPES)
def test_get_data_as_arraylike(self): keys = ('raw', 'yx') item_types = {'raw': True, 'yx': True, 'gtux': False, 'gtuy': False} exp_items = ([self.items['raw'][0]], [self.items['yx'][0]]) packet, target = self.items['raw'][0], self.mock_targets[0] meta = self.mock_meta[0] dset = ds.NumpyDataset(self.name, self.packet_shape, item_types=item_types) dset.add_data_item(packet, target, meta) items = dset.get_data_as_arraylike() for idx in range(len(keys)): err_msg = "items of type '{}' are not equal".format(keys[idx]) nptest.assert_array_equal(items[idx], exp_items[idx], err_msg)
def main(**settings): srcdir, outdir = settings['srcdir'], settings['outdir'] name, outname = args['name'], args['outname'] if outname is None: outname = name if outdir is None: outdir = srcdir io_handler = io_utils.DatasetFsPersistencyHandler(load_dir=srcdir, save_dir=outdir) items_slice = args['items_slice'] old_dataset = io_handler.load_dataset(name) new_dataset = ds.NumpyDataset(outname, old_dataset.accepted_packet_shape, item_types=old_dataset.item_types, dtype=old_dataset.dtype) new_dataset.merge_with(old_dataset, items_slice) io_handler.save_dataset(new_dataset)
def test_add_metafield(self): dset = ds.NumpyDataset(self.name, self.packet_shape) packet = self.items['raw'][0] exp_meta = self.mock_meta.copy() exp_meta[0] = exp_meta[0].copy() exp_meta[1] = exp_meta[1].copy() dset.add_data_item(packet, self.mock_targets[0], exp_meta[0]) dset.add_data_item(packet, self.mock_targets[1], exp_meta[1]) exp_meta = exp_meta.copy() exp_meta[0] = exp_meta[0].copy() exp_meta[1] = exp_meta[1].copy() exp_meta[0]['random_metafield'] = 'default' exp_meta[1]['random_metafield'] = 'default' exp_metafields = self.metafields.union(['random_metafield']) dset.add_metafield('random_metafield', default_value='default') self.assertListEqual(dset.get_metadata(), exp_meta) self.assertSetEqual(dset.metadata_fields, exp_metafields)
def setUpClass(cls, num_items=2, name='test', item_types=None): items_mixin = DatasetItemsMixin() items_mixin.setUpClass(num_items=num_items) targets_mixin = DatasetTargetsMixin() targets_mixin.setUpClass(num_items=num_items) meta_mixin = DatasetMetadataMixin() meta_mixin.setUpClass(num_items=num_items) packets = items_mixin.items['raw'] targets = targets_mixin.mock_targets metadata = meta_mixin.mock_meta if item_types is None: item_types = items_mixin.item_types dset = ds.NumpyDataset(name, items_mixin.packet_shape, item_types=item_types) for idx in range(num_items): dset.add_data_item(packets[idx], targets[idx], metadata[idx]) cls.dset = dset
def load_empty_dataset(self, name, item_types=None): """ Create a dataset from configuration stored in secondary storage without loading any of its actual contents (data, targets, metadata). Parameters ---------- :param name: the dataset name/config filename prefix. :type name: str :param item_types: (optional) types of dataset items to load. :type item_types: typing.Mapping[str, bool] """ attrs = self.load_dataset_config(name) itypes = item_types or attrs['item_types'] dataset = ds.NumpyDataset(name, attrs['packet_shape'], item_types=itypes, dtype=attrs['dtype']) return dataset
def test_is_compatible_with(self): dset1 = ds.NumpyDataset(self.name, self.packet_shape) dset2 = ds.NumpyDataset(self.name, self.packet_shape) self.assertTrue(dset1.is_compatible_with(dset2))
def create_dataset(self, name, num_data, item_types, dtype='uint8'): """ Generate and return a numpy dataset containing simulated showers and corresponding targets for them, for use in training neural networks for classifiction tasks. The data returned is divided into equal-sized quarters as follows: 1/4: shower data (possibly with malfunctioned EC units) 2/4: shower data (without malfunctioned EC units) 3/4: noise data (possibly with malfunctioned EC units) 4/4: noise data (without malfunctioned EC units) Whether there are any data items with malfunctioning ECs depends on the property bad_ECs_range. Parameters ---------- num_data : int The number of data items to create in total. item_types : dict of str to bool The requested item types, where the keys are from the utils.dataset_utils.item_types module-level constant. Returns ------- dataset : utils.dataset_utils.NumpyDataset A numpy dataset with capacity and num_items both equal to num_data. """ # create output data holders as needed template_shape = self._bg_template.packet_template.packet_shape dataset = ds.NumpyDataset(name, template_shape, item_types=item_types, dtype=dtype) # output and target generation ec_gen = self._bg_template.get_new_bad_ECs num_showers = int(num_data / 2) shower_creator = self.create_shower_packet noise_creator = self.create_noise_packet shower_target = cons.CLASSIFICATION_TARGETS['shower'] noise_target = cons.CLASSIFICATION_TARGETS['noise'] iteration_handlers = ({ 'target': shower_target, 'start': 0, 'stop': int(num_showers / 2), 'packet_handler': lambda angle: shower_creator(angle, ec_gen()) }, { 'target': shower_target, 'start': int(num_showers / 2), 'stop': num_showers, 'packet_handler': lambda angle: shower_creator(angle) }, { 'target': noise_target, 'start': num_showers, 'stop': num_data - int(num_showers / 2), 'packet_handler': lambda angle: noise_creator(ec_gen()) }, { 'target': noise_target, 'start': num_data - int(num_showers / 2), 'stop': num_data, 'packet_handler': lambda angle: noise_creator() }) # main loop for handler in iteration_handlers: start, stop = handler['start'], handler['stop'] packet_handler = handler['packet_handler'] target = handler['target'] # idx serves as both an index into targets and data, as well as # shower angle in xy projection for idx in range(start, stop): packet, meta = packet_handler(idx) dataset.add_data_item(packet, target, meta) return dataset
def test_is_compatible_with_bad_packet_shape(self): bad_packet_shape = (self.n_f + 1, self.f_h, self.f_h) dset1 = ds.NumpyDataset(self.name, bad_packet_shape) dset2 = ds.NumpyDataset(self.name, self.packet_shape) self.assertFalse(dset1.is_compatible_with(dset2))