def balance_class_count_hdf5(fpath, keys, key_label='label', other_clname=CLNAME_OTHER): """ Resample keys in an HDF5 to generate a near balanced dataset. Returns a dictionary with resampled features and ground truth and indicies from the original label that were sampled. Not suitable for very large datasets. fpath -- path to HDF5 file keys -- keys to resample (e.g. features) Keyword arguments: key_label -- key for ground truth data in HDF5 other_clname -- name for negative class (None if non-existent) """ h_src = h5py.File(fpath, 'r') labls = h_src[key_label][:] bal = Balancer(np.squeeze(labls)) class_count = bal.get_class_count(other_clname=other_clname) idxs = bal.get_idxs_to_balance_class_count(class_count.values()) np.random.shuffle( idxs ) # shuffle the array along the first index of a multi-dimensional array, in-place dict_balanced = {key_label: labls[idxs]} for k in keys: dict_balanced[k] = h_src[k][:][idxs] return dict_balanced, idxs
def test_get_idxs_to_balance_class_count_other_highest(self): self.l[10:60, 1] = 0 self.l[10:30, 1] = 1 bal = Balancer(np.copy(self.l)) counts = bal.get_class_count(other_clname=CLNAME_OTHER) assert_in(CLNAME_OTHER, counts.keys()) assert_equals(counts[0], 10) assert_equals(counts[1], 20) assert_equals(counts[CLNAME_OTHER], 70) assert_equals(counts[CLNAME_OTHER], np.max(counts.values()), "this test requires class count for %s to be highest!") tolerance_order = 1 idxs = bal.get_idxs_to_balance_class_count(counts.values()) assert_almost_equal(np.count_nonzero(np.logical_and(idxs >= 0, idxs < 10) ), 10 + (70 - 10), tolerance_order) assert_almost_equal(np.count_nonzero(np.logical_and(idxs >= 10, idxs < 30) ), 20 + (70 - 20), tolerance_order) assert_equals(np.count_nonzero(idxs >= 30), 70, tolerance_order)
def balance_class_count_hdf5(fpath, keys, key_label='label', other_clname=CLNAME_OTHER): """ Resample keys in an HDF5 to generate a near balanced dataset. Returns a dictionary with resampled features and ground truth and indicies from the original label that were sampled. Not suitable for very large datasets. fpath -- path to HDF5 file keys -- keys to resample (e.g. features) Keyword arguments: key_label -- key for ground truth data in HDF5 other_clname -- name for negative class (None if non-existent) """ h_src = h5py.File(fpath, 'r') labls = h_src[key_label][:] bal = Balancer(np.squeeze(labls)) class_count = bal.get_class_count(other_clname=other_clname) idxs = bal.get_idxs_to_balance_class_count(class_count.values()) np.random.shuffle(idxs) # shuffle the array along the first index of a multi-dimensional array, in-place dict_balanced = {key_label : labls[idxs]} for k in keys: dict_balanced[k] = h_src[k][:][idxs] return dict_balanced, idxs
def get_class_count_hdf5(fpath, key_label='label', other_clname=CLNAME_OTHER): """ Count per-class instances in HDF5 and return a dictionary of class ids and per-class count fpath -- path to HDF5 file Keyword arguments: key_label -- key for ground truth data in HDF5 other_clname -- name for negative class (None if non-existent) """ h = h5py.File(fpath, 'r') b = Balancer(np.squeeze(h[key_label])) return b.get_class_count(other_clname=other_clname)
def save_balanced_sampled_class_count_hdf5(fpath, keys, fpath_dst, key_label='label', other_clname=CLNAME_OTHER, chunks=None, target_count=None): """ Resample keys in an HDF5 to generate a near balanced dataset and save into a new HDF5. Returns indicies from the original label that were sampled. Not suitable for very large datasets. Classes with count < target_count will sub-sampled without replacement. Classes with count > target_count will get over-sampled. Classes with count equal to target_count will be copied. fpath -- path to source HDF5 file keys -- keys to resample (e.g. features) fpath_dst -- path to destination HDF5 file Keyword arguments: key_label -- key for ground truth data in HDF5 other_clname -- name for negative class (None if non-existent) chunks -- forward chunks parameter to use during hdf5 writing target_count -- per-class count to target when sampling """ if os.path.abspath(fpath) == os.path.abspath(fpath_dst): raise IOError("Cannot read and write to the same file (%s) (%s)" % (fpath, fpath_dst)) with h5py.File(fpath, 'r') as h_src: labls = h_src[key_label][:] bal = Balancer(np.squeeze(labls)) class_count = bal.get_class_count(other_clname=other_clname) idxs = bal.get_idxs_to_balance_class_count(class_count.values(), target_count) np.random.shuffle( idxs ) # shuffle the array along the first index of a multi-dimensional array, in-place with h5py.File(fpath_dst, 'w') as h_dst: h_dst[key_label] = labls[idxs] for k in keys: dataset_src = h_src[k] shape_new = list(dataset_src.shape) shape_new[0] = len(idxs) dataset_dst = h_dst.create_dataset(k, tuple(shape_new), dataset_src.dtype, chunks=chunks) for idx_dst, idx_src in enumerate(idxs): dataset_dst[idx_dst] = dataset_src[idx_src] return idxs
def get_class_count_hdf5(fpath, key_label='label', other_clname=CLNAME_OTHER): """ Count per-class instances in HDF5 and return a dictionary of class ids and per-class count fpath -- path to HDF5 file Keyword arguments: key_label -- key for ground truth data in HDF5 other_clname -- name for negative class (None if non-existent) """ h = h5py.File(fpath, 'r') b = Balancer(np.squeeze(h[key_label])) return b.get_class_count(other_clname=other_clname)
def save_balanced_sampled_class_count_hdf5(fpath, keys, fpath_dst, key_label='label', other_clname=CLNAME_OTHER, chunks=None, target_count=None ): """ Resample keys in an HDF5 to generate a near balanced dataset and save into a new HDF5. Returns indicies from the original label that were sampled. Not suitable for very large datasets. Classes with count < target_count will sub-sampled without replacement. Classes with count > target_count will get over-sampled. Classes with count equal to target_count will be copied. fpath -- path to source HDF5 file keys -- keys to resample (e.g. features) fpath_dst -- path to destination HDF5 file Keyword arguments: key_label -- key for ground truth data in HDF5 other_clname -- name for negative class (None if non-existent) chunks -- forward chunks parameter to use during hdf5 writing target_count -- per-class count to target when sampling """ if os.path.abspath(fpath) == os.path.abspath(fpath_dst): raise IOError("Cannot read and write to the same file (%s) (%s)" % (fpath, fpath_dst)) with h5py.File(fpath, 'r') as h_src: labls = h_src[key_label][:] bal = Balancer(np.squeeze(labls)) class_count = bal.get_class_count(other_clname=other_clname) idxs = bal.get_idxs_to_balance_class_count(class_count.values(), target_count) np.random.shuffle(idxs) # shuffle the array along the first index of a multi-dimensional array, in-place with h5py.File(fpath_dst, 'w') as h_dst: h_dst[key_label] = labls[idxs] for k in keys: dataset_src = h_src[k] shape_new = list(dataset_src.shape) shape_new[0] = len(idxs) dataset_dst = h_dst.create_dataset(k, tuple(shape_new), dataset_src.dtype, chunks=chunks) for idx_dst, idx_src in enumerate(idxs): dataset_dst[idx_dst] = dataset_src[idx_src] return idxs
def test_get_idxs_to_balance_class_count_other_not_highest(self): bal = Balancer(np.copy(self.l)) counts = bal.get_class_count(other_clname=CLNAME_OTHER) assert_in(CLNAME_OTHER, counts.keys()) assert_equals(counts[0], 10) assert_equals(counts[1], 50) assert_equals(counts[CLNAME_OTHER], 40) tolerance_order = 1 idxs = bal.get_idxs_to_balance_class_count(counts.values()) assert_almost_equal( np.count_nonzero(np.logical_and(idxs >= 0, idxs < 10)), 10 + (50 - 10), tolerance_order) assert_equals(np.count_nonzero(np.logical_and(idxs >= 10, idxs < 60)), 50, 1) assert_almost_equal(np.count_nonzero(idxs >= 60), 40 + (50 - 40), tolerance_order)
def test_get_idxs_to_balance_class_count_other_not_highest(self): bal = Balancer(np.copy(self.l)) counts = bal.get_class_count(other_clname=CLNAME_OTHER) assert_in(CLNAME_OTHER, counts.keys()) assert_equals(counts[0], 10) assert_equals(counts[1], 50) assert_equals(counts[CLNAME_OTHER], 40) for target_count in [500]:#[10, 20, 500]: idxs = bal.sample_idxs_to_target_count(counts.values(), target_count) assert_equals(idxs.size, (self.num_classes + 1) * target_count) assert_equals(np.count_nonzero(idxs < 10), target_count) assert_equals(np.count_nonzero(np.logical_and(idxs >= 10, idxs < 60)), target_count) assert_equals(np.count_nonzero(idxs >= 60), target_count)
def test_get_idxs_to_balance_class_count_other_not_highest(self): bal = Balancer(np.copy(self.l)) counts = bal.get_class_count(other_clname=CLNAME_OTHER) assert_in(CLNAME_OTHER, counts.keys()) assert_equals(counts[0], 10) assert_equals(counts[1], 50) assert_equals(counts[CLNAME_OTHER], 40) for target_count in [500]: #[10, 20, 500]: idxs = bal.sample_idxs_to_target_count(counts.values(), target_count) assert_equals(idxs.size, (self.num_classes + 1) * target_count) assert_equals(np.count_nonzero(idxs < 10), target_count) assert_equals( np.count_nonzero(np.logical_and(idxs >= 10, idxs < 60)), target_count) assert_equals(np.count_nonzero(idxs >= 60), target_count)
def test_get_idxs_to_balance_class_count_other_not_highest(self): bal = Balancer(np.copy(self.l)) counts = bal.get_class_count(other_clname=CLNAME_OTHER) assert_in(CLNAME_OTHER, counts.keys()) assert_equals(counts[0], 10) assert_equals(counts[1], 50) assert_equals(counts[CLNAME_OTHER], 40) tolerance_order = 1 idxs = bal.get_idxs_to_balance_class_count(counts.values()) assert_almost_equal(np.count_nonzero(np.logical_and(idxs >= 0, idxs < 10) ), 10 + (50 - 10), tolerance_order) assert_equals(np.count_nonzero(np.logical_and(idxs >= 10, idxs < 60) ), 50, 1) assert_almost_equal(np.count_nonzero(idxs >= 60), 40 + (50 - 40), tolerance_order)
def test_get_idxs_to_balance_class_count_no_other(self): new_col = np.zeros((len(self.l), 1)) labls = np.hstack((self.l, new_col)) labls[60:, -1] = 1 bal = Balancer(labls) counts = bal.get_class_count(other_clname=None) assert_not_in(CLNAME_OTHER, counts.keys()) assert_equals(counts[0], 10) assert_equals(counts[1], 50) assert_equals(counts[2], 40) tolerance_order = 1 idxs = bal.get_idxs_to_balance_class_count(counts.values()) assert_almost_equal( np.count_nonzero(np.logical_and(idxs >= 0, idxs < 10)), 10 + (50 - 10), tolerance_order) assert_equals(np.count_nonzero(np.logical_and(idxs >= 10, idxs < 60)), 50, 1) assert_almost_equal(np.count_nonzero(idxs >= 60), 40 + (50 - 40), tolerance_order)
def test_get_idxs_to_balance_class_count_other_highest(self): self.l[10:60, 1] = 0 self.l[10:30, 1] = 1 bal = Balancer(np.copy(self.l)) counts = bal.get_class_count(other_clname=CLNAME_OTHER) assert_in(CLNAME_OTHER, counts.keys()) assert_equals(counts[0], 10) assert_equals(counts[1], 20) assert_equals(counts[CLNAME_OTHER], 70) assert_equals(counts[CLNAME_OTHER], np.max(counts.values()), "this test requires class count for %s to be highest!") tolerance_order = 1 idxs = bal.get_idxs_to_balance_class_count(counts.values()) assert_almost_equal( np.count_nonzero(np.logical_and(idxs >= 0, idxs < 10)), 10 + (70 - 10), tolerance_order) assert_almost_equal( np.count_nonzero(np.logical_and(idxs >= 10, idxs < 30)), 20 + (70 - 20), tolerance_order) assert_equals(np.count_nonzero(idxs >= 30), 70, tolerance_order)
def test_get_idxs_to_balance_class_count_no_other(self): new_col = np.zeros((len(self.l), 1)) labls = np.hstack((self.l, new_col)) labls[60:, -1] = 1 bal = Balancer(labls) counts = bal.get_class_count(other_clname=None) assert_not_in(CLNAME_OTHER, counts.keys()) assert_equals(counts[0], 10) assert_equals(counts[1], 50) assert_equals(counts[2], 40) tolerance_order = 1 idxs = bal.get_idxs_to_balance_class_count(counts.values()) assert_almost_equal(np.count_nonzero(np.logical_and(idxs >= 0, idxs < 10) ), 10 + (50 - 10), tolerance_order) assert_equals(np.count_nonzero(np.logical_and(idxs >= 10, idxs < 60) ), 50, 1) assert_almost_equal(np.count_nonzero(idxs >= 60), 40 + (50 - 40), tolerance_order)