def test_balancer(): ds = give_data() # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3,)) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1,2], 5) ds.fa['chk'] = np.repeat([1,2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def test_balancer(): ds = give_data() # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3, )) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal( get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal( get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round( np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1, 2], 5) ds.fa['chk'] = np.repeat([1, 2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def get_nsamples_per_attr(dataset, attr): """Returns the number of samples per unique value of a sample attribute. Parameters ---------- attr : str Name of the sample attribute Returns ------- dict with the number of samples (value) per unique attribute (key). """ return get_nelements_per_value(dataset.sa[attr])
def test_balancer(): ds = give_data() # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # with limit bal = Balancer(limit={"chunks": 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa["chunks"].unique, (3,)) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={"chunks": 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa["chunks"].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal(get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal(get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={"chunks": 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values()), ) # check on feature attribute ds.fa["one"] = np.tile([1, 2], 5) ds.fa["chk"] = np.repeat([1, 2], 5) bal = Balancer(attr="one", amount=2, limit="chk", apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def _call(self, ds): # local binding amount = self._amount attr, collection = ds.get_attr(self._attr) # get filter if not set already (maybe from generate()) if self._limit_filter is None: limit_filter = get_limit_filter(self._limit, collection) else: limit_filter = self._limit_filter # ids of elements that are part of the balanced set balanced_set = [] full_limit_set = [] # for each chunk in the filter (might be just the selected ones) for limit_value in np.unique(limit_filter): if limit_filter.dtype == np.bool: # simple boolean filter -> do nothing on False if not limit_value: continue # otherwise get indices of "selected ones" limit_idx = limit_filter.nonzero()[0] else: # non-boolean limiter -> determine "chunk" and balance within limit_idx = (limit_filter == limit_value).nonzero()[0] full_limit_set += list(limit_idx) # apply the current limit to the target attribute # need list to index properly attr_limited = attr[list(limit_idx)] uattr_limited = np.unique(attr_limited) # handle all types of supported arguments if amount == 'equal': # go for maximum possible number of samples provided # by each label in this dataset # determine the min number of samples per class epa = get_nelements_per_value(attr_limited) min_epa = min(epa.values()) for k in epa: epa[k] = min_epa elif isinstance(amount, float): epa = get_nelements_per_value(attr_limited) for k in epa: epa[k] = int(round(epa[k] * amount)) elif isinstance(amount, int): epa = dict(zip(uattr_limited, [amount] * len(uattr_limited))) else: raise ValueError("Unknown type of amount argument '%s'" % amount) # select determined number of elements per unique attribute value selected = [] for ua in uattr_limited: selected += random.sample( list((attr_limited == ua).nonzero()[0]), epa[ua]) # determine the final indices of selected elements and store # as part of the balanced set balanced_set += list(limit_idx[selected]) # make full-sized boolean selection attribute and put it into # the right collection of the output dataset if self._include_offlimit: # start with all-in battr = np.ones(len(attr), dtype=np.bool) # throw out all samples that could have been limited battr[full_limit_set] = False # put back the ones that got into the balanced set battr[balanced_set] = True else: # start with nothing battr = np.zeros(len(attr), dtype=np.bool) # only keep the balanced set battr[balanced_set] = True if self._apply_selection: if collection is ds.sa: return ds[battr] elif collection is ds.fa: return ds[:, battr] else: # paranoid raise RuntimeError( "Don't know where this collection comes from. " "This should never happen!") else: # shallow copy of the dataset for output out = ds.copy(deep=False) if collection is ds.sa: out.sa[self.get_space()] = battr elif collection is ds.fa: out.fa[self.get_space()] = battr else: # paranoid raise RuntimeError( "Don't know where this collection comes from. " "This should never happen!") return out
def _call(self, ds): # local binding amount = self._amount attr, collection = ds.get_attr(self._attr) # get filter if not set already (maybe from generate()) if self._limit_filter is None: limit_filter = get_limit_filter(self._limit, collection) else: limit_filter = self._limit_filter # ids of elements that are part of the balanced set balanced_set = [] full_limit_set = [] # for each chunk in the filter (might be just the selected ones) for limit_value in np.unique(limit_filter): if limit_filter.dtype == np.bool: # simple boolean filter -> do nothing on False if not limit_value: continue # otherwise get indices of "selected ones" limit_idx = limit_filter.nonzero()[0] else: # non-boolean limiter -> determine "chunk" and balance within limit_idx = (limit_filter == limit_value).nonzero()[0] full_limit_set += list(limit_idx) # apply the current limit to the target attribute # need list to index properly attr_limited = attr[list(limit_idx)] uattr_limited = np.unique(attr_limited) # handle all types of supported arguments if amount == 'equal': # go for maximum possible number of samples provided # by each label in this dataset # determine the min number of samples per class epa = get_nelements_per_value(attr_limited) min_epa = min(epa.values()) for k in epa: epa[k] = min_epa elif isinstance(amount, float): epa = get_nelements_per_value(attr_limited) for k in epa: epa[k] = int(round(epa[k] * amount)) elif isinstance(amount, int): epa = dict(zip(uattr_limited, [amount] * len(uattr_limited))) else: raise ValueError("Unknown type of amount argument '%s'" % amount) # select determined number of elements per unique attribute value selected = [] for ua in uattr_limited: selected += random.sample(list((attr_limited == ua).nonzero()[0]), epa[ua]) # determine the final indices of selected elements and store # as part of the balanced set balanced_set += list(limit_idx[selected]) # make full-sized boolean selection attribute and put it into # the right collection of the output dataset if self._include_offlimit: # start with all-in battr = np.ones(len(attr), dtype=np.bool) # throw out all samples that could have been limited battr[full_limit_set] = False # put back the ones that got into the balanced set battr[balanced_set] = True else: # start with nothing battr = np.zeros(len(attr), dtype=np.bool) # only keep the balanced set battr[balanced_set] = True if self._apply_selection: if collection is ds.sa: return ds[battr] elif collection is ds.fa: return ds[:, battr] else: # paranoid raise RuntimeError( "Don't know where this collection comes from. " "This should never happen!") else: # shallow copy of the dataset for output out = ds.copy(deep=False) if collection is ds.sa: out.sa[self.get_space()] = battr elif collection is ds.fa: out.fa[self.get_space()] = battr else: # paranoid raise RuntimeError( "Don't know where this collection comes from. " "This should never happen!") return out
def test_balancer(): ds = give_data() ds.sa['ids'] = np.arange(len(ds)) # some sa to ease tracking of samples # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # if we rerun again, it would be a different selection res2 = bal(ds) assert_true(np.any(res.sa.ids != bal(ds).sa.ids)) # but if we create a balancer providing seed rng int, # should be identical results bal = Balancer(apply_selection=True, count=5, rng=1) assert_false(np.any(bal(ds).sa.ids != bal(ds).sa.ids)) # But results should differ if we use .generate to produce those multiple # balanced datasets b = Balancer(apply_selection=True, count=3, rng=1) balanced = list(b.generate(ds)) assert_false(all(balanced[0].sa.ids == balanced[1].sa.ids)) assert_false(all(balanced[0].sa.ids == balanced[2].sa.ids)) assert_false(all(balanced[1].sa.ids == balanced[2].sa.ids)) # And should be exactly the same for ds_a, ds_b in zip(balanced, b.generate(ds)): assert_datasets_equal(ds_a, ds_b) # Contribution by Chris Markiewicz # And interleaving __call__ and generator fetches gen1 = b.generate(ds) gen2 = b.generate(ds) seq1, seq2, seq3 = [], [], [] for i in xrange(3): seq1.append(gen1.next()) seq2.append(gen2.next()) seq3.append(b(ds)) # Produces expected sequences for i in xrange(3): assert_datasets_equal(balanced[i], seq1[i]) assert_datasets_equal(balanced[i], seq2[i]) # And all __call__s return the same result ds_a = seq3[0] for ds_b in seq3[1:]: assert_array_equal(ds_a.sa.ids, ds_b.sa.ids) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3, )) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal( get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal( get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round( np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1, 2], 5) ds.fa['chk'] = np.repeat([1, 2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def test_balancer(): ds = give_data() ds.sa['ids'] = np.arange(len(ds)) # some sa to ease tracking of samples # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # if we rerun again, it would be a different selection res2 = bal(ds) assert_true(np.any(res.sa.ids != bal(ds).sa.ids)) # but if we create a balancer providing seed rng int, # should be identical results bal = Balancer(apply_selection=True, count=5, rng=1) assert_false(np.any(bal(ds).sa.ids != bal(ds).sa.ids)) # But results should differ if we use .generate to produce those multiple # balanced datasets b = Balancer(apply_selection=True, count=3, rng=1) balanced = list(b.generate(ds)) assert_false(all(balanced[0].sa.ids == balanced[1].sa.ids)) assert_false(all(balanced[0].sa.ids == balanced[2].sa.ids)) assert_false(all(balanced[1].sa.ids == balanced[2].sa.ids)) # And should be exactly the same for ds_a, ds_b in zip(balanced, b.generate(ds)): assert_datasets_equal(ds_a, ds_b) # Contribution by Chris Markiewicz # And interleaving __call__ and generator fetches gen1 = b.generate(ds) gen2 = b.generate(ds) seq1, seq2, seq3 = [], [], [] for i in xrange(3): seq1.append(gen1.next()) seq2.append(gen2.next()) seq3.append(b(ds)) # Produces expected sequences for i in xrange(3): assert_datasets_equal(balanced[i], seq1[i]) assert_datasets_equal(balanced[i], seq2[i]) # And all __call__s return the same result ds_a = seq3[0] for ds_b in seq3[1:]: assert_array_equal(ds_a.sa.ids, ds_b.sa.ids) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3,)) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal(get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal(get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1, 2], 5) ds.fa['chk'] = np.repeat([1, 2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)