def _get_call_kwargs(self, ds): attr, collection = ds.get_attr(self._attr) # _call might need to operate on the dedicated instantiated rng # e.g. if seed int is provided return { 'limit_filter': get_limit_filter(self._limit, collection), 'rng': get_rng(self._rng) }
def _get_call_kwargs(self, ds): attr, collection = ds.get_attr(self._attr) # _call might need to operate on the dedicated instantiated rng # e.g. if seed int is provided return { 'limit_filter': get_limit_filter(self._limit, collection), 'rng': get_rng(self._rng) }
def _get_pcfg(self, ds): # determine to be permuted attribute to find the collection pattr = self._pattr if isinstance(pattr, str): pattr, collection = ds.get_attr(pattr) else: # must be sequence of attrs, take first since we only need the shape pattr, collection = ds.get_attr(pattr[0]) return get_limit_filter(self._limit, collection)
def _get_pcfg(self, ds): # determine to be permuted attribute to find the collection pattr = self._pattr if isinstance(pattr, str): pattr, collection = ds.get_attr(pattr) else: # must be sequence of attrs, take first since we only need the shape pattr, collection = ds.get_attr(pattr[0]) return get_limit_filter(self._limit, collection)
def generate(self, ds): """Generate the desired number of balanced datasets datasets.""" # figure out filter for all runs at once attr, collection = ds.get_attr(self._attr) self._limit_filter = get_limit_filter(self._limit, collection) # permute as often as requested for i in xrange(self.count): yield self(ds) # reset filter to do the right thing upon next call to object self._limit_filter = None
def generate(self, ds): """Generate the desired number of balanced datasets datasets.""" # figure out filter for all runs at once attr, collection = ds.get_attr(self._attr) self._limit_filter = get_limit_filter(self._limit, collection) # permute as often as requested for i in xrange(self.count): yield self(ds) # reset filter to do the right thing upon next call to object self._limit_filter = None
def _get_call_kwargs(self, ds): # determine to be permuted attribute to find the collection pattr = self._pattr if isinstance(pattr, str): pattr, collection = ds.get_attr(pattr) else: # must be sequence of attrs, take first since we only need the shape pattr, collection = ds.get_attr(pattr[0]) # _call might need to operate on the dedicated instantiated rng # e.g. if seed int is provided return { 'limit_filter': get_limit_filter(self._limit, collection), 'rng': get_rng(self.rng) }
def _get_call_kwargs(self, ds): # determine to be permuted attribute to find the collection pattr = self._pattr if isinstance(pattr, str): pattr, collection = ds.get_attr(pattr) else: # must be sequence of attrs, take first since we only need the shape pattr, collection = ds.get_attr(pattr[0]) # _call might need to operate on the dedicated instantiated rng # e.g. if seed int is provided return { 'limit_filter': get_limit_filter(self._limit, collection), 'rng': get_rng(self.rng) }
def _call(self, ds): # local binding amount = self._amount attr, collection = ds.get_attr(self._attr) # get filter if not set already (maybe from generate()) if self._limit_filter is None: limit_filter = get_limit_filter(self._limit, collection) else: limit_filter = self._limit_filter # ids of elements that are part of the balanced set balanced_set = [] full_limit_set = [] # for each chunk in the filter (might be just the selected ones) for limit_value in np.unique(limit_filter): if limit_filter.dtype == np.bool: # simple boolean filter -> do nothing on False if not limit_value: continue # otherwise get indices of "selected ones" limit_idx = limit_filter.nonzero()[0] else: # non-boolean limiter -> determine "chunk" and balance within limit_idx = (limit_filter == limit_value).nonzero()[0] full_limit_set += list(limit_idx) # apply the current limit to the target attribute # need list to index properly attr_limited = attr[list(limit_idx)] uattr_limited = np.unique(attr_limited) # handle all types of supported arguments if amount == 'equal': # go for maximum possible number of samples provided # by each label in this dataset # determine the min number of samples per class epa = get_nelements_per_value(attr_limited) min_epa = min(epa.values()) for k in epa: epa[k] = min_epa elif isinstance(amount, float): epa = get_nelements_per_value(attr_limited) for k in epa: epa[k] = int(round(epa[k] * amount)) elif isinstance(amount, int): epa = dict(zip(uattr_limited, [amount] * len(uattr_limited))) else: raise ValueError("Unknown type of amount argument '%s'" % amount) # select determined number of elements per unique attribute value selected = [] for ua in uattr_limited: selected += random.sample( list((attr_limited == ua).nonzero()[0]), epa[ua]) # determine the final indices of selected elements and store # as part of the balanced set balanced_set += list(limit_idx[selected]) # make full-sized boolean selection attribute and put it into # the right collection of the output dataset if self._include_offlimit: # start with all-in battr = np.ones(len(attr), dtype=np.bool) # throw out all samples that could have been limited battr[full_limit_set] = False # put back the ones that got into the balanced set battr[balanced_set] = True else: # start with nothing battr = np.zeros(len(attr), dtype=np.bool) # only keep the balanced set battr[balanced_set] = True if self._apply_selection: if collection is ds.sa: return ds[battr] elif collection is ds.fa: return ds[:, battr] else: # paranoid raise RuntimeError( "Don't know where this collection comes from. " "This should never happen!") else: # shallow copy of the dataset for output out = ds.copy(deep=False) if collection is ds.sa: out.sa[self.get_space()] = battr elif collection is ds.fa: out.fa[self.get_space()] = battr else: # paranoid raise RuntimeError( "Don't know where this collection comes from. " "This should never happen!") return out
def _call(self, ds): # local binding amount = self._amount attr, collection = ds.get_attr(self._attr) # get filter if not set already (maybe from generate()) if self._limit_filter is None: limit_filter = get_limit_filter(self._limit, collection) else: limit_filter = self._limit_filter # ids of elements that are part of the balanced set balanced_set = [] full_limit_set = [] # for each chunk in the filter (might be just the selected ones) for limit_value in np.unique(limit_filter): if limit_filter.dtype == np.bool: # simple boolean filter -> do nothing on False if not limit_value: continue # otherwise get indices of "selected ones" limit_idx = limit_filter.nonzero()[0] else: # non-boolean limiter -> determine "chunk" and balance within limit_idx = (limit_filter == limit_value).nonzero()[0] full_limit_set += list(limit_idx) # apply the current limit to the target attribute # need list to index properly attr_limited = attr[list(limit_idx)] uattr_limited = np.unique(attr_limited) # handle all types of supported arguments if amount == 'equal': # go for maximum possible number of samples provided # by each label in this dataset # determine the min number of samples per class epa = get_nelements_per_value(attr_limited) min_epa = min(epa.values()) for k in epa: epa[k] = min_epa elif isinstance(amount, float): epa = get_nelements_per_value(attr_limited) for k in epa: epa[k] = int(round(epa[k] * amount)) elif isinstance(amount, int): epa = dict(zip(uattr_limited, [amount] * len(uattr_limited))) else: raise ValueError("Unknown type of amount argument '%s'" % amount) # select determined number of elements per unique attribute value selected = [] for ua in uattr_limited: selected += random.sample(list((attr_limited == ua).nonzero()[0]), epa[ua]) # determine the final indices of selected elements and store # as part of the balanced set balanced_set += list(limit_idx[selected]) # make full-sized boolean selection attribute and put it into # the right collection of the output dataset if self._include_offlimit: # start with all-in battr = np.ones(len(attr), dtype=np.bool) # throw out all samples that could have been limited battr[full_limit_set] = False # put back the ones that got into the balanced set battr[balanced_set] = True else: # start with nothing battr = np.zeros(len(attr), dtype=np.bool) # only keep the balanced set battr[balanced_set] = True if self._apply_selection: if collection is ds.sa: return ds[battr] elif collection is ds.fa: return ds[:, battr] else: # paranoid raise RuntimeError( "Don't know where this collection comes from. " "This should never happen!") else: # shallow copy of the dataset for output out = ds.copy(deep=False) if collection is ds.sa: out.sa[self.get_space()] = battr elif collection is ds.fa: out.fa[self.get_space()] = battr else: # paranoid raise RuntimeError( "Don't know where this collection comes from. " "This should never happen!") return out