def get_scores(y_test, y_probs, t=0.5, beta=2): prob_to_label = (y_probs >= t).astype("int") return [ mr.roc_auc_score(y_test, prob_to_label), mr.f1_score( y_test, prob_to_label, sample_weight=compute_sample_weight(class_weight="balanced", y=y_test), ), mr.precision_score( y_test, prob_to_label, sample_weight=compute_sample_weight(class_weight="balanced", y=y_test), zero_division=0, # default="warn", which is same as 0 ), mr.recall_score( y_test, prob_to_label, sample_weight=compute_sample_weight(class_weight="balanced", y=y_test), ), -1 * false_positive_rate_scorer(y_test, prob_to_label), mr.fbeta_score( y_test, prob_to_label, beta=beta, sample_weight=compute_sample_weight(class_weight="balanced", y=y_test), ), ]
def __getitem__(self, idx): x = self.known[idx * self.batch_size:(idx + 1) * self.batch_size] y = self.labels[idx * self.batch_size:(idx + 1) * self.batch_size] for _ in range(self.n_silence): x.append(self._get_silence) y.append('silence') unknown_idx = np.random.randint(0, len(self.unknown), self.n_unknown) for idx in unknown_idx: x.append(self.unknown[idx]) y.append('unknown') label_ids = [LABEL2ID[label] for label in y] if self.augment == 0: batch = [self._pad_sample(s) for s in x] else: batch = [self._augment_sample(s) for s in x] ohe_batch = [] for id_ in label_ids: ohe_y = np.ones(N_CLASS) * self.eps / (N_CLASS - 1) ohe_y[id_] = 1 - self.eps ohe_batch.append(ohe_y) batch = np.array(batch) ohe_batch = np.array(ohe_batch) batch = batch.reshape((-1, 1, L)) if self.balance == 0: return batch, ohe_batch else: weights = compute_sample_weight('balanced', label_ids) return batch, ohe_batch, weights
def initialize_class_weights(y_data, n_classes=None, name=None, return_numpy_object=True, class_weight=None): assert y_data.min() >= 0 if y_data.ndim == 1: class_ = y_data if n_classes is None: n_classes = len(np.unique(y_data)) w = np.ones((y_data.shape[0], n_classes)) w[np.arange(y_data.shape[0]), y_data.astype(int)] = 0 else: class_ = np.argmax(y_data, axis=1) if n_classes is not None: assert n_classes == y_data.shape[1] else: n_classes = y_data.shape[1] w = 1 - y_data assert n_classes >= 2 w = w / (w.shape[0] * (w.shape[1] - 1)) sample_weights = compute_sample_weight(class_weight, class_) w = w * sample_weights.reshape((-1,1)) w = w / w.sum() if return_numpy_object: return w else: w = theano.shared(w.astype(theano.config.floatX), name=name) return w
def __getitem__(self, idx): x = self.images[idx * self.batch_size:(idx + 1) * self.batch_size] y = self.labels[idx * self.batch_size:(idx + 1) * self.batch_size] label_ids = [LABEL2ID[label] for label in y] images_batch = [] manip_flags = [] args = list(zip(x, [self.center] * len(x))) for result in self.p.imap(self._prepare_image, args): image, manip_flag = result images_batch.append(image) manip_flags.append(manip_flag) if self.augmentation: augmented_batch = [] args = list(zip(images_batch, label_ids)) for image in self.p.imap(self._augment_image, args): augmented_batch.append(image) images_batch = augmented_batch labels_batch = [] for id_ in label_ids: ohe = np.zeros(N_CLASS) ohe[id_] = 1 labels_batch.append(ohe) images_batch = self._preprocess_batch(np.array(images_batch).astype(np.float32)) manip_flags = np.array(manip_flags) labels_batch = np.array(labels_batch) if self.balance: weights = compute_sample_weight('balanced', label_ids) return [images_batch, manip_flags], labels_batch, weights else: return [images_batch, manip_flags], labels_batch
def fit(self, X_train, Y_train): for _, clf in enumerate(self.clf_list): clf.fit( X_train, Y_train, sample_weight=compute_sample_weight("balanced", Y_train), )
def roc_auc_binary_scorer(y_true, y_pred): roc_auc_binary_score = mr.roc_auc_score( y_true, y_pred, average="weighted", sample_weight=compute_sample_weight(class_weight="balanced", y=y_true), ) return roc_auc_binary_score
def recall_binary_scorer(y_true, y_pred): recall_binary_score = mr.recall_score( y_true, y_pred, average="binary", sample_weight=compute_sample_weight(class_weight="balanced", y=y_true), ) return recall_binary_score
def test_compute_sample_weight(): # Test (and demo) compute_sample_weight. # Test with balanced classes y = np.asarray([1, 1, 1, 2, 2, 2]) sample_weight = assert_warns(DeprecationWarning, compute_sample_weight, "auto", y) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) sample_weight = compute_sample_weight("balanced", y) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with user-defined weights sample_weight = compute_sample_weight({1: 2, 2: 1}, y) assert_array_almost_equal(sample_weight, [2., 2., 2., 1., 1., 1.]) # Test with column vector of balanced classes y = np.asarray([[1], [1], [1], [2], [2], [2]]) sample_weight = assert_warns(DeprecationWarning, compute_sample_weight, "auto", y) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) sample_weight = compute_sample_weight("balanced", y) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with unbalanced classes y = np.asarray([1, 1, 1, 2, 2, 2, 3]) sample_weight = assert_warns(DeprecationWarning, compute_sample_weight, "auto", y) expected_auto = np.asarray([.6, .6, .6, .6, .6, .6, 1.8]) assert_array_almost_equal(sample_weight, expected_auto) sample_weight = compute_sample_weight("balanced", y) expected_balanced = np.array([0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 2.3333]) assert_array_almost_equal(sample_weight, expected_balanced, decimal=4) # Test with `None` weights sample_weight = compute_sample_weight(None, y) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 1.]) # Test with multi-output of balanced classes y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]]) sample_weight = assert_warns(DeprecationWarning, compute_sample_weight, "auto", y) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) sample_weight = compute_sample_weight("balanced", y) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with multi-output with user-defined weights y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]]) sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y) assert_array_almost_equal(sample_weight, [2., 2., 2., 2., 2., 2.]) # Test with multi-output of unbalanced classes y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]]) sample_weight = assert_warns(DeprecationWarning, compute_sample_weight, "auto", y) assert_array_almost_equal(sample_weight, expected_auto ** 2) sample_weight = compute_sample_weight("balanced", y) assert_array_almost_equal(sample_weight, expected_balanced ** 2, decimal=3)
def accumulated(self): print("accumulated") self.alldata = pd.read_csv(self.args.data) self.pddata = self.alldata.sample(frac=0.3, replace=True) self.pddata = self.pddata.reset_index(drop=1) #embed() sample_weight = class_weight.compute_sample_weight( "balanced", self.pddata['Y']) acculated_r = False acculated_p = 0 acculated_recall = 0 prev_p = 0 prev_recall = 0 out = "" self.rule_perf = sorted(self.rule_perf, key=self.sortby[self.args.sort]) #self.rule_perf=sorted(self.rule_perf,key = lambda x: -x[1]*x[2]/(x[1]+x[2])) index_set = set(range(len(self.rule_perf))) - self.redudant print(index_set) count = 0 while index_set: pick_index = 0 candidates = [] for index in list(index_set)[:4]: r, _, _ = self.rule_perf[index] tmp_r = acculated_r | r precision, recall = xgbtree_rule_perf(str(tmp_r), self.pddata, self.pddata['Y'], sample_weight) candidates.append((index, precision, recall, tmp_r)) candidates = sorted(candidates, key=self.sortby[self.args.sort]) print("candidates", candidates) i, acculated_p, acculated_recall, tmp_r = candidates[0] index_set.remove(i) r, precision, recall = self.rule_perf[i] if acculated_recall <= prev_recall * self.args.dev_ratio: continue acculated_r = acculated_r | r prev_p = acculated_p prev_recall = acculated_recall out = out + f"{i}:{r}, {acculated_p}, {acculated_recall},{precision},{recall}\n" """ for r, precision, recall in self.rule_perf: if not acculated_r: acculated_r= r else: acculated_r = acculated_r | r acculated_p, acculated_recall = xgbtree_rule_perf(str(acculated_r),self.pddata,self.pddata['Y'],sample_weight) out =out + f"{r}, {acculated_p}, {acculated_recall},{precision},{recall}\n" """ print(out) with open( os.path.join( os.path.dirname(self.rulef), os.path.splitext(os.path.basename(self.rulef))[0] + f"-{self.args.sort}-{self.args.dev_ratio}-accumulated.txt" ), "w") as f: f.write(out)
def test_compute_sample_weight_with_subsample(): # Test compute_sample_weight with subsamples specified. # Test with balanced classes and all samples present y = np.asarray([1, 1, 1, 2, 2, 2]) sample_weight = assert_warns(DeprecationWarning, compute_sample_weight, "auto", y) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) sample_weight = compute_sample_weight("balanced", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with column vector of balanced classes and all samples present y = np.asarray([[1], [1], [1], [2], [2], [2]]) sample_weight = assert_warns(DeprecationWarning, compute_sample_weight, "auto", y) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) sample_weight = compute_sample_weight("balanced", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with a subsample y = np.asarray([1, 1, 1, 2, 2, 2]) sample_weight = assert_warns(DeprecationWarning, compute_sample_weight, "auto", y, range(4)) assert_array_almost_equal(sample_weight, [.5, .5, .5, 1.5, 1.5, 1.5]) sample_weight = compute_sample_weight("balanced", y, range(4)) assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3, 2. / 3, 2., 2., 2.]) # Test with a bootstrap subsample y = np.asarray([1, 1, 1, 2, 2, 2]) sample_weight = assert_warns(DeprecationWarning, compute_sample_weight, "auto", y, [0, 1, 1, 2, 2, 3]) expected_auto = np.asarray([1 / 3., 1 / 3., 1 / 3., 5 / 3., 5 / 3., 5 / 3.]) assert_array_almost_equal(sample_weight, expected_auto) sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3]) expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.]) assert_array_almost_equal(sample_weight, expected_balanced) # Test with a bootstrap subsample for multi-output y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]]) sample_weight = assert_warns(DeprecationWarning, compute_sample_weight, "auto", y, [0, 1, 1, 2, 2, 3]) assert_array_almost_equal(sample_weight, expected_auto ** 2) sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3]) assert_array_almost_equal(sample_weight, expected_balanced ** 2) # Test with a missing class y = np.asarray([1, 1, 1, 2, 2, 2, 3]) sample_weight = assert_warns(DeprecationWarning, compute_sample_weight, "auto", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.]) sample_weight = compute_sample_weight("balanced", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.]) # Test with a missing class for multi-output y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]]) sample_weight = assert_warns(DeprecationWarning, compute_sample_weight, "auto", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.]) sample_weight = compute_sample_weight("balanced", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
def test_compute_sample_weight(): # Test (and demo) compute_sample_weight. # Test with balanced classes y = np.asarray([1, 1, 1, 2, 2, 2]) sample_weight = compute_sample_weight("auto", y) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with user-defined weights sample_weight = compute_sample_weight({1: 2, 2: 1}, y) assert_array_almost_equal(sample_weight, [2., 2., 2., 1., 1., 1.]) # Test with column vector of balanced classes y = np.asarray([[1], [1], [1], [2], [2], [2]]) sample_weight = compute_sample_weight("auto", y) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with unbalanced classes y = np.asarray([1, 1, 1, 2, 2, 2, 3]) sample_weight = compute_sample_weight("auto", y) expected = np.asarray([.6, .6, .6, .6, .6, .6, 1.8]) assert_array_almost_equal(sample_weight, expected) # Test with `None` weights sample_weight = compute_sample_weight(None, y) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 1.]) # Test with multi-output of balanced classes y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]]) sample_weight = compute_sample_weight("auto", y) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with multi-output with user-defined weights y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]]) sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y) assert_array_almost_equal(sample_weight, [2., 2., 2., 2., 2., 2.]) # Test with multi-output of unbalanced classes y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]]) sample_weight = compute_sample_weight("auto", y) assert_array_almost_equal(sample_weight, expected ** 2)
def test_use_sample_weights(self): x, y = self.multinomial[1] class_0_idx = np.where(y==0) to_drop = class_0_idx[0][:-3] to_keep = np.ones(len(y), dtype=bool) to_keep[to_drop] = False y = y[to_keep] x = x[to_keep, :] sample_weight = class_weight.compute_sample_weight('balanced', y) sample_weight[0] = 0. unweighted = LogitNet(random_state=2, scoring='f1_micro') unweighted = unweighted.fit(x, y) unweighted_acc = f1_score(y, unweighted.predict(x), sample_weight=sample_weight, average='micro') weighted = LogitNet(random_state=2, scoring='f1_micro') weighted = weighted.fit(x, y, sample_weight=sample_weight) weighted_acc = f1_score(y, weighted.predict(x), sample_weight=sample_weight, average='micro') self.assertTrue(weighted_acc >= unweighted_acc)
def test_compute_sample_weight_with_subsample(): """Test compute_sample_weight with subsamples specified.""" # Test with balanced classes and all samples present y = np.asarray([1, 1, 1, 2, 2, 2]) sample_weight = compute_sample_weight("auto", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with column vector of balanced classes and all samples present y = np.asarray([[1], [1], [1], [2], [2], [2]]) sample_weight = compute_sample_weight("auto", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with a subsample y = np.asarray([1, 1, 1, 2, 2, 2]) sample_weight = compute_sample_weight("auto", y, range(4)) assert_array_almost_equal(sample_weight, [.5, .5, .5, 1.5, 1.5, 1.5]) # Test with a bootstrap subsample y = np.asarray([1, 1, 1, 2, 2, 2]) sample_weight = compute_sample_weight("auto", y, [0, 1, 1, 2, 2, 3]) expected = np.asarray([1/3., 1/3., 1/3., 5/3., 5/3., 5/3.]) assert_array_almost_equal(sample_weight, expected) # Test with a bootstrap subsample for multi-output y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]]) sample_weight = compute_sample_weight("auto", y, [0, 1, 1, 2, 2, 3]) assert_array_almost_equal(sample_weight, expected ** 2) # Test with a missing class y = np.asarray([1, 1, 1, 2, 2, 2, 3]) sample_weight = compute_sample_weight("auto", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.]) # Test with a missing class for multi-output y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]]) sample_weight = compute_sample_weight("auto", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
def __call__(self, class_ratio, fv_train, class_train): rs = random.randint(0,2**31) cfier = ensemble.GradientBoostingClassifier(random_state=rs, **self.cfier_params) sample_weight = compute_sample_weight({0: 1, 1: self.base_class_ratio/class_ratio}, class_train) cfier.fit(fv_train, class_train, sample_weight=sample_weight) return cfier
def test_compute_sample_weight_more_than_32(): # Non-regression smoke test for #12146 y = np.arange(50) # more than 32 distinct classes indices = np.arange(50) # use subsampling weight = compute_sample_weight('balanced', y, indices=indices) assert_array_almost_equal(weight, np.ones(y.shape[0]))
def resample_data(arr, weights, sample_fraction=1.0, rng_seed=None): """Resamples arr according to weights. This can be used to resample unbalanced datasets (either by label, or some external label specified). This wraps sklearn.utils.class_weight.compute_sample_weight Parameters ------- arr: array-like, shape = [n_samples] or [n_samples, outputs] Array of labels/class/group to balance. class_weight: dict, list of dicts, "balanced", or None, optional Weights associated with `arr` in the form ``{label: weight}``, where the keys `label` are unique values present in arr and weights are the percentage of which to sample. If not given, all classes are set to weights of 1. For multi-output problems, a list of dicts can be provided in the same order as the columns of y.\n The "balanced" mode uses the values of arr to automatically adjust weights inversely proportional to class frequencies in the data.\n For multi-output, the weights of each column of y will be multiplied. sample_fraction: float, default=1.0 Fraction of len(arr) to return when resampling dataset. For example, if `sample_fraction=2.0`, will return arr of ``len(2*len(arr))``. rng_seed: int, default=None Seed to random state that uses np.choice to select idxs of samples. Returns ------- idxs: ndarray, shape = (sample_fraction * len(arr)) Resampled indices of arr according to weights. """ # set randomstate, if supplied if rng_seed is None: rng = np.random else: rng = np.random.RandomState(seed=rng_seed) # check for errors if weights is specified as a dictionary if isinstance(weights, dict): # check if keys in weights express all unique values present in arr uniques = np.unique(arr) lbl_weights = np.sort(weights.keys()) if not np.all(np.equal(lbl_weights, uniques)): raise RuntimeError("Unique labels of `arr` are not all contained" "within keys of `weights`. Found \n" "unique(arr) = %s\n" "weights.keys() = %s" % (list(uniques), list(lbl_weights))) # now check if the probabilities assigned to weights keys add to 1 weight_vals = float(np.sum(weights.values())) if not weight_vals == 1.0: raise RuntimeError("Weight values (probabilities) do not add to " "1.0. Found sum(weight.values()) = %0.5f" % weight_vals) # compute sample weights sample_weights = compute_sample_weight(weights, arr) sample_weights = sample_weights / float(np.sum(sample_weights)) # resample according to sample_fraction idxs = np.arange(len(arr)) n_data = int(sample_fraction * len(arr)) ret = rng.choice(idxs, size=n_data, replace=True, p=sample_weights) return ret
def __call__(self, class_ratio, fv_train, class_train): cfier = ensemble.GradientBoostingClassifier(**self.cfier_params) sample_weight = compute_sample_weight({0: 1, 1: self.base_class_ratio/class_ratio}, class_train) cfier.fit(fv_train, class_train, sample_weight=sample_weight) return cfier