示例#1
0
def get_scores(y_test, y_probs, t=0.5, beta=2):
    prob_to_label = (y_probs >= t).astype("int")
    return [
        mr.roc_auc_score(y_test, prob_to_label),
        mr.f1_score(
            y_test,
            prob_to_label,
            sample_weight=compute_sample_weight(class_weight="balanced",
                                                y=y_test),
        ),
        mr.precision_score(
            y_test,
            prob_to_label,
            sample_weight=compute_sample_weight(class_weight="balanced",
                                                y=y_test),
            zero_division=0,  # default="warn", which is same as 0
        ),
        mr.recall_score(
            y_test,
            prob_to_label,
            sample_weight=compute_sample_weight(class_weight="balanced",
                                                y=y_test),
        ),
        -1 * false_positive_rate_scorer(y_test, prob_to_label),
        mr.fbeta_score(
            y_test,
            prob_to_label,
            beta=beta,
            sample_weight=compute_sample_weight(class_weight="balanced",
                                                y=y_test),
        ),
    ]
示例#2
0
 def __getitem__(self, idx):
     x = self.known[idx * self.batch_size:(idx + 1) * self.batch_size]
     y = self.labels[idx * self.batch_size:(idx + 1) * self.batch_size]
     for _ in range(self.n_silence):
         x.append(self._get_silence)
         y.append('silence')
     unknown_idx = np.random.randint(0, len(self.unknown), self.n_unknown)
     for idx in unknown_idx:
         x.append(self.unknown[idx])
         y.append('unknown')
     label_ids = [LABEL2ID[label] for label in y]
     if self.augment == 0:
         batch = [self._pad_sample(s) for s in x]
     else:
         batch = [self._augment_sample(s) for s in x]
     ohe_batch = []
     for id_ in label_ids:
         ohe_y = np.ones(N_CLASS) * self.eps / (N_CLASS - 1)
         ohe_y[id_] = 1 - self.eps
         ohe_batch.append(ohe_y)
     batch = np.array(batch)
     ohe_batch = np.array(ohe_batch)
     batch = batch.reshape((-1, 1, L))
     if self.balance == 0:
         return batch, ohe_batch
     else:
         weights = compute_sample_weight('balanced', label_ids)
         return batch, ohe_batch, weights
示例#3
0
def initialize_class_weights(y_data, n_classes=None, name=None,
                             return_numpy_object=True,
                             class_weight=None):

    assert y_data.min() >= 0

    if y_data.ndim == 1:
        class_ = y_data
        if n_classes is None:
            n_classes = len(np.unique(y_data))
        w = np.ones((y_data.shape[0], n_classes))
        w[np.arange(y_data.shape[0]), y_data.astype(int)] = 0
    else:
        class_ = np.argmax(y_data, axis=1)
        if n_classes is not None:
            assert n_classes == y_data.shape[1]
        else:
            n_classes = y_data.shape[1]
        w = 1 - y_data

    assert n_classes >= 2

    w = w / (w.shape[0] * (w.shape[1] - 1))


    sample_weights = compute_sample_weight(class_weight, class_)
    w = w * sample_weights.reshape((-1,1))
    w = w / w.sum()

    if return_numpy_object:
        return w
    else:
        w = theano.shared(w.astype(theano.config.floatX), name=name)
        return w
示例#4
0
 def __getitem__(self, idx):
     x = self.images[idx * self.batch_size:(idx + 1) * self.batch_size]
     y = self.labels[idx * self.batch_size:(idx + 1) * self.batch_size]
     label_ids = [LABEL2ID[label] for label in y]
     images_batch = []
     manip_flags = []
     args = list(zip(x, [self.center] * len(x)))
     for result in self.p.imap(self._prepare_image, args):
         image, manip_flag = result
         images_batch.append(image)
         manip_flags.append(manip_flag)
     if self.augmentation:
         augmented_batch = []
         args = list(zip(images_batch, label_ids))
         for image in self.p.imap(self._augment_image, args):
             augmented_batch.append(image)
         images_batch = augmented_batch
     labels_batch = []
     for id_ in label_ids:
         ohe = np.zeros(N_CLASS)
         ohe[id_] = 1
         labels_batch.append(ohe)
     images_batch = self._preprocess_batch(np.array(images_batch).astype(np.float32))
     manip_flags = np.array(manip_flags)
     labels_batch = np.array(labels_batch)
     if self.balance:
         weights = compute_sample_weight('balanced', label_ids)
         return [images_batch, manip_flags], labels_batch, weights
     else:
         return [images_batch, manip_flags], labels_batch
示例#5
0
 def fit(self, X_train, Y_train):
     for _, clf in enumerate(self.clf_list):
         clf.fit(
             X_train,
             Y_train,
             sample_weight=compute_sample_weight("balanced", Y_train),
         )
示例#6
0
def roc_auc_binary_scorer(y_true, y_pred):
    roc_auc_binary_score = mr.roc_auc_score(
        y_true,
        y_pred,
        average="weighted",
        sample_weight=compute_sample_weight(class_weight="balanced", y=y_true),
    )
    return roc_auc_binary_score
示例#7
0
def recall_binary_scorer(y_true, y_pred):
    recall_binary_score = mr.recall_score(
        y_true,
        y_pred,
        average="binary",
        sample_weight=compute_sample_weight(class_weight="balanced", y=y_true),
    )
    return recall_binary_score
def test_compute_sample_weight():
    # Test (and demo) compute_sample_weight.
    # Test with balanced classes
    y = np.asarray([1, 1, 1, 2, 2, 2])
    sample_weight = assert_warns(DeprecationWarning,
                                 compute_sample_weight, "auto", y)
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
    sample_weight = compute_sample_weight("balanced", y)
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

    # Test with user-defined weights
    sample_weight = compute_sample_weight({1: 2, 2: 1}, y)
    assert_array_almost_equal(sample_weight, [2., 2., 2., 1., 1., 1.])

    # Test with column vector of balanced classes
    y = np.asarray([[1], [1], [1], [2], [2], [2]])
    sample_weight = assert_warns(DeprecationWarning,
                                 compute_sample_weight, "auto", y)
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
    sample_weight = compute_sample_weight("balanced", y)
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

    # Test with unbalanced classes
    y = np.asarray([1, 1, 1, 2, 2, 2, 3])
    sample_weight = assert_warns(DeprecationWarning,
                                 compute_sample_weight, "auto", y)
    expected_auto = np.asarray([.6, .6, .6, .6, .6, .6, 1.8])
    assert_array_almost_equal(sample_weight, expected_auto)
    sample_weight = compute_sample_weight("balanced", y)
    expected_balanced = np.array([0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 2.3333])
    assert_array_almost_equal(sample_weight, expected_balanced, decimal=4)

    # Test with `None` weights
    sample_weight = compute_sample_weight(None, y)
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 1.])

    # Test with multi-output of balanced classes
    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
    sample_weight = assert_warns(DeprecationWarning,
                                 compute_sample_weight, "auto", y)
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
    sample_weight = compute_sample_weight("balanced", y)
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

    # Test with multi-output with user-defined weights
    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
    sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y)
    assert_array_almost_equal(sample_weight, [2., 2., 2., 2., 2., 2.])

    # Test with multi-output of unbalanced classes
    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]])
    sample_weight = assert_warns(DeprecationWarning,
                                 compute_sample_weight, "auto", y)
    assert_array_almost_equal(sample_weight, expected_auto ** 2)
    sample_weight = compute_sample_weight("balanced", y)
    assert_array_almost_equal(sample_weight, expected_balanced ** 2, decimal=3)
示例#9
0
 def accumulated(self):
     print("accumulated")
     self.alldata = pd.read_csv(self.args.data)
     self.pddata = self.alldata.sample(frac=0.3, replace=True)
     self.pddata = self.pddata.reset_index(drop=1)
     #embed()
     sample_weight = class_weight.compute_sample_weight(
         "balanced", self.pddata['Y'])
     acculated_r = False
     acculated_p = 0
     acculated_recall = 0
     prev_p = 0
     prev_recall = 0
     out = ""
     self.rule_perf = sorted(self.rule_perf,
                             key=self.sortby[self.args.sort])
     #self.rule_perf=sorted(self.rule_perf,key = lambda x: -x[1]*x[2]/(x[1]+x[2]))
     index_set = set(range(len(self.rule_perf))) - self.redudant
     print(index_set)
     count = 0
     while index_set:
         pick_index = 0
         candidates = []
         for index in list(index_set)[:4]:
             r, _, _ = self.rule_perf[index]
             tmp_r = acculated_r | r
             precision, recall = xgbtree_rule_perf(str(tmp_r), self.pddata,
                                                   self.pddata['Y'],
                                                   sample_weight)
             candidates.append((index, precision, recall, tmp_r))
         candidates = sorted(candidates, key=self.sortby[self.args.sort])
         print("candidates", candidates)
         i, acculated_p, acculated_recall, tmp_r = candidates[0]
         index_set.remove(i)
         r, precision, recall = self.rule_perf[i]
         if acculated_recall <= prev_recall * self.args.dev_ratio:
             continue
         acculated_r = acculated_r | r
         prev_p = acculated_p
         prev_recall = acculated_recall
         out = out + f"{i}:{r}, {acculated_p}, {acculated_recall},{precision},{recall}\n"
     """
 for r, precision, recall in self.rule_perf:
   if not acculated_r:
     acculated_r= r
   else:
     acculated_r = acculated_r | r
   acculated_p, acculated_recall = xgbtree_rule_perf(str(acculated_r),self.pddata,self.pddata['Y'],sample_weight)
   out =out + f"{r}, {acculated_p}, {acculated_recall},{precision},{recall}\n"
 """
     print(out)
     with open(
             os.path.join(
                 os.path.dirname(self.rulef),
                 os.path.splitext(os.path.basename(self.rulef))[0] +
                 f"-{self.args.sort}-{self.args.dev_ratio}-accumulated.txt"
             ), "w") as f:
         f.write(out)
def test_compute_sample_weight_with_subsample():
    # Test compute_sample_weight with subsamples specified.
    # Test with balanced classes and all samples present
    y = np.asarray([1, 1, 1, 2, 2, 2])
    sample_weight = assert_warns(DeprecationWarning,
                                 compute_sample_weight, "auto", y)
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
    sample_weight = compute_sample_weight("balanced", y, range(6))
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

    # Test with column vector of balanced classes and all samples present
    y = np.asarray([[1], [1], [1], [2], [2], [2]])
    sample_weight = assert_warns(DeprecationWarning,
                                 compute_sample_weight, "auto", y)
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
    sample_weight = compute_sample_weight("balanced", y, range(6))
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

    # Test with a subsample
    y = np.asarray([1, 1, 1, 2, 2, 2])
    sample_weight = assert_warns(DeprecationWarning,
                                 compute_sample_weight, "auto", y, range(4))
    assert_array_almost_equal(sample_weight, [.5, .5, .5, 1.5, 1.5, 1.5])
    sample_weight = compute_sample_weight("balanced", y, range(4))
    assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3,
                                              2. / 3, 2., 2., 2.])

    # Test with a bootstrap subsample
    y = np.asarray([1, 1, 1, 2, 2, 2])
    sample_weight = assert_warns(DeprecationWarning, compute_sample_weight,
                                 "auto", y, [0, 1, 1, 2, 2, 3])
    expected_auto = np.asarray([1 / 3., 1 / 3., 1 / 3., 5 / 3., 5 / 3., 5 / 3.])
    assert_array_almost_equal(sample_weight, expected_auto)
    sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
    expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.])
    assert_array_almost_equal(sample_weight, expected_balanced)

    # Test with a bootstrap subsample for multi-output
    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
    sample_weight = assert_warns(DeprecationWarning, compute_sample_weight,
                                 "auto", y, [0, 1, 1, 2, 2, 3])
    assert_array_almost_equal(sample_weight, expected_auto ** 2)
    sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
    assert_array_almost_equal(sample_weight, expected_balanced ** 2)

    # Test with a missing class
    y = np.asarray([1, 1, 1, 2, 2, 2, 3])
    sample_weight = assert_warns(DeprecationWarning, compute_sample_weight,
                                 "auto", y, range(6))
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
    sample_weight = compute_sample_weight("balanced", y, range(6))
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])

    # Test with a missing class for multi-output
    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
    sample_weight = assert_warns(DeprecationWarning, compute_sample_weight,
                                 "auto", y, range(6))
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
    sample_weight = compute_sample_weight("balanced", y, range(6))
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
示例#11
0
def test_compute_sample_weight():
    # Test (and demo) compute_sample_weight.
    # Test with balanced classes
    y = np.asarray([1, 1, 1, 2, 2, 2])
    sample_weight = compute_sample_weight("auto", y)
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

    # Test with user-defined weights
    sample_weight = compute_sample_weight({1: 2, 2: 1}, y)
    assert_array_almost_equal(sample_weight, [2., 2., 2., 1., 1., 1.])

    # Test with column vector of balanced classes
    y = np.asarray([[1], [1], [1], [2], [2], [2]])
    sample_weight = compute_sample_weight("auto", y)
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

    # Test with unbalanced classes
    y = np.asarray([1, 1, 1, 2, 2, 2, 3])
    sample_weight = compute_sample_weight("auto", y)
    expected = np.asarray([.6, .6, .6, .6, .6, .6, 1.8])
    assert_array_almost_equal(sample_weight, expected)

    # Test with `None` weights
    sample_weight = compute_sample_weight(None, y)
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 1.])

    # Test with multi-output of balanced classes
    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
    sample_weight = compute_sample_weight("auto", y)
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

    # Test with multi-output with user-defined weights
    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
    sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y)
    assert_array_almost_equal(sample_weight, [2., 2., 2., 2., 2., 2.])

    # Test with multi-output of unbalanced classes
    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]])
    sample_weight = compute_sample_weight("auto", y)
    assert_array_almost_equal(sample_weight, expected ** 2)
    def test_use_sample_weights(self):
        x, y = self.multinomial[1]
        class_0_idx = np.where(y==0)
        to_drop = class_0_idx[0][:-3]
        to_keep = np.ones(len(y), dtype=bool)
        to_keep[to_drop] = False
        y = y[to_keep]
        x = x[to_keep, :]
        sample_weight = class_weight.compute_sample_weight('balanced', y)
        sample_weight[0] = 0.

        unweighted = LogitNet(random_state=2, scoring='f1_micro')
        unweighted = unweighted.fit(x, y)
        unweighted_acc = f1_score(y, unweighted.predict(x), sample_weight=sample_weight,
                                  average='micro')

        weighted = LogitNet(random_state=2, scoring='f1_micro')
        weighted = weighted.fit(x, y, sample_weight=sample_weight)
        weighted_acc = f1_score(y, weighted.predict(x), sample_weight=sample_weight,
                                average='micro')

        self.assertTrue(weighted_acc >= unweighted_acc)
示例#13
0
def test_compute_sample_weight_with_subsample():
    """Test compute_sample_weight with subsamples specified."""
    # Test with balanced classes and all samples present
    y = np.asarray([1, 1, 1, 2, 2, 2])
    sample_weight = compute_sample_weight("auto", y, range(6))
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

    # Test with column vector of balanced classes and all samples present
    y = np.asarray([[1], [1], [1], [2], [2], [2]])
    sample_weight = compute_sample_weight("auto", y, range(6))
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])

    # Test with a subsample
    y = np.asarray([1, 1, 1, 2, 2, 2])
    sample_weight = compute_sample_weight("auto", y, range(4))
    assert_array_almost_equal(sample_weight, [.5, .5, .5, 1.5, 1.5, 1.5])

    # Test with a bootstrap subsample
    y = np.asarray([1, 1, 1, 2, 2, 2])
    sample_weight = compute_sample_weight("auto", y, [0, 1, 1, 2, 2, 3])
    expected = np.asarray([1/3., 1/3., 1/3., 5/3., 5/3., 5/3.])
    assert_array_almost_equal(sample_weight, expected)

    # Test with a bootstrap subsample for multi-output
    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
    sample_weight = compute_sample_weight("auto", y, [0, 1, 1, 2, 2, 3])
    assert_array_almost_equal(sample_weight, expected ** 2)

    # Test with a missing class
    y = np.asarray([1, 1, 1, 2, 2, 2, 3])
    sample_weight = compute_sample_weight("auto", y, range(6))
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])

    # Test with a missing class for multi-output
    y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
    sample_weight = compute_sample_weight("auto", y, range(6))
    assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
示例#14
0
 def __call__(self, class_ratio, fv_train, class_train):
     rs = random.randint(0,2**31)
     cfier = ensemble.GradientBoostingClassifier(random_state=rs, **self.cfier_params)
     sample_weight = compute_sample_weight({0: 1, 1: self.base_class_ratio/class_ratio}, class_train)
     cfier.fit(fv_train, class_train, sample_weight=sample_weight)
     return cfier
def test_compute_sample_weight_more_than_32():
    # Non-regression smoke test for #12146
    y = np.arange(50)  # more than 32 distinct classes
    indices = np.arange(50)  # use subsampling
    weight = compute_sample_weight('balanced', y, indices=indices)
    assert_array_almost_equal(weight, np.ones(y.shape[0]))
示例#16
0
def resample_data(arr, weights, sample_fraction=1.0, rng_seed=None):
    """Resamples arr according to weights. This can be used to resample
    unbalanced datasets (either by label, or some external label specified).
    This wraps sklearn.utils.class_weight.compute_sample_weight

    Parameters
    -------
    arr: array-like, shape = [n_samples] or [n_samples, outputs]
        Array of labels/class/group to balance.

    class_weight: dict, list of dicts, "balanced", or None, optional
        Weights associated with `arr` in the form ``{label: weight}``, where
        the keys `label` are unique values present in arr and weights are
        the percentage of which to sample. If not given, all classes are
        set to weights of 1. For multi-output problems, a list of dicts
        can be provided in the same order as the columns of y.\n

        The "balanced" mode uses the values of arr to automatically adjust
        weights inversely proportional to class frequencies in the data.\n

        For multi-output, the weights of each column of y will be multiplied.

    sample_fraction: float, default=1.0
        Fraction of len(arr) to return when resampling dataset. For example,
        if `sample_fraction=2.0`, will return arr of ``len(2*len(arr))``.

    rng_seed: int, default=None
        Seed to random state that uses np.choice to select idxs of samples.

    Returns
    -------
    idxs: ndarray, shape = (sample_fraction * len(arr))
        Resampled indices of arr according to weights.
    """
    # set randomstate, if supplied
    if rng_seed is None:
        rng = np.random
    else:
        rng = np.random.RandomState(seed=rng_seed)

    # check for errors if weights is specified as a dictionary
    if isinstance(weights, dict):
        # check if keys in weights express all unique values present in arr
        uniques = np.unique(arr)
        lbl_weights = np.sort(weights.keys())
        if not np.all(np.equal(lbl_weights, uniques)):
            raise RuntimeError("Unique labels of `arr` are not all contained"
                               "within keys of `weights`. Found \n"
                               "unique(arr) = %s\n"
                               "weights.keys() = %s"
                               % (list(uniques), list(lbl_weights)))

        # now check if the probabilities assigned to weights keys add to 1
        weight_vals = float(np.sum(weights.values()))
        if not weight_vals == 1.0:
            raise RuntimeError("Weight values (probabilities) do not add to "
                               "1.0. Found sum(weight.values()) = %0.5f"
                               % weight_vals)

    # compute sample weights
    sample_weights = compute_sample_weight(weights, arr)
    sample_weights = sample_weights / float(np.sum(sample_weights))

    # resample according to sample_fraction
    idxs = np.arange(len(arr))
    n_data = int(sample_fraction * len(arr))
    ret = rng.choice(idxs, size=n_data, replace=True, p=sample_weights)

    return ret
 def __call__(self, class_ratio, fv_train, class_train):
     cfier = ensemble.GradientBoostingClassifier(**self.cfier_params)
     sample_weight = compute_sample_weight({0: 1, 1: self.base_class_ratio/class_ratio}, class_train)
     cfier.fit(fv_train, class_train, sample_weight=sample_weight)
     return cfier