Пример #1
0
def test_balance_weights():
    weights = balance_weights([0, 0, 1, 1])
    assert_array_equal(weights, [1., 1., 1., 1.])

    weights = balance_weights([0, 1, 1, 1, 1])
    assert_array_equal(weights, [1., 0.25, 0.25, 0.25, 0.25])

    weights = balance_weights([0, 0])
    assert_array_equal(weights, [1., 1.])
Пример #2
0
def test_balance_weights():
    weights = balance_weights([0, 0, 1, 1])
    assert_array_equal(weights, [1., 1., 1., 1.])

    weights = balance_weights([0, 1, 1, 1, 1])
    assert_array_equal(weights, [1., 0.25, 0.25, 0.25, 0.25])

    weights = balance_weights([0, 0])
    assert_array_equal(weights, [1., 1.])
Пример #3
0
def weighted_randomforest(data, targets, tree_num=TREE_NUM):
    model = RandomForestClassifier(n_estimators=tree_num,
                                   n_jobs=4,
                                   max_features=data.shape[1] / 2 + 1,
                                   verbose=0,
                                   compute_importances=True)
    model.fit(data, targets, balance_weights(targets))
    return model
Пример #4
0
def weighted_randomforest(data, targets, tree_num=TREE_NUM):
    model = RandomForestClassifier(n_estimators=tree_num, 
                                   n_jobs=4, 
                                   max_features=data.shape[1]/2+1, 
                                   verbose=0, 
                                   compute_importances=True) 
    model.fit(data, targets, balance_weights(targets))
    return model
Пример #5
0
def test_unbalanced_iris():
    """Check class rebalancing."""
    unbalanced_X = iris.data[:125]
    unbalanced_y = iris.target[:125]
    sample_weight = balance_weights(unbalanced_y)

    clf = tree.DecisionTreeClassifier()
    clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight)
    assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y)
Пример #6
0
def test_unbalanced_iris():
    """Check class rebalancing."""
    unbalanced_X = iris.data[:125]
    unbalanced_y = iris.target[:125]
    sample_weight = balance_weights(unbalanced_y)

    clf = tree.DecisionTreeClassifier()
    clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight)
    assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y)
Пример #7
0
 def __init__(self, stories=None, df=None):
     self.stories = stories
     if self.stories is not None:  # training set
         candidates = df[['candidate', 'story_id']].to_dict('records')
         y = np.array([get_relevance(candidate, self.stories)
                       for candidate in candidates])
         self.target = pd.get_dummies(y)
         self.y = y
         self.sample_weight = balance_weights(y)
    def _attempt(clf, X_train, y_train, X_test, y_test, weighted=True):
        weights = None
        if weighted:
            weights = balance_weights(y_train)

        clf.fit(X_train, y_train, sample_weight=weights)

        pred = clf.predict(X_test)
        print metrics.classification_report(y_test, pred, target_names=['high', 'low'])
Пример #9
0
def test_unbalanced_iris():
    """Check class rebalancing."""
    unbalanced_X = iris.data[:125]
    unbalanced_y = iris.target[:125]
    sample_weight = balance_weights(unbalanced_y)

    for name, TreeClassifier in CLF_TREES.items():
        clf = TreeClassifier(random_state=0)
        clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight)
        assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y)
Пример #10
0
def test_unbalanced_iris():
    """Check class rebalancing."""
    unbalanced_X = iris.data[:125]
    unbalanced_y = iris.target[:125]
    sample_weight = balance_weights(unbalanced_y)

    for name, TreeClassifier in CLF_TREES.items():
        clf = TreeClassifier(random_state=0)
        clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight)
        assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y)
Пример #11
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting, performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')

        self.scorer_ = _deprecate_loss_and_score_funcs(self.loss_func,
                                                       self.score_func,
                                                       self.scoring)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
            y = np.asarray(y)
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(n_jobs=self.n_jobs,
                       verbose=self.verbose,
                       pre_dispatch=pre_dispatch)(delayed(fit_grid_point)(
                           X, y, base_estimator, parameters, train, test,
                           self.scorer_, self.verbose, **{
                               'sample_weight': balance_weights(y[train])
                           }) for parameters in parameter_iterable
                                                  for train, test in cv)

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, parameters, this_n_test_samples in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(
                _CVScoreTuple(parameters, score, np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X,
                                   y,
                                   sample_weight=balance_weights(y),
                                   **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Пример #12
0
data_y = train["target"]

train_X, val_X, train_y, val_y = train_test_split(data_X, data_y,
                                                  test_size = 0.33, random_state = 42)

test_X = test.drop(["id"], axis = 1)

# <codecell>

#rf = GridSearchCV(rfClassifier(), [{'n_estimators': [10, 50, 100, 150, 200, 300, 500]}, 
                                   #{'max_features': ["sqrt", "log2", None]}])
rf = rfClassifier(n_estimators = 500, max_features = 18, verbose = 1)

# <codecell>

rf_fit = rf.fit(train_X, train_y, sample_weight = balance_weights(train_y))

# <codecell>

rf_fit

# <codecell>

rf_prob = rf_fit.score(val_X, val_y)

# <codecell>

rf_prob

# <codecell>
Пример #13
0
 def calculate_weights(self, classes):
     return balance_weights(classes)
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting, performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')

        self.scorer_ = _deprecate_loss_and_score_funcs(
            self.loss_func, self.score_func, self.scoring)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
            y = np.asarray(y)
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch)(
                delayed(fit_grid_point)(
                    X, y, base_estimator, parameters, train, test, self.scorer_,
                    self.verbose, **{'sample_weight': balance_weights(y[train])}) for parameters in
                parameter_iterable for train, test in cv)

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, parameters, this_n_test_samples in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, sample_weight = balance_weights(y),**self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Пример #15
0
 def calculate_weights(self, classes):
     return balance_weights(classes)