def test_balance_weights(): weights = balance_weights([0, 0, 1, 1]) assert_array_equal(weights, [1., 1., 1., 1.]) weights = balance_weights([0, 1, 1, 1, 1]) assert_array_equal(weights, [1., 0.25, 0.25, 0.25, 0.25]) weights = balance_weights([0, 0]) assert_array_equal(weights, [1., 1.])
def weighted_randomforest(data, targets, tree_num=TREE_NUM): model = RandomForestClassifier(n_estimators=tree_num, n_jobs=4, max_features=data.shape[1] / 2 + 1, verbose=0, compute_importances=True) model.fit(data, targets, balance_weights(targets)) return model
def weighted_randomforest(data, targets, tree_num=TREE_NUM): model = RandomForestClassifier(n_estimators=tree_num, n_jobs=4, max_features=data.shape[1]/2+1, verbose=0, compute_importances=True) model.fit(data, targets, balance_weights(targets)) return model
def test_unbalanced_iris(): """Check class rebalancing.""" unbalanced_X = iris.data[:125] unbalanced_y = iris.target[:125] sample_weight = balance_weights(unbalanced_y) clf = tree.DecisionTreeClassifier() clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight) assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y)
def __init__(self, stories=None, df=None): self.stories = stories if self.stories is not None: # training set candidates = df[['candidate', 'story_id']].to_dict('records') y = np.array([get_relevance(candidate, self.stories) for candidate in candidates]) self.target = pd.get_dummies(y) self.y = y self.sample_weight = balance_weights(y)
def _attempt(clf, X_train, y_train, X_test, y_test, weighted=True): weights = None if weighted: weights = balance_weights(y_train) clf.fit(X_train, y_train, sample_weight=weights) pred = clf.predict(X_test) print metrics.classification_report(y_test, pred, target_names=['high', 'low'])
def test_unbalanced_iris(): """Check class rebalancing.""" unbalanced_X = iris.data[:125] unbalanced_y = iris.target[:125] sample_weight = balance_weights(unbalanced_y) for name, TreeClassifier in CLF_TREES.items(): clf = TreeClassifier(random_state=0) clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight) assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y)
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') self.scorer_ = _deprecate_loss_and_score_funcs(self.loss_func, self.score_func, self.scoring) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)(delayed(fit_grid_point)( X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **{ 'sample_weight': balance_weights(y[train]) }) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, parameters, this_n_test_samples in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, sample_weight=balance_weights(y), **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
data_y = train["target"] train_X, val_X, train_y, val_y = train_test_split(data_X, data_y, test_size = 0.33, random_state = 42) test_X = test.drop(["id"], axis = 1) # <codecell> #rf = GridSearchCV(rfClassifier(), [{'n_estimators': [10, 50, 100, 150, 200, 300, 500]}, #{'max_features': ["sqrt", "log2", None]}]) rf = rfClassifier(n_estimators = 500, max_features = 18, verbose = 1) # <codecell> rf_fit = rf.fit(train_X, train_y, sample_weight = balance_weights(train_y)) # <codecell> rf_fit # <codecell> rf_prob = rf_fit.score(val_X, val_y) # <codecell> rf_prob # <codecell>
def calculate_weights(self, classes): return balance_weights(classes)
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') self.scorer_ = _deprecate_loss_and_score_funcs( self.loss_func, self.score_func, self.scoring) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(fit_grid_point)( X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **{'sample_weight': balance_weights(y[train])}) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, parameters, this_n_test_samples in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, sample_weight = balance_weights(y),**self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self