def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6): """ Testing with two main classification losses. Also testing copying """ testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) for loss in [LogLossFunction(), AdaLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY) assert clf.n_features == n_features assert len(clf.feature_importances_) == n_features # checking that predict proba works for p in clf.staged_predict_proba(testX): assert p.shape == (n_samples, 2) assert numpy.all(p == clf.predict_proba(testX)) assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low' # checking clonability _ = clone(clf) clf_copy = copy.deepcopy(clf) assert numpy.all( clf.predict_proba(trainX) == clf_copy.predict_proba( trainX)), 'copied classifier is different'
def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = losses.CompositeLossFunction() loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [ loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn ]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format( result, loss) trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX)) for loss in [ losses.MSELossFunction(), losses.MAELossFunction(), losses.RankBoostLossFunction(request_column='fake_request') ]: print(loss) clf = UGradientBoostingRegressor(loss=loss, max_depth=3, n_estimators=50, learning_rate=0.01, subsample=0.5, train_features=list( trainX.columns[1:])) clf.fit(trainX, trainY) roc_auc = roc_auc_score(testY, clf.predict(testX)) assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format( roc_auc, loss)
def __init__(self, learning_rate=1.0, regularization=100., n_units=10, iterations=30, n_thresholds=10, max_overlap=20, sign=+1): self.learning_rate = learning_rate self.regularization = regularization self.bias_regularization = regularization * 0.1 self.n_units = n_units self.iterations = iterations self.n_thresholds = n_thresholds self.loss = LogLossFunction() self.max_overlap = max_overlap self.sign = sign self.unit_signs = numpy.ones(n_units) * sign
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6): """ Testing how classifiers work with highly misbalanced (in the terms of weights) datasets. """ testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) trainW = trainY * 10000 + 1 testW = testY * 10000 + 1 for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY, sample_weight=trainW) p = clf.predict_proba(testX) assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
def fit(self, X, y): X = numpy.array(X) self.loss = LogLossFunction() self.loss.fit(X, y, sample_weight=y * 0 + 1) max_cats = numpy.max(X) + 1 self.cat_biases = numpy.zeros([max_cats, X.shape[1]], dtype='float') predictions = numpy.zeros(len(X)) for stage in range(self.n_iterations): for column in range(X.shape[1]): grads = self.loss.negative_gradient(predictions) hesss = self.loss.hessian(predictions) inds = X[:, column] nominator = numpy.bincount( inds, weights=grads, minlength=max_cats ) - self.regularization * self.cat_biases[:, column] denominator = numpy.bincount( inds, weights=hesss, minlength=max_cats) + self.regularization predictions -= self.cat_biases[inds, column] self.cat_biases[:, column] += nominator / denominator predictions += self.cat_biases[inds, column] print stage, self.loss(predictions) return self
class Logistic_My: def __init__(self, regularization, n_iterations=10): self.regularization = regularization self.n_iterations = n_iterations def fit(self, X, y): X = numpy.array(X) self.loss = LogLossFunction() self.loss.fit(X, y, sample_weight=y * 0 + 1) max_cats = numpy.max(X) + 1 self.cat_biases = numpy.zeros([max_cats, X.shape[1]], dtype='float') predictions = numpy.zeros(len(X)) for stage in range(self.n_iterations): for column in range(X.shape[1]): grads = self.loss.negative_gradient(predictions) hesss = self.loss.hessian(predictions) inds = X[:, column] nominator = numpy.bincount( inds, weights=grads, minlength=max_cats ) - self.regularization * self.cat_biases[:, column] denominator = numpy.bincount( inds, weights=hesss, minlength=max_cats) + self.regularization predictions -= self.cat_biases[inds, column] self.cat_biases[:, column] += nominator / denominator predictions += self.cat_biases[inds, column] print stage, self.loss(predictions) return self def predict_proba(self, X): X = numpy.array(X) predictions = numpy.zeros(len(X)) for column in range(X.shape[1]): predictions += self.cat_biases[X[:, column], column] return predictions def predict_train(self, X): X = numpy.array(X) predictions = self.predict_proba(X) grads = self.loss.negative_gradient(predictions) hesss = self.loss.hessian(predictions) prediction_shift = numpy.zeros(len(X)) for column in range(X.shape[1]): inds = X[:, column] cum_grads = numpy.bincount(inds, weights=grads)[inds] cum_hess = numpy.bincount( inds, weights=hesss)[inds] + self.regularization prediction_shift += -grads / cum_hess return predictions + prediction_shift
def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = CompositeLossFunction() loss4 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [ loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn ]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format( result, loss)
def fit(self, X_cat, y): X_cat = numpy.array(X_cat) self.unit_weights = numpy.random.normal(size=self.n_units) self.cat_weights = [] for column in X_cat.T: self.cat_weights.append( numpy.random.normal(size=[numpy.max(column) + 1, self.n_units]) * 0.1) loss = LogLossFunction() loss.fit(X_cat, y, y * 0 + 1.) unit_predictions, predictions = self.compute_all(X_cat) # Training process for iteration in range(self.n_iterations): new_unit_predictions, new_predictions = self.compute_all(X_cat) assert numpy.allclose(predictions, new_predictions) predictions = new_predictions assert numpy.allclose(unit_predictions, new_unit_predictions) unit_predictions = new_unit_predictions for unit in range(self.n_units): # updating coefficient for unit for updated_unit in [unit]: grads = loss.negative_gradient(predictions) hesss = loss.hessian(predictions) unit_outputs = self.activation( unit_predictions[:, updated_unit]) nom = numpy.dot(grads, unit_outputs) denom = (numpy.dot(hesss, unit_outputs**2) + self.regularization) step = 0.5 * nom / denom self.unit_weights[updated_unit] += step predictions += step * unit_outputs for column in range(X_cat.shape[1]): inds = X_cat[:, column] # updating with respect to column and unit unit_outputs, unit_derivs, unit_hesss = self.act_grad_hess( unit_predictions[:, unit]) unit_weight = self.unit_weights[unit] grads = loss.negative_gradient(predictions) * unit_weight hesss = loss.hessian(predictions) * unit_weight**2 cat_grads = grads * unit_derivs cat_hesss = hesss * (unit_derivs**2) + grads * unit_hesss max_cats = self.cat_weights[column].shape[0] nominator = numpy.bincount(inds, weights=cat_grads, minlength=max_cats) nominator -= self.regularization * self.cat_weights[ column][:, unit] cat_steps = nominator/ \ (numpy.bincount(inds, weights=cat_hesss.clip(0), minlength=max_cats) + self.regularization) cat_steps *= 1.5 self.cat_weights[column][:, unit] += cat_steps predictions -= self.unit_weights[unit] * unit_outputs unit_predictions[:, unit] += cat_steps[inds] unit_outputs = self.activation(unit_predictions[:, unit]) predictions += self.unit_weights[unit] * unit_outputs print iteration, unit, column, loss(predictions) return self
class StallsFM(BaseEstimator, ClassifierMixin): def __init__(self, learning_rate=1.0, regularization=100., n_units=10, iterations=30, n_thresholds=10, max_overlap=20, sign=+1): self.learning_rate = learning_rate self.regularization = regularization self.bias_regularization = regularization * 0.1 self.n_units = n_units self.iterations = iterations self.n_thresholds = n_thresholds self.loss = LogLossFunction() self.max_overlap = max_overlap self.sign = sign self.unit_signs = numpy.ones(n_units) * sign def decompose_data(self, X, fit=False): # hack to support both pandas and numpy.arrays X = pandas.DataFrame(X) if fit: self.is_sequential = numpy.array( [column.dtype == 'float' for name, column in X.iteritems()]) self.codings = [] self.codings.append([0]) for name, column in X.iteritems(): if column.dtype == 'float': self.codings.append( numpy.percentile( column, numpy.linspace(0, 100, self.n_thresholds + 1)[1:-1])) else: self.codings.append(numpy.unique(column)) X_categoricals = [] X_categoricals.append(numpy.zeros(len(X), dtype=int)) for is_seq, coding, (name, column) in zip(self.is_sequential, self.codings[1:], X.iteritems()): if is_seq: X_categoricals.append(numpy.searchsorted(coding, column)) else: X_categoricals.append( (numpy.searchsorted(coding, column) + 1) * numpy.in1d(column, coding)) return numpy.array(X_categoricals).T def compute_grad_hess(self, predictions): return self.loss.negative_gradient(predictions), self.loss.hessian( predictions) def fit(self, X, y): self.classes_, y = numpy.unique(y, return_inverse=True) assert len(self.classes_) == 2, 'only two classes supported' X_cat = self.decompose_data(X, fit=True) self.cat_biases = [ numpy.zeros(len(coding) + 1) for coding in self.codings ] self.cat_representations = [ numpy.random.normal(size=[len(coding) + 1, self.n_units]) * 0.1 for coding in self.codings ] self.connections = numpy.zeros([X_cat.shape[1], self.n_units]) max_overlap = min(self.max_overlap, X_cat.shape[1]) self.connections[:] = generate_connections(X_cat.shape[1], self.n_units, n_overlap=max_overlap) return self.partial_fit(X, y, restart=True) def partial_fit(self, X, y, restart=False): assert isinstance( X, pandas.DataFrame), 'only pandas.DataFrames are accepted' assert numpy.in1d(y, self.classes_).all() y = numpy.searchsorted(self.classes_, y) assert len(X) == len(y) self.loss.fit(X, y, sample_weight=numpy.ones_like(y)) X_cat = self.decompose_data(X, fit=False) unit_signs = self.unit_signs self.losses = [] for iteration in range(self.iterations): if iteration % 1 == 0: biases, representations, representations_sq = self.compute_representations( X_cat) new_predictions = self.compute_prediction( biases, representations, representations_sq, unit_signs) if iteration > 0: assert numpy.allclose(predictions, new_predictions) predictions = new_predictions for category_biases, category_representations, column, connection in \ zip(self.cat_biases, self.cat_representations, X_cat.T, self.connections): # fitting biases with exact step minlen = len(category_biases) grads, hesss = self.compute_grad_hess(predictions) total_grads = numpy.bincount(column, weights=grads, minlength=minlen) total_hesss = numpy.bincount(column, weights=hesss, minlength=minlen) updates = (total_grads - self.bias_regularization * category_biases) / ( total_hesss + self.bias_regularization) category_biases[:] += updates biases += updates[column] predictions += updates[column] for unit in numpy.arange(self.n_units): unit_sign = unit_signs[unit] if unit_sign == 0 or connection[unit] == 0: continue grads, hesss = self.compute_grad_hess(predictions) predictions -= unit_sign * representations[:, unit]**2 predictions += unit_sign * category_representations[ column, unit]**2 representations[:, unit] -= category_representations[column, unit] total_grads = numpy.bincount(column, weights=(2 * unit_sign) * representations[:, unit] * grads, minlength=minlen) total_hesss = numpy.bincount( column, weights=4 * representations[:, unit]**2 * hesss, minlength=minlen) nominator = total_grads - self.regularization * category_representations[:, unit] denominator = total_hesss + self.regularization # TODO iterative update here with penalty for is_seq unit_update = self.learning_rate * nominator / denominator category_representations[:, unit] += unit_update category_representations[:, unit] = numpy.clip( category_representations[:, unit], -1, 1) representations[:, unit] += category_representations[column, unit] predictions += unit_sign * representations[:, unit]**2 predictions -= unit_sign * category_representations[ column, unit]**2 self.losses.append(self.loss(predictions)) print(iteration, self.losses[-1]) return self def compute_prediction(self, biases, representations, representations_sq, unit_signs): return biases + (representations**2 ).dot(unit_signs) - representations_sq.dot(unit_signs) def compute_representations(self, X_cat): biases = numpy.zeros(len(X_cat), dtype='float') representations = numpy.zeros([len(X_cat), self.n_units], dtype='float') representations_sq = numpy.zeros([len(X_cat), self.n_units], dtype='float') for cat_biases, cat_representations, column, connection in \ zip(self.cat_biases, self.cat_representations, X_cat.T, self.connections): biases += cat_biases[column] representations += cat_representations[column] * connection[ None, :] representations_sq += (cat_representations** 2)[column] * connection[None, :] return biases, representations, representations_sq def decision_function(self, X): X_cat = self.decompose_data(X, fit=False) biases, representations, representations_sq = self.compute_representations( X_cat) return self.compute_prediction(biases, representations, representations_sq, self.unit_signs) def predict_proba(self, X): result = numpy.zeros([len(X), 2]) result[:, 1] = scipy.special.expit(self.decision_function(X)) result[:, 0] = 1 - result[:, 1] return result def predict(self, X): return numpy.argmax(self.predict_proba(X), axis=1)