def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes if self.n_neighbors >= len(X): self.X_ = np.array(X) self.y_ = np.array(y) self.reduction_ = 0.0 mask = np.zeros(y.size, dtype=bool) tmp_m = np.ones(y.size, dtype=bool) for i in xrange(y.size): tmp_m[i] = not tmp_m[i] self.classifier.fit(X[tmp_m], y[tmp_m]) sample, label = X[i], y[i] if self.classifier.predict(sample) == [label]: mask[i] = not mask[i] tmp_m[i] = not tmp_m[i] self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
def _my_lrap(y_true, y_score): """Simple implementation of label ranking average precision""" y_true, y_score = check_arrays(y_true, y_score) n_samples, n_labels = y_true.shape score = np.empty((n_samples, )) for i in range(n_samples): # The best rank correspond to 1. Rank higher than 1 are worse. # The best inverse ranking correspond to n_labels. unique_rank, inv_rank = np.unique(y_score[i], return_inverse=True) n_ranks = unique_rank.size rank = n_ranks - inv_rank # Rank need to be corrected to take into account ties # ex: rank 1 ex aequo means that both label are rank 2. corr_rank = np.bincount(rank, minlength=n_ranks + 1).cumsum() rank = corr_rank[rank] relevant = y_true[i].nonzero()[0] if relevant.size == 0 or relevant.size == n_labels: score[i] = 1 continue score[i] = 0. for label in relevant: # Let's count the number of relevant label with better rank # (smaller rank). n_ranked_above = sum(rank[r] <= rank[label] for r in relevant) # Weight by the rank of the actual label score[i] += n_ranked_above / rank[label] score[i] /= relevant.size return score.mean()
def calc_hist_with_errors(x, weight=None, bins=60, normed=True, x_range=None, ignored_sideband=0.0): """ Calculate data for error bar (for plot pdf with errors) :param x: data :type x: list or numpy.array :param weight: weights :type weight: None or list or numpy.array :return: tuple (x-points (list), y-points (list), y points errors (list), x points errors (list)) """ weight = numpy.ones(len(x)) if weight is None else weight x, weight = check_arrays(x, weight) if x_range is None: x_range = numpy.percentile(x, [100 * ignored_sideband, 100 * (1 - ignored_sideband)]) ans, bins = numpy.histogram(x, bins=bins, normed=normed, weights=weight, range=x_range) yerr = [] normalization = 1.0 if normed: normalization = float(len(bins) - 1) / float(sum(weight)) / (x_range[1] - x_range[0]) for i in range(len(bins) - 1): weight_bin = weight[(x > bins[i]) * (x <= bins[i + 1])] yerr.append(numpy.sqrt(sum(weight_bin * weight_bin)) * normalization) bins_mean = [0.5 * (bins[i] + bins[i + 1]) for i in range(len(ans))] xerr = [0.5 * (bins[i + 1] - bins[i]) for i in range(len(ans))] return bins_mean, ans, yerr, xerr
def reduce_data(self, X, y): X, y = check_arrays(X, y, sparse_format="csr") if self.classifier == None: self.classifier = KNeighborsClassifier( n_neighbors=self.n_neighbors) prots_s = [] labels_s = [] classes = np.unique(y) self.classes_ = classes for cur_class in classes: mask = y == cur_class insts = X[mask] prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]] labels_s = labels_s + [cur_class] self.classifier.fit(prots_s, labels_s) for sample, label in zip(X, y): if self.classifier.predict(sample) != [label]: prots_s = prots_s + [sample] labels_s = labels_s + [label] self.classifier.fit(prots_s, labels_s) self.X_ = np.asarray(prots_s) self.y_ = np.asarray(labels_s) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier( n_neighbors=self.n_neighbors, algorithm='brute') if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes self.classifier.fit(X, y) nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False) nn_idx = nn_idx.T[1] mask = [ nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0]) ] mask = ~np.asarray(mask) if self.keep_class != None and self.keep_class in self.classes_: mask[y == self.keep_class] = True self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
def reduce_data(self, X, y): X, y = check_arrays(X, y, sparse_format="csr") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) prots_s = [] labels_s = [] classes = np.unique(y) self.classes_ = classes for cur_class in classes: mask = y == cur_class insts = X[mask] prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]] labels_s = labels_s + [cur_class] self.classifier.fit(prots_s, labels_s) for sample, label in zip(X, y): if self.classifier.predict(sample) != [label]: prots_s = prots_s + [sample] labels_s = labels_s + [label] self.classifier.fit(prots_s, labels_s) self.X_ = np.asarray(prots_s) self.y_ = np.asarray(labels_s) self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
def transform(self, X, y=None): """Project the data by using matrix product with the random matrix Parameters ---------- X : numpy array or scipy.sparse of shape [n_samples, n_features] The input data to project into a smaller dimensional space. y : is not used: placeholder to allow for usage in a Pipeline. Returns ------- X_new : numpy array or scipy sparse of shape [n_samples, n_components] Projected array. """ X, y = check_arrays(X, y) if self.components_ is None: raise ValueError('No random projection matrix had been fit.') if X.shape[1] != self.components_.shape[1]: raise ValueError( 'Impossible to perform projection:' 'X at fit stage had a different number of features.' '(%s != %s)' % (X.shape[1], self.components_.shape[1])) if not sp.issparse(X): X = np.atleast_2d(X) X_new = safe_sparse_dot(X, self.components_.T, dense_output=self.dense_output) return X_new
def reduce_data(self, X, y): X, y = check_arrays(X, y, sparse_format="csr") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors classes = np.unique(y) self.classes_ = classes # loading inicial groups self.groups = [] for label in classes: mask = y == label self.groups = self.groups + [_Group(X[mask], label)] self._main_loop() self._generalization_step() self._merge() self._pruning() self.X_ = np.asarray([g.rep_x for g in self.groups]) self.y_ = np.asarray([g.label for g in self.groups]) self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes if self.n_neighbors >= len(X): self.X_ = np.array(X) self.y_ = np.array(y) self.reduction_ = 0.0 return self.X_, self.y_ mask = np.zeros(y.size, dtype=bool) tmp_m = np.ones(y.size, dtype=bool) for i in xrange(y.size): tmp_m[i] = not tmp_m[i] self.classifier.fit(X[tmp_m], y[tmp_m]) sample, label = X[i], y[i] if self.classifier.predict(sample) == [label]: mask[i] = not mask[i] tmp_m[i] = not tmp_m[i] self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
def fit(self, X, y, sample_weight=None): X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) sample_weight = check_sample_weight(y, sample_weight=sample_weight) sample_weight = normalize_weight(y, sample_weight, sig_weight=self.sig_weight) self.random_state = check_random_state(self.random_state) self.estimators = [] score = numpy.zeros(len(X), dtype=float) y_signed = 2 * y - 1 self.w_sig = [] self.w_bck = [] for _ in range(self.n_estimators): residual = y_signed # numpy.exp(- y_signed * score) # residual[y > 0.5] /= numpy.mean(residual[y > 0.5]) # residual[y < 0.5] /= -numpy.mean(residual[y < 0.5]) trainX, testX, trainY, testY, trainW, testW, trainR, testR, trainS, testS = \ train_test_split(X, y, sample_weight, residual, score, train_size=self.train_part, test_size=self.test_size, random_state=self.random_state) tree = DecisionTreeRegressor(criterion=self.criterion, splitter=self.splitter, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, random_state=self.random_state) # fitting tree.fit(trainX, trainR, sample_weight=trainW, check_input=False) # post-pruning self.update_terminal_regions(tree.tree_, testX, testY, testW, testS) # updating score # score += self.learning_rate * tree.predict(X) self.estimators.append(tree)
def reduce_data(self, X, y): X, y = check_arrays(X, y, sparse_format="csr") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors classes = np.unique(y) self.classes_ = classes minority_class = self.pos_class if self.pos_class == None: minority_class = min(set(y), key = list(y).count) # loading inicial groups self.groups = [] for label in classes: mask = y == label self.groups = self.groups + [_Group(X[mask], label)] self._main_loop() self._generalization_step() min_groups = filter(lambda g: g.label == minority_class, self.groups) self._merge() self._pruning() max_groups = filter(lambda g: g.label != minority_class, self.groups) self.groups = min_groups + max_groups self.X_ = np.asarray([g.rep_x for g in self.groups]) self.y_ = np.asarray([g.label for g in self.groups]) self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
def cvm_flatness(y, proba, X, uniform_variables, sample_weight=None, label=1, knn=30): """ The most simple way to compute Cramer-von Mises flatness, this is however very slow if you need to compute it many times :param y: real classes of events, shape = [n_samples] :param proba: predicted probabilities, shape = [n_samples, n_classes] :param X: pandas.DataFrame with uniform features (i.e. test dataset) :param uniform_variables: features, along which uniformity is desired, list of strings :param sample_weight: weights of events, shape = [n_samples] :param label: class, for which uniformity is measured (usually, 0 is bck, 1 is signal) :param knn: number of nearest neighbours used in knn Example of usage: proba = classifier.predict_proba(testX) cvm_flatness(testY, proba=proba, X=testX, uniform_variables=['mass']) """ y, proba = check_arrays(y, proba) assert len(y) == len(proba) == len(X), 'Different lengths' y = column_or_1d(y) sample_weight = check_sample_weight(y, sample_weight=sample_weight) X = pandas.DataFrame(X) signal_mask = y == label groups_indices = computeSignalKnnIndices(uniform_variables=uniform_variables, dataframe=X, is_signal=signal_mask, n_neighbors=knn) groups_indices = groups_indices[signal_mask, :] return ut.group_based_cvm(proba[:, label], mask=signal_mask, groups_indices=groups_indices, sample_weight=sample_weight)
def plot_score_variable_correlation(y_true, y_pred, correlation_values, cuts, sample_weight=None, classifier_name="", var_name="", score_function=efficiency_score, bins_number=20): """ Different score functions available: Efficiency, Precision, Recall, F1Score, and other things from sklearn.metrics :param y_pred: numpy.array, of shape [n_samples] :param y_true: numpy.array, of shape [n_samples] with float predictions :param correlation_values: numpy.array of shape [n_samples], usually that is masses of events :param cuts: array-like of cuts, for each cut a separate :param sample_weight: numpy.array or None, shape = [n_samples] :param classifier_name: str, used only in label :param var_name: str, i.e. 'mass' :param score_function: any function with signature (y_true, y_pred, sample_weight=None) :param bins_number: int, the number of bins """ y_true, y_pred, correlation_values = check_arrays(y_true, y_pred, correlation_values) sample_weight = check_sample_weight(y_true, sample_weight=sample_weight) binner = Binner(correlation_values, n_bins=bins_number) bins_data = binner.split_into_bins(correlation_values, y_true, y_pred, sample_weight) for cut in cuts: x_values = [] y_values = [] for bin_data in bins_data: bin_masses, bin_y_true, bin_proba, bin_weight = bin_data y_values.append(score_function(bin_y_true, bin_proba[:, 1] > cut, sample_weight=bin_weight)) x_values.append(numpy.mean(bin_masses)) pylab.plot(x_values, y_values, '.-', label="cut = %0.3f" % cut) pylab.title("Correlation with results of " + classifier_name) pylab.xlabel(var_name) pylab.ylabel(score_function.__name__) pylab.legend(loc="lower right")
def reorder_by_first(*arrays): """ Applies the same permutation to all passed arrays, permutation sorts the first passed array """ arrays = check_arrays(*arrays) order = numpy.argsort(arrays[0]) return [arr[order] for arr in arrays]
def fit(self, X, y, sample_weight=None): sample_weight = check_sample_weight(y, sample_weight=sample_weight) assert len(X) == len(y), 'Different lengths of X and y' X = pandas.DataFrame(X) y = numpy.array(column_or_1d(y), dtype=int) assert numpy.all(numpy.in1d(y, [0, 1])), 'Only two-class classification supported' self.check_params() self.estimators = [] self.scores = [] n_samples = len(X) n_inbag = int(self.subsample * len(X)) self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=sample_weight) # preparing for fitting in trees X = self.get_train_vars(X) self.n_features = X.shape[1] X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y_pred = numpy.zeros(len(X), dtype=float) if self.init_estimator is not None: y_signed = 2 * y - 1 self.init_estimator.fit(X, y_signed, sample_weight=sample_weight) y_pred += numpy.ravel(self.init_estimator.predict(X)) for stage in range(self.n_estimators): # tree creation tree = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, random_state=self.random_state, max_leaf_nodes=self.max_leaf_nodes) # tree learning residual = self.loss.negative_gradient(y_pred) train_indices = self.random_state.choice(n_samples, size=n_inbag, replace=False) tree.fit(X[train_indices], residual[train_indices], sample_weight=sample_weight[train_indices], check_input=False) # update tree leaves if self.update_tree: self.loss.update_tree(tree.tree_, X=X, y=y, y_pred=y_pred, sample_weight=sample_weight, update_mask=numpy.ones(len(X), dtype=bool), residual=residual) y_pred += self.learning_rate * tree.predict(X) self.estimators.append(tree) self.scores.append(self.loss(y_pred)) return self
def fit(self, X): """Creates a biclustering for X. Parameters ---------- X : array-like, shape (n_samples, n_features) """ X, = check_arrays(X, sparse_format='csr', dtype=np.float64) check_array_ndim(X) self._check_parameters() self._fit(X)
def _log_loss(y_true, y_pred, eps=1e-10, sample_weight=None): """ This is shorter ans simpler version og log_loss, which supports sample_weight """ sample_weight = check_sample_weight(y_true, sample_weight=sample_weight) y_true, y_pred, sample_weight = check_arrays(y_true, y_pred, sample_weight) y_true = column_or_1d(y_true) lb = LabelBinarizer() T = lb.fit_transform(y_true) if T.shape[1] == 1: T = numpy.append(1 - T, T, axis=1) # Clipping Y = numpy.clip(y_pred, eps, 1 - eps) # Check if dimensions are consistent. T, Y = check_arrays(T, Y) # Renormalize Y /= Y.sum(axis=1)[:, numpy.newaxis] loss = -(T * numpy.log(Y) * sample_weight[:, numpy.newaxis]).sum() / numpy.sum(sample_weight) return loss
def fit(self, X): """Creates a biclustering for X. Parameters ---------- X : array-like, shape (n_samples, n_features) """ X, = check_arrays(X, sparse_format="csr", dtype=np.float64) check_array_ndim(X) self._check_parameters() self._fit(X)
def plot_roc(y_true, y_pred, sample_weight=None, classifier_name="", is_cut=False, mask=None): """Plots ROC curve in the way physicists like it :param y_true: numpy.array, shape=[n_samples] :param y_pred: numpy.array, shape=[n_samples] :param sample_weight: numpy.array | None, shape = [n_samples] :param classifier_name: str, the name of classifier for label :param is_cut: predictions are binary :param mask: plot ROC curve only for events that have mask=True """ if is_cut: assert len(numpy.unique(y_pred)) == 2, 'Cut assumes that prediction are 0 and 1 (or True/False)' MAX_STEPS = 500 y_true, y_pred = check_arrays(y_true, y_pred) if mask is not None: mask = numpy.array(mask, dtype=bool) # converting to bool, just in case y_true = y_true[mask] y_pred = y_pred[mask] if sample_weight is not None: sample_weight = sample_weight[mask] fpr, tpr, thresholds = check_arrays(*roc_curve(y_true, y_pred, sample_weight=sample_weight)) roc_auc = auc(fpr, tpr) # tpr = recall = isSasS / isS = signal efficiency # fpr = isBasS / isB = 1 - specificity = 1 - backgroundRejection bg_rejection = 1. - fpr if len(fpr) > MAX_STEPS: # decreasing the number of points in plot targets = numpy.linspace(0, 1, MAX_STEPS) x_ids = numpy.searchsorted(tpr, targets) y_ids = numpy.searchsorted(fpr, targets) indices = numpy.concatenate([x_ids, y_ids, [0, len(tpr) - 1]], ) indices = numpy.unique(indices) tpr = tpr[indices] bg_rejection = bg_rejection[indices] if not is_cut: pylab.plot(tpr, bg_rejection, label='%s (area = %0.3f)' % (classifier_name, roc_auc)) else: pylab.plot(tpr[1:2], bg_rejection[1:2], 'o', label='%s' % classifier_name)
def fit(self, X, y, sample_weight=None): sample_weight = check_sample_weight(y, sample_weight=sample_weight) X, y = check_arrays(X, y) assert X.shape[1] == 1 X = numpy.ravel(X) indices = numpy.argsort(X) self.sorted_x = X[indices] self.sorted_y = y[indices] self.sorted_w = sample_weight[indices] window = numpy.hamming(2 * self.knn + 1) window[self.knn] = 0 self.sig_w = numpy.convolve(self.sorted_w * self.sorted_y, window, mode='same') self.bck_w = numpy.convolve(self.sorted_w * (1 - self.sorted_y), window, mode='same') assert len(self.sig_w) == len(self.bck_w) == len(self.sorted_y) == len(X)
def plot_roc(y_true, y_pred, sample_weight=None, classifier_name=""): """Plots ROC curve in the way physicist like it :param y_true: numpy.array, shape=[n_samples] :param y_pred: numpy.array, shape=[n_samples] :param sample_weight: numpy.array | None, shape = [n_samples] :param classifier_name: str, the name of classifier for label """ y_true, y_pred = check_arrays(y_true, y_pred) fpr, tpr, thresholds = roc_curve(y_true, y_pred, sample_weight=sample_weight) # tpr = recall = isSasS / isS = signal efficiency # fpr = isBasS / isB = 1 - specificity ?=? 1 - backgroundRejection bg_rejection = 1. - numpy.array(fpr) roc_auc = auc(fpr, tpr) pylab.plot(tpr, bg_rejection, label='%s (area = %0.3f)' % (classifier_name, roc_auc))
def reduce_data(self, X, y): X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes self.main_loop(X, y) best_index = np.argmax(self.evaluations) mask = np.asarray(self.chromosomes[best_index], dtype=bool) self.X_ = X[mask] self.y_ = y[mask] self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
def staged_predict_proba(self, X): results=numpy.zeros([len(X), self.n_estimators], dtype=float) X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) w_s = numpy.zeros(len(X)) w_b = numpy.zeros(len(X)) for i, (estimator, e_ws, e_wb) in enumerate(zip(self.estimators, self.w_sig, self.w_bck)): indices = estimator.predict(X).astype(int) assert numpy.all(indices == estimator.tree_.apply(X)) results[:, i] = e_ws[indices] / e_wb[indices] for i in range(1, self.n_estimators): score = numpy.median(results[:, :i], axis=1) proba = numpy.zeros([len(X), 2], dtype=float) proba[:, 1] = expit(score) proba[:, 0] = expit(-score) yield proba
def squared_error(y_true, y_pred): """Compute the squared error. Parameters ---------- y_true : array-like of shape = [n_samples] Ground truth (correct) target values. y_pred : array-like of shape = [n_samples] Estimated target values. Returns ------- errors : array of shape = [n_samples] Squared error of each sample. """ y_true, y_pred = check_arrays(y_true, y_pred) return (y_pred - y_true)**2
def reduce_data(self, X, y): X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes enn = ENN(n_neighbors = self.n_neighbors) p_, l_, r_ = X, y, 1.0 while r_ != 0: enn.reduce_data(p_, l_) p_ = enn.X_ l_ = enn.y_ r_ = enn.reduction_ self.X_ = p_ self.y_ = l_ self.reduction_ = 1.0 - float(l_.shape[0]) / y.shape[0] return self.X_, self.y_
def predict_proba(self, X, percentile=50): results=numpy.zeros([len(X), self.n_estimators], dtype=float) X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) w_s = numpy.zeros(len(X)) w_b = numpy.zeros(len(X)) for i, (estimator, e_ws, e_wb) in enumerate(zip(self.estimators, self.w_sig, self.w_bck)): indices = estimator.predict(X).astype(int) assert numpy.all(indices == estimator.tree_.apply(X)) results[:, i] = e_ws[indices] / e_wb[indices] w_s += e_ws[indices] w_b += e_wb[indices] score = numpy.percentile(results, percentile, axis=1) # score = w_s / (w_s + w_b + 0.01) proba = numpy.zeros([len(X), 2], dtype=float) proba[:, 1] = expit( score) proba[:, 0] = expit(-score) return proba
def gain_curve(y_true, y_score, pos_label=None): y_true, y_score = check_arrays(y_true, y_score) y_true = column_or_1d(y_true) y_score = column_or_1d(y_score) # ensure binary classification if pos_label is not specified classes = np.unique(y_true) if (pos_label is None and not (np.all(classes == [0, 1]) or np.all(classes == [-1, 1]) or np.all(classes == [0]) or np.all(classes == [-1]) or np.all(classes == [1]))): raise ValueError("Data is not binary and pos_label is not specified") elif pos_label is None: pos_label = 1. # sort scores and corresponding truth values. -1 to invert order desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] y_score = y_score[desc_score_indices] y_true = y_true[desc_score_indices] # make y_true a boolean vector to deal with y_true = (y_true == pos_label) # accumulate the true positives with decreasing threshold total = len(y_true) _, total_target = np.bincount(y_true) tps_cum = np.cumsum(y_true) percentages = np.asarray(range(10, 110, 10)) percentages_indices = np.divide(np.multiply(total, percentages), 100) - 1 tps_cum_percentages = np.divide( np.multiply(100, tps_cum[percentages_indices]), total_target) # Add an extra threshold position if necessary percentages = np.r_[0, percentages] tps_cum_percentages = np.r_[0, tps_cum_percentages] # print 'Indexes of customers on the first percentil' # for idx in range(0,percentages_indices[0]): # if y_true[idx]: # print str(desc_score_indices[idx]) return percentages, tps_cum_percentages
def fit(self, X, y, sample_weight=None, iterations=100, loss=None): X, y = check_arrays(X, y) sample_weight = check_sample_weight(y, sample_weight=sample_weight) if loss is None: loss = BinomialDevianceLossFunction() loss.fit(X, y, sample_weight=sample_weight) self.n_features = X.shape[1] self.coeffs = numpy.zeros([self.n_features, self.max_categories], dtype='float') assert numpy.max(X) < self.max_categories assert numpy.min(X) >= 0 for iteration in range(iterations): # this line could be skipped, but we need it to avoid # mistakes after too many steps of computations y_pred = self.decision_function(X) print(iteration, loss(y_pred)) for feature in range(self.n_features): ngradient = loss.negative_gradient(y_pred) nominator = numpy.bincount(X[:, feature], weights=ngradient, minlength=self.max_categories) nominator -= self.l2_reg * self.coeffs[feature, :] + self.l1_reg * numpy.sign(self.coeffs[feature, :]) denominator = numpy.abs(ngradient) * (1. - numpy.abs(ngradient)) denominator = numpy.bincount(X[:, feature], weights=denominator, minlength=self.max_categories) denominator += 2 * self.l2_reg + 5 gradients = nominator / denominator right_gradients = gradients # those already zeros not to become nonzero mask = (self.coeffs[feature, :] == 0) & (numpy.abs(gradients) < self.l1_reg) right_gradients[mask] = 0 # those already not zeros old_coeffs = self.coeffs[feature, :] new_coeffs = old_coeffs + self.learning_rate * right_gradients new_coeffs[new_coeffs * old_coeffs < 0] = 0 y_pred += numpy.take(new_coeffs - old_coeffs, X[:, feature]) self.coeffs[feature, :] = new_coeffs return self
def reduce_data(self, X, y): X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes edited_nn = ENN(n_neighbors = 1) p_, l_, r_ = X, y, 1.0 for k in range(1, self.n_neighbors + 1): if l_.shape[0] > k + 1: edited_nn.n_neighbors = k edited_nn.fit(p_, l_) p_ = edited_nn.X_ l_ = edited_nn.y_ r_ = edited_nn.reduction_ self.X_ = p_ self.y_ = l_ self.reduction_ = 1.0 - float(l_.shape[0]) / y.shape[0] return self.X_, self.y_
def reduce_data(self, X, y): X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes edited_nn = ENN(n_neighbors=1) p_, l_, r_ = X, y, 1.0 for k in range(1, self.n_neighbors + 1): if l_.shape[0] > k + 1: edited_nn.n_neighbors = k edited_nn.fit(p_, l_) p_ = edited_nn.X_ l_ = edited_nn.y_ r_ = edited_nn.reduction_ self.X_ = p_ self.y_ = l_ self.reduction_ = 1.0 - float(l_.shape[0]) / y.shape[0] return self.X_, self.y_
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if not self.dataset_filenames: self.save_dataset_filename(X, y, cv) dataset_filenames = self.dataset_filenames client = Client() lb_view = client.load_balanced_view() if self.verbose > 0: print("Number of CPU core %d" % len(client.ids())) self.tasks = [([lb_view.apply(evaluate, estimator, dataset_filename, params) for dataset_filename in dataset_filenames], params) for params in parameter_iterable] if self.sync: self.wait() self.set_grid_scores() self.set_best_score_params() if self.refit: self.set_best_estimator(estimator) return self
def calc_ROC(prediction, signal, sample_weight=None, max_points=10000): """ Calculate roc curve :param prediction: predictions :type prediction: array or list :param signal: true labels :type signal: array or list :param sample_weight: weights :type sample_weight: None or array or list :param int max_points: maximum of used points on roc curve :return: (tpr, tnr), (err_tnr, err_tpr), thresholds """ sample_weight = numpy.ones(len(signal)) if sample_weight is None else sample_weight prediction, signal, sample_weight = check_arrays(prediction, signal, sample_weight) assert set(signal) == {0, 1}, "the labels should be 0 and 1, labels are " + str(set(signal)) fpr, tpr, thresholds = roc_curve(signal, prediction, sample_weight=sample_weight) tpr = numpy.insert(tpr, 0, 0.) fpr = numpy.insert(fpr, 0, 0.) thresholds = numpy.insert(thresholds, 0, thresholds[0] + 1.) tnr = 1 - fpr weight_bck = sample_weight[signal == 0] weight_sig = sample_weight[signal == 1] err_tnr = numpy.sqrt(tnr * (1 - tnr) * numpy.sum(weight_bck ** 2)) / numpy.sum(weight_bck) err_tpr = numpy.sqrt(tpr * (1 - tpr) * numpy.sum(weight_sig ** 2)) / numpy.sum(weight_sig) if len(prediction) > max_points: sum_weights = numpy.cumsum((fpr + tpr) / 2.) sum_weights /= sum_weights[-1] positions = numpy.searchsorted(sum_weights, numpy.linspace(0, 1, max_points)) tpr, tnr = tpr[positions], tnr[positions] err_tnr, err_tpr = err_tnr[positions], err_tpr[positions] thresholds = thresholds[positions] return (tpr, tnr), (err_tnr, err_tpr), thresholds
def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors, algorithm='brute') if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes self.classifier.fit(X, y) nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False) nn_idx = nn_idx.T[1] mask = [nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0])] mask = ~np.asarray(mask) if self.keep_class != None and self.keep_class in self.classes_: mask[y==self.keep_class] = True self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
def get_efficiencies(prediction, spectator, sample_weight=None, bins_number=20, thresholds=None, errors=False, ignored_sideband=0.0): """ Construct efficiency function dependent on spectator for each threshold Different score functions available: Efficiency, Precision, Recall, F1Score, and other things from sklearn.metrics Parameters: ----------- :param prediction: list of probabilities :param spectator: list of spectator's values :param bins_number: int, count of bins for plot :param thresholds: list of prediction's threshold (default=prediction's cuts for which efficiency will be [0.2, 0.4, 0.5, 0.6, 0.8]) :return: if errors=False OrderedDict threshold -> (x_values, y_values) if errors=True OrderedDict threshold -> (x_values, y_values, y_err, x_err) All the parts: x_values, y_values, y_err, x_err are numpy.arrays of the same length. """ prediction, spectator = \ check_arrays(prediction, spectator) spectator_min, spectator_max = numpy.percentile(spectator, [100 * ignored_sideband, 100 * (1. - ignored_sideband)]) mask = (spectator >= spectator_min) & (spectator <= spectator_max) spectator = spectator[mask] prediction = prediction[mask] bins_number = min(bins_number, len(prediction)) sample_weight = sample_weight if sample_weight is None else numpy.array(sample_weight)[mask] if thresholds is None: thresholds = [weighted_percentile(prediction, percentiles=1 - eff, sample_weight=sample_weight) for eff in [0.2, 0.4, 0.5, 0.6, 0.8]] binner = Binner(spectator, bins_number=bins_number) bins_data = binner.split_into_bins(spectator, prediction) bin_edges = numpy.array([spectator_min] + list(binner.limits) + [spectator_max]) xerr = numpy.diff(bin_edges) / 2. result = OrderedDict() for threshold in thresholds: x_values = [] y_values = [] for num, (masses, probabilities) in enumerate(bins_data): y_values.append(numpy.mean(probabilities > threshold)) if errors: x_values.append((bin_edges[num + 1] + bin_edges[num]) / 2.) else: x_values.append(numpy.mean(masses)) x_values, y_values = check_arrays(x_values, y_values) if errors: result[threshold] = (x_values, y_values, numpy.sqrt(y_values * (1 - y_values) / len(y_values)), xerr) else: result[threshold] = (x_values, y_values) return result
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') self.scorer_ = _deprecate_loss_and_score_funcs( self.loss_func, self.score_func, self.scoring) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(fit_grid_point_extended)( X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **self.fit_params) for parameters in parameter_iterable for train, test in cv) # out = [] # for parameters in parameter_iterable: # fold = 1 # for train, test in cv: # print "Processing fold", fold, self.fit_params # out.append(fit_grid_point_extended(X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **self.fit_params)) # fold += 1 # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_extras = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] all_extras = list() for this_score, parameters, this_n_test_samples, extra in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) all_extras.append(extra) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) grid_extras.append(all_extras) # Store the computed scores self.grid_scores_ = grid_scores self.extras_ = grid_extras # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True, sample_weight=None): """Build a decision tree from the training set (X, y). Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. Use ``dtype=np.float32`` for maximum efficiency. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (integers that correspond to classes in classification, real numbers in regression). Use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Deprecations if sample_mask is not None: warn( "The sample_mask parameter is deprecated as of version 0.14 " "and will be removed in 0.16.", DeprecationWarning) if X_argsorted is not None: warn( "The X_argsorted parameter is deprecated as of version 0.14 " "and will be removed in 0.16.", DeprecationWarning) # Convert data if check_input: X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: y = np.copy(y) self.classes_ = [] self.n_classes_ = [] for k in xrange(self.n_outputs_): classes_k, y[:, k] = unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = (2**31) - 1 if self.max_depth is None else self.max_depth if isinstance(self.max_features, six.string_types): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if self.min_samples_split <= 0: raise ValueError("min_samples_split must be greater than zero.") if self.min_samples_leaf <= 0: raise ValueError("min_samples_leaf must be greater than zero.") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) # Set min_samples_split sensibly min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, max_features, self.min_samples_leaf, random_state) self.criterion_ = criterion self.splitter_ = splitter self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_, splitter, max_depth, min_samples_split, self.min_samples_leaf, random_state) self.tree_.build(X, y, sample_weight=sample_weight) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True): """Build a decision tree from the training set (X, y). Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. Use ``dtype=np.float32`` and ``order='F'`` for maximum efficiency. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (integers that correspond to classes in classification, real numbers in regression). Use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_mask : array-like, shape = [n_samples], dtype = bool or None A bit mask that encodes the rows of ``X`` that should be used to build the decision tree. It can be used for bagging without the need to create of copy of ``X``. If None a mask will be created that includes all samples. X_argsorted : array-like, shape = [n_samples, n_features] or None Each column of ``X_argsorted`` holds the row indices of ``X`` sorted according to the value of the corresponding feature in ascending order. I.e. ``X[X_argsorted[i, k], k] <= X[X_argsorted[j, k], k]`` for each j > i. If None, ``X_argsorted`` is computed internally. The argument is supported to enable multiple decision trees to share the data structure and to avoid re-computation in tree ensembles. For maximum efficiency use dtype np.int32. Returns ------- self : object Returns self. """ if check_input: X, y = check_arrays(X, y) self.random_state = check_random_state(self.random_state) # set min_samples_split sensibly self.min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf) # Convert data if getattr(X, "dtype", None) != DTYPE or \ X.ndim != 2 or not X.flags.fortran: X = array2d(X, dtype=DTYPE, order="F") n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) if y.ndim == 1: y = y[:, np.newaxis] self.classes_ = [] self.n_classes_ = [] self.n_outputs_ = y.shape[1] if is_classification: y = np.copy(y) for k in xrange(self.n_outputs_): unique = np.unique(y[:, k]) self.classes_.append(unique) self.n_classes_.append(unique.shape[0]) y[:, k] = np.searchsorted(unique, y[:, k]) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) if is_classification: criterion = CLASSIFICATION[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = REGRESSION[self.criterion](self.n_outputs_) # Check parameters max_depth = np.inf if self.max_depth is None else self.max_depth if isinstance(self.max_features, basestring): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ else: max_features = self.max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if self.min_samples_split <= 0: raise ValueError("min_samples_split must be greater than zero.") if self.min_samples_leaf <= 0: raise ValueError("min_samples_leaf must be greater than zero.") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if self.min_density < 0.0 or self.min_density > 1.0: raise ValueError("min_density must be in [0, 1]") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if sample_mask is not None: sample_mask = np.asarray(sample_mask, dtype=np.bool) if sample_mask.shape[0] != n_samples: raise ValueError("Length of sample_mask=%d does not match " "number of samples=%d" % (sample_mask.shape[0], n_samples)) if X_argsorted is not None: X_argsorted = np.asarray(X_argsorted, dtype=np.int32, order='F') if X_argsorted.shape != X.shape: raise ValueError("Shape of X_argsorted does not match " "the shape of X") # Build tree self.tree_ = _tree.Tree(self.n_features_, self.n_classes_, self.n_outputs_, criterion, max_depth, self.min_samples_split, self.min_samples_leaf, self.min_density, max_features, self.find_split_, self.random_state) self.tree_.build(X, y, sample_mask=sample_mask, X_argsorted=X_argsorted) if self.compute_importances: self.feature_importances_ = \ self.tree_.compute_feature_importances() return self
def fit(self, X, y, sample_weight=None, neighbours_matrix=None): """Build a boosted classifier from the training set (X, y). Parameters ---------- X : array-like of shape = [n_samples, n_features] The training input samples. y : array-like of shape = [n_samples] The target values (integers that correspond to classes). sample_weight : array-like of shape = [n_samples], optional Sample weights. If None, the sample weights are initialized to ``1 / n_samples``. neighbours_matrix: array-like of shape [n_samples, n_neighbours], each row contains indices of signal neighbours (neighbours should be computed for background too), if None, this matrix is computed. Returns ------- self : object Returns self. """ if self.smoothing < 0: raise ValueError("Smoothing must be non-negative") if not isinstance(self.base_estimator, BaseEstimator): raise TypeError("estimator must be a subclass of BaseEstimator") if self.n_estimators <= 0: raise ValueError("n_estimators must be greater than zero.") if self.learning_rate <= 0: raise ValueError("learning_rate must be greater than zero") # Check that algorithm is supported if self.algorithm not in ('SAMME', 'SAMME.R'): raise ValueError("algorithm %s is not supported" % self.algorithm) if self.algorithm == 'SAMME.R': if not hasattr(self.base_estimator, 'predict_proba'): raise TypeError( "uBoostBDT with algorithm='SAMME.R' requires " "that the weak learner have a predict_proba method.\n" "Please change the base estimator or set " "algorithm='SAMME' instead.") assert np.in1d(y, [0, 1]).all(), \ "only two-class classification is implemented" if neighbours_matrix is not None: assert np.shape(neighbours_matrix) == (len(X), self.n_neighbors), \ "Wrong shape of neighbours_matrix" self.knn_indices = neighbours_matrix else: assert self.uniform_variables is not None,\ "uniform_variables should be set" self.knn_indices = computeSignalKnnIndices( self.uniform_variables, X, y > 0.5, self.n_neighbors) if sample_weight is None: # Initialize weights to 1 / n_samples sample_weight = np.ones(len(X), dtype=np.float) / len(X) else: # Normalize existing weights assert np.all(sample_weight >= 0.),\ 'the weights should be non-negative' sample_weight /= np.sum(sample_weight) # Clear any previous fit results self.estimators_ = [] self.estimator_weights_ = [] # score cuts correspond to # global efficiency == target_efficiency on each iteration. self.score_cuts_ = [] X_train_variables = self.get_train_vars(X) y = np.ravel(y) X_train_variables, y = check_arrays( X_train_variables, y, sparse_format="dense") # A dictionary to keep all intermediate weights, efficiencies and so on if self.keep_debug_info: self.debug_dict = defaultdict(list) self.random_generator = check_random_state(self.random_state) self._boost(X_train_variables, y, sample_weight) self.score_cut = compute_bdt_cut( self.target_efficiency, y, self.predict_score(X)) assert abs(self.score_cut - self.score_cuts_[-1] < 1e-4),\ "score cut doesn't appear to coincide with the staged one" assert len(self.estimators_) == len(self.estimator_weights_) == len(self.score_cuts_) return self
def _check_rows_and_columns(a, b): """Unpacks the row and column arrays and checks their shape.""" a_rows, a_cols = check_arrays(*a) b_rows, b_cols = check_arrays(*b) return a_rows, a_cols, b_rows, b_cols
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') self.scorer_ = _deprecate_loss_and_score_funcs(self.loss_func, self.score_func, self.scoring) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)(delayed(fit_grid_point)( X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **{ 'sample_weight': balance_weights(y[train]) }) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, parameters, this_n_test_samples in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, sample_weight=balance_weights(y), **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _prepare_data_for_fitting(self, X, y, sample_weight): """By default the same format used as for trees """ X = self.get_train_vars(X) X, y = check_arrays(X, y, dtype=self.dtype, sparse_format="dense", check_ccontiguous=True) return X, y, sample_weight
def _fit(self, X, y, sample_weight, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y, sample_weight = check_arrays(X, y, sample_weight, allow_lists=True, sparse_format='csr') if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) if sample_weight is not None: sample_weight = np.asarray(sample_weight) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print( "Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) # first fit at each grid point using the maximum n_estimators param_grid = self.param_grid.copy() param_grid['n_estimators'] = [self.max_n_estimators] grid = ParameterGrid(param_grid) pre_dispatch = self.pre_dispatch clfs = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)(delayed(fit_grid_point)( base_estimator, clf_params, X, y, sample_weight, train, test, self.verbose, **self.fit_params) for clf_params in grid for train, test in cv) # now use the already fitted ensembles but trancate to N estimators for # N from 1 to n_estimators_max - 1 (inclusive) out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(score_each_boost) (clf, clf_params, self.min_n_estimators, X, y, sample_weight, self.score_func, train, test, self.verbose) for clf, clf_params, train, test in clfs) out = reduce(operator.add, [zip(*stage) for stage in out]) # out is now a list of triplet: score, estimator_params, n_test_samples n_estimators_points = self.max_n_estimators - self.min_n_estimators + 1 n_fits = len(out) n_folds = len(cv) grid_scores = list() for block in range(0, n_fits, n_folds * n_estimators_points): for grid_start in range(block, block + n_estimators_points): n_test_samples = 0 score = 0 all_scores = list() for this_score, parameters, this_n_test_samples in \ out[grid_start: grid_start + n_folds * n_estimators_points: n_estimators_points]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples score += this_score n_test_samples += this_n_test_samples if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: fit_params = self.fit_params if sample_weight is not None: fit_params = fit_params.copy() fit_params['sample_weight'] = sample_weight # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **fit_params) else: best_estimator.fit(X, **fit_params) self.best_estimator_ = best_estimator return self
def fit(self, X, y, sample_weight=None): shuffler = Shuffler(X, random_state=self.random_state) X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y = column_or_1d(y, warn=True) n_samples = len(X) n_inbag = int(self.subsample * n_samples) sample_weight = check_sample_weight( y, sample_weight=sample_weight).copy() self.random_state = check_random_state(self.random_state) # skipping all checks assert self.update_on in ['all', 'same', 'other', 'random'] y_pred = numpy.zeros(len(y), dtype=float) self.classifiers = [] self.learning_rates = [] self.loss_values = [] self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=sample_weight) iter_X = shuffler.generate(0.) prev_smearing = 1 for iteration in range(self.n_estimators): if iteration % self.recount_step == 0: if prev_smearing > 0: iter_smearing = interpolate(self.smearing, iteration, self.n_estimators) prev_smearing = iter_smearing iter_X = shuffler.generate(iter_smearing) iter_X, = check_arrays(iter_X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y_pred = numpy.zeros(len(y)) y_pred += sum( cl.predict(X) * rate for rate, cl in zip( self.learning_rates, self.classifiers)) self.loss_values.append( self.loss(y, y_pred, sample_weight=sample_weight)) tree = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=interpolate(self.max_depth, iteration, self.n_estimators), min_samples_split=self.min_samples_split, min_samples_leaf=interpolate(self.min_samples_leaf, iteration, self.n_estimators, use_log=True), max_features=self.max_features, random_state=self.random_state) sample_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) loss_weight = sample_weight if self.weights_in_loss else numpy.ones( len(sample_weight)) tree_weight = sample_weight if not self.weights_in_loss else numpy.ones( len(sample_weight)) residual = self.loss.negative_gradient(y, y_pred, sample_weight=loss_weight) tree.fit(numpy.array(iter_X)[sample_mask, :], residual[sample_mask], sample_weight=tree_weight[sample_mask], check_input=False) # update tree leaves if self.update_tree: if self.update_on == 'all': update_mask = numpy.ones(len(sample_mask), dtype=bool) elif self.update_on == 'same': update_mask = sample_mask elif self.update_on == 'other': update_mask = ~sample_mask else: # random update_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) self.loss.update_terminal_regions(tree.tree_, X=iter_X, y=y, residual=residual, pred=y_pred, sample_mask=update_mask, sample_weight=sample_weight) iter_learning_rate = interpolate(self.learning_rate, iteration, self.n_estimators, use_log=True) y_pred += iter_learning_rate * tree.predict(X) self.classifiers.append(tree) self.learning_rates.append(iter_learning_rate) return self
def fit(self, X, y=None): """Generate a sparse random projection matrix Parameters ---------- X : numpy array or scipy.sparse of shape [n_samples, n_features] Training set: only the shape is used to find optimal random matrix dimensions based on the theory referenced in the afore mentioned papers. y : is not used: placeholder to allow for usage in a Pipeline. Returns ------- self """ X, y = check_arrays(X, y) if not sp.issparse(X): X = np.atleast_2d(X) n_samples, n_features = X.shape if self.n_components == 'auto': self.n_components_ = johnson_lindenstrauss_min_dim( n_samples=n_samples, eps=self.eps) if self.n_components_ <= 0: raise ValueError( 'eps=%f and n_samples=%d lead to a target dimension of ' '%d which is invalid' % ( self.eps, n_samples, self.n_components_)) elif self.n_components_ > n_features: raise ValueError( 'eps=%f and n_samples=%d lead to a target dimension of ' '%d which is larger than the original space with ' 'n_features=%d' % (self.eps, n_samples, self.n_components_, n_features)) else: if self.n_components <= 0: raise ValueError("n_components must be greater than 0, got %s" % self.n_components_) elif self.n_components > n_features: warnings.warn( "The number of components is higher than the number of" " features: n_features > n_components (%s > %s)." "The dimensionality of the problem will not be reduced." % (n_features, self.n_components)) self.n_components_ = self.n_components # Generate a projection matrix of size [n_components, n_features] self.components_ = self._make_random_matrix(self.n_components_, n_features) # Check contract assert_equal( self.components_.shape, (self.n_components_, n_features), err_msg=('An error has occurred the self.components_ matrix has ' ' not the proper shape.')) return self