def _fit_leaf(self, X, Y, fit_keywords): if self.verbose: print "Fitting leaf" model = deepcopy(self.leaf_model) for field, gen in self.randomize_leaf_params.items(): setattr(model, field, gen()) model.fit(X, Y, **fit_keywords) clear_sklearn_fields(model) return model
def fit(self, X, Y, **fit_keywords): self.models = {} if self.verbose: print "Clustering X" # also get back the labels so we can use them to create regressors self.clusters.fit(X) labels = self.clusters.labels_ # clear this field so that it doesn't get serialized later self.clusters.labels_ = None for label in np.unique(labels): if self.verbose: print "Fitting model for cluster", label model = deepcopy(self.base_model) mask = (labels == label) X_slice = X[mask, :] Y_slice = Y[mask] model.fit(X_slice, Y_slice, **fit_keywords) # clear sklearn's left over junk to make pickled strings smaller clear_sklearn_fields(model) self.models[label] = model
def fit(self, X, Y, **fit_keywords): n_samples, n_features = X.shape if self.verbose: print "Depth", self.depth, ": Fitting model for", n_samples, "vectors" if self.depth >= self.max_depth or n_samples <= self.min_leaf_size: self.model = self._fit_leaf(X, Y, fit_keywords) else: # if we've been passed a limit to the number of features # then train the current model on a random subspace of that size if self.num_features_per_node: feature_indices = np.random.permutation(n_features) self.subspace = feature_indices[:self.num_features_per_node] X_reduced = X[:, self.subspace] else: X_reduced = X self.model = deepcopy(self.split_classifier) for field, gen in self.randomize_split_params.items(): setattr(self.model, field, gen()) self.model.fit(X_reduced, Y, **fit_keywords) clear_sklearn_fields(self.model) pred = self.model.predict(X_reduced) for c in self.classes: mask = (pred == c) count = np.sum(mask) if count == 0: self.children[c] = ConstantLeaf(int(c)) else: X_slice = X[mask, :] Y_slice = Y[mask] self.children[c] = self._fit_child(X_slice, Y_slice, fit_keywords)
def fit(self, X, Y, **fit_keywords): assert self.base_model is not None assert self.bagging_percent is not None assert self.bagging_replacement is not None assert self.num_models is not None assert self.verbose is not None self.need_to_fit = False self.models = [] X = np.atleast_2d(X) Y = np.atleast_1d(Y) n_rows, total_features = X.shape bagsize = int(math.ceil(self.bagging_percent * n_rows)) if self.additive: self.weights = np.ones(self.num_models, dtype='float') else: self.weights = np.ones(self.num_models, dtype='float') / self.num_models # each derived class needs to implement this self._init_fit(X,Y) if self.feature_subset_percent < 1: n_features = int(math.ceil(self.feature_subset_percent * total_features)) self.feature_subsets = [] else: n_features = total_features self.feature_subsets = None for i in xrange(self.num_models): if self.verbose: print "Training iteration", i if self.bagging_replacement: indices = np.random.random_integers(0,n_rows-1,bagsize) else: p = np.random.permutation(n_rows) indices = p[:bagsize] data_subset = X[indices, :] if n_features < total_features: feature_indices = np.random.permutation(total_features)[:n_features] self.feature_subsets.append(feature_indices) data_subset = data_subset[:, feature_indices] label_subset = Y[indices] model = deepcopy(self.base_model) # randomize parameters using given functions for param_name, fn in self.randomize_params.items(): setattr(model, param_name, fn()) model.fit(data_subset, label_subset, **fit_keywords) self.models.append(model) self._created_model(X, Y, indices, i, model) if self.additive: if n_features < total_features: Y -= model.predict(X[:, feature_indices]) else: Y -= model.predict(X) clear_sklearn_fields(model) # stacking works by treating the outputs of each base classifier as the # inputs to an additional meta-classifier if self.stacking_model: transformed_data = self.transform(X) self.stacking_model.fit(transformed_data, Y)