def __init__(self, weak_params, weak_classifier, meta_params, meta_classifier, undersampler=None, cv=5, n_jobs=12, scoring='f1_micro', iid=True): self.cv = cv self.n_jobs = n_jobs self.undersampler = undersampler self.weak_clf = clone_estimator(weak_classifier) self.weak_params = weak_params self.weak_gs = GridSearchCV(self.weak_clf, self.weak_params, cv=self.cv, n_jobs=self.n_jobs, scoring=scoring, iid=iid) self.meta_clf = clone_estimator(meta_classifier) self.meta_params = meta_params self.meta_gs = GridSearchCV(self.meta_clf, self.meta_params, cv=self.cv, n_jobs=self.n_jobs, scoring=scoring, iid=iid) self.clf_by_class = {} self._fitted_ = False
def _create_huffman_tree(self, y: csr_matrix, k: int) -> HuffmanTree: """ Create a huffman tree_ based on the given labels and their probabilities. :param y: sparse binary representation label vectors :return: huffman label tree_ """ if self.verbose: print('Building tree_') label_probs = self._compute_label_probabilities(y) priority_queue = [] for label_id, prob in label_probs.items(): new_node = HuffmanNode(probability=prob, clf=clone_estimator(self.node_clf), label_idx=[label_id], children=[]) priority_queue.append(new_node) heapify(priority_queue) while len(priority_queue) > 1: n_children = [heappop(priority_queue) for _ in range(min(k, len(priority_queue)))] new_node = HuffmanNode(probability=sum(map(lambda node: node.probability, n_children)), clf=clone_estimator(self.node_clf), label_idx=list(chain.from_iterable(map(lambda node: node.label_idx, n_children))), children=n_children) heappush(priority_queue, new_node) if len(priority_queue) != 1: raise Exception('Building tree failed (Priority Queue must only contain one element)') return HuffmanTree(root=priority_queue[0])
def get_feature_count(features, pipeline): if pipeline: pipeline_clone = clone_estimator(pipeline) feature_count = pipeline_clone.fit_transform(features).shape[1] else: feature_count = features.shape[1] return feature_count
def get_feature_count(dataset, pipeline): if pipeline: pipeline_clone = clone_estimator(pipeline) feature_count = pipeline_clone.fit_transform(X=dataset.features, y=dataset.target).shape[1] else: feature_count = dataset.features.shape[1] return feature_count
def _get_transformed_feature_count(conduit): clone_steps = [] if 'feature_eng' in self.named_steps: clone_steps.append('clone_estimator(conduit.named_steps['feature_eng'])) if 'dimensionality_transformer' in self.named_steps: clone_steps.append(('estimator', clone_estimator(conduit.named_steps['dimensionality_transformer']))) if clone_steps: feature_count = Pipeline(steps=(clone_steps).fit_transform(conduit.dataset.features).shape[1] else: feature_count = conduit.dataset.df_features.shape[1] return feature_count
def get_ga_param(self): param = {} param['estimator'] = clone_estimator(self.model) param['ngen'] = self.NGEN param['pop_size'] = self.pop_size param['prior'] = self.prior param['mut_rate'] = self.MUTPB param['cross_prob'] = self.CXPB param['cv'] = self.cv param['n_jobs'] = self.n_jobs param['verbose'] = self.verbose param['random_state'] = self.rs return param
def check_svm_model_equal(queue, svm, X_train, y_train, X_test, decimal=6): sparse_svm = clone_estimator(svm) dense_svm = clone_estimator(svm) dense_svm.fit(X_train.toarray(), y_train, queue=queue) if sp.isspmatrix(X_test): X_test_dense = X_test.toarray() else: X_test_dense = X_test sparse_svm.fit(X_train, y_train, queue=queue) assert sp.issparse(sparse_svm.support_vectors_) assert sp.issparse(sparse_svm.dual_coef_) assert_array_almost_equal(dense_svm.support_vectors_, sparse_svm.support_vectors_.toarray(), decimal) assert_array_almost_equal(dense_svm.dual_coef_, sparse_svm.dual_coef_.toarray(), decimal) assert_array_almost_equal(dense_svm.support_, sparse_svm.support_) assert_array_almost_equal(dense_svm.predict(X_test_dense, queue=queue), sparse_svm.predict(X_test, queue=queue)) if is_classifier(svm): assert_array_almost_equal(dense_svm.decision_function(X_test_dense, queue=queue), sparse_svm.decision_function(X_test, queue=queue), decimal)
def fit(self, X_multiclass, y): self.classes_ = sorted(list(X_multiclass.keys())) self.classes_mapper_ = {k: i for (i, k) in enumerate(self.classes_)} X_probs = [] for c in self.classes_: X_class = X_multiclass[c] y_transformed = self.transform_y(y, c) X_resampled, y_resampled = self.get_undersamplig( X_class, y_transformed) weak_gs_atual = clone_estimator(self.weak_gs) weak_gs_atual.fit(X_resampled, y_resampled) clf = clone_estimator( self.weak_clf).set_params(**weak_gs_atual.best_params_) if "n_jobs" in clf.get_params(): clf.set_params(n_jobs=self.n_jobs) if "predict_proba" in dir(clf): cccv = clf else: cccv = CalibratedClassifierCV(clf, cv=self.cv) self.clf_by_class[c] = cccv.fit(X_resampled, y_resampled) X_probs.append(cccv.predict_proba(X_class)) #print(X_probs) X_probs = np.concatenate(X_probs, axis=1) #print(X_probs.shape, X_probs[0]) self.meta_gs.fit(X_probs, y) self.meta_clf.set_params(**self.meta_gs.best_params_) self.meta_clf.fit(X_probs, y) self._fitted_ = True return self
def _optimize(self, X, y, fit_params=None): if fit_params is None: fit_params = {} best_params = {} best_loss = np.inf for params in ParameterGrid(self.search_params): estimator = clone_estimator(self.estimator) estimator.set_params(**params) estimator.fit(X, y=y, **fit_params) loss = estimator.score(X, y) loss = loss if self.has_loss_function else -loss if loss < best_loss: best_loss = loss best_params = params self._best_score = best_loss self._best_params = best_params
def _groups(self, xyz): """Galaxies clustering. Finds groups of galaxies. """ # set weights for clustering weights = self.z * 100 # clustering of galaxies # we never use the instance level cluster, we always clone # and use it internally clustering = clone_estimator(self._halo_clustering) clustering.fit(xyz, sample_weight=weights) # select only galaxies in groups unique_elements, counts_elements = np.unique(clustering.labels_, return_counts=True) return clustering.labels_, unique_elements
def _calc_error(self, search_vals, X, y, fit_params=None): """Calculates the estimator's error The error is calculated using the estimator's scoring function (assumes a true scoring function, i.e. greater == better). """ # Need to pair these values with the names of the search params # to build a dict search_params = {} for k, v in zip(list(self.search_params.keys()), search_vals): search_params[k] = v # Clone the estimator to make sure we have a clean slate, then fit: if fit_params is None: fit_params = {} estimator = clone_estimator(self.estimator) estimator.set_params(**search_params) estimator.fit(X, y=y, **fit_params) # If score is not a loss function, we need to invert here: loss = estimator.score(X, y) loss = loss if self.has_loss_function else -loss return loss