Пример #1
0
    def __init__(self,
                 weak_params,
                 weak_classifier,
                 meta_params,
                 meta_classifier,
                 undersampler=None,
                 cv=5,
                 n_jobs=12,
                 scoring='f1_micro',
                 iid=True):
        self.cv = cv
        self.n_jobs = n_jobs
        self.undersampler = undersampler

        self.weak_clf = clone_estimator(weak_classifier)
        self.weak_params = weak_params
        self.weak_gs = GridSearchCV(self.weak_clf,
                                    self.weak_params,
                                    cv=self.cv,
                                    n_jobs=self.n_jobs,
                                    scoring=scoring,
                                    iid=iid)

        self.meta_clf = clone_estimator(meta_classifier)
        self.meta_params = meta_params
        self.meta_gs = GridSearchCV(self.meta_clf,
                                    self.meta_params,
                                    cv=self.cv,
                                    n_jobs=self.n_jobs,
                                    scoring=scoring,
                                    iid=iid)

        self.clf_by_class = {}
        self._fitted_ = False
Пример #2
0
    def _create_huffman_tree(self, y: csr_matrix, k: int) -> HuffmanTree:
        """
        Create a huffman tree_ based on the given labels and their probabilities.
        :param y: sparse binary representation label vectors
        :return: huffman label tree_
        """

        if self.verbose:
            print('Building tree_')

        label_probs = self._compute_label_probabilities(y)

        priority_queue = []
        for label_id, prob in label_probs.items():
            new_node = HuffmanNode(probability=prob,
                                   clf=clone_estimator(self.node_clf),
                                   label_idx=[label_id],
                                   children=[])
            priority_queue.append(new_node)
        heapify(priority_queue)

        while len(priority_queue) > 1:
            n_children = [heappop(priority_queue) for _ in range(min(k, len(priority_queue)))]
            new_node = HuffmanNode(probability=sum(map(lambda node: node.probability, n_children)),
                                   clf=clone_estimator(self.node_clf),
                                   label_idx=list(chain.from_iterable(map(lambda node: node.label_idx, n_children))),
                                   children=n_children)

            heappush(priority_queue, new_node)

        if len(priority_queue) != 1:
            raise Exception('Building tree failed (Priority Queue must only contain one element)')

        return HuffmanTree(root=priority_queue[0])
Пример #3
0
 def get_feature_count(features, pipeline):
     if pipeline:
         pipeline_clone = clone_estimator(pipeline)
         feature_count = pipeline_clone.fit_transform(features).shape[1]
     else:
         feature_count = features.shape[1]
     return feature_count
Пример #4
0
def get_feature_count(dataset, pipeline):
    if pipeline:
        pipeline_clone = clone_estimator(pipeline)
        feature_count = pipeline_clone.fit_transform(X=dataset.features, y=dataset.target).shape[1]
    else:
        feature_count = dataset.features.shape[1]
    return feature_count
Пример #5
0
 def _get_transformed_feature_count(conduit):
     clone_steps = []
     if 'feature_eng' in self.named_steps:
         clone_steps.append('clone_estimator(conduit.named_steps['feature_eng']))
     if 'dimensionality_transformer' in self.named_steps:
         clone_steps.append(('estimator', clone_estimator(conduit.named_steps['dimensionality_transformer'])))
     if clone_steps:
         feature_count = Pipeline(steps=(clone_steps).fit_transform(conduit.dataset.features).shape[1]
     else:
         feature_count = conduit.dataset.df_features.shape[1]
     return feature_count
Пример #6
0
 def get_ga_param(self):
     param = {}
     param['estimator'] = clone_estimator(self.model)
     param['ngen'] = self.NGEN
     param['pop_size'] = self.pop_size
     param['prior'] = self.prior
     param['mut_rate'] = self.MUTPB
     param['cross_prob'] = self.CXPB
     param['cv'] = self.cv
     param['n_jobs'] = self.n_jobs
     param['verbose'] = self.verbose
     param['random_state'] = self.rs
     return param
Пример #7
0
def check_svm_model_equal(queue, svm, X_train, y_train, X_test, decimal=6):
    sparse_svm = clone_estimator(svm)
    dense_svm = clone_estimator(svm)
    dense_svm.fit(X_train.toarray(), y_train, queue=queue)
    if sp.isspmatrix(X_test):
        X_test_dense = X_test.toarray()
    else:
        X_test_dense = X_test
    sparse_svm.fit(X_train, y_train, queue=queue)
    assert sp.issparse(sparse_svm.support_vectors_)
    assert sp.issparse(sparse_svm.dual_coef_)
    assert_array_almost_equal(dense_svm.support_vectors_,
                              sparse_svm.support_vectors_.toarray(), decimal)
    assert_array_almost_equal(dense_svm.dual_coef_,
                              sparse_svm.dual_coef_.toarray(), decimal)
    assert_array_almost_equal(dense_svm.support_, sparse_svm.support_)
    assert_array_almost_equal(dense_svm.predict(X_test_dense, queue=queue),
                              sparse_svm.predict(X_test, queue=queue))

    if is_classifier(svm):
        assert_array_almost_equal(dense_svm.decision_function(X_test_dense, queue=queue),
                                  sparse_svm.decision_function(X_test, queue=queue),
                                  decimal)
Пример #8
0
    def fit(self, X_multiclass, y):
        self.classes_ = sorted(list(X_multiclass.keys()))
        self.classes_mapper_ = {k: i for (i, k) in enumerate(self.classes_)}

        X_probs = []
        for c in self.classes_:
            X_class = X_multiclass[c]
            y_transformed = self.transform_y(y, c)
            X_resampled, y_resampled = self.get_undersamplig(
                X_class, y_transformed)

            weak_gs_atual = clone_estimator(self.weak_gs)

            weak_gs_atual.fit(X_resampled, y_resampled)

            clf = clone_estimator(
                self.weak_clf).set_params(**weak_gs_atual.best_params_)
            if "n_jobs" in clf.get_params():
                clf.set_params(n_jobs=self.n_jobs)
            if "predict_proba" in dir(clf):
                cccv = clf
            else:
                cccv = CalibratedClassifierCV(clf, cv=self.cv)

            self.clf_by_class[c] = cccv.fit(X_resampled, y_resampled)
            X_probs.append(cccv.predict_proba(X_class))
        #print(X_probs)
        X_probs = np.concatenate(X_probs, axis=1)
        #print(X_probs.shape, X_probs[0])
        self.meta_gs.fit(X_probs, y)
        self.meta_clf.set_params(**self.meta_gs.best_params_)
        self.meta_clf.fit(X_probs, y)

        self._fitted_ = True

        return self
Пример #9
0
 def _optimize(self, X, y, fit_params=None):
     if fit_params is None:
         fit_params = {}
     best_params = {}
     best_loss = np.inf
     for params in ParameterGrid(self.search_params):
         estimator = clone_estimator(self.estimator)
         estimator.set_params(**params)
         estimator.fit(X, y=y, **fit_params)
         loss = estimator.score(X, y)
         loss = loss if self.has_loss_function else -loss
         if loss < best_loss:
             best_loss = loss
             best_params = params
     self._best_score = best_loss
     self._best_params = best_params
Пример #10
0
    def _groups(self, xyz):
        """Galaxies clustering.

        Finds groups of galaxies.

        """
        # set weights for clustering
        weights = self.z * 100

        # clustering of galaxies
        # we never use the instance level cluster, we always clone
        # and use it internally
        clustering = clone_estimator(self._halo_clustering)
        clustering.fit(xyz, sample_weight=weights)

        # select only galaxies in groups
        unique_elements, counts_elements = np.unique(clustering.labels_,
                                                     return_counts=True)

        return clustering.labels_, unique_elements
Пример #11
0
    def _calc_error(self, search_vals, X, y, fit_params=None):
        """Calculates the estimator's error

        The error is calculated using the estimator's scoring function (assumes
        a true scoring function, i.e. greater == better).
        """
        # Need to pair these values with the names of the search params
        # to build a dict
        search_params = {}
        for k, v in zip(list(self.search_params.keys()), search_vals):
            search_params[k] = v
        # Clone the estimator to make sure we have a clean slate, then fit:
        if fit_params is None:
            fit_params = {}
        estimator = clone_estimator(self.estimator)
        estimator.set_params(**search_params)
        estimator.fit(X, y=y, **fit_params)
        # If score is not a loss function, we need to invert here:
        loss = estimator.score(X, y)
        loss = loss if self.has_loss_function else -loss
        return loss