def update_event(self, input_called=-1): if input_called == 0: estimator = self.input(2) selector = RFE(estimator) if self.input(1) != None: selector.set_params(**self.input(1)) try: X = self.input(3) y = self.input(4) selector.fit(X, y) except: pass self.set_output_val(1, selector) self.exec_output(0)
rfe = RFE( LogisticRegression( penalty="l2", dual=True, C=1.0, fit_intercept=True, # approx 2mm starting out intercept_scaling=10.0, class_weight="auto", verbose=1, ) ) ## logistic regression rfe.set_params(n_features_to_select=50000, step=0.1) rfe.fit(model_mat_train[:27000], ACTION[:27000]) lr = LogisticRegression(penalty="l2", dual=True, C=10.0, intercept_scaling=10.0, class_weight="auto") lr.fit(model_mat_train[:27000, np.where(rfe.support_)[0]], ACTION[:27000]) pred = lr.predict_proba(model_mat_train[27000:, np.where(rfe.support_)[0]]) auc_score(ACTION[27000:], pred[:, 1]) lr = LogisticRegression(penalty="l2", dual=True, C=10.0, intercept_scaling=10.0, class_weight="auto") lr.fit(model_mat_train[:27000], ACTION[:27000]) pred = lr.predict_proba(model_mat_train[27000:]) auc_score(ACTION[27000:], pred[:, 1]) lr.fit(model_mat_train[:, np.where(rfe.support_)[0]], ACTION) pred = lr.predict_proba(model_mat_test[:, np.where(rfe.support_)[0]]) pd.DataFrame({"Id": test_data.index, "Action": pred[:, 1]}).to_csv(
def fit(self, X, y=None): """ Fits the RFECV with the wrapped model to the specified data and draws the rfecv curve with the optimal number of features found. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression. Returns ------- self : instance Returns the instance of the RFECV visualizer. """ X, y = check_X_y(X, y, "csr") n_features = X.shape[1] # This check is kind of unnecessary since RFE will do it, but it's # nice to get it out of the way ASAP and raise a meaningful error. if 0.0 < self.step < 1.0: step = int(max(1, self.step * n_features)) else: step = int(self.step) if step < 0: raise YellowbrickValueError("step must be >0") # Create the RFE model rfe = RFE(self.estimator, step=step) n_feature_subsets = np.arange(1, n_features+1) # Create the cross validation params # TODO: handle random state cv_params = { key: self.get_params()[key] for key in ('groups', 'cv', 'scoring') } # Perform cross-validation for each feature subset scores = [] for n_features_to_select in n_feature_subsets: rfe.set_params(n_features_to_select=n_features_to_select) scores.append(cross_val_score(rfe, X, y, **cv_params)) # Convert scores to array self.cv_scores_ = np.array(scores) # Find the best RFE model bestidx = self.cv_scores_.mean(axis=1).argmax() self.n_features_ = n_feature_subsets[bestidx] # Fit the final RFE model for the number of features self.rfe_estimator_ = rfe self.rfe_estimator_.set_params(n_features_to_select=self.n_features_) self.rfe_estimator_.fit(X, y) # Rewrap the visualizer to use the rfe estimator self._wrapped = self.rfe_estimator_ # Hoist the RFE params to the visualizer self.support_ = self.rfe_estimator_.support_ self.ranking_ = self.rfe_estimator_.ranking_ self.draw() return self