def _select_features(self, problem, percent_features_to_select, algorithm, features_to_keep=None): # Initialize FeatureSelector. fs = FeatureSelector(problem=problem, algorithm=algorithm, random_state=self._random_state) fs.set_input_matrix(self._X_train, column_or_1d(self._y_train)) num_features_to_select = int(percent_features_to_select * len(self._X_train.columns.values)) # Parse features_to_keep. if features_to_keep is None: features_to_keep = [] # Select features. fs.select(k=num_features_to_select) # Enumerate eliminated features pre-transformation. feature_ranks = fs.compute_ranks() for i in range(len(feature_ranks)): if feature_ranks[i] > num_features_to_select: # If in features_to_keep, pretend it wasn't eliminated. if self._X_train.columns[i] not in features_to_keep: self._eliminated_features.append(self._X_train.columns[i]) # Hack: rather than making FeatureSelector handle the concept of # kept features, just copy the data here and add it back to the # transformed matrices. # Rather than looping, do this individually so that we can skip if # transformed X already has the feature. for feature in features_to_keep: kept_X_train_feature = self._X_train[[feature]].copy() log.debug('kept_X_train_feature.shape: %s' % str(kept_X_train_feature.shape)) self._X_train = fs.transform_matrix(self._X_train) if feature not in self._X_train: self._X_train = self._X_train.merge(kept_X_train_feature, left_index=True, right_index=True) kept_X_test_feature = self._X_test[[feature]].copy() log.debug('kept_X_test_feature.shape: %s' % str(kept_X_test_feature.shape)) self._X_test = fs.transform_matrix(self._X_test) if feature not in self._X_test: self._X_test = self._X_test.merge(kept_X_test_feature, left_index=True, right_index=True) if not features_to_keep: # Even if there is no feature to keep, still need to # perform transform_matrix to drop most low-rank features self._X_train = fs.transform_matrix(self._X_train) self._X_test = fs.transform_matrix(self._X_test)
def _get_test_feature_ranks(self, algorithm, problem, X, y, k=None, percentile=None): # Set input features and values. fs = FeatureSelector(algorithm=algorithm, problem=problem, random_state=12345) fs.set_input_matrix(X, y) # Select k best features. fs.select(k=k, percentile=percentile) feature_ranks = fs.compute_ranks() return feature_ranks
class Select_Features(TransformerMixin): def __init__(self, random_state=0, features_by_type=None): ''' TODO: if feature_collection is None, assume all features are numeric. Args: random_state: feature_dict: ''' self.fs = FeatureSelector( problem=FeatureSelector.CLASSIFICATION, algorithm=FeatureSelector.RECURSIVE_ELIMINATION, random_state=random_state) self.features_by_type = features_by_type self.selected_features = [] def fit(self, X, y=None, features_to_keep=None, select_percent=0.05): ''' TODO: Does this "select_percent" include those pre-set to keep? features_to_keep includes both features wanted to keep + non-numeric features Args: X: y: features_to_keep: select_percent: Returns: ''' if not features_to_keep: features_to_keep = [] X_numeric = X[X.columns[X.columns.isin( self.features_by_type['numeric_features'])]] self.fs.set_input_matrix(X_numeric.values, column_or_1d(y.values)) num_features_to_select = int( round(select_percent * len(X_numeric.columns.values))) self.fs.select(k=num_features_to_select) feature_ranks = self.fs.compute_ranks() for i in range(len(feature_ranks)): if feature_ranks[i] <= num_features_to_select: # If in features_to_keep, pretend it wasn't eliminated. features_to_keep.append(X_numeric.columns[i]) self.selected_features = features_to_keep[:] return self def transform(self, X): return X[self.selected_features]
def _select_features(self): # Use FeatureSelector to prune all but 100 variables. fs = FeatureSelector(algorithm=FeatureSelector.RECURSIVE_ELIMINATION, \ problem=FeatureSelector.CLASSIFICATION) fs.set_input_matrix(self._X_train, column_or_1d(self._y_train)) num_features_to_select = int(0.01 * len(self._X_train.columns.values)) fs.select(k=num_features_to_select) # Enumerate eliminated features pre-transformation. self._feature_ranks = fs.compute_ranks() for i in range(len(self._feature_ranks)): if self._feature_ranks[i] > num_features_to_select: self._eliminated_features.append(self._X_train.columns[i]) self._X_train = fs.transform_matrix(self._X_train) self._X_test = fs.transform_matrix(self._X_test)
def select_features(matrix, features, random_state=0): select_params = features['select_params'] fs = FeatureSelector(problem=select_params['selection_problem'], algorithm=select_params['selection_algorithm'], random_state=random_state) X, y = split_Xy(matrix, features['ylabel']) fs.set_input_matrix(X, y) num_features_to_select = int(select_params['percent_features_to_select'] * len(matrix.columns.values)) fs.select(k=num_features_to_select) feature_ranks = fs.compute_ranks() features_to_keep = [] for i in range(len(feature_ranks)): if feature_ranks[i] <= num_features_to_select: # If in features_to_keep, pretend it wasn't eliminated. # self._eliminated_features.append(self._X_train.columns[i]) features_to_keep.append(X.columns[i]) return matrix[features_to_keep].copy()