def fit(self, X, y): X_train, X_test, y_train, y_test = T_T_S( X, y, test_size=self.test_size, random_state=self.random_state) dim = X_train.shape[1] self.indices_ = tuple(range(dim)) self.subsets_ = [self.indices_] score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_) self.scores_ = [score] while dim > self.k_features: scores = [] subsets = [] for p in combinations(self.indices_, r=dim - 1): score = self._calc_score(X_train, y_train, X_test, y_test, p) scores.append(score) subsets.append(p) # print(p, score) # p = score = # (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) 0.8863636363636364 # (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12) 0.9090909090909091 # (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12) 0.9318181818181818 best = np.argmax(scores) self.indices_ = subsets[best] # finds and saves the best features self.subsets_.append(self.indices_) dim -= 1 self.scores_.append(scores[best]) #print('Number of dimensions and indices ', dim, self.indices_) self.k_score_ = self.scores_[-1] return self
'Class label', 'Alcohol', 'Malic Acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoids phenols', 'Proanthocyanins', 'Color Intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline' ] print('Wine data') print() print(df_wine.head()) print() print(df_wine.tail()) print() X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values X_train, X_test, y_train, y_test =\ T_T_S(X,y,test_size = .3, random_state = 0, stratify = y) # stratify ensures same class proportions of training and test data sets print('Training Data Size = ', len(X_train)) print('Test Data Size = ', len(X_test)) print() pause() feat_labels = df_wine.columns[1:] forest = RFC(n_estimators=500, random_state=1) forest.fit(X_train, y_train) tic_fwd = time() sfs_forward = SequentialFeatureSelector(forest, n_features_to_select=5, direction='forward').fit(