def featureSelectionTree(data): label = data[:,1] datanew = data[:,2:] for i in range(0,len(datanew)): datanew[i] = map(abs, datanew[i]) clf = ExtraTreesClassifier() X_new = clf.fit(datanew, label).transform(datanew) size = len(X_new[0]) data[:,2:size+2] = X_new fd = open('History.txt','a') history = 'Feature Selection: Tree' + '\n' + 'Selected Feature: ' + str(clf.get_support(True)) + '\n' fd.write(history) fd.close() return data[:,:size+2], size
class Model: def __init__(self): # self.features_selector = VarianceThreshold(threshold=(.8 * (1 - .8))) # self.features_selector = SelectKBest(k="all") # self.features_selector = SelectPercentile(score_func=SelectFpr, percentile=16) self.features_selector = ExtraTreesClassifier(n_estimators=250, max_features=20) self.dict_vectorizer = DictVectorizer() self.scaler = StandardScaler(copy=True) def vectorize(self, X, y, fit=True): # digitize categories if fit: self.dict_vectorizer.fit(X) X = self.dict_vectorizer.transform(X).toarray() return X, y def scale(self, X, y, fit=True): # scale numbers if fit: self.scaler.fit(X) X = self.scaler.transform(X) return X, y def all_feature_names(self): return self.dict_vectorizer.get_feature_names() def selected_feature_names(self): names = [] all_names = np.array(self.all_feature_names()) return all_names[self.feats['ensemble']] # if hasattr(self.features_selector, 'get_support'): # for i in self.features_selector.get_support(indices=True): # names.append(all_names[i]) # else: # feature_importance = self.features_selector.feature_importances_ # feature_importance = 100.0 * (feature_importance / feature_importance.max()) # sorted_idx = np.argsort(feature_importance)[::-1] # names = np.array(self.all_feature_names()[:len(sorted_idx)]) # """ # for name, imp in zip(names[sorted_idx], feature_importance[sorted_idx]): # # i = indices[f] # print "%s (%f)" % (name, imp), # """ # sel_count = int(math.log(len(sorted_idx), 2)) # return names[sorted_idx][:self.features_selector.max_features] def save_features(self, X, y): feats = dict() print "univariate feature selectors" selector_clf = SelectKBest(score_func=f_classif, k='all') selector_clf.fit(X, y) pvalues_clf = selector_clf.pvalues_ pvalues_clf[np.isnan(pvalues_clf)] = 1 #put feature vectors into dictionary feats['univ_sub01'] = (pvalues_clf < 0.1) feats['univ_sub005'] = (pvalues_clf < 0.05) feats['univ_clf_sub005'] = (pvalues_clf < 0.05) print "randomized logistic regression feature selector" sel_log = linear_model.RandomizedLogisticRegression(random_state=42, n_jobs=4).fit( X, y) #put rand_lasso feats into feature dict feats['rand_logreg'] = sel_log.get_support() print "l1-based feature selectors" X_sp = sparse.coo_matrix(X) sel_svc = svm.LinearSVC(C=0.1, penalty="l1", dual=False, random_state=42).fit(X, y) feats['LinearSVC'] = np.ravel(sel_svc.coef_ > 0) sel_log = linear_model.LogisticRegression(C=0.01, random_state=42).fit( X_sp, y) feats['LogReg'] = np.ravel(sel_log.coef_ > 0) tree_max_features = 20 print "ExtraTrees feature selectors (%s)" % tree_max_features feats['tree'] = np.zeros(len(feats['LogReg'])) tree = ExtraTreesClassifier(n_estimators=250, max_features=tree_max_features) tree.fit(X, y) feature_importance = tree.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance)[::-1] for i in xrange(tree_max_features): feats['tree'][sorted_idx[i]] = 1 feat_sums = np.zeros(len(feats['LogReg'])) for key in feats: feat_sums += feats[key].astype(int) feats[ 'ensemble'] = feat_sums >= 4 #take features which get 5 or more votes joblib.dump(feats, 'features/feats.pkl', compress=3) return feats def load_features(self): return joblib.load('features/feats.pkl') def select_features(self, X, y, fit=True): if fit: # self.features_selector.fit(X,y) # print "Selected Features:" # print self.selected_feature_names() # print self.feats = self.save_features(X, y) # pass # self.feats = self.load_features() # X = self.features_selector.transform(X) print "Selected Features:" print self.selected_feature_names() print return X[:, self.feats['ensemble']], y def split_data(self, X, y, ids, cross_validate): if not cross_validate: return X, [], y, [], ids, [] # append ids so we can identify who is in test and who is in train set X = np.c_[X, ids] # split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=.3) # , random_state=0 # store ids train_ids = X_train[:, -1] test_ids = X_test[:, -1] # remove ids X_train = np.delete(X_train, -1, 1).astype(np.float) X_test = np.delete(X_test, -1, 1).astype(np.float) return X_train, X_test, y_train, y_test, train_ids, test_ids def get_columns_from_selected_features(self, featureNames): all_names = self.all_feature_names() featureNames = set(featureNames) columns = [] for i, j in enumerate( self.features_selector.get_support(indices=True)): if all_names[j] in featureNames: columns.append(i) return columns def get_columns_for_features(self, featureNames): all_names = self.all_feature_names() cols = [] for feature in featureNames: cols.append(all_names.index(feature)) return cols def standard_prepare(self, X, y, fit=True, cross_validate=True): X, y = self.vectorize(X, y, fit) X, y = self.select_features(X, y, fit) X = np.array(X) y = np.array(y) self.X_unscaled = X self.y_unscaled = y X, y = self.scale(X, y, fit) self.X_scaled = X self.y_scaled = y return X, y def prepare(self, X, y, ids, fit=True, cross_validate=True): X, y = self.standard_prepare(X, y, fit, cross_validate) self.ids = ids self.X_train, self.X_test, self.y_train, self.y_test, self.train_ids, self.test_ids = self.split_data( X, y, ids, cross_validate) def apply_set(self, bidders): # TODO: first apply filtering - then split the data self.uX_scaled = [] self.uy_scaled = [] n = len(self.ids) for i in xrange(n): if self.ids[i] in bidders: self.uX_scaled.append(self.X_scaled[i]) self.uy_scaled.append(self.y_scaled[i])
# this part is used if we want to fit an svm to find important variables # commented out as it was decided that logistic regression gave a better result # lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(Xx, yy) # model = SelectFromModel(lsvc, prefit=True) # feat_list_3 = Xx.iloc[:,list(model.get_support(indices=True))].columns # this part is used if we want to fit logistic regression to find important variables lr = LogisticRegression(C=0.000000001, penalty='l2', dual=False, solver='lbfgs').fit(Xx, yy) model = SelectFromModel(lr, prefit=True) # takes the found to be important column names according to importance in logistic regression feat_list_3 = Xx.iloc[:, list(model.get_support(indices=True))].columns # this is a list of all the common variables across all three variable seleciton methods # these can likely be considered quite important as all three methods have picked them up # list(set(feat_list_1) & set(feat_list_2) & set(feat_list_3)) # this 'master' list combines all the elements from the three variable selection methods # overall it reduces our dimensionality from 79 down to 33, which definitely makes a more robust model (in most cases) important_cols = list( set(list(feat_list_1) + list(feat_list_2) + list(feat_list_3))) # as we have reduce the number of features taken through our training/testing # we must also reduce our final testing X_test_final = df_test.loc[:, important_cols].values # this is the main split; taking all the important columns from our