def _select_features(self, X, y, best, save_path, method='rfecv', min_features=1): if best: if method == 'rfecv': print('Select best') rfecv = RFECV(estimator=LinearRegression(self.fit_intercept), min_features_to_select=min_features, cv=KFold(3), scoring='neg_mean_squared_error', n_jobs=-1) rfecv.fit_transform(X=X, y=y) self.bf_support_ = rfecv.support_ self.bf_n_features_ = rfecv.n_features_ # Results print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure(figsize=figsize) plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (neg mean squared error)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) if save_path != None: plt.savefig(save_path + 'rfecv_feature_selection.png') plt.show() if method == 'chi2': pass else: self.bf_support_ = [True] * X.shape[1]
def feature_selection(X, Y, outcome, method, imp_method, data_dir, verbose=0): if method not in ['RFE', 'PCA', 'ElasticNet']: raise Exception("{} not supported.".format(method)) is_classf = Y.dtype == np.int8 feature_subset_path = os.path.join( data_dir, 'feature_subset_{}_{}_{}.h5'.format(outcome, method, imp_method)) if os.path.exists(feature_subset_path): if verbose: print("Feature subset already exists. Loading {}...".format( feature_subset_path)) with h5py.File(feature_subset_path, 'r') as hf: subset = hf[method][:] X_refined = X[:, subset] selector = None else: if method == 'RFE': if is_classf: selector = RFECV(LinearSVC(), step=0.1, cv=5, n_jobs=-1, verbose=verbose) else: selector = RFECV(LinearSVR(), step=0.1, cv=5, n_jobs=-1, verbose=verbose) X_refined = selector.fit_transform(X, Y) elif method == 'ElasticNet': selector = SelectFromModel(ElasticNetCV(cv=10, n_jobs=-1)) X_refined = selector.fit_transform(X, Y) else: selector = None pca_path = os.path.join( data_dir, 'pca_comp_{}_{}.pkl'.format(outcome, imp_method)) if os.path.exists(pca_path): print("PCA components already exist. Loading {}...".format( pca_path)) pca = joblib.load(pca_path) X_refined = pca.transform(X) else: var_thr = 0.99 pca = PCA() x_pca = pca.fit_transform(X) index_pca = np.argmax( pca.explained_variance_ratio_.cumsum() > var_thr) if verbose: print("Number of selected features:", index_pca) pca = PCA(n_components=index_pca) X_refined = pca.fit_transform(X) joblib.dump(pca, pca_path) if selector: with h5py.File(feature_subset_path, 'w') as hf: hf.create_dataset(method, data=selector.get_support()) return X_refined
def select_features_univariate(X, y, method='Decision_Tree'): """ with high dimensional datasets it aids classifier performance to select features of interest This function rejects features below a certain (univariate) threshold. Parameters ---------- X : ndarray repetitions by features y : ndarray vector of labels of each repetition method : string function used for data reduction {'decision_tree','decision_tree_RFECV','mutual_information',... 'univariate_select'} Returns -------- dictionary: X_transformed : ndarray repetitions by features (reduced) weights: ndarray or Boolean relative importance features or binary (important or not) """ # based on the method we choose the clf to fit and transform the data if method == 'decision_tree_RFECV': clf = DecisionTreeClassifier() trans = RFECV(clf) X_transformed = trans.fit_transform(X, y) weights = trans.get_support() elif method == 'decision_tree': clf = DecisionTreeClassifier() clf.fit(X, y) # choose features with an importance that is more than avg. selected_features = np.where( clf.feature_importances_ > clf.feature_importances_.mean(0), 1, 0) X_transformed = X[:, selected_features == 1] weights = clf.feature_importances_ elif method == 'mutual_information': mutual_info = mutual_info_classif(X, y) # choose features above the avg mutual information threshold. selected_features = np.where(mutual_info > mutual_info.mean(0), 1, 0) X_transformed = X[:, selected_features == 1] weights = mutual_info #continuous elif method == 'univariate_select': # select features with more univariate activity than avg. trans = GenericUnivariateSelect(score_func=lambda X, y: X.mean(axis=0), mode='percentile', param=50) X_transformed = trans.fit_transform(X, y) weights = trans.get_support() #binary return X_transformed, weights
def figs_of_RFE(self, model=None): # 展示:随着特征个数增加得分变化趋势图 # 向后消除:该过程从所有特征集开始。通过逐步删除集合中剩余的最差特征。 # 参数estimator为基模型 selector = RFECV(estimator=model, scoring=self.score) selector.fit_transform(self.train_X, self.train_y) model_name = str(model).split('(')[0] plt.figure() plt.title('RFECV of {}'.format(model_name)) plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_) plt.grid() plt.show()
def transformer(train, test, train_y): """Scales and applies PCA to the input data. Parameters ---------- train : DataFrame Features. test : DataFrame Features. test_y : numpy array Target. Returns ------- train, test : numpy arrays Transformed Features. """ train = train.dropna() test = test.dropna() scaler = RobustScaler().fit(train) train = scaler.transform(train) test = scaler.transform(test) clf = DecisionTreeClassifier() rfexv = RFECV(clf, cv=5) train = rfexv.fit_transform(train, train_y) test = rfexv.transform(test) return train, test
def RFEtrain(self, data): # # 1st # search_model = self._grid_search(data, self.label) # model = RFECV(estimator=search_model, step=1, cv=KFold(len(data)), scoring='accuracy', n_jobs=-1) # X = model.fit_transform(data, self.label) # plot_feature_selected(model, self.fsSavepath) # self.trainproc(X, search_model) # # 2nd # model = RFECV(estimator=self.svm, step=1, cv=KFold(len(data)), scoring='accuracy', n_jobs=-1) # X = model.fit_transform(data, self.label) # plot_feature_selected(model, self.fsSavepath) # self.trainproc(X, self.svm) # 3rd model = RFECV(estimator=self.svm, step=1, cv=KFold(len(data)), scoring='accuracy', n_jobs=-1) X = model.fit_transform(data, self.label) search_model = self._grid_search(X, self.label) plot_feature_selected(model, self.fsSavepath) self.trainproc(X, search_model)
def plot_RFE(X,y): from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV from sklearn.datasets import make_classification from sklearn.metrics import zero_one_loss import pylab as pl import matplotlib.pylab as pl # Create the RFE object and compute a cross-validated score. # svc= SVC(kernel="linear", class_weight="auto", cache_size=1200, shrinking=True) svc=LinearSVC(penalty='l1', loss='l2', dual=False, class_weight='auto',multi_class='ovr') # SGD = SGDClassifier(penalty='elasticnet',class_weight='auto',n_jobs=-1,n_iter=10,l1_ratio =0.15) ## rfecv = RFECV(estimator=svc, step=0.1, cv=StratifiedKFold(y, 5), scoring='roc_auc') rfecv = RFECV(estimator=svc, step=0.2,cv=StratifiedKFold(y, 2), scoring='f1') X_RFE = rfecv.fit_transform(X, y) print("Optimal number of features in X_RFE : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores pl.figure() pl.xlabel("Number of features selected") pl.ylabel("Cross validation score (nb of misclassifications)") pl.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) pl.show() print ('RFE Opt.shapes features CV score:') CV_multi_stats(X_RFE,y,svc) return (X_RFE,rfecv)
def featureSelectAndClassifyRFECV(X_train, X_test, y_train, y_test): scaler = MinMaxScaler() #scaler = StandardScaler() #scaler = RobustScaler() X_train_minmax = scaler.fit_transform(X_train) X_test_minmax = scaler.transform(X_test) #svc =svm.LinearSVC() rf = RandomForestClassifier(n_estimators=50, max_depth=20) rfecv = RFECV(estimator=rf, step=1, min_features_to_select=5, cv=StratifiedKFold(5), scoring='accuracy') X_train_transformed = rfecv.fit_transform(X_train_minmax, y_train) #X_train_transformed = rfecv.fit_transform(X_train, y_train) X_test_transformed = rfecv.transform(X_test_minmax) #X_test_transformed = rfecv.transform(X_test) score = rfecv.score(X_test_minmax, y_test) #score = rfecv.score(X_test, y_test) print('Optimal no. of features are ' + str(rfecv.n_features_)) print('Score for test set is ' + str(score)) print(rfecv.ranking_.shape) print(X_train_transformed.shape) print(X_test_transformed.shape) plt.figure() plt.xlabel('no. of features') plt.ylabel('cv score') plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def recursiveFeatureSelectorCV(classifier_model,train_data,train_labels,test_data,number_of_features): rfe = RFECV(classifier_model,number_of_features) transformed_train_data = rfe.fit_transform(train_data,train_labels) transformed_test_data = rfe.transform(test_data) return transformed_train_data,transformed_test_data
def plot_RFE(X, y): from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV from sklearn.datasets import make_classification from sklearn.metrics import zero_one_loss import pylab as pl import matplotlib.pylab as pl # Create the RFE object and compute a cross-validated score. # svc= SVC(kernel="linear", class_weight="auto", cache_size=1200, shrinking=True) svc = LinearSVC(penalty='l1', loss='l2', dual=False, class_weight='auto', multi_class='ovr') # SGD = SGDClassifier(penalty='elasticnet',class_weight='auto',n_jobs=-1,n_iter=10,l1_ratio =0.15) ## rfecv = RFECV(estimator=svc, step=0.1, cv=StratifiedKFold(y, 5), scoring='roc_auc') rfecv = RFECV(estimator=svc, step=0.2, cv=StratifiedKFold(y, 2), scoring='f1') X_RFE = rfecv.fit_transform(X, y) print("Optimal number of features in X_RFE : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores pl.figure() pl.xlabel("Number of features selected") pl.ylabel("Cross validation score (nb of misclassifications)") pl.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) pl.show() print('RFE Opt.shapes features CV score:') CV_multi_stats(X_RFE, y, svc) return (X_RFE, rfecv)
def recursiveFeatureSelectorCV(classifier_model, train_data, train_labels, test_data, number_of_features): rfe = RFECV(classifier_model, number_of_features) transformed_train_data = rfe.fit_transform(train_data, train_labels) transformed_test_data = rfe.transform(test_data) return transformed_train_data, transformed_test_data
def selectFeatures(self, select_model): selector = RFECV(estimator=select_model, step=self.step, cv=self.cv) y = self.train[self.label] X = self.train.drop(self.label, axis=1) select_X = selector.fit_transform(X, y) select_features_index = selector.get_support(True) select_columns = X.columns[select_features_index] return select_X, select_columns
def SelectRFE_DTCV(dataf, targetf): estimator = DecisionTreeClassifier() selector = RFECV(estimator, cv=3) data_new = selector.fit_transform(dataf.values, targetf.values.ravel()) outcome = selector.get_support(True) new_features = [] # The list of your K best features for ind in outcome: new_features.append(dataf.columns.values[ind]) return pd.DataFrame(data_new, columns=new_features)
def _feature_selection(self, data_matrix, target): try: # perform recursive feature elimination feature_selector = RFECV(self.estimator, step=self.step, cv=self.cv) data_matrix_out = feature_selector.fit_transform(data_matrix, target) self.feature_selectors.append(feature_selector) return data_matrix_out except Exception as e: logger.debug(e) return data_matrix
def feature_selection(data_matrix, target): from sklearn.feature_selection import RFECV from sklearn.linear_model import SGDClassifier estimator = SGDClassifier(average=True, shuffle=True, penalty='elasticnet') # perform feature rescaling with elastic penalty data_matrix = estimator.fit_transform(data_matrix, target) # perform recursive feature elimination selector = RFECV(estimator, step=0.1, cv=10) data_matrix = selector.fit_transform(data_matrix, target) return data_matrix
def selectBestFeaturesRFECV(samples, classifications, featureNames, classifierClass): fs = RFECV(classifierClass.getEstimator()) if (not sprs.issparse(samples)): samples = sprs.csr_matrix(samples) samples = fs.fit_transform(samples.toarray(), classifications) sup = fs.get_support() featureNames = [featureNames[i] for (i,s) in enumerate(sup) if s] return [samples,featureNames]
def pre_process(data_fname=None, target_fname=None, correlation_transformation=None, normalization=None, feature_selection=None, min_threshold=None, max_threshold=None, random_state=1): """Process data.""" # load data data_matrix, gene_names, instance_names = _loaddata_matrix(data_fname) # prepare target y_orig, target_names = _load_target(target_fname) y_sel = _select_targets(y_orig, min_threshold=min_threshold, max_threshold=max_threshold) logger.info('original num classes: %d' % len(set(y_orig))) logger.info('selected %d classes with more than %d instances' % (len(y_sel), min_threshold)) data_matrix, y_orig_sel = _filter_dataset(data_matrix, y_orig, y_sel) rows, cols = data_matrix.shape logger.info('num instances:%d num features:%d' % (rows, cols)) lenc = LabelEncoder() y = lenc.fit_transform(y_orig_sel) y = np.array(y) target_dict = dict() for i, c in enumerate(lenc.classes_): target_dict[i] = target_names[c] # normalization if normalization: logger.info('Normalization') data_matrix = normalize(data_matrix) # feature selection if feature_selection: estimator = SGDClassifier(random_state=random_state) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state) selector = RFECV(estimator, step=20, cv=cv) data_matrix = selector.fit_transform(data_matrix, y) logger.info('Feature selection') rows, cols = data_matrix.shape logger.info('num instances:%d num features:%d' % (rows, cols)) # prepare data matrix if correlation_transformation: data_matrix = np.corrcoef(data_matrix) logger.info('Correlation coefficient transformation') rows, cols = data_matrix.shape logger.info('num instances:%d num features:%d' % (rows, cols)) return data_matrix, y, target_dict
class RFECVFeatureSelection: def __init__(self, estimator): self._rfecv = RFECV(estimator=estimator, cv=StratifiedKFold(5), scoring='recall') def execute(self, dataset): print('===== Feature selection - RFECV =====') dataset['features'] = self._rfecv.fit_transform( dataset['features'].toarray(), dataset['categories']) print(dataset['features'].shape) return dataset
def _feature_selection(self, data_matrix, target): try: # perform recursive feature elimination step = max(int(data_matrix.shape[1] * self.step), 1) feature_selector = RFECV(self.estimator, step=step, cv=self.cv) data_matrix_out = feature_selector.fit_transform( data_matrix, target) self.feature_selectors.append(feature_selector) return data_matrix_out except Exception as e: logger.debug(e) return data_matrix
def rfe_filter(feature_filter,finger_feature): from sklearn.svm import SVC global label label=data["n_np"].replace({"p":1,"n":0}) svc=SVC(kernel="linear") rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(5), scoring='roc_auc') finger_three=rfecv.fit_transform(finger_feature, label) #rfecv_get = rfecv.get_support(indices=True) #finger_three=finger_feature[rfecv_get] print " ",finger_three.shape print("Optimal number of features : %d" % rfecv.n_features_) return finger_three
def trainModel(model, db, indexes, tests, goal): X = db.copy().drop(goal, axis=1) X = X.loc[indexes, :] y = db.copy()[goal] y = y[indexes] high_score = 0 score_list = [] topTRF = 0 topFeatsRF = 0 topFeatsPosRF = 0 topFeatsRankRF = 0 featValsRF = 0 topModel = RandomForestRegressor(n_estimators=100) topX_train = 0 topX_test = 0 topy_train = 0 topy_test = 0 for t in tests: print(t) #Variable to store the optimum features for n in range(1, len(X.columns)): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=t, random_state=0) model = RandomForestRegressor(n_estimators=100) rfe = RFECV(model, n, cv=10) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if (score > high_score and rfe.n_features_ > 9 and rfe.n_features_ < 40): topTRF = t high_score = score nof = rfe.n_features_ topFeatsPosRF = rfe.support_ topFeatsRF = X.columns[topFeatsPosRF] topFeatsRankRF = rfe.ranking_ featValsRF = model.feature_importances_ topModel.fit(X_train_rfe, y_train) topX_train = X_train topX_test = X_test topy_train = y_train topy_test = y_test print("Score with %d features: %f" % (nof, high_score)) return topModel, topFeatsRF, topFeatsPosRF, topX_train, topX_test, topy_train, topy_test
def rfecv_fc(X,y,estimator): print('RFECV FEATURE SELECTION:') estimator = estimator selector = RFECV(estimator, step=1, cv=5) og_X = pd.DataFrame(X) X = selector.fit_transform(og_X, y) print('Optimal number of features :', selector.n_features_) print('Best features index :', og_X.columns[selector.support_]) print('Best features:') for x in og_X.columns[selector.support_]: print(tmp[x]) ml_alg(X,y) return og_X.columns[selector.support_].tolist()
def apply(self, X_mat, y_train): rfe_settings = self.rfe_settings kwargs = rfe_settings['kwargs'] #estimator_class_name = kwargs['estimator'] #current_mod = importlib.import_module('feature.selection') #estimator_class = getattr(current_mod,estimator_class_name) #estimator = estimator_class() estimator = SVC(kernel='linear') remaining_kwargs_keys = filter(lambda x: x not in ['estimator'], kwargs.keys()) remaining_kwargs = {k: kwargs[k] for k in remaining_kwargs_keys} selector = RFECV(estimator, **remaining_kwargs) X_filt = selector.fit_transform(X_mat, y_train) return pd.DataFrame(X_filt)
def feature_selection(train_data, train_target, test_data, unknown_data): """ Selects features based on cross validation with Lasso This method determined the above removed columns Not calling it everytime, because it takes ages to run """ lasso = Lasso() selector = RFECV(lasso, cv=3) train = selector.fit_transform(train_data, train_target) test = selector.transform(test_data) unknown = selector.transform(unknown_data) print(selector.support_) # mask of used and deleted columns return (train, test, unknown)
def RFECV_DT(df, test_size=0.3, cv=5, min_features_to_select=7, max_depth=4): X = df.drop(['class'], axis=1) y = df['class'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 1, stratify=y) dt_rfecv = DecisionTreeClassifier(max_depth=max_depth) rfecv = RFECV(dt_rfecv, min_features_to_select=min_features_to_select, cv=StratifiedKFold(cv)) X_train_rfecv = rfecv.fit_transform(X_train,y_train) X_test_rfecv = rfecv.transform(X_test) columns = X.columns rank = pd.DataFrame({'feature': columns, 'rank': list(rfecv.ranking_)}) rank = rank.sort_values(by=['rank'], ascending = True) top_rank = df[rank['feature'][rank['rank']==1]] top_rank = pd.concat([top_rank, df['class']], axis=1) return top_rank
def recursive_feature_elimination_cv(input_data, feature_names, step=0.1, cv=3, estimator=SVC(kernel='linear')): """ Recursively elinates features from x_train and x_test with cross validation, uses scikit-learn's RFECV see documentation: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html If feature_names is given it is also returned with any features from x_train and x_test also removed from feature_names. Args: input_data (tuple): x_train, y_train, x_test, y_test feature_names: The names of all features before feature selection or None. estimator (object): Passed to RFECV, see documentation step (int or float): Passed to RFECV, see documentation cv (int): Passed to RFECV, see documentation Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, input_args """ x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) feature_selector = RFECV(estimator, step, cv) x_train = feature_selector.fit_transform(x_train, y_train) x_test = feature_selector.transform(x_test) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] args = {'step': step, 'cv': cv, 'estimator': estimator} return output_data, feature_names, args
def select_by_RFECV(self, model=None): # 展示:随着特征个数增加得分变化趋势图 # 向后消除:该过程从所有特征集开始。通过逐步删除集合中剩余的最差特征。 # 参数estimator为基模型 selector = RFECV(estimator=model) f = selector.fit_transform(self.train_X, self.train_y) # grid_scores = list(map(lambda x: round(x, 4), selector.grid_scores_)) # print("随着特征个数增加,得分变化 : {}".format(grid_scores)) model_name = str(model).split('(')[0] plt.figure() plt.title('RFECV of {}'.format(model_name)) plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_) plt.grid() plt.show()
def performFS(texto,clfFS): skf = StratifiedKFold(y, random_state=0, n_folds=3) #print "FS tuned by " + texto fs = HmbFS(clfFS, skf, 1.5,1.5) X_withHmbFS = fs.fit_transform(X,y) #print "Done0" fs = HmbFS(clfFS, skf, 1.0,4.0) X_withHmbFSCV = fs.fit_transform(X,y) #print "Done1" #competencia = RFECV(clfFS, step=0.1, cv=skf, scoring='accuracy') stepSize=int(np.ceil(len(X[0])/10.0)) competencia = RFECV(clfFS, step=stepSize, cv=skf, scoring='accuracy', verbose=0) X_withRFECV = competencia.fit_transform(X,y) #print "Done2" return X_withHmbFS,X_withHmbFSCV,X_withRFECV
def feature_selection(train_x, train_y, test_x): """ The method uses Recursive Feature Elimination Feature method to choose subset of features. It is a wrapper method of feature selection techniques. The main purpose is to reduce the dimension of the samples to avoid curse of dimensionality. Parameters ---------- train_x: features of training data test_x: features of testing data """ svc = SVC(kernel="linear") rfecv = RFECV(estimator=svc, step=1, cv=ShuffleSplit(n_splits=10, test_size=0.25, random_state=0), n_jobs=-1, scoring='accuracy') reduced_train_x = rfecv.fit_transform(train_x, train_y) reduced_test_x = rfecv.transform(test_x) return reduced_train_x, reduced_test_x
def test_refcv(): # 加载数据 iris = load_iris() x, y = iris.data, iris.target # 特征提取 estimator = LinearSVC() selector = RFECV(estimator, cv=5) x_t = selector.fit_transform(x, y) # 切分测试集和验证集 x_train, x_test, y_train, y_test = model_selection.train_test_split( x, y, test_size=0.25, random_state=0, stratify=y) x_train_t, x_test_t, y_train_t, y_test_t = model_selection.train_test_split( x_t, y, test_size=0.25, random_state=0, stratify=y) # 测试和验证 clf = LinearSVC() clf_t = LinearSVC() clf.fit(x_train, y_train) clf_t.fit(x_train_t, y_train_t) print(clf.score(x_test, y_test)) print(clf_t.score(x_test_t, y_test_t)) pass
def main(args): if args.train_dir is None: # args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/' #args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/' # args.train_dir = r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3' # args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big' args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles' print("Using default train_dir: %s" % args.train_dir) pandas.set_option('display.max_columns', 10) pandas.set_option('display.max_rows', 4) # mpl.rc('title', labelsize=6) mpl.rc('ytick', labelsize=7) mpl.rc('xtick', labelsize=4) os.chdir(args.train_dir) dataName = 'Neuropeptides' df = pandas.read_csv('trainingSetFeatures.csv') feature_cols = [col for col in df.columns if col not in ['classname','Id','proteinname']] feature_cols=numpy.array(feature_cols) X = df[feature_cols].values y = df.classname.values le = LabelEncoder() y = le.fit_transform(y) "Initial feature selection trimming" print(X.shape) Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) print("F-test -> ",X.shape) feature_cols=feature_cols[Fwe.get_support()] ''' FeatSelection_SVM = True if FeatSelection_SVM == True: svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y) X = svc_L1.transform(X, y) print ("L1 SVM Transformed X:",X_L1.shape) feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))] ''' k = SelectKBest(k=255).fit(X,y) X=k.transform(X) feature_cols=feature_cols[k.get_support()] param_dist = {"max_depth": [6,9, None], "max_features": ['auto',0.4], "min_samples_leaf": [1,2,3], "bootstrap": [True, False], 'min_samples_split':[2,3], "criterion": [ "gini"], "n_estimators":[100], "n_jobs":[-1]} rf = RandomForestClassifierWithCoef(max_depth= 7, min_samples_split= 1, min_samples_leaf= 2, n_estimators= 50, n_jobs= 2, max_features= "auto") "WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class." scores = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2)) print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2)) "Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'." scores_f1 = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1') print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) # rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04) rfeSelect = RFECV(estimator=rf,step=20, cv=2,scoring='f1') #average_precision , recall X_RFE = rfeSelect.fit_transform(X,y) print(X_RFE.shape) RFE_FeatureNames = feature_cols[rfeSelect.get_support()] print(RFE_FeatureNames) RFE_ScoreRatio = 100*(cross_validation.cross_val_score(rf,X_RFE,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1').mean())/scores_f1.mean() print("Even with just",X_RFE.shape[1]," features, we have %f performance! (f1 score ratio)" %(RFE_ScoreRatio)) # PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName) print("Alt plot:") altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, roc_auc_score df1 = pd.read_csv('EventDetectionData.csv') scores = [] for i in range(150, 200): score = [] X_train, X_test, y_train, y_test = train_test_split( df1.iloc[:, 1:i], df1['target'], test_size=0.3, random_state=69) # 70% training and 30% test log = LogisticRegression() rfecv = RFECV(estimator=log, step=1, cv=5, scoring='roc_auc') X_train_new = rfecv.fit_transform(X_train, y_train) X_test_new = rfecv.transform(X_test) j = rfecv.n_features_ C_range = 10.**np.arange(-5, 1) penalty_options = ['l1', 'l2'] param_grid = dict(C=C_range, penalty=penalty_options) grid = GridSearchCV(log, param_grid, cv=5, scoring='roc_auc') grid.fit(X_train_new, y_train) y_train_pred = grid.predict(X_train_new)
Caracteristicas = Datos['CaracteristicasD'] etiqueta = Datos['etiquetas'] etiquetas = etiqueta.reshape((etiqueta.shape[0])) Resultados = np.zeros(shape=(4, 5)) ResultadosSTD = np.zeros(shape=(4, 5)) ResultadosCompletos = np.zeros(shape=(4, 5, grupos)) start_time = time.time() for vent in range(4): desc = 1 Caracteristicas_vent = Caracteristicas['vent'][0, desc][ 0, vent] #[0,desc,0,vent] Car_dff = pd.DataFrame(Caracteristicas_vent) Car_dfS = selector_rfecv.fit_transform(Car_dff, etiquetas) Car_df = lda.fit_transform(Car_dfS, etiquetas) ############ KNN clasificador_knn = KNeighborsClassifier(n_neighbors=10, weights="uniform") accuracy_knn = cross_val_score(clasificador_knn, X=Car_df, y=etiquetas, scoring='accuracy', cv=grupos, n_jobs=-1) ResultadosCompletos[vent, 0, :] = accuracy_knn Resultados[vent, 0] = accuracy_knn.mean() ResultadosSTD[vent, 0] = accuracy_knn.std()
class MyApp(QtGui.QMainWindow, Ui_MainWindow): def __init__(self): self.x_data = list() self.y_data = list() QtGui.QMainWindow.__init__(self) Ui_MainWindow.__init__(self) self.setupUi(self) self.rfwlv_action.clicked.connect(self.rfwlv) self.ufs_action.clicked.connect(self.ufs) self.rfe_action.clicked.connect(self.rfe) #对标准化radio加入组bg中 self.bg01 = QtGui.QButtonGroup() self.bg01.addButton(self.s_radio_1,1) self.bg01.addButton(self.s_radio_2,2) #默认定义s_radio_1这个控件被选中 self.s_radio_1.setChecked(True) #对数据集划分radio加入组bg中 self.bg02 = QtGui.QButtonGroup() self.bg02.addButton(self.d_radio_1,1) self.bg02.addButton(self.d_radio_2,2) #默认定义s_radio_1这个控件被选中 self.d_radio_1.setChecked(True) def rfwlv(self): self.bz() #标准化 self.stt() #划分数据集 self.dtc01() # def ufs(self): self.bz() #标准化 self.stt() #划分数据集 self.dtc03() # def rfe(self): self.bz() #标准化 self.stt() #划分数据集 self.dtc04() # #数据标准化 def bz(self): if self.bg01.checkedId() == 1: self.x = preprocessing.scale(self.x_data) else: min_max_scaler = preprocessing.MinMaxScaler() self.x = min_max_scaler.fit_transform(self.x_data) #训练数据和测试数据的划分 def stt(self): #对数据进行划分,其中自变量和因变量都进行 #这样就产生四个数据集:x_train,x_test,y_train,y_test self.x_train = list() self.x_test = list() self.y_train = list() self.y_test = list() if self.bg02.checkedId() == 1: strte = self.tt_box.itemText(self.tt_box.currentIndex()) s01 = str(strte).split(':') if len(s01) == 2: xnum = math.ceil((int(s01[0])*1.0/10)*len(self.x_data)) for i in range(len(self.x_data)): if i <= xnum: self.x_train.append(self.x_data[i]) self.y_train.append(self.y_data[i]) else: self.x_test.append(self.x_data[i]) self.y_test.append(self.y_data[i]) else: ts01 = int(self.train.text()) ts02 = int(self.test.text()) for i in range(ts01+ts02): if i < ts01: self.x_train.append(self.x_data[i]) self.y_train.append(self.y_data[i]) else: self.x_test.append(self.x_data[i]) self.y_test.append(self.y_data[i]) ''' 主函数 ''' def dtc01(self): #将y转化为一维形式:self.y_train,self.y_test self.y01_train = list() self.y01_test = list() for a in range(len(self.y_train)): self.y01_train.append(self.y_train[a][0]) for b in range(len(self.y_test)): self.y01_test.append(self.y_test[b][0]) #取出其中labels self.labels = list() for c in range(len(self.y_test)): if self.labels.count(self.y_test[c][0]) == 0: self.labels.append(self.y_test[c][0]) print (self.labels) # VarianceThreshold算法的实现 # 参数的获取 if not self.th_edit.text().strip(): self.max_depth = 0.0 else: self.max_depth = float(self.md_edit.text()) # 定义模型 self.clf = VarianceThreshold(threshold=self.max_depth) self.clf.fit_transform(self.x_train) self.f_c = self.clf.get_support() ''' 该模块是对dtable01模块进行设置,即显示训练集的训练结果 ''' # VarianceThreshold算法的结果显示 self.rfwlv_dtable.setRowCount(2) self.rfwlv_dtable.setColumnCount(len(self.x_train[0])) mlan = "是否保留该特征(T/F)" self.rfwlv_dtable.setSpan(0, 0, 1, len(self.x_train[0])) self.rfwlv_dtable.setItem(0,0, QtGui.QTableWidgetItem(mlan.decode('utf-8'))) for j in range(len(self.f_c)): self.rfwlv_dtable.setItem(1,j, QtGui.QTableWidgetItem(str(self.f_c[j]))) def dtc03(self): #将y转化为一维形式:self.y_train,self.y_test self.y01_train = list() self.y01_test = list() for a in range(len(self.y_train)): self.y01_train.append(self.y_train[a][0]) for b in range(len(self.y_test)): self.y01_test.append(self.y_test[b][0]) #取出其中labels self.labels = list() for c in range(len(self.y_test)): if self.labels.count(self.y_test[c][0]) == 0: self.labels.append(self.y_test[c][0]) print (self.labels) # SelectKBest算法的实现 # 参数的获取 if not self.kedit.text().strip(): self.k = 10 else: self.k = int(self.kedit.text()) if not self.pedit.text().strip(): self.param = 1e-05 else: self.param = float(self.pedit.text()) self.mode = self.mo_box.itemText(self.mo_box.currentIndex()) # 定义模型 if self.sp_box.itemText(self.sp_box.currentIndex()) == 'SelectKBest': self.clf = SelectKBest(score_func= f_classif, k=self.k) self.clf.fit_transform(self.x_train,self.y01_train) self.f_c = self.clf.get_support() elif self.sp_box.itemText(self.sp_box.currentIndex()) == 'SelectPercentile': self.clf = SelectPercentile(score_func= f_classif, percentile= self.k) self.clf.fit_transform(self.x_train,self.y01_train) self.f_c = self.clf.get_support() else: self.clf = GenericUnivariateSelect(score_func= f_classif, mode= self.mode, param=self.param) self.clf.fit_transform(self.x_train,self.y01_train) self.f_c = self.clf.get_support() # ''' 该模块是对dtable01模块进行设置,即显示训练集的训练结果 ''' # VarianceThreshold算法的结果显示 self.ufs_dtable.setRowCount(2) self.ufs_dtable.setColumnCount(len(self.x_train[0])) mlan = "是否保留该特征(T/F)" self.ufs_dtable.setSpan(0, 0, 1, len(self.x_train[0])) self.ufs_dtable.setItem(0,0, QtGui.QTableWidgetItem(mlan.decode('utf-8'))) for j in range(len(self.f_c)): self.ufs_dtable.setItem(1,j, QtGui.QTableWidgetItem(str(self.f_c[j]))) def dtc04(self): #将y转化为一维形式:self.y_train,self.y_test self.y01_train = list() self.y01_test = list() for a in range(len(self.y_train)): self.y01_train.append(self.y_train[a][0]) for b in range(len(self.y_test)): self.y01_test.append(self.y_test[b][0]) #取出其中labels self.labels = list() for c in range(len(self.y_test)): if self.labels.count(self.y_test[c][0]) == 0: self.labels.append(self.y_test[c][0]) print (self.labels) # VarianceThreshold算法的实现 # 参数的获取 if not self.stepedit.text().strip(): self.step = 1 else: self.step = int(self.stepedit.text()) if not self.cvedit.text().strip(): self.cv = 5 else: self.cv = int(self.cvedit.text()) # 定义模型 estimator = SVR(kernel="linear") self.clf = RFECV(estimator, step=self.step, cv=self.cv) self.clf.fit(self.x_train,self.y01_train) self.f_c = self.clf.get_support() ''' 该模块是对dtable01模块进行设置,即显示训练集的训练结果 ''' # VarianceThreshold算法的结果显示 self.rfe_dtable.setRowCount(2) self.rfe_dtable.setColumnCount(len(self.x_train[0])) mlan = "是否保留该特征(T/F)" self.rfe_dtable.setSpan(0, 0, 1, len(self.x_train[0])) self.rfe_dtable.setItem(0,0, QtGui.QTableWidgetItem(mlan.decode('utf-8'))) for j in range(len(self.f_c)): self.rfe_dtable.setItem(1,j, QtGui.QTableWidgetItem(str(self.f_c[j]))) #保存模型 def out_model(self): self.filepath=str(QtGui.QFileDialog.getSaveFileName(self,"文件保存","F:/","Model Files (*.model)")) joblib.dump(self.clf, self.filepath.decode('GB2312'))
def main(args): if args.train_dir is None: # args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/' #args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/' # args.train_dir = r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3' # args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big' args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles' print("Using default train_dir: %s" % args.train_dir) pandas.set_option('display.max_columns', 10) pandas.set_option('display.max_rows', 4) # mpl.rc('title', labelsize=6) mpl.rc('ytick', labelsize=7) mpl.rc('xtick', labelsize=4) os.chdir(args.train_dir) dataName = 'Neuropeptides' df = pandas.read_csv('trainingSetFeatures.csv') feature_cols = [ col for col in df.columns if col not in ['classname', 'Id', 'proteinname'] ] feature_cols = numpy.array(feature_cols) X = df[feature_cols].values y = df.classname.values le = LabelEncoder() y = le.fit_transform(y) "Initial feature selection trimming" print(X.shape) Fwe = SelectFwe(alpha=0.01).fit(X, y) X = Fwe.transform(X) print("F-test -> ", X.shape) feature_cols = feature_cols[Fwe.get_support()] ''' FeatSelection_SVM = True if FeatSelection_SVM == True: svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y) X = svc_L1.transform(X, y) print ("L1 SVM Transformed X:",X_L1.shape) feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))] ''' k = SelectKBest(k=255).fit(X, y) X = k.transform(X) feature_cols = feature_cols[k.get_support()] param_dist = { "max_depth": [6, 9, None], "max_features": ['auto', 0.4], "min_samples_leaf": [1, 2, 3], "bootstrap": [True, False], 'min_samples_split': [2, 3], "criterion": ["gini"], "n_estimators": [100], "n_jobs": [-1] } rf = RandomForestClassifierWithCoef(max_depth=7, min_samples_split=1, min_samples_leaf=2, n_estimators=50, n_jobs=2, max_features="auto") "WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class." scores = cross_validation.cross_val_score( rf, X, y, n_jobs=-1, cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2)) print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2)) "Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'." scores_f1 = cross_validation.cross_val_score( rf, X, y, n_jobs=-1, cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2), scoring='f1') print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) # rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04) rfeSelect = RFECV(estimator=rf, step=20, cv=2, scoring='f1') #average_precision , recall X_RFE = rfeSelect.fit_transform(X, y) print(X_RFE.shape) RFE_FeatureNames = feature_cols[rfeSelect.get_support()] print(RFE_FeatureNames) RFE_ScoreRatio = 100 * (cross_validation.cross_val_score( rf, X_RFE, y, n_jobs=-1, cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2), scoring='f1').mean()) / scores_f1.mean() print( "Even with just", X_RFE.shape[1], " features, we have %f performance! (f1 score ratio)" % (RFE_ScoreRatio)) # PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName) print("Alt plot:") altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
def preProcess(theFileName): df = pd.read_csv(str(theFileName)) if 'Unnamed: 0' in df.columns: df = df.drop('Unnamed: 0', axis = 1) labBin = sklearn.preprocessing.LabelBinarizer() df['y'] = labBin.fit_transform(df['y']) dp = pd.get_dummies(df) X = dp.drop('y', axis = 1) y = dp[['y']] # get the features theFeatures = X.columns # convert the dataframes to arrays X = X.values y = y.values y.shape = np.shape(y)[0] yOrig = y[:] # need this later for plotting feature impacts # and carry out feature scaling X = StandardScaler().fit_transform(X) #======================================================================= # apply random undersampling if labels are imbalanced labelSkewness = 100*np.sum(y)*1./np.shape(y)[0] if np.min([labelSkewness, 100-labelSkewness]) < (100./3.): rus = RandomUnderSampler(verbose=0) X, y = rus.fit_sample(X, y) #======================================================================= # select optimal number of features thisModel = LogisticRegression(penalty='l1', C=1) rfecv = RFECV(estimator=thisModel, step=1, cv=StratifiedKFold(y, n_folds=3), scoring='f1') Xt = rfecv.fit_transform(X, y); optimalNumberOfFeatures = rfecv.n_features_ introReport = ['Optimal Number of Attributes: ' + str(optimalNumberOfFeatures), 'The following attributes are the most influential to the outcome'] #======================================================================= # plot number of selected features VS cross-validation scores plt.figure(figsize=(12, 8)) plt.xlabel("Number of Attributes", fontsize=20) plt.ylabel("Score", fontsize=20) plt.title("Attribute Selection", fontsize=25) plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) imgOne = 'static/thePlot.jpg' plt.savefig('flask_files/'+imgOne, dpi=300) #======================================================================= # get the feature feature importance rankings model = RandomForestClassifier(n_estimators=300) model.fit(X,y) theImportances = list(model.feature_importances_) sortedImportances = sorted(theImportances,reverse = True) # ...and print the selected features along with their weights and ranks tableOne = [] for ii in range(1,optimalNumberOfFeatures+1): tableOne.append(dict(Feature = str(theFeatures[theImportances.index(sortedImportances[ii-1])]), Weight = str(sortedImportances[ii-1]), Rank = str(ii))) #======================================================================= # plot histogram of the most important feature thisFeature = 0 allThoseFeatures = dp[theFeatures[theImportances.index(sortedImportances[thisFeature])]] plt.figure(figsize=(12, 8)) combinedOutcomes = plt.hist(allThoseFeatures, bins=10) # plt.hist(allThoseFeatures, bins=10) plt.xlabel('Attribute: ' + theFeatures[theImportances.index(sortedImportances[0])], fontsize=20) plt.ylabel('Count', fontsize=20) plt.title('Impact of the Most Influential Attribute', fontsize=25) imgTwo = 'static/theHist.jpg' plt.savefig('flask_files/'+imgTwo, dpi=300) #======================================================================= # plot impact of the most important feature positiv = allThoseFeatures[yOrig==1] negativ = allThoseFeatures[yOrig==0] plt.figure(figsize=(12, 8)) negA = plt.hist(negativ,bins=combinedOutcomes[1]) posA = plt.hist(positiv,bins=combinedOutcomes[1]) # yUpperLimit = np.max([negA[0], posA[0]])*1.01 # plt.subplot(1,2,1) # plt.hist(negativ,bins=combinedOutcomes[1]) # plt.ylim(ymax = yUpperLimit*1.01, ymin = 0) # plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16) # plt.ylabel('Count', fontsize=16) # plt.title('Negative', fontsize=20) # # plt.subplot(1,2,2) # plt.hist(positiv,bins=combinedOutcomes[1]) # plt.ylim(ymax = yUpperLimit, ymin = 0) # plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16) # plt.title('Positive',fontsize=20) # # imgThree = 'static/theNegPosHist.jpg' # plt.savefig('flask_files/'+imgThree, dpi=300) #======================================================================= a = posA[0] b = negA[0] c = combinedOutcomes[0] posImpact = np.divide(a,c) negImpact = np.divide(b,c) midPoints=[] for i in range(1,len(combinedOutcomes[1])): midPoints.append((combinedOutcomes[1][i] + combinedOutcomes[1][i-1])/2.) for i in range(len(posImpact)): if np.isnan(posImpact[i]): posImpact[i]=0 if np.isnan(negImpact[i]): negImpact[i]=0 plt.figure(figsize=(12, 8)) plt.hold(True) plt.plot(midPoints, posImpact,'.', markersize=20, label='Positive') plt.plot(midPoints, negImpact, 'r.', markersize=20, label='Negative') plt.legend(prop={'size':20}) plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16) plt.ylabel('Relative Impact', fontsize=20) plt.grid() imgThree = 'static/theNegPosHist.jpg' plt.savefig('flask_files/'+imgThree, dpi=300) #======================================================================= # generate plots for report (this is save to an "html" file) from bokeh.charts import Histogram, output_file, show, save, gridplot from bokeh.plotting import figure plotList=[] for i in range(optimalNumberOfFeatures): thisFeatureIs = theFeatures[theImportances.index(sortedImportances[i])] allThoseFeatures = dp[thisFeatureIs] combinedOutcomes = plt.hist(allThoseFeatures, bins=10) positiv = allThoseFeatures[yOrig==1] negativ = allThoseFeatures[yOrig==0] negA = plt.hist(negativ,bins=combinedOutcomes[1]) posA = plt.hist(positiv,bins=combinedOutcomes[1]) posImpact = np.divide(posA[0],combinedOutcomes[0]) negImpact = np.divide(negA[0],combinedOutcomes[0]) midPoints=[] for i in range(1,len(combinedOutcomes[1])): midPoints.append((combinedOutcomes[1][i] + combinedOutcomes[1][i-1])/2.) for i in range(len(posImpact)): if np.isnan(posImpact[i]): posImpact[i]=0 if np.isnan(negImpact[i]): negImpact[i]=0 hist0 = Histogram(dp, values=thisFeatureIs, color='blue', title="Impact of " + thisFeatureIs, bins=10) plot0 = figure() plot0.xaxis.axis_label = thisFeatureIs plot0.yaxis.axis_label = "Relative Impact" # plot0.title = "Relative Impact of " + thisFeatureIs plot0.circle(midPoints, list(negImpact), size=10, color="red", alpha=0.9, legend='Negative') plot0.circle(midPoints, list(posImpact), size=10, color="green", alpha=0.9, legend='Positive') plotList.append([hist0,plot0]) output_file("flask_files/static/Report.html", title = "Report") hist = gridplot(plotList) save(hist) #======================================================================= # specify the models to run tests with theModels = {'Logistic Regression':LogisticRegression(penalty='l1'), 'LDA':LinearDiscriminantAnalysis(), 'SVM':SVC(kernel='linear'), 'Random Forest':RandomForestClassifier(n_estimators=300)} # ...then display the results of the tests classifierComparisons=[] for aModel in theModels: model = theModels[aModel] results = cross_validation.cross_val_score(model, Xt, y, scoring='f1', cv=StratifiedKFold(y, n_folds=3)) classifierComparisons.append(dict(Classifier = aModel, Score = np.max(results))) #======================================================================= # display the plots theJPGs = [imgOne, imgTwo, imgThree] #======================================================================= return introReport, tableOne, optimalNumberOfFeatures, classifierComparisons, theJPGs
if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=0.1, cv=StratifiedShuffleSplit(y,n_iter=7,test_size=0.33), scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1'..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.1) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFEcv selected %d number of Optimal features : " % (rfecv.n_features_)) print("RFE (%d Features) scorer : \n" % (rfecv.n_features_),rfecv.score(X, y) ) print("RFE selected feature names:") featureNames=featureNames[rfecv.get_support()] rfe_featnames = featureNames[rfecv.get_support()] print (rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print(X_RFE.shape,"X_RFE \n") 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False print("\n X: \n") ModelParam_GridSearch(X,y,cv=4) if GetRFEPerf==True: print("\n X-RFE: \n") ModelParam_GridSearch(X_RFE,y,cv=4) GetPCAPerf=False if GetPCAPerf==True:
def GetAllPerf (filePaths=None): if filePaths is None: filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n",filePaths) fileNames=fileNameFromPaths (filePaths) print("FileNames:",fileNames) resDict = pd.DataFrame(index=fileNames, columns=['Accuracy','Accuracy_SD', 'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1', 'LargestClassPercent','Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores' ,'Array-f1-Scores' ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted']) #redDict holds results for each file/class, for saving to output-file i=-1 for filePath in filePaths: i +=1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName=str(fileNames[i]) #Str added now 14.1 print("fileName: %s" %(fileName)) "resDict['Name']= fileName" # filePath = str(argv[1]) # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1) print("Classes:", lb_encoder.classes_) print("MajorityClassPercent:", MajorityPercent) resDict.LargestClassPercent[fileName] = MajorityPercent resDict.Classes[fileName] = str(lb_encoder.classes_) resDict["# Classes"][fileName]=len(lb_encoder.classes_) KFilt=None KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself. if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("X reduced to K best features: ",X.shape) FeatSelection_SVM=False #Feature Names need updating!! FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5, sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y) X_L1 = LogRegFeats.transform(X) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape) elif FeatSelection_SVM == True: svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y) X_L1 = svc_L1.transform(X, y) featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))] print ("L1 SVM Transformed X:",X_L1.shape) # X=X_L1 ''' print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) ''' 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 16 FeatSelection_RFE=False FeatSelection_RFECV=False if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision') # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3)) #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFE-CV selected %d features : " % (rfecv.n_features_)) print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) ) rfe_featnames = featureNames[rfecv.get_support()] featureNames = featureNames[rfecv.get_support()] print("RFE selected feature names:",rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print("X_RFE",X_RFE.shape) resDict['TopRFE-Features'][fileName]=str(rfe_featnames) 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False # print("lb_encoder.classes_",lb_encoder.classes_) 'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb ' 'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/' 'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators' "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html" print() "Make custom F1 scorer. May not have fixed problem!" from sklearn.metrics.score import make_scorer f1_scorer = make_scorer(metrics.f1_score, greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none # print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0) y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred )) dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted')) dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred)) #Get from ALL classes f1.. dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean() # print("Dummy, most frequent acc:",dummy_freq_acc) # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0) # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y))) # 'print("Dummy, Stratified Random:",dummy_strat2)' print() resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc ## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted # resDict.dummy_Stratfreq[fileName]=dummy_strat2 "We can get seperately the best model for Acc, and the best for f1!" "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1') "Temporary workaround until next SKlearn update of F1 metric:" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer) bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy') print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1) print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc) #Temp # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1) if GetRFEPerf==True: bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1') "Modified to get 2 estimators" scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2)) scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1') print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) resDict['Accuracy'][fileName]=round(scores_acc.mean(),4) resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4) resDict['f1'][fileName]=round(scores_f1.mean(),4) resDict['f1_SD'][fileName]=round(scores_f1.std(),4) resDict['Array-f1-Scores'][fileName]=(scores_f1) resDict['Array-Acc-Scores'][fileName]=(scores_acc) resDict['bestML-f1'][fileName]=(str(bestEst_f1)) resDict['bestML-Acc'][fileName]=(str(bestEst_acc)) #ORIG # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15) # resDict['Accuracy'][fileName]=round(Acc,4) # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4) # resDict['f1 score'][fileName]=round(f1,4) # resDict['f1_SD'][fileName]=round(f1_SD,4) # resDict['Best (f1) Model parameters'][fileName]= bestEst print() # print(fileName," Done") print("Saving results to file") resDict.to_csv("OutputData.tsv", sep=',')
X_test = vectorizer.transform(test_features) #scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True) scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True) X_train = scaler.fit_transform(X_train.toarray()) X_test = scaler.transform(X_test.toarray()) do_feature_elimination = False if do_feature_elimination: estimator = RandomForestClassifier(n_estimators=2000, criterion='entropy', max_depth=None, min_samples_split=16, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None) selector = RFECV(estimator, step=1, cv=5, scoring='log_loss') X_train = selector.fit_transform(X_train, train_labels) print 'after feature elimination', X_train.shape X_test = selector.transform(X_test) do_feature_selection = False if do_feature_selection: ch2 = SelectKBest(chi2, k=4000) X_train = ch2.fit_transform(X_train, train_labels) X_test = ch2.transform(X_test) do_pca = False if do_pca: k = 100 add_pca_to_original = True X_train = X_train.toarray()