class f_regressionFWEPrim(primitive): def __init__(self, random_state=0): super(f_regressionFWEPrim, self).__init__(name='f_regressionFWE') self.id = 39 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Select the p-values corresponding to Family-wise error rate with F-value between label/feature for regression tasks." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFwe(f_regression, alpha=0.05) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) try: mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) except Exception as e: print(e) final_output = {0: output} return final_output
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4., 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def test_select_fwe_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='fwe', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert (np.sum(np.abs(support - gtruth)) < 2)
def test_select_heuristics_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the fdr, fwe and fpr heuristics X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ['fdr', 'fpr', 'fwe']: X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_almost_equal(support, gtruth)
def test_select_heuristics_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the fdr, fwe and fpr heuristics X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ["fdr", "fpr", "fwe"]: X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_almost_equal(support, gtruth)
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def test_select_fwe_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="fwe", param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert np.sum(np.abs(support - gtruth)) < 2
def SelectFwe_selector(data, target, sf): selector = SelectFwe(score_func=sf) data_new = selector.fit_transform(data.values, target.values.ravel()) outcome = selector.get_support(True) new_features = [] # The list of your K best features for ind in outcome: new_features.append(data.columns.values[ind]) return pd.DataFrame(data_new, columns=new_features)
def test_select_fwe_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fwe heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fwe", param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support[:5], np.ones((5,), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 2)
def test_select_fwe_4(): """Ensure that the TPOT select fwe outputs the same result as sklearn fwe when 0.001 < alpha < 0.05""" tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UserWarning) selector = SelectFwe(f_classif, alpha=0.042) selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(tpot_obj._select_fwe(training_testing_data, 0.042), training_testing_data[mask_cols])
def test_select_fwe_4(): """Ensure that the TPOT select fwe outputs the same result as sklearn fwe when 0.001 < alpha < 0.05""" tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UserWarning) selector = SelectFwe(f_classif, alpha=0.042) selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(tpot_obj._select_fwe(training_testing_data, 0.042), training_testing_data[mask_cols])
def test_select_fwe_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fwe heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect( f_regression, mode='fwe', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 2)
def test_select_fwe_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fwe heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fwe', param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert(support[:5] == 1).all() assert(np.sum(support[5:] == 1) < 2)
def _select_fwe(self, input_df, alpha): """ Uses Scikit-learn's SelectFwe feature selection to filter the subset of features according to p-values corresponding to Family-wise error rate Parameters ---------- input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} Input DataFrame to perform feature selection on alpha: float in the range [0.001, 0.05] The highest uncorrected p-value for features to keep Returns ------- subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']} Returns a DataFrame containing the 'best' features """ training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values # forcing 0.001 <= alpha <= 0.05 if alpha > 0.05: alpha = 0.05 elif alpha <= 0.001: alpha = 0.001 if len(training_features.columns.values) == 0: return input_df.copy() with warnings.catch_warnings(): # Ignore warnings about constant features warnings.simplefilter('ignore', category=UserWarning) selector = SelectFwe(f_classif, alpha=alpha) selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group'] return input_df[mask_cols].copy()
imputer = imputer.fit(X) X = imputer.transform(X) #feature scaling from sklearn.preprocessing import MinMaxScaler mms = MinMaxScaler() X_norm = mms.fit_transform(X) # Univariate feature selection using family wise error from sklearn.feature_selection import SelectFwe, f_classif X_fwe = SelectFwe(f_classif, alpha=0.05).fit(X, y) # Get indices of selected features X_fwe.get_support(indices=True) # select features using family wise error method X_fwe = SelectFwe(f_classif, alpha=0.05).fit_transform(X, y) print(X_fwe.shape) # Splitting the dataset into Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_fwe, y, test_size=0.2, random_state=0) # fitting logistic regression to Training Set from sklearn.linear_model import LogisticRegression
#Set by user input: fileName = r'/trainingSetFeatures.csv' filePath = str(argv[1]) X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) print("Classes:", y_inv) # 'Normalize/Scale features if needed. Our data is standardized by default' # X = StandardScaler(copy=False).fit_transform(X) Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("F-test filter ->",X.shape) FeatSelection_SVM=True FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=5, scaling=0.5, sample_fraction=0.8, n_resampling=60, selection_threshold=0.2,n_jobs=-1) X = LogRegFeats.fit_transform(X,y) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X.shape) elif FeatSelection_SVM == True: X= LinearSVC(C=1, penalty="l1", dual=False,class_weight='auto').fit_transform(X, y) # X= LogisticRegression(C=0.01,class_weight='auto').fit_transform(X, y)
def GetAllPerf (filePaths=None): if filePaths is None: filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n",filePaths) fileNames=fileNameFromPaths (filePaths) print("FileNames:",fileNames) resDict = pd.DataFrame(index=fileNames, columns=['Accuracy','Accuracy_SD', 'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1', 'LargestClassPercent','Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores' ,'Array-f1-Scores' ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted']) #redDict holds results for each file/class, for saving to output-file i=-1 for filePath in filePaths: i +=1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName=str(fileNames[i]) #Str added now 14.1 print("fileName: %s" %(fileName)) "resDict['Name']= fileName" # filePath = str(argv[1]) # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1) print("Classes:", lb_encoder.classes_) print("MajorityClassPercent:", MajorityPercent) resDict.LargestClassPercent[fileName] = MajorityPercent resDict.Classes[fileName] = str(lb_encoder.classes_) resDict["# Classes"][fileName]=len(lb_encoder.classes_) KFilt=None KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself. if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("X reduced to K best features: ",X.shape) FeatSelection_SVM=False #Feature Names need updating!! FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5, sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y) X_L1 = LogRegFeats.transform(X) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape) elif FeatSelection_SVM == True: svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y) X_L1 = svc_L1.transform(X, y) featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))] print ("L1 SVM Transformed X:",X_L1.shape) # X=X_L1 ''' print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) ''' 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 16 FeatSelection_RFE=False FeatSelection_RFECV=False if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision') # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3)) #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFE-CV selected %d features : " % (rfecv.n_features_)) print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) ) rfe_featnames = featureNames[rfecv.get_support()] featureNames = featureNames[rfecv.get_support()] print("RFE selected feature names:",rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print("X_RFE",X_RFE.shape) resDict['TopRFE-Features'][fileName]=str(rfe_featnames) 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False # print("lb_encoder.classes_",lb_encoder.classes_) 'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb ' 'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/' 'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators' "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html" print() "Make custom F1 scorer. May not have fixed problem!" from sklearn.metrics.score import make_scorer f1_scorer = make_scorer(metrics.f1_score, greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none # print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0) y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred )) dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted')) dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred)) #Get from ALL classes f1.. dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean() # print("Dummy, most frequent acc:",dummy_freq_acc) # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0) # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y))) # 'print("Dummy, Stratified Random:",dummy_strat2)' print() resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc ## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted # resDict.dummy_Stratfreq[fileName]=dummy_strat2 "We can get seperately the best model for Acc, and the best for f1!" "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1') "Temporary workaround until next SKlearn update of F1 metric:" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer) bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy') print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1) print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc) #Temp # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1) if GetRFEPerf==True: bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1') "Modified to get 2 estimators" scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2)) scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1') print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) resDict['Accuracy'][fileName]=round(scores_acc.mean(),4) resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4) resDict['f1'][fileName]=round(scores_f1.mean(),4) resDict['f1_SD'][fileName]=round(scores_f1.std(),4) resDict['Array-f1-Scores'][fileName]=(scores_f1) resDict['Array-Acc-Scores'][fileName]=(scores_acc) resDict['bestML-f1'][fileName]=(str(bestEst_f1)) resDict['bestML-Acc'][fileName]=(str(bestEst_acc)) #ORIG # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15) # resDict['Accuracy'][fileName]=round(Acc,4) # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4) # resDict['f1 score'][fileName]=round(f1,4) # resDict['f1_SD'][fileName]=round(f1_SD,4) # resDict['Best (f1) Model parameters'][fileName]= bestEst print() # print(fileName," Done") print("Saving results to file") resDict.to_csv("OutputData.tsv", sep=',')
for idx in range(len(kbest.scores_)): if kbest.scores_[idx] < 2: print columns[idx], kbest.scores_[idx] kbest_result = [ columns[idx] for idx in range(len(columns) - 1) if kbest.scores_[idx] < 2 ] # perform regression without those # In[48]: fwe = SelectFwe(f_regression, alpha=0.7) fwe.fit(converted_train_array[:, :-1], converted_train_array[:, -1]) for idx in range(len(columns) - 1): if not idx in fwe.get_support(indices=True): print columns[idx] # In[49]: variance = VarianceThreshold(threshold=1) variance.fit(converted_train_array[:, :-1]) print len(variance.get_support(indices=True)) for idx in range(len(columns) - 1): if not idx in variance.get_support(indices=True): print columns[idx] variance_result = [ columns[idx] for idx in range(len(columns) - 1) if not idx in variance.get_support(indices=True) ]
# In[ ]: X=df[feature_cols].values y=df.classname.values # In[ ]: le = LabelEncoder() y = le.fit_transform(y) # In[ ]: print("Orig X -> ",X.shape) Fwe = SelectFwe(alpha=0.001).fit(X,y) X=Fwe.transform(X) print("F-test -> ",X.shape) feature_cols=feature_cols[Fwe.get_support()] # In[ ]: rf = RandomForestClassifierWithCoef(max_depth= 9, min_samples_split= 3, min_samples_leaf= 3, n_estimators= 650, n_jobs= -1, max_features= "auto") # In[ ]: scores = cross_val_score(rf,X,y,n_jobs=-1,cv=StratifiedShuffleSplit(y,n_iter=7,test_size=0.3)) print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2)) # scores_f1 = cross_val_score(rf,X,y,n_jobs=-1,cv=StratifiedShuffleSplit(y,n_iter=10,test_size=0.22),scoring='f1') # print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) # In[ ]: svc = LinearSVC(C=20, penalty='l1', dual=False)
def preprocess_dataset(X, y, features, exploration_results, fs_example=False): """ Preprocess the data according to earlier performed exploration results with found issues. These issues are based on: - feature types, - feature dimensionality, - missing values, - output imbalance, - irrelevant features, - normalisation, - multicollinearity Since feature selection can be very dataset specific, it can also be removed from the preprocessing list. :param X: A numpy matrix of the data. First axis corresponding to instances, second axis corresponding to samples :param y: A numpy array of the output. The length of the array should correspond to the size of the first axis of X :param features: A numpy array of the feature names. The length of the array should correspond to the size of the second axis of X :param exploration_results: A dict with the results of the earlier exploration, corresponding to the aforementioned issues :param fs_example: Whether also an example of feature selection should be done. Default: False :return: The preprocessed X, y and features """ # Test the input to be according to the standards robustness_methods.check_input_arrays(X, y, features) # First change data for missing values if exploration_results['mv']: print("\nStarting missing value handling...") old_features = np.copy(features) if exploration_results['cca']: X, y = LDM.cca(X, y, missing_values='') elif exploration_results['aca']: X, features = LDM.aca(X, features, missing_values='') else: X, features = LDM.aca(X, features, missing_values='', removal_fraction=0.15) X = impute.mean_imputation(X, missing_values='') removed_features = _return_removed_features(features, old_features) print( "These features are removed due to having too many missing values: %s" % removed_features) if exploration_results['irrelevance'] > 0: print("\nRemoving irrelevant features...") # Remove irrelevant irr_feat_loc = exploration_results['irrelevant_features'] X = np.delete(X, irr_feat_loc, axis=1) old_features = np.copy(features) features = np.delete(features, irr_feat_loc) removed_features = _return_removed_features(features, old_features) print("These features are removed due to having no information: %s" % removed_features) _return_removed_features(features, old_features) if exploration_results['norm_means'] or exploration_results['norm_stdev']: print("\nNormalising numeric features...") # Normalise or standardise values NS.normalise_numeric_features(X, exploration_results['stand'], exploration_results['norm_means'], exploration_results['norm_stdev']) # Than change categorical to numeric values if exploration_results['cat']: print("\nHot encoding categorical values...") X, features = HE.hot_encode_categorical_features(X, features) if exploration_results['fs'] and fs_example: print("\nDoing an example of feature selection...") # Feature selection if multicollinearity if exploration_results['mc']: # Remove multicollinearity feature_selector = WM.ForwardSelector(threshold=0.0001) # Order to have more relevant features first feature_orderer = OM.FeatureOrderer(f_classif) X = feature_orderer.fit_transform(X, y) features = features[np.argsort(-feature_orderer.scores_)] else: feature_selector = SF(f_classif, alpha=0.05) # Transform data to feature_selection X = feature_selector.fit_transform(X, y) old_features = np.copy(features) features = features[feature_selector.get_support()] # Remove extra features as only 200 are needed. if features.shape[0] > 200: print( "Extra feature selection is done to reduce the number of features to 200..." ) extra_feature_selector = SelectKBest(f_classif, k=200) X = extra_feature_selector.fit_transform(X, y) features = features[feature_selector.get_support()] removed_features = _return_removed_features(features, old_features) print("These features are removed due to feature selection: %s" % removed_features) return X, y, features
def main(args): if args.train_dir is None: # args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/' #args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/' # args.train_dir = r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3' # args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big' args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles' print("Using default train_dir: %s" % args.train_dir) pandas.set_option('display.max_columns', 10) pandas.set_option('display.max_rows', 4) # mpl.rc('title', labelsize=6) mpl.rc('ytick', labelsize=7) mpl.rc('xtick', labelsize=4) os.chdir(args.train_dir) dataName = 'Neuropeptides' df = pandas.read_csv('trainingSetFeatures.csv') feature_cols = [col for col in df.columns if col not in ['classname','Id','proteinname']] feature_cols=numpy.array(feature_cols) X = df[feature_cols].values y = df.classname.values le = LabelEncoder() y = le.fit_transform(y) "Initial feature selection trimming" print(X.shape) Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) print("F-test -> ",X.shape) feature_cols=feature_cols[Fwe.get_support()] ''' FeatSelection_SVM = True if FeatSelection_SVM == True: svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y) X = svc_L1.transform(X, y) print ("L1 SVM Transformed X:",X_L1.shape) feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))] ''' k = SelectKBest(k=255).fit(X,y) X=k.transform(X) feature_cols=feature_cols[k.get_support()] param_dist = {"max_depth": [6,9, None], "max_features": ['auto',0.4], "min_samples_leaf": [1,2,3], "bootstrap": [True, False], 'min_samples_split':[2,3], "criterion": [ "gini"], "n_estimators":[100], "n_jobs":[-1]} rf = RandomForestClassifierWithCoef(max_depth= 7, min_samples_split= 1, min_samples_leaf= 2, n_estimators= 50, n_jobs= 2, max_features= "auto") "WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class." scores = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2)) print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2)) "Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'." scores_f1 = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1') print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) # rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04) rfeSelect = RFECV(estimator=rf,step=20, cv=2,scoring='f1') #average_precision , recall X_RFE = rfeSelect.fit_transform(X,y) print(X_RFE.shape) RFE_FeatureNames = feature_cols[rfeSelect.get_support()] print(RFE_FeatureNames) RFE_ScoreRatio = 100*(cross_validation.cross_val_score(rf,X_RFE,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1').mean())/scores_f1.mean() print("Even with just",X_RFE.shape[1]," features, we have %f performance! (f1 score ratio)" %(RFE_ScoreRatio)) # PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName) print("Alt plot:") altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.feature_selection import SelectFwe from sklearn.feature_selection import f_classif from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR') training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25) result1 = tpot_data.copy() training_features = result1.loc[training_indices].drop(['class', 'group', 'guess'], axis=1) training_class_vals = result1.loc[training_indices, 'class'].values if len(training_features.columns.values) == 0: result1 = result1.copy() else: selector = SelectFwe(f_classif, alpha=0.05) selector.fit(training_features.values, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + ['class'] result1 = result1[mask_cols] # Perform classification with a k-nearest neighbor classifier knnc2 = KNeighborsClassifier(n_neighbors=min(8, len(training_indices))) knnc2.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values) result2 = result1.copy() result2['knnc2-classification'] = knnc2.predict(result2.drop('class', axis=1).values)
def main(args): if args.train_dir is None: # args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/' #args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/' # args.train_dir = r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3' # args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big' args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles' print("Using default train_dir: %s" % args.train_dir) pandas.set_option('display.max_columns', 10) pandas.set_option('display.max_rows', 4) # mpl.rc('title', labelsize=6) mpl.rc('ytick', labelsize=7) mpl.rc('xtick', labelsize=4) os.chdir(args.train_dir) dataName = 'Neuropeptides' df = pandas.read_csv('trainingSetFeatures.csv') feature_cols = [ col for col in df.columns if col not in ['classname', 'Id', 'proteinname'] ] feature_cols = numpy.array(feature_cols) X = df[feature_cols].values y = df.classname.values le = LabelEncoder() y = le.fit_transform(y) "Initial feature selection trimming" print(X.shape) Fwe = SelectFwe(alpha=0.01).fit(X, y) X = Fwe.transform(X) print("F-test -> ", X.shape) feature_cols = feature_cols[Fwe.get_support()] ''' FeatSelection_SVM = True if FeatSelection_SVM == True: svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y) X = svc_L1.transform(X, y) print ("L1 SVM Transformed X:",X_L1.shape) feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))] ''' k = SelectKBest(k=255).fit(X, y) X = k.transform(X) feature_cols = feature_cols[k.get_support()] param_dist = { "max_depth": [6, 9, None], "max_features": ['auto', 0.4], "min_samples_leaf": [1, 2, 3], "bootstrap": [True, False], 'min_samples_split': [2, 3], "criterion": ["gini"], "n_estimators": [100], "n_jobs": [-1] } rf = RandomForestClassifierWithCoef(max_depth=7, min_samples_split=1, min_samples_leaf=2, n_estimators=50, n_jobs=2, max_features="auto") "WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class." scores = cross_validation.cross_val_score( rf, X, y, n_jobs=-1, cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2)) print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2)) "Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'." scores_f1 = cross_validation.cross_val_score( rf, X, y, n_jobs=-1, cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2), scoring='f1') print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) # rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04) rfeSelect = RFECV(estimator=rf, step=20, cv=2, scoring='f1') #average_precision , recall X_RFE = rfeSelect.fit_transform(X, y) print(X_RFE.shape) RFE_FeatureNames = feature_cols[rfeSelect.get_support()] print(RFE_FeatureNames) RFE_ScoreRatio = 100 * (cross_validation.cross_val_score( rf, X_RFE, y, n_jobs=-1, cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2), scoring='f1').mean()) / scores_f1.mean() print( "Even with just", X_RFE.shape[1], " features, we have %f performance! (f1 score ratio)" % (RFE_ScoreRatio)) # PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName) print("Alt plot:") altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
fileName = r'/trainingSetFeatures.csv' filePath = str(argv[1]) X, y, lb_encoder, featureNames = load_data( filePath + fileName, 'file') # X, y = features, labels print(X.shape, "= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) print("Classes:", y_inv) # 'Normalize/Scale features if needed. Our data is standardized by default' # X = StandardScaler(copy=False).fit_transform(X) Fwe = SelectFwe(alpha=0.01).fit(X, y) X = Fwe.transform(X) featureNames = featureNames[Fwe.get_support()] print("F-test filter ->", X.shape) FeatSelection_SVM = True FeatSelection_RandLogReg = False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=5, scaling=0.5, sample_fraction=0.8, n_resampling=60, selection_threshold=0.2, n_jobs=-1) X = LogRegFeats.fit_transform(X, y) featureNames = featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:", X.shape)
# In[ ]: X = df[feature_cols].values y = df.classname.values # In[ ]: le = LabelEncoder() y = le.fit_transform(y) # In[ ]: print("Orig X -> ", X.shape) Fwe = SelectFwe(alpha=0.001).fit(X, y) X = Fwe.transform(X) print("F-test -> ", X.shape) feature_cols = feature_cols[Fwe.get_support()] # In[ ]: rf = RandomForestClassifierWithCoef(max_depth=9, min_samples_split=3, min_samples_leaf=3, n_estimators=650, n_jobs=-1, max_features="auto") # In[ ]: scores = cross_val_score(rf, X, y,
def GetAllPerf (filePaths=None): if filePaths is None: filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n",filePaths) fileNames=fileNameFromPaths (filePaths) print("FileNames:",fileNames) resDict = pd.DataFrame(index=fileNames, columns=['Accuracy','Accuracy_SD', 'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1', 'LargestClassPercent','Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores' ,'Array-f1-Scores' ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted']) #redDict holds results for each file/class, for saving to output-file i=-1 for filePath in filePaths: i +=1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName=str(fileNames[i]) #Str added now 14.1 print("fileName: %s" %(fileName)) "resDict['Name']= fileName" # filePath = str(argv[1]) # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels X, y, lb_encoder,featureNames = load_data(filePath) # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1) print("Classes:", lb_encoder.classes_) print("MajorityClassPercent:", MajorityPercent) resDict.LargestClassPercent[fileName] = MajorityPercent resDict.Classes[fileName] = str(lb_encoder.classes_) resDict["# Classes"][fileName]=len(lb_encoder.classes_) KFilt=None KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself. if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("X reduced to K best features: ",X.shape) FeatSelection_SVM=False #Feature Names need updating!! FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5, sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y) X_L1 = LogRegFeats.transform(X) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape) elif FeatSelection_SVM == True: svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y) X_L1 = svc_L1.transform(X, y) featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))] print ("L1 SVM Transformed X:",X_L1.shape) # X=X_L1 ''' print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) ''' 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 16 FeatSelection_RFE=False FeatSelection_RFECV=False if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision') # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3)) #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFE-CV selected %d features : " % (rfecv.n_features_)) print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) ) rfe_featnames = featureNames[rfecv.get_support()] featureNames = featureNames[rfecv.get_support()] print("RFE selected feature names:",rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print("X_RFE",X_RFE.shape) resDict['TopRFE-Features'][fileName]=str(rfe_featnames) 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False # print("lb_encoder.classes_",lb_encoder.classes_) 'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb ' 'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/' 'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators' "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html" print() "Make custom F1 scorer. May not have fixed problem!" from sklearn.metrics.score import make_scorer f1_scorer = make_scorer(metrics.f1_score, greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none # print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0) y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred )) dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted')) dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred)) #Get from ALL classes f1.. dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean() # print("Dummy, most frequent acc:",dummy_freq_acc) # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0) # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y))) # 'print("Dummy, Stratified Random:",dummy_strat2)' print() resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc ## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted # resDict.dummy_Stratfreq[fileName]=dummy_strat2 "We can get seperately the best model for Acc, and the best for f1!" "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1') "Temporary workaround until next SKlearn update of F1 metric:" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer) bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy') print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1) print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc) #Temp # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1) if GetRFEPerf==True: bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1') "Modified to get 2 estimators" scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2)) scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1') print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) resDict['Accuracy'][fileName]=round(scores_acc.mean(),4) resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4) resDict['f1'][fileName]=round(scores_f1.mean(),4) resDict['f1_SD'][fileName]=round(scores_f1.std(),4) resDict['Array-f1-Scores'][fileName]=(scores_f1) resDict['Array-Acc-Scores'][fileName]=(scores_acc) resDict['bestML-f1'][fileName]=(str(bestEst_f1)) resDict['bestML-Acc'][fileName]=(str(bestEst_acc)) #ORIG # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15) # resDict['Accuracy'][fileName]=round(Acc,4) # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4) # resDict['f1 score'][fileName]=round(f1,4) # resDict['f1_SD'][fileName]=round(f1_SD,4) # resDict['Best (f1) Model parameters'][fileName]= bestEst print() # print(fileName," Done") print("Saving results to file") resDict.to_csv("OutputData.tsv", sep=',')