def preprocessingCV(foldK=2,dataSize=2000,fileName="logstash.csv"): print("loading data ...") dataDf=dh.loadLogData(fileName=fileName).loc[:dataSize] print(foldK,"-fold cross validation ...") dataArr=np.array(dataDf) dataSKF=skf(dataArr[:,1],n_folds=foldK,shuffle=True) dataIndexList=list(dataSKF) print("transforming dataIndexList into dataList ...") dataList=[([dataArr[row0Item][0] for row0Item in row[0]],[dataArr[row0Item][1] for row0Item in row[1]]) for row in dataIndexList] return dataList
def preprocessingCVFromDf(dataDf,foldK=2,dataSize=2000): print("-transforming df into arr to fit sklearn...") dataArr=np.array(dataDf) print("-folding ...") yArr=dataArr[:,1] mySkf=skf(yArr,n_folds=foldK,shuffle=True) dataIndexList=list(mySkf) print("transforming dataIndexList into dataList ...") dataList=[([(dataArr[row0Item][0],dataArr[row0Item][1]) for row0Item in row[0]],\ [(dataArr[row0Item][0],dataArr[row0Item][1]) for row0Item in row[1]])\ for row in dataIndexList] return dataList
print white_corr_rho print white_corr_pval #RANDOM FOREST MODELING: RED--------------------------------------------------- #set iterations iterations=20 #create empty data frames for prediction results and feature importances red_results=pd.DataFrame(index=dfr_exp.index, columns=range(0,iterations)) red_features=pd.DataFrame(index=range(0,11), columns=range(0,iterations)) #fit model using StratifiedKFold rf=rfc(n_estimators=360, max_features=5, criterion='gini') for j in range(0,iterations): folds = skf(dfr_res, 5, shuffle=True) for train, test in folds: model=rf.fit(dfr_exp.ix[train,], dfr_res[train]) red_results.ix[test,j] = pd.Series(model.predict(dfr_exp.ix[test,]), index=test, name=[j]) red_features[j]=pd.Series(model.feature_importances_) print j #write results to file red_results.to_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_results.txt', sep='\t', header=True) red_features.to_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_features.txt', sep='\t', header=True) #retrieve results as needed #red_results=pd.read_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_results.txt', sep='\t', header=False, names=range(0,iterations)) #red_features=pd.read_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_features.txt', sep='\t', header=False, names=range(0,iterations)) #transform results to calculate accuracy, sensitivity (TPR) and precision (PPV)
print round(kn_accuracy.best_score_ * 100, 2) print round(nb_accuracy.mean() * 100, 2) print round(dt_accuracy.best_score_ * 100, 2) #Compare models: F1 print round(kn_f1.best_score_ * 100, 2) print round(nb_f1.mean() * 100, 2) print round(dt_f1.best_score_ * 100, 2) #Compare models: ROC AUC print round(kn_auc.best_score_ * 100, 2) print round(nb_auc.mean() * 100, 2) print round(dt_auc.best_score_ * 100, 2) #Confusion Matrix, ROC Curve by K-fold Slice folds = skf(df_res, 10, indices=False) #produce confusion matrices for each fold for i, (train, test) in enumerate(folds): preds = dtree.fit(df_exp.ix[train, ], df_res[train]).predict(df_exp.ix[test, ]) print '----FOLD #%d----' % i print pd.crosstab(df_res[test], preds, rownames=['Actual'], colnames=['Predicted'], margins=True) #produce ROC curves for each fold dtree = tr.DecisionTreeClassifier() mean_tpr = 0.0
print white_corr_rho print white_corr_pval #RANDOM FOREST MODELING: RED--------------------------------------------------- #set iterations iterations = 20 #create empty data frames for prediction results and feature importances red_results = pd.DataFrame(index=dfr_exp.index, columns=range(0, iterations)) red_features = pd.DataFrame(index=range(0, 11), columns=range(0, iterations)) #fit model using StratifiedKFold rf = rfc(n_estimators=360, max_features=5, criterion='gini') for j in range(0, iterations): folds = skf(dfr_res, 5, shuffle=True) for train, test in folds: model = rf.fit(dfr_exp.ix[train, ], dfr_res[train]) red_results.ix[test, j] = pd.Series(model.predict(dfr_exp.ix[test, ]), index=test, name=[j]) red_features[j] = pd.Series(model.feature_importances_) print j #write results to file red_results.to_csv( 'C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_results.txt', sep='\t', header=True) red_features.to_csv( 'C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_features.txt',
print round(nb_accuracy.mean()*100,2) print round(dt_accuracy.best_score_*100,2) #Compare models: F1 print round(kn_f1.best_score_*100,2) print round(nb_f1.mean()*100,2) print round(dt_f1.best_score_*100,2) #Compare models: ROC AUC print round(kn_auc.best_score_*100,2) print round(nb_auc.mean()*100,2) print round(dt_auc.best_score_*100,2) #Confusion Matrix, ROC Curve by K-fold Slice folds = skf(df_res, 10, indices=False) #produce confusion matrices for each fold for i, (train, test) in enumerate(folds): preds = dtree.fit(df_exp.ix[train,], df_res[train]).predict(df_exp.ix[test,]) print '----FOLD #%d----' % i print pd.crosstab(df_res[test], preds, rownames=['Actual'], colnames=['Predicted'], margins=True) #produce ROC curves for each fold dtree = tr.DecisionTreeClassifier() mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) for i, (train, test) in enumerate(folds): preds = dtree.fit(df_exp.ix[train,], df_res[train]).predict(df_exp.ix[test,]) fpr, tpr, thresholds = mt.roc_curve(df_res[test], preds) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, mt.auc(fpr, tpr)))
def train_model_bagging (features, labels) : base_model = rfc(n_estimators = 80, max_features = 20, max_depth=6, random_state = 30, criterion = 'entropy') # model = BaggingClassifier(base_estimator = base_model) params_dict = {'max_features': [0.5, 0.8], 'max_samples': [0.5, 0.8, 1], 'n_estimators':[25, 50, 75]} clf = GridSearchCV(BaggingClassifier(random_state = 30, n_jobs = -1, base_estimator = base_model), params_dict, scoring = 'roc_auc', cv = skf(labels, n_folds = 5, random_state = 30)) clf.fit(features, labels) print ("Best estimator: ", clf.best_estimator_) print ("Best best scores: %.4f" %(clf.best_score_)) return clf
shuffle(data) ys = data[:,4] #classe xs = data[:,:4] #features means = np.mean(xs, axis = 0) stdevs = np.std(xs, axis = 0) xs = (xs - means)/stdevs xr,xt,yr,yt = tts(xs, ys,test_size = 0.33, stratify=ys) errs = [] # numero de vezes que itera para depois fazer a média folds = 10 kf = skf(yr,n_folds = folds) ''' Logistic Regression ''' menorC_va_err=200000 #Parametro especifico C=1; #Vamos guardar em que numero de parametro tivemos o menor valor de va_err (guardado em cima) bestNumberofC=0 #Plot the errors against the logarithm of the C value arrayC = [] for idx in range(1,21): tr_err = va_err = 0