def doSFS(runDict, save=True): for runName, subDict in runDict.items(): for forward in [True, False]: print(runName, forward) featureSelector = SequentialFeatureSelector(subDict['clf'], k_features=(1, 50), forward=forward, verbose=2, scoring="accuracy", cv=5, n_jobs=-1) if forward: subDict['Ffeatures'] = featureSelector.fit(x, y) subDict['FfilteredFeatures'] = x.columns[list( subDict['Ffeatures'].k_feature_idx_)] else: subDict['Bfeatures'] = featureSelector.fit(x, y) subDict['BfilteredFeatures'] = x.columns[list( subDict['Bfeatures'].k_feature_idx_)] if save: forwardsOrBackwards = 'Bfeatures' if forward: forwardsOrBackwards = 'Ffeatures' saveName = runName + '_' + forwardsOrBackwards pickling.save_dill(subDict[forwardsOrBackwards].subsets_, saveName) return runDict
def postProcess(modelDataDict, modelDict, pickleIt=False, emailIt=True, ROC=False): now = datetime.datetime.now() modelAvgDict, modelScoresDict = makeAvgResults(modelDict, 'AVG' + str(now.day) + "_" + str(now.month) + "_" + str(now.hour) + str(now.minute) + fileName ) # Find 'best' model in dict maxAcc= 0 currModAcc='' for runName in modelAvgDict.keys(): if modelAvgDict[runName]['acc'] > maxAcc: maxAcc = modelAvgDict[runName]['acc'] currModAcc = runName # print(f"Best model for F from run is:\n{currModF}\n{currModF.getCM()}") print(f"Best model for accuracy from run is:\n{currModAcc}") print(f"Accuracy: {maxAcc}") if pickleIt: import pickling pickling.save_dill( modelDict, "modelDict" + str(now.day) + "_" + str(now.month) + "_" + str(now.hour) + str(now.minute), ) # Save results in .csv file exportResultsDF(modelDict, "modelDict" + str(now.day) + "_" + str(now.month) + "_" + str(now.hour) + str(now.minute) + fileName ) if ROC: # Print out ROC curve plt.figure(figsize=(15, 11)) for item in modelDict.values(): item.plotROC() # item.printOut() plt.show() if emailIt: try: content = f"finished genericModelClass - took {datetime.datetime.now() - start} and ran {len(modelDict)-modelsAtStart} models\n{currModAcc} {maxAcc}" emailing.sendEmail(subject=f"{maxAcc}, {currModAcc}", content=content) except: print("\nEmail sending failed, carrying on..\n") return modelAvgDict, modelScoresDict
def showBestFeaturesOfRunDict(runDict, printOut=True, save=False): for runName, run in runDict.items(): print('run name:', runName) for forwardsOrBackwards in ['Ffeatures', 'Bfeatures']: if save: saveName = runName + '_' + forwardsOrBackwards pickling.save_dill(run[forwardsOrBackwards].subsets_, saveName) numberOfFeatures = len(run[forwardsOrBackwards].k_feature_names_) listName = forwardsOrBackwards[0] + 'bestFeatures' run[listName] = [] if printOut: print(f'\n{listName} with {numberOfFeatures} features:') for i in run['Ffeatures'].subsets_.values(): for name in i['feature_names']: if name not in run[listName]: run[listName].append(name) if printOut: if len(run[listName]) < numberOfFeatures: print(i['avg_score'], name) if printOut: print(f"------CUTOFF------ ") for i in run['Bfeatures'].subsets_.values(): for name in i['feature_names']: if name not in run[listName]: run[listName].append(name) if printOut: if len(run[listName]) < numberOfFeatures: print(i['avg_score'], name) forwardSet = set(run['Ffeatures'].k_feature_names_) backwardSet = set(run['Bfeatures'].k_feature_names_) if printOut: print( f"\n{runName} has {len(forwardSet)} in forwardSet, {len(backwardSet)} in backwardSet, {len(forwardSet & backwardSet)} common to both:" ) print(forwardSet & backwardSet) return runDict
'KNN':{'cols':CS.KNNcols2, 'model':KNN}, 'RF' :{'cols':CS.RFcols2, 'model':RandomForest}, 'LR':{'cols':CS.LRcols, 'model':LogReg}, 'GNB':{'cols':CS.GNBcols, 'model':GaussianBayes}, # 'AB':{'cols':CS.RFcols2, 'model':AdaBoost} } if __name__ == "__main__": import emailing import pickling # doneRuns=[] files = ['bbbbVgsbbbsdf7.csv']#['stuckForDF7.csv']*10#'bbbbVgsbbbsdf7.csv']#*1000 for fileName in files:#['bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv']:#,'bbbbVgsbbbsAllCols.csv']:#,'bbbVgsbbsLessCols.csv','bbbVgbbLessCols.csv', 'bbbbVgbbbLessCols.csv']: # for cols in [CS.KNNcols]:#[SFS1Cols,SFS2Cols,chosenCols1, lessCols, cols]: for modelType in colDict.keys():#['GNB','LR','KNN','SVM','NN','RF']: pickling.save_dill(doneRuns, f"doneRuns_{len(doneRuns)}") # modelDict={} modelDataDict={} df = pd.read_csv(sf.addFolderPath( fileName)) # cols = colDict[modelType]['cols'] cols = list(df.columns) # xCols = [x for x in (set(df.columns) - {"URN", "Stuck","Class", "Unnamed: 0",'Unnamed: 0.1'})] # cols.append('PerformancePctRank') xCols = [x for x in (set(cols) - {"URN", "Stuck","Class", "Unnamed: 0",'Unnamed: 0.1','PTRWM_EXP__18','GOR_Not Applicable'})] x = df[xCols] print(x.columns) y = df["Class"] try: modelsAtStart = len(modelDict) except NameError: modelsAtStart = 0