def makeCSVlistFromFolderName(folderName, basePath=sf.folderPath): '''Make a list of all the filenames in a folder''' # basePath = '' # fullPath = basePath + folderName # print(fullPath) listofcsvs = listdir(sf.addFolderPath(folderName)) # print(listofcsvs) listofcsvs = [ sf.addFolderPath(x, folderName=folderName) for x in listofcsvs ] # print(listofcsvs) return listofcsvs
def processListOfPickles(listOfPickles, folderName='', printOut=True): ''' Feeds to showBestFeaturesOfLoadedDict ''' folderName = sf.addFolderPath(folderName) dictOfPickleNamesAndOutLists = {} for pickle in listOfPickles: print(os.path.join(folderName, pickle)) if pickle[-4:] != '.pik': print(pickle, 'not opened') continue newDict = pickling.load_dill(os.path.join(folderName, pickle)) print('\nunpacking', pickle) dictOfPickleNamesAndOutLists[pickle] = showBestFeaturesOfLoadedDict( newDict, printOut) if printOut: bestAcc, numFeatures, bestRun = 0, 0, '' for runName, outList in dictOfPickleNamesAndOutLists.items(): for group in outList: if group[1] > bestAcc: bestAcc = group[1] numFeatures = group[0] bestRun = runName print('\nBest Results:') print('best accuracy, num features, best run') print(bestAcc, numFeatures, bestRun) print() return dictOfPickleNamesAndOutLists
def makeNewDoneRunListFromOutFile(outFile): ''' Input .csv file which is combo of all csvs so each column is a new run''' import re df = pd.read_csv(sf.addFolderPath(outFile)) cols = [x for x in (set(df.columns) - {'Unnamed: 0'})] cols = [re.sub('_[0-9]of5_', '_', x) for x in cols] print('made newRunDoneList with', len(cols), 'items') assert (len(cols) > 100), 'df needs to be transposed - need col names to be run names' return cols
def load_dill(name): '''Doesn't do the path''' try: with open(name, 'rb') as f: return dill.load(f) except FileNotFoundError: with open(sf.addFolderPath(name), 'rb') as f: return dill.load(f) #if __name__ == "__main__": # save_dill(modelDict, 'modelDictAllFeatures0908') # aReloaded = load_dill('modelDictWithDill') # pass
def runAll(): global predsThatAreNotInDF predsThatAreNotInDF = [] global inspList global SchoolDict inspList = [] df = pd.read_csv(sf.addFolderPath("bigDFnoDups1.csv")) df = df.apply(loadInspections, axis=1) SchoolDict, allInspNos, dupInsps = assignInspectionsToSchools(inspList) df = df.apply(addPredecessorURNsFromDF, axis=1) SchoolDict = addAllPredecessors(SchoolDict) stuck = calcStuck(SchoolDict) SchoolDict, openAndUninspected = setAllStatuses(SchoolDict) openStuck = whichStuckAreOpen(stuck) SchoolDict = feedToSort(SchoolDict) dfForClustering = clusterDF(SchoolDict, sf.addFolderPath("clusterDF.csv")) SchoolDict = makeGoodsAndBadsLists(SchoolDict) dfOut = makeURNvsYearInspCats(SchoolDict, sf.addFolderPath("dfOut.csv")) # finalPt, steps = makeMatrices(SchoolDict) openSchoolDict = findOpenSchools(SchoolDict) threeGenerations = findGrandParents(SchoolDict) # groupDict = grouping(openSchoolDict) groupDict = {} return SchoolDict, openSchoolDict, groupDict
def makeDFofDiffsWithTrueValues(bigDF, write=''): accdf = pd.DataFrame() for col in bigDF.columns: accdf[col] = bigDF[col] - bigDF['TheTruth'] if write != '': accdf.to_csv(sf.addFolderPath(write)) bestScore = 0 for col in accdf: if col == 'TheTruth': continue runScore = accdf[col].value_counts()[0] if runScore > bestScore: bestScore = runScore bestRun = col return
def setAllStatuses(SchoolDict): openAndUninspected = [] if where in ["ONS", "Cdrive"]: file = "Data\edubaseallstatefunded20190704.csv" else: file = "edubaseallstatefunded20190627.csv" openSchools = pd.read_csv(sf.addFolderPath(file), encoding="latin-1") openSchoolsSet = set(openSchools["URN"]) for URN in openSchoolsSet: try: SchoolDict[URN].setStatus("open") except KeyError: SchoolDict[URN] = School(URN) SchoolDict[URN].setStatus("open") openAndUninspected.append(SchoolDict[URN]) print("school statuses set") return SchoolDict, openAndUninspected
def exportResultsDF(dic, write=''): print('saving..') dictToDF ={} for name in dic.keys(): mod = dic[name] # entry = dictToDF[name] # entry = {key: mod.value() for (key, value) in entryInput.items()} entry = {} for key, func in entryInput.items(): entry[key] = func(mod) dictToDF[name]=entry outDF = pd.DataFrame(dictToDF) if len(write)>0: outDF.to_csv(sf.addFolderPath( write)) print(write,'file written') return outDF
def makeAvgResults(modelDict, write=''): ''' For k-fold cross validation Makes an average of the k folds for each run setup''' modelScoresDict, modelAvgDict = {}, {} # Fill up a dict of dicts with a list of scores for each run for run in modelDict.keys(): loc = run.find('of') if loc > 0: avgName = run[:loc - 2] + run[loc + 3:] if avgName not in modelScoresDict.keys(): entry = {} for key, func in entryInput.items(): # print(modelDict[run]) # print(key, func) entry[key] = [func(modelDict[run])] # print(entry) # entry = {key: [func(modelDict[run])] for (key, func) in entryInput.items()} modelScoresDict[avgName] = entry else: for score in modelScoresDict[avgName].keys(): # print(modelScoresDict[avgName]) # print(modelScoresDict[avgName][score]) scoreToAppend = entryInput[score](modelDict[run]) modelScoresDict[avgName][score].append(scoreToAppend) # Fill up a dict of dicts with just an average score for each set of runs for runName, runResultsDict in modelScoresDict.items(): modelAvgDict[runName] = {} for score in (set(runResultsDict.keys()) - { 'cr', 'longName', 'params', 'runCode', 'runName', 'tpr', 'fpr', 'cm' }): modelAvgDict[runName][score] = np.mean(runResultsDict[score]) modelAvgDict[runName]['acc variance'] = np.var( runResultsDict['acc']) if len(write) > 0: outDF = pd.DataFrame(modelAvgDict) outDF.to_csv(sf.addFolderPath(write)) print(write, 'file written') # return modelAvgDict, modelScoresDict
def makeDFofResults(write=''): numSchools = len(df) bigDF = pd.DataFrame({'tester': np.zeros(len(df))}, index=list(range(len(df)))) oldName = '' for modelName, modelInstance in modelDict.items(): # if modelName[-11:]=='_42_brute_1': newName = ''.join(re.split('_[0-9]of[0-9]', modelName)) # print(modelName) modData = modelInstance.getData() xTest = modData.getxTest() yTest = modData.getyTest() xTestIndices = list(xTest.index) clf = modelInstance.getCLF() y_pred = clf.predict(xTest) # numpy ndarray of 0s and 1s # turn array of predictions for this test set into a dataframe y_predDFOneRun = pd.DataFrame({newName: y_pred}, index=xTestIndices) # combine the 5 dataframes into a single one if oldName != newName: # First fold of this set of 5 runs DF5runs = y_predDFOneRun numOfDFsAdded = 1 else: DF5runs = DF5runs.append(y_predDFOneRun) numOfDFsAdded += 1 if DF5runs.shape[1] > 1: raise 'modelDict not in order so change the way bigDF is made' if numOfDFsAdded == 5: bigDF = bigDF.join(DF5runs) oldName = newName # bigDF = bigDF.join(y_predDF) # print(y_pred) # print(modelName) # print(modData.getxTest().index) bigDF.drop('tester', axis=1, inplace=True) bigDF['TheTruth'] = df['Class'][:] if write != '': bigDF.to_csv(sf.addFolderPath(write)) return bigDF
def makeLabelledSubsets(dictOfURNGroups, cat1, cat2, df, write=''): """Add a column called Stuck to the df 1 if the URN is in the 'stuck' list 0 if not stuck If write != False then write to .csv """ import creatingAMonster as cam print("Adding/updating stuck column in df...") posURNs, negURNs = dictOfURNGroups[cat1], dictOfURNGroups[cat2] allURNs = posURNs + negURNs URNsToDrop = set(df['URN']) - set(allURNs) # df = df[~df['URN'].isin(URNsToDrop)] df["Class"] = df.apply(lambda row: np.where( (int(row["URN"]) in posURNs), 1, np.where(int(row["URN"]) in URNsToDrop, 2, 0)), axis=1) print(df['Class'].value_counts()) df = cam.dropColsFromList(df, ['Stuck']) if len(write) > 0: df.to_csv(sf.addFolderPath(write)) return df
def combineIntermediateResultsCSVs(listOfCSVFilenames, outFile=''): ''' Runs take too much memory and crash computer so at intervals it dumps the results to csv and wipes the memory clean. This function is just to stitch the csv files together so the results are all in one place ''' global dupLists # global droppedCols dupLists = {} allDups = [] bigDF = pd.read_csv(listOfCSVFilenames[0]) bigDF.set_index(keys='Unnamed: 0') droppedCols = {'test'} droppedTingsTest = [] for fileName in listOfCSVFilenames[1:]: nextDFtoJoin = pd.read_csv(fileName) colsToDrop = set(bigDF.columns) & set(nextDFtoJoin.columns) colsToDrop = colsToDrop | {'Unnamed: 0'} droppedTingsTest.append(colsToDrop) droppedCols = droppedCols | colsToDrop # print(colsToDrop) for col in colsToDrop: if col in nextDFtoJoin.columns: nextDFtoJoin.drop(col, axis=1, inplace=True) bigDF = bigDF.join(nextDFtoJoin) #, rsuffix='_'+fileName[:-4]+'_DUP') for col in bigDF.columns: if col[-4:] == '_DUP': if col not in allDups: allDups.append(col) try: dupLists[fileName].append(col) except KeyError: dupLists[fileName] = [col] if outFile != '': bigDF.to_csv(sf.addFolderPath(outFile), index=False) return bigDF
import setFolder as sf import genericModelClass as gmc import pandas as pd from mlxtend.feature_selection import SequentialFeatureSelector from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.svm import SVC from sklearn.neural_network import MLPClassifier from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression import pickling import os import matplotlib.pyplot as plt from math import ceil csv = sf.addFolderPath( 'bbbbVgsbbbsdf7.csv' ) #:#,'bbbbVgsbbbsAllCols.csv']:#,'bbbVgsbbsLessCols.csv','bbbVgbbLessCols.csv', 'bbbbVgbbbLessCols.csv']: df = pd.read_csv(csv) xCols = [ x for x in (set(df.columns) - { "URN", "Stuck", "Class", "Unnamed: 0", 'Unnamed: 0.1', 'GOR_Not Applicable' }) ] x = df[xCols] y = df["Class"] runDict = { # 'NN_original_2_5_adam_0.0001':{'clf':MLPClassifier(hidden_layer_sizes=(5,5), solver='adam', max_iter=1000)}, # 'RF_original_200_11_gini_True':{'clf':RandomForestClassifier(n_estimators=200, max_depth=11, criterion='gini', bootstrap=True)}, # 'SVM_original_4_rbf_2_0.016':{'clf':SVC(C=4, gamma=0.016)},
}, 'RFE': { 30: 'No\nRFE', 5: '5', 10: '10', 15: '15', 20: '20', 25: '25' } } for item in paramDict.keys(): paramDict[item] = {('p' + str(i)): paramDict[item][i - 1] for i in range(1, 5)} #df = pd.read_csv(sf.addFolderPath('paramsearch3forDF7Added.csv')) df = pd.read_csv(sf.addFolderPath('paramSearch2forOldStuckAdded.csv')) measureList = ['auc', 'acc', 'recall1', 'recall0', 'precision1', 'precision0'] #paramScatterPlots(df, 'auc', subplots=True) #scores= bestModelsBarPlot(df, mins=mins) #makeSubplots(df, measureList, mins=mins, figsize=(15,7), ymax=1, chosenMeasure='precision1') # for score in measureList: RFEBarPlot(df, score=score, OS=True, subPlots=True, mins=mins, barwidth=0.3) #scores = findParamsOfBestRuns(df, mins=mins)
'GNB':{'cols':CS.GNBcols, 'model':GaussianBayes}, # 'AB':{'cols':CS.RFcols2, 'model':AdaBoost} } if __name__ == "__main__": import emailing import pickling # doneRuns=[] files = ['bbbbVgsbbbsdf7.csv']#['stuckForDF7.csv']*10#'bbbbVgsbbbsdf7.csv']#*1000 for fileName in files:#['bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv']:#,'bbbbVgsbbbsAllCols.csv']:#,'bbbVgsbbsLessCols.csv','bbbVgbbLessCols.csv', 'bbbbVgbbbLessCols.csv']: # for cols in [CS.KNNcols]:#[SFS1Cols,SFS2Cols,chosenCols1, lessCols, cols]: for modelType in colDict.keys():#['GNB','LR','KNN','SVM','NN','RF']: pickling.save_dill(doneRuns, f"doneRuns_{len(doneRuns)}") # modelDict={} modelDataDict={} df = pd.read_csv(sf.addFolderPath( fileName)) # cols = colDict[modelType]['cols'] cols = list(df.columns) # xCols = [x for x in (set(df.columns) - {"URN", "Stuck","Class", "Unnamed: 0",'Unnamed: 0.1'})] # cols.append('PerformancePctRank') xCols = [x for x in (set(cols) - {"URN", "Stuck","Class", "Unnamed: 0",'Unnamed: 0.1','PTRWM_EXP__18','GOR_Not Applicable'})] x = df[xCols] print(x.columns) y = df["Class"] try: modelsAtStart = len(modelDict) except NameError: modelsAtStart = 0 # Generate data and model instances, run the models modelDataDict, modelDict = runAGroup( [
def save_dill(obj, name): '''Puts the path in for you''' with open(sf.addFolderPath(name + '.pik'), 'wb') as f: dill.dump(obj, f) print('pickled')
0 if not stuck If write != False then write to .csv """ import creatingAMonster as cam print("Adding/updating stuck column in df...") posURNs, negURNs = dictOfURNGroups[cat1], dictOfURNGroups[cat2] allURNs = posURNs + negURNs URNsToDrop = set(df['URN']) - set(allURNs) # df = df[~df['URN'].isin(URNsToDrop)] df["Class"] = df.apply(lambda row: np.where( (int(row["URN"]) in posURNs), 1, np.where(int(row["URN"]) in URNsToDrop, 2, 0)), axis=1) print(df['Class'].value_counts()) df = cam.dropColsFromList(df, ['Stuck']) if len(write) > 0: df.to_csv(sf.addFolderPath(write)) return df dictOfURNs = makeURNListFromGroupDict(newGrouping(openSchoolDict, True)) inputDF = pd.read_csv( sf.addFolderPath('AllDatanotNormedForFeaturePlots_bbbbVgsbbbsImputed.csv')) #dfWithCats = makeLabelledSubsets(dictOfURNs, 'bbb','gbb',inputDF, 'bbbVgbbLessCols.csv') #dfWithCats = makeLabelledSubsets(dictOfURNs, 'bbbb','gbbb',inputDF, 'bbbbVgbbbLessCols.csv') #dfWithCats = makeLabelledSubsets(dictOfURNs, 'bbb','gsbbs', inputDF, 'bbbVgsbbsLessCols.csv') dfWithCats = makeLabelledSubsets( dictOfURNs, 'bbbb', 'gsbbbs', inputDF, sf.addFolderPath( 'AllDatanotNormedForFeaturePlots_bbbbVgsbbbsImputedWithClass.csv'))
roc_curve, roc_auc_score, auc, classification_report, ) import re import matplotlib.pyplot as plt from imblearn.over_sampling import SMOTE from sklearn.feature_selection import RFE import itertools from random import sample import datetime import colSubsets as CS import setFolder as sf df = pd.read_csv(sf.addFolderPath('bbbbVgsbbbsdf7.csv')) def makeDFofResults(write=''): numSchools = len(df) bigDF = pd.DataFrame({'tester': np.zeros(len(df))}, index=list(range(len(df)))) oldName = '' for modelName, modelInstance in modelDict.items(): # if modelName[-11:]=='_42_brute_1': newName = ''.join(re.split('_[0-9]of[0-9]', modelName)) # print(modelName) modData = modelInstance.getData() xTest = modData.getxTest() yTest = modData.getyTest()
""" Created on Sat Aug 31 12:10:27 2019 Plot histograms of each variable with class1 vs class0 Class1 (bad next) is blue Class0 (good next) is orange @author: Chris """ from os import listdir import matplotlib.pyplot as plt import pandas as pd import setFolder as sf #df = pd.read_csv(sf.addFolderPath( 'notNormedForFeaturePlots_bbbbVgsbbbs.csv')) df = pd.read_csv( sf.addFolderPath('AllDatanotNormedForFeaturePlots_bbbbVgsbbbs.csv')) plotCols = [ x for x in (set(df.columns) - {"URN", "Stuck", "Class", "Unnamed: 0", 'Unnamed: 0.1'}) ] plotCols = [ 'PNUMFSM', 'Total revenue balance (1) as a % of total revenue income (6) 2017-18', 'ISPRIMARY', 'Premises.2018', 'PTKS1GROUP_L__18', 'PNUMEAL', 'ISSECONDARY', 'PERCTOT', 'AcademyNew', 'Pupil: Teacher Ratio', 'Supply.Staff_4yrDiff', 'Mean Gross FTE Salary of All Teachers (£s)', 'TOTPUPS__18', 'PerformancePctRank', 'AGEL' ] plt.figure(figsize=(16, 20)) i = 0
def processCSV(csv, write=False, addCols=True): ''' Takes in avg results csv Reads run name and extracts run info, putting into cols to use later for plotting etc. Adds one hot cols for RFE & OS Adds p1/2/3/4 cols with the param values for that run ''' if type(csv) == str: df = pd.read_csv(sf.addFolderPath(csv)) else: df = csv df.set_index('Unnamed: 0', inplace=True) df = df.sort_values(by='acc', axis=1, ascending=False) df = df.transpose() print(f'Analysing {csv}:') # Make subset of df that has the minimum scores in the dict dfWithMins = df.copy() minScores = { 'acc': 0.5, 'recall1': 0.3, 'F0': 0.1, 'F1': 0.1, 'precision0': 0.1 } for measure, score in minScores.items(): dfWithMins = df[df[measure] > score] best = {} for col in [ 'F0', 'F1', 'acc', 'auc', 'precision0', 'precision1', 'recall0', 'recall1' ]: print(f"Best {col}:") runName = dfWithMins.loc[:, col].idxmax() try: best[col] = (dfWithMins.loc[:, col].max(), runName, dfWithMins.loc[runName, :]) print(best[col][0], 'for', best[col][1]) # print(best[col][2]) except KeyError: pass df.columns.rename('Run', inplace=True) # df=df.head(100) if addCols: # df['RunName'] = df.index df['Model'] = None df['RFE'] = None df['OS'] = None df['p1'], df['p2'], df['p3'], df['p4'] = None, None, None, None for runName in df.index: # Identify model type in 'Model' col nameDict = { 'SV': 'SVM', 'NN': 'NN', 'RF': 'RF', 'GN': 'GNB', 'LR': 'LR', 'KN': 'KNN' } df.loc[runName, 'Model'] = nameDict[runName[:2]] end = 3 if runName[:2] in ['GN', 'SV', 'KN']: end = 4 # end = len(df.loc[runName,'Model'])+1 # Put no. of RFE vals in RFE col - 0 if RFE not used if len(re.findall('RFE', runName)) > 0: RFE = runName[runName.find('RFE') + 3:runName.find('RFE') + 5] try: RFE = int(RFE) end += 6 except ValueError: RFE = int(RFE[0]) end += 5 else: RFE = 0 df.loc[runName, 'RFE'] = RFE # Put 1 in 'OS' col if oversampled if len(re.findall('OS', runName)) > 0: df.loc[runName, 'OS'] = 1 end += 3 else: df.loc[runName, 'OS'] = 0 # Fix for 'original' if end < 5: end += 9 # Sort out params if runName[:2] in ['SV', 'NN', 'RF', 'KN']: bits = [] string = '' for char in runName[end:len(runName)]: if char == '_': if string == 'None': string = 20 try: string = float(string) bits.append(string) except ValueError: bits.insert(0, string) string = '' else: string += char if string == 'False': string = 0 elif string == 'True': string = 1 bits.append(float(string)) for i, param in enumerate(['p1', 'p2', 'p3', 'p4']): if (runName[:2] == 'KN') and i == 3: df.loc[runName, param] = 0 else: df.loc[runName, param] = bits[i] # Make model type one hot df = pd.get_dummies(df, columns=['Model'], prefix='', prefix_sep='') if write: df.to_csv(sf.addFolderPath(csv[:-4] + 'Added.csv')) return df