예제 #1
0
def makeCSVlistFromFolderName(folderName, basePath=sf.folderPath):
    '''Make a list of all the filenames in a folder'''
    #    basePath = ''
    #    fullPath = basePath + folderName
    #    print(fullPath)
    listofcsvs = listdir(sf.addFolderPath(folderName))

    #    print(listofcsvs)
    listofcsvs = [
        sf.addFolderPath(x, folderName=folderName) for x in listofcsvs
    ]
    #    print(listofcsvs)
    return listofcsvs
예제 #2
0
파일: SFS.py 프로젝트: crees00/SchoolsData
def processListOfPickles(listOfPickles, folderName='', printOut=True):
    ''' Feeds to showBestFeaturesOfLoadedDict '''
    folderName = sf.addFolderPath(folderName)
    dictOfPickleNamesAndOutLists = {}
    for pickle in listOfPickles:
        print(os.path.join(folderName, pickle))
        if pickle[-4:] != '.pik':
            print(pickle, 'not opened')
            continue
        newDict = pickling.load_dill(os.path.join(folderName, pickle))
        print('\nunpacking', pickle)
        dictOfPickleNamesAndOutLists[pickle] = showBestFeaturesOfLoadedDict(
            newDict, printOut)
    if printOut:
        bestAcc, numFeatures, bestRun = 0, 0, ''
        for runName, outList in dictOfPickleNamesAndOutLists.items():
            for group in outList:
                if group[1] > bestAcc:
                    bestAcc = group[1]
                    numFeatures = group[0]
                    bestRun = runName
        print('\nBest Results:')
        print('best accuracy, num features, best run')
        print(bestAcc, numFeatures, bestRun)
        print()
    return dictOfPickleNamesAndOutLists
예제 #3
0
def makeNewDoneRunListFromOutFile(outFile):
    ''' Input .csv file which is combo of all csvs
    so each column is a new run'''
    import re
    df = pd.read_csv(sf.addFolderPath(outFile))
    cols = [x for x in (set(df.columns) - {'Unnamed: 0'})]
    cols = [re.sub('_[0-9]of5_', '_', x) for x in cols]
    print('made newRunDoneList with', len(cols), 'items')
    assert (len(cols) >
            100), 'df needs to be transposed - need col names to be run names'
    return cols
예제 #4
0
def load_dill(name):
    '''Doesn't do the path'''
    try:
        with open(name, 'rb') as f:
            return dill.load(f)
    except FileNotFoundError:
        with open(sf.addFolderPath(name), 'rb') as f:
            return dill.load(f)


#if __name__ == "__main__":
#    save_dill(modelDict, 'modelDictAllFeatures0908')
#    aReloaded = load_dill('modelDictWithDill')
#    pass
예제 #5
0
def runAll():
    global predsThatAreNotInDF
    predsThatAreNotInDF = []
    global inspList
    global SchoolDict
    inspList = []
    df = pd.read_csv(sf.addFolderPath("bigDFnoDups1.csv"))
    df = df.apply(loadInspections, axis=1)
    SchoolDict, allInspNos, dupInsps = assignInspectionsToSchools(inspList)
    df = df.apply(addPredecessorURNsFromDF, axis=1)
    SchoolDict = addAllPredecessors(SchoolDict)
    stuck = calcStuck(SchoolDict)
    SchoolDict, openAndUninspected = setAllStatuses(SchoolDict)
    openStuck = whichStuckAreOpen(stuck)
    SchoolDict = feedToSort(SchoolDict)
    dfForClustering = clusterDF(SchoolDict, sf.addFolderPath("clusterDF.csv"))
    SchoolDict = makeGoodsAndBadsLists(SchoolDict)
    dfOut = makeURNvsYearInspCats(SchoolDict, sf.addFolderPath("dfOut.csv"))
    #    finalPt, steps = makeMatrices(SchoolDict)
    openSchoolDict = findOpenSchools(SchoolDict)
    threeGenerations = findGrandParents(SchoolDict)
    #    groupDict = grouping(openSchoolDict)
    groupDict = {}
    return SchoolDict, openSchoolDict, groupDict
예제 #6
0
def makeDFofDiffsWithTrueValues(bigDF, write=''):
    accdf = pd.DataFrame()
    for col in bigDF.columns:
        accdf[col] = bigDF[col] - bigDF['TheTruth']
    if write != '':
        accdf.to_csv(sf.addFolderPath(write))

    bestScore = 0
    for col in accdf:
        if col == 'TheTruth':
            continue
        runScore = accdf[col].value_counts()[0]
        if runScore > bestScore:
            bestScore = runScore
            bestRun = col
    return
예제 #7
0
def setAllStatuses(SchoolDict):
    openAndUninspected = []
    if where in ["ONS", "Cdrive"]:
        file = "Data\edubaseallstatefunded20190704.csv"
    else:
        file = "edubaseallstatefunded20190627.csv"
    openSchools = pd.read_csv(sf.addFolderPath(file), encoding="latin-1")
    openSchoolsSet = set(openSchools["URN"])
    for URN in openSchoolsSet:
        try:
            SchoolDict[URN].setStatus("open")
        except KeyError:
            SchoolDict[URN] = School(URN)
            SchoolDict[URN].setStatus("open")
            openAndUninspected.append(SchoolDict[URN])
    print("school statuses set")
    return SchoolDict, openAndUninspected
예제 #8
0
def exportResultsDF(dic, write=''):
    print('saving..')
    dictToDF  ={}
    for name in dic.keys():
        mod = dic[name]
#        entry = dictToDF[name]
#        entry = {key: mod.value() for (key, value) in entryInput.items()}
        entry = {}
        for key, func in entryInput.items():
            entry[key] = func(mod)

        dictToDF[name]=entry
    outDF = pd.DataFrame(dictToDF)
    if len(write)>0:
        outDF.to_csv(sf.addFolderPath( write))
        print(write,'file written')
    return outDF
예제 #9
0
def makeAvgResults(modelDict, write=''):
    ''' For k-fold cross validation
    Makes an average of the k folds for each run setup'''
    modelScoresDict, modelAvgDict = {}, {}
    # Fill up a dict of dicts with a list of scores for each run
    for run in modelDict.keys():
        loc = run.find('of')
        if loc > 0:
            avgName = run[:loc - 2] + run[loc + 3:]
            if avgName not in modelScoresDict.keys():
                entry = {}
                for key, func in entryInput.items():
                    #                    print(modelDict[run])
                    #                    print(key, func)
                    entry[key] = [func(modelDict[run])]
#                    print(entry)
#                entry = {key: [func(modelDict[run])] for (key, func) in entryInput.items()}
                modelScoresDict[avgName] = entry
            else:
                for score in modelScoresDict[avgName].keys():
                    #                    print(modelScoresDict[avgName])
                    #                    print(modelScoresDict[avgName][score])
                    scoreToAppend = entryInput[score](modelDict[run])
                    modelScoresDict[avgName][score].append(scoreToAppend)

    # Fill up a dict of dicts with just an average score for each set of runs
    for runName, runResultsDict in modelScoresDict.items():
        modelAvgDict[runName] = {}
        for score in (set(runResultsDict.keys()) - {
                'cr', 'longName', 'params', 'runCode', 'runName', 'tpr', 'fpr',
                'cm'
        }):
            modelAvgDict[runName][score] = np.mean(runResultsDict[score])
            modelAvgDict[runName]['acc variance'] = np.var(
                runResultsDict['acc'])

    if len(write) > 0:
        outDF = pd.DataFrame(modelAvgDict)
        outDF.to_csv(sf.addFolderPath(write))
        print(write, 'file written')


#
    return modelAvgDict, modelScoresDict
예제 #10
0
def makeDFofResults(write=''):
    numSchools = len(df)
    bigDF = pd.DataFrame({'tester': np.zeros(len(df))},
                         index=list(range(len(df))))
    oldName = ''
    for modelName, modelInstance in modelDict.items():
        #    if modelName[-11:]=='_42_brute_1':
        newName = ''.join(re.split('_[0-9]of[0-9]', modelName))

        #        print(modelName)
        modData = modelInstance.getData()
        xTest = modData.getxTest()
        yTest = modData.getyTest()
        xTestIndices = list(xTest.index)
        clf = modelInstance.getCLF()
        y_pred = clf.predict(xTest)  # numpy ndarray of 0s and 1s
        # turn array of predictions for this test set into a dataframe
        y_predDFOneRun = pd.DataFrame({newName: y_pred}, index=xTestIndices)

        # combine the 5 dataframes into a single one
        if oldName != newName:  # First fold of this set of 5 runs
            DF5runs = y_predDFOneRun
            numOfDFsAdded = 1
        else:
            DF5runs = DF5runs.append(y_predDFOneRun)
            numOfDFsAdded += 1
        if DF5runs.shape[1] > 1:
            raise 'modelDict not in order so change the way bigDF is made'
        if numOfDFsAdded == 5:
            bigDF = bigDF.join(DF5runs)
        oldName = newName

    #        bigDF = bigDF.join(y_predDF)
    #        print(y_pred)
    #        print(modelName)
    #        print(modData.getxTest().index)
    bigDF.drop('tester', axis=1, inplace=True)

    bigDF['TheTruth'] = df['Class'][:]
    if write != '':
        bigDF.to_csv(sf.addFolderPath(write))

    return bigDF
예제 #11
0
def makeLabelledSubsets(dictOfURNGroups, cat1, cat2, df, write=''):
    """Add a column called Stuck to the df
    1 if the URN is in the 'stuck' list
    0 if not stuck
    If write != False then write to .csv
    """
    import creatingAMonster as cam
    print("Adding/updating stuck column in df...")
    posURNs, negURNs = dictOfURNGroups[cat1], dictOfURNGroups[cat2]
    allURNs = posURNs + negURNs
    URNsToDrop = set(df['URN']) - set(allURNs)
    #    df = df[~df['URN'].isin(URNsToDrop)]
    df["Class"] = df.apply(lambda row: np.where(
        (int(row["URN"]) in posURNs), 1,
        np.where(int(row["URN"]) in URNsToDrop, 2, 0)),
                           axis=1)
    print(df['Class'].value_counts())
    df = cam.dropColsFromList(df, ['Stuck'])
    if len(write) > 0:
        df.to_csv(sf.addFolderPath(write))
    return df
예제 #12
0
def combineIntermediateResultsCSVs(listOfCSVFilenames, outFile=''):
    ''' Runs take too much memory and crash computer so at intervals it dumps
    the results to csv and wipes the memory clean. This function is just to 
    stitch the csv files together so the results are all in one place
    '''
    global dupLists
    #    global droppedCols
    dupLists = {}
    allDups = []
    bigDF = pd.read_csv(listOfCSVFilenames[0])
    bigDF.set_index(keys='Unnamed: 0')
    droppedCols = {'test'}
    droppedTingsTest = []
    for fileName in listOfCSVFilenames[1:]:
        nextDFtoJoin = pd.read_csv(fileName)
        colsToDrop = set(bigDF.columns) & set(nextDFtoJoin.columns)

        colsToDrop = colsToDrop | {'Unnamed: 0'}
        droppedTingsTest.append(colsToDrop)
        droppedCols = droppedCols | colsToDrop
        #        print(colsToDrop)
        for col in colsToDrop:
            if col in nextDFtoJoin.columns:
                nextDFtoJoin.drop(col, axis=1, inplace=True)
        bigDF = bigDF.join(nextDFtoJoin)  #, rsuffix='_'+fileName[:-4]+'_DUP')

        for col in bigDF.columns:
            if col[-4:] == '_DUP':
                if col not in allDups:
                    allDups.append(col)
                    try:
                        dupLists[fileName].append(col)
                    except KeyError:
                        dupLists[fileName] = [col]
    if outFile != '':
        bigDF.to_csv(sf.addFolderPath(outFile), index=False)
    return bigDF
예제 #13
0
파일: SFS.py 프로젝트: crees00/SchoolsData
import setFolder as sf
import genericModelClass as gmc
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import pickling
import os
import matplotlib.pyplot as plt
from math import ceil
csv = sf.addFolderPath(
    'bbbbVgsbbbsdf7.csv'
)  #:#,'bbbbVgsbbbsAllCols.csv']:#,'bbbVgsbbsLessCols.csv','bbbVgbbLessCols.csv', 'bbbbVgbbbLessCols.csv']:
df = pd.read_csv(csv)
xCols = [
    x for x in (set(df.columns) - {
        "URN", "Stuck", "Class", "Unnamed: 0", 'Unnamed: 0.1',
        'GOR_Not Applicable'
    })
]
x = df[xCols]
y = df["Class"]

runDict = {
    #         'NN_original_2_5_adam_0.0001':{'clf':MLPClassifier(hidden_layer_sizes=(5,5), solver='adam', max_iter=1000)},
    #     'RF_original_200_11_gini_True':{'clf':RandomForestClassifier(n_estimators=200, max_depth=11, criterion='gini', bootstrap=True)},
    #              'SVM_original_4_rbf_2_0.016':{'clf':SVC(C=4, gamma=0.016)},
예제 #14
0
    },
    'RFE': {
        30: 'No\nRFE',
        5: '5',
        10: '10',
        15: '15',
        20: '20',
        25: '25'
    }
}
for item in paramDict.keys():
    paramDict[item] = {('p' + str(i)): paramDict[item][i - 1]
                       for i in range(1, 5)}

#df = pd.read_csv(sf.addFolderPath('paramsearch3forDF7Added.csv'))
df = pd.read_csv(sf.addFolderPath('paramSearch2forOldStuckAdded.csv'))

measureList = ['auc', 'acc', 'recall1', 'recall0', 'precision1', 'precision0']
#paramScatterPlots(df, 'auc', subplots=True)
#scores= bestModelsBarPlot(df, mins=mins)
#makeSubplots(df, measureList, mins=mins, figsize=(15,7), ymax=1, chosenMeasure='precision1')
#
for score in measureList:
    RFEBarPlot(df,
               score=score,
               OS=True,
               subPlots=True,
               mins=mins,
               barwidth=0.3)

#scores = findParamsOfBestRuns(df, mins=mins)
예제 #15
0
           'GNB':{'cols':CS.GNBcols, 'model':GaussianBayes},
#           'AB':{'cols':CS.RFcols2, 'model':AdaBoost}
           }

if __name__ == "__main__":
    import emailing
    import pickling
#    doneRuns=[]
    files = ['bbbbVgsbbbsdf7.csv']#['stuckForDF7.csv']*10#'bbbbVgsbbbsdf7.csv']#*1000
    for fileName in files:#['bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv']:#,'bbbbVgsbbbsAllCols.csv']:#,'bbbVgsbbsLessCols.csv','bbbVgbbLessCols.csv', 'bbbbVgbbbLessCols.csv']:
#        for cols in [CS.KNNcols]:#[SFS1Cols,SFS2Cols,chosenCols1, lessCols, cols]:   
        for modelType in colDict.keys():#['GNB','LR','KNN','SVM','NN','RF']:
            pickling.save_dill(doneRuns, f"doneRuns_{len(doneRuns)}")
#            modelDict={}
            modelDataDict={}
            df = pd.read_csv(sf.addFolderPath( fileName))
#            cols = colDict[modelType]['cols']
            cols = list(df.columns)
#            xCols = [x for x in (set(df.columns) - {"URN", "Stuck","Class", "Unnamed: 0",'Unnamed: 0.1'})]
#            cols.append('PerformancePctRank')
            xCols = [x for x in (set(cols) - {"URN", "Stuck","Class", "Unnamed: 0",'Unnamed: 0.1','PTRWM_EXP__18','GOR_Not Applicable'})]
            x = df[xCols]
            print(x.columns)
            y = df["Class"]
            try:
                modelsAtStart = len(modelDict)
            except NameError:
                modelsAtStart = 0
            # Generate data and model instances, run the models
            modelDataDict, modelDict = runAGroup(
                [
예제 #16
0
def save_dill(obj, name):
    '''Puts the path in for you'''
    with open(sf.addFolderPath(name + '.pik'), 'wb') as f:
        dill.dump(obj, f)
    print('pickled')
예제 #17
0
    0 if not stuck
    If write != False then write to .csv
    """
    import creatingAMonster as cam
    print("Adding/updating stuck column in df...")
    posURNs, negURNs = dictOfURNGroups[cat1], dictOfURNGroups[cat2]
    allURNs = posURNs + negURNs
    URNsToDrop = set(df['URN']) - set(allURNs)
    #    df = df[~df['URN'].isin(URNsToDrop)]
    df["Class"] = df.apply(lambda row: np.where(
        (int(row["URN"]) in posURNs), 1,
        np.where(int(row["URN"]) in URNsToDrop, 2, 0)),
                           axis=1)
    print(df['Class'].value_counts())
    df = cam.dropColsFromList(df, ['Stuck'])
    if len(write) > 0:
        df.to_csv(sf.addFolderPath(write))
    return df


dictOfURNs = makeURNListFromGroupDict(newGrouping(openSchoolDict, True))
inputDF = pd.read_csv(
    sf.addFolderPath('AllDatanotNormedForFeaturePlots_bbbbVgsbbbsImputed.csv'))
#dfWithCats = makeLabelledSubsets(dictOfURNs, 'bbb','gbb',inputDF, 'bbbVgbbLessCols.csv')
#dfWithCats = makeLabelledSubsets(dictOfURNs, 'bbbb','gbbb',inputDF, 'bbbbVgbbbLessCols.csv')
#dfWithCats = makeLabelledSubsets(dictOfURNs, 'bbb','gsbbs', inputDF, 'bbbVgsbbsLessCols.csv')
dfWithCats = makeLabelledSubsets(
    dictOfURNs, 'bbbb', 'gsbbbs', inputDF,
    sf.addFolderPath(
        'AllDatanotNormedForFeaturePlots_bbbbVgsbbbsImputedWithClass.csv'))
예제 #18
0
    roc_curve,
    roc_auc_score,
    auc,
    classification_report,
)
import re
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
import itertools
from random import sample
import datetime
import colSubsets as CS
import setFolder as sf

df = pd.read_csv(sf.addFolderPath('bbbbVgsbbbsdf7.csv'))


def makeDFofResults(write=''):
    numSchools = len(df)
    bigDF = pd.DataFrame({'tester': np.zeros(len(df))},
                         index=list(range(len(df))))
    oldName = ''
    for modelName, modelInstance in modelDict.items():
        #    if modelName[-11:]=='_42_brute_1':
        newName = ''.join(re.split('_[0-9]of[0-9]', modelName))

        #        print(modelName)
        modData = modelInstance.getData()
        xTest = modData.getxTest()
        yTest = modData.getyTest()
예제 #19
0
"""
Created on Sat Aug 31 12:10:27 2019

Plot histograms of each variable with class1 vs class0
Class1 (bad next) is blue
Class0 (good next) is orange

@author: Chris
"""
from os import listdir
import matplotlib.pyplot as plt
import pandas as pd
import setFolder as sf
#df = pd.read_csv(sf.addFolderPath( 'notNormedForFeaturePlots_bbbbVgsbbbs.csv'))
df = pd.read_csv(
    sf.addFolderPath('AllDatanotNormedForFeaturePlots_bbbbVgsbbbs.csv'))
plotCols = [
    x for x in (set(df.columns) -
                {"URN", "Stuck", "Class", "Unnamed: 0", 'Unnamed: 0.1'})
]

plotCols = [
    'PNUMFSM',
    'Total revenue balance (1) as a % of total revenue income (6) 2017-18',
    'ISPRIMARY', 'Premises.2018', 'PTKS1GROUP_L__18', 'PNUMEAL', 'ISSECONDARY',
    'PERCTOT', 'AcademyNew', 'Pupil:     Teacher Ratio',
    'Supply.Staff_4yrDiff', 'Mean Gross FTE Salary of All Teachers (£s)',
    'TOTPUPS__18', 'PerformancePctRank', 'AGEL'
]
plt.figure(figsize=(16, 20))
i = 0
예제 #20
0
def processCSV(csv, write=False, addCols=True):
    ''' Takes in avg results csv
    Reads run name and extracts run info, putting into cols to use later 
    for plotting etc.
    Adds one hot cols for RFE & OS
    Adds p1/2/3/4 cols with the param values for that run
    '''
    if type(csv) == str:
        df = pd.read_csv(sf.addFolderPath(csv))
    else:
        df = csv
    df.set_index('Unnamed: 0', inplace=True)
    df = df.sort_values(by='acc', axis=1, ascending=False)
    df = df.transpose()
    print(f'Analysing {csv}:')
    # Make subset of df that has the minimum scores in the dict
    dfWithMins = df.copy()
    minScores = {
        'acc': 0.5,
        'recall1': 0.3,
        'F0': 0.1,
        'F1': 0.1,
        'precision0': 0.1
    }
    for measure, score in minScores.items():
        dfWithMins = df[df[measure] > score]

        best = {}
    for col in [
            'F0', 'F1', 'acc', 'auc', 'precision0', 'precision1', 'recall0',
            'recall1'
    ]:
        print(f"Best {col}:")
        runName = dfWithMins.loc[:, col].idxmax()
        try:
            best[col] = (dfWithMins.loc[:, col].max(), runName,
                         dfWithMins.loc[runName, :])
            print(best[col][0], 'for', best[col][1])


#            print(best[col][2])
        except KeyError:
            pass

    df.columns.rename('Run', inplace=True)

    #    df=df.head(100)
    if addCols:
        #    df['RunName'] = df.index
        df['Model'] = None
        df['RFE'] = None
        df['OS'] = None
        df['p1'], df['p2'], df['p3'], df['p4'] = None, None, None, None
        for runName in df.index:
            # Identify model type in 'Model' col
            nameDict = {
                'SV': 'SVM',
                'NN': 'NN',
                'RF': 'RF',
                'GN': 'GNB',
                'LR': 'LR',
                'KN': 'KNN'
            }
            df.loc[runName, 'Model'] = nameDict[runName[:2]]
            end = 3
            if runName[:2] in ['GN', 'SV', 'KN']:
                end = 4

    #        end = len(df.loc[runName,'Model'])+1
    # Put no. of RFE vals in RFE col - 0 if RFE not used
            if len(re.findall('RFE', runName)) > 0:
                RFE = runName[runName.find('RFE') + 3:runName.find('RFE') + 5]
                try:
                    RFE = int(RFE)
                    end += 6
                except ValueError:
                    RFE = int(RFE[0])
                    end += 5
            else:
                RFE = 0
            df.loc[runName, 'RFE'] = RFE

            # Put 1 in 'OS' col if oversampled
            if len(re.findall('OS', runName)) > 0:
                df.loc[runName, 'OS'] = 1
                end += 3
            else:
                df.loc[runName, 'OS'] = 0
            # Fix for 'original'
            if end < 5:
                end += 9

            # Sort out params
            if runName[:2] in ['SV', 'NN', 'RF', 'KN']:
                bits = []
                string = ''
                for char in runName[end:len(runName)]:
                    if char == '_':
                        if string == 'None':
                            string = 20
                        try:
                            string = float(string)
                            bits.append(string)
                        except ValueError:
                            bits.insert(0, string)
                        string = ''
                    else:
                        string += char
                if string == 'False':
                    string = 0
                elif string == 'True':
                    string = 1
                bits.append(float(string))
                for i, param in enumerate(['p1', 'p2', 'p3', 'p4']):
                    if (runName[:2] == 'KN') and i == 3:
                        df.loc[runName, param] = 0
                    else:
                        df.loc[runName, param] = bits[i]
        # Make model type one hot
        df = pd.get_dummies(df, columns=['Model'], prefix='', prefix_sep='')

        if write:
            df.to_csv(sf.addFolderPath(csv[:-4] + 'Added.csv'))

    return df