Пример #1
0
def doSFS(runDict, save=True):
    for runName, subDict in runDict.items():
        for forward in [True, False]:
            print(runName, forward)
            featureSelector = SequentialFeatureSelector(subDict['clf'],
                                                        k_features=(1, 50),
                                                        forward=forward,
                                                        verbose=2,
                                                        scoring="accuracy",
                                                        cv=5,
                                                        n_jobs=-1)
            if forward:
                subDict['Ffeatures'] = featureSelector.fit(x, y)
                subDict['FfilteredFeatures'] = x.columns[list(
                    subDict['Ffeatures'].k_feature_idx_)]
            else:
                subDict['Bfeatures'] = featureSelector.fit(x, y)
                subDict['BfilteredFeatures'] = x.columns[list(
                    subDict['Bfeatures'].k_feature_idx_)]
            if save:
                forwardsOrBackwards = 'Bfeatures'
                if forward:
                    forwardsOrBackwards = 'Ffeatures'
                saveName = runName + '_' + forwardsOrBackwards
                pickling.save_dill(subDict[forwardsOrBackwards].subsets_,
                                   saveName)
    return runDict
Пример #2
0
def postProcess(modelDataDict, modelDict, pickleIt=False, emailIt=True, ROC=False):
    now = datetime.datetime.now()
    modelAvgDict, modelScoresDict = makeAvgResults(modelDict,  'AVG'  + str(now.day)
        + "_"
        + str(now.month)
        + "_"
        + str(now.hour)
        + str(now.minute)
        + fileName
)
            
    # Find 'best' model in dict
    maxAcc= 0
    currModAcc=''
    for runName in modelAvgDict.keys():
        if modelAvgDict[runName]['acc'] > maxAcc:
            maxAcc = modelAvgDict[runName]['acc']
            currModAcc = runName
#    print(f"Best model for F from run is:\n{currModF}\n{currModF.getCM()}")
    print(f"Best model for accuracy from run is:\n{currModAcc}")
    print(f"Accuracy: {maxAcc}")
    
    if pickleIt:    
        import pickling
        pickling.save_dill(
            modelDict,
            "modelDict"
            + str(now.day)
            + "_"
            + str(now.month)
            + "_"
            + str(now.hour)
            + str(now.minute),
        )
    # Save results in .csv file
    
    exportResultsDF(modelDict, "modelDict"
        + str(now.day)
        + "_"
        + str(now.month)
        + "_"
        + str(now.hour)
        + str(now.minute)
        + fileName
    )
    if ROC:
        # Print out ROC curve
        plt.figure(figsize=(15, 11))
        for item in modelDict.values():
            item.plotROC()
    #        item.printOut()
        plt.show()
    if emailIt:
        try:
            content = f"finished genericModelClass - took {datetime.datetime.now() - start} and ran {len(modelDict)-modelsAtStart} models\n{currModAcc} {maxAcc}"
            emailing.sendEmail(subject=f"{maxAcc}, {currModAcc}", content=content)
        except:
            print("\nEmail sending failed, carrying on..\n")
    return modelAvgDict, modelScoresDict
Пример #3
0
def showBestFeaturesOfRunDict(runDict, printOut=True, save=False):
    for runName, run in runDict.items():
        print('run name:', runName)
        for forwardsOrBackwards in ['Ffeatures', 'Bfeatures']:
            if save:
                saveName = runName + '_' + forwardsOrBackwards
                pickling.save_dill(run[forwardsOrBackwards].subsets_, saveName)
            numberOfFeatures = len(run[forwardsOrBackwards].k_feature_names_)
            listName = forwardsOrBackwards[0] + 'bestFeatures'
            run[listName] = []
            if printOut:
                print(f'\n{listName} with {numberOfFeatures} features:')
                for i in run['Ffeatures'].subsets_.values():
                    for name in i['feature_names']:
                        if name not in run[listName]:
                            run[listName].append(name)
                            if printOut:
                                if len(run[listName]) < numberOfFeatures:
                                    print(i['avg_score'], name)
                if printOut:
                    print(f"------CUTOFF------ ")

            for i in run['Bfeatures'].subsets_.values():
                for name in i['feature_names']:
                    if name not in run[listName]:
                        run[listName].append(name)
                        if printOut:
                            if len(run[listName]) < numberOfFeatures:
                                print(i['avg_score'], name)

        forwardSet = set(run['Ffeatures'].k_feature_names_)
        backwardSet = set(run['Bfeatures'].k_feature_names_)
        if printOut:
            print(
                f"\n{runName} has {len(forwardSet)} in forwardSet, {len(backwardSet)} in backwardSet, {len(forwardSet & backwardSet)} common to both:"
            )
            print(forwardSet & backwardSet)
    return runDict
Пример #4
0
           'KNN':{'cols':CS.KNNcols2, 'model':KNN},
           'RF' :{'cols':CS.RFcols2, 'model':RandomForest},
           'LR':{'cols':CS.LRcols, 'model':LogReg},
           'GNB':{'cols':CS.GNBcols, 'model':GaussianBayes},
#           'AB':{'cols':CS.RFcols2, 'model':AdaBoost}
           }

if __name__ == "__main__":
    import emailing
    import pickling
#    doneRuns=[]
    files = ['bbbbVgsbbbsdf7.csv']#['stuckForDF7.csv']*10#'bbbbVgsbbbsdf7.csv']#*1000
    for fileName in files:#['bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv','bbbbVgsbbbs6.csv']:#,'bbbbVgsbbbsAllCols.csv']:#,'bbbVgsbbsLessCols.csv','bbbVgbbLessCols.csv', 'bbbbVgbbbLessCols.csv']:
#        for cols in [CS.KNNcols]:#[SFS1Cols,SFS2Cols,chosenCols1, lessCols, cols]:   
        for modelType in colDict.keys():#['GNB','LR','KNN','SVM','NN','RF']:
            pickling.save_dill(doneRuns, f"doneRuns_{len(doneRuns)}")
#            modelDict={}
            modelDataDict={}
            df = pd.read_csv(sf.addFolderPath( fileName))
#            cols = colDict[modelType]['cols']
            cols = list(df.columns)
#            xCols = [x for x in (set(df.columns) - {"URN", "Stuck","Class", "Unnamed: 0",'Unnamed: 0.1'})]
#            cols.append('PerformancePctRank')
            xCols = [x for x in (set(cols) - {"URN", "Stuck","Class", "Unnamed: 0",'Unnamed: 0.1','PTRWM_EXP__18','GOR_Not Applicable'})]
            x = df[xCols]
            print(x.columns)
            y = df["Class"]
            try:
                modelsAtStart = len(modelDict)
            except NameError:
                modelsAtStart = 0