def getPCATraingAndTesting(thresh): allData = loadData() trainingData, testingData = do.partionData(allData, .8) trainingX = trainingData.loc[:, features] trainingY = trainingData.loc[:,'label'] testingX = testingData.loc[:, features] testingY = testingData.loc[:, 'label'] #Standardize features #trainingX = StandardScaler().fit_transform(trainingX) pca = PCA() #Run PCA decomposition principalComponents = pca.fit_transform(trainingX) #Compute and print the number of components that PCA will extract numPcaComponents = findBestPCAFeatures(pca.explained_variance_ratio_, thresh) print(f'Components: {numPcaComponents}') principalDf = pd.DataFrame(principalComponents) trainingX = principalDf.iloc[:, 0:numPcaComponents+1] #Plot how each component affects the label #plotPcaComponentsAffectingY(principalDf, trainingY) testingX = pd.DataFrame(pca.transform(testingX)) testingX = testingX.iloc[:, 0:numPcaComponents+1] return trainingX, trainingY, testingX, testingY
def doBestFeatureSelection(clf, numFeatures): multDf = pd.read_csv(os.path.dirname(os.path.abspath(__file__))+'/data/TrainData_Labeled.csv') multTraining, multTesting = do.partionData(multDf, .8) bestFeatures = fs.getBestFeaturesForHigherOrderTerms(clf, multTraining, numFeatures, 'accuracy') #bestFeatures = list(['alcohol', 'volatile acidity*total sulfur dioxide*density*', 'volatile acidity*chlorides*free sulfur dioxide*pH*', 'fixed acidity*volatile acidity*free sulfur dioxide*pH*sulphates*']) print(bestFeatures) trainingData = multTraining.loc[:, bestFeatures] trainingY = multTraining['label'] trainingData.insert(loc = len(trainingData.columns),column='label', value=trainingY) testingData = multTesting.loc[:, bestFeatures] testingY = multTesting['label'] testingData.insert(loc = len(testingData.columns),column='label', value=testingY) print(testingData) do.fitTrainingData(clf, trainingData) do.testClassifier(clf, testingData, "Random Forests")