Exemplo n.º 1
0
def getPCATraingAndTesting(thresh):
    allData = loadData()
    trainingData, testingData = do.partionData(allData, .8)
    trainingX = trainingData.loc[:, features]
    trainingY = trainingData.loc[:,'label']

    testingX = testingData.loc[:, features]
    testingY = testingData.loc[:, 'label']
    #Standardize features 
    #trainingX = StandardScaler().fit_transform(trainingX)

    pca = PCA()
    #Run PCA decomposition
    principalComponents = pca.fit_transform(trainingX)

    #Compute and print the number of components that PCA will extract
    numPcaComponents = findBestPCAFeatures(pca.explained_variance_ratio_, thresh)
    print(f'Components: {numPcaComponents}')

    principalDf = pd.DataFrame(principalComponents)
    trainingX = principalDf.iloc[:, 0:numPcaComponents+1]

    #Plot how each component affects the label
    #plotPcaComponentsAffectingY(principalDf, trainingY)

    testingX = pd.DataFrame(pca.transform(testingX))
    testingX = testingX.iloc[:, 0:numPcaComponents+1]

    return trainingX, trainingY, testingX, testingY
Exemplo n.º 2
0
def doBestFeatureSelection(clf, numFeatures):
    multDf = pd.read_csv(os.path.dirname(os.path.abspath(__file__))+'/data/TrainData_Labeled.csv')
    multTraining, multTesting = do.partionData(multDf, .8)
    bestFeatures = fs.getBestFeaturesForHigherOrderTerms(clf, multTraining, numFeatures, 'accuracy')
    #bestFeatures = list(['alcohol', 'volatile acidity*total sulfur dioxide*density*', 'volatile acidity*chlorides*free sulfur dioxide*pH*', 'fixed acidity*volatile acidity*free sulfur dioxide*pH*sulphates*'])
    print(bestFeatures)

    trainingData = multTraining.loc[:, bestFeatures]
    trainingY = multTraining['label']
    trainingData.insert(loc = len(trainingData.columns),column='label', value=trainingY)

    testingData = multTesting.loc[:, bestFeatures]
    testingY = multTesting['label']
    testingData.insert(loc = len(testingData.columns),column='label', value=testingY)
    print(testingData)
    do.fitTrainingData(clf, trainingData)
    do.testClassifier(clf, testingData, "Random Forests")