def featureSelectionUsingExtraTreesClassifier(dataSetForFeatureSelection): print( "\n****** Start performing feature selection using ExtraTreesClassifier *****" ) print("****** Falls under wrapper methods (feature importance) *****") labelName = getLabelName() #Applying feature encoding before applying the ExtraTreesClassification dataSetForFeatureSelection = featureEncodingUsingLabelEncoder( dataSetForFeatureSelection) dataSetAfterFeatuerSelection = dataSetForFeatureSelection #features = dataSetForFeatureSelection.iloc[:,0:len(dataSetForFeatureSelection.columns)-1] features = dataSetForFeatureSelection.drop([labelName], axis=1) label = dataSetForFeatureSelection[labelName] labelencoder = LabelEncoder() labelTransformed = labelencoder.fit_transform(label) print("****** ExtraTreesClassification is in progress *****") #Train using ExtraTreesClassifier trainedforest = ExtraTreesClassifier(n_estimators=700).fit( features, labelTransformed) importances = trainedforest.feature_importances_ #array with importances of each feature idx = np.arange( 0, features.shape[1]) #create an index array, with the number of features features_to_keep = idx[importances > np.mean( importances )] #only keep features whose importance is greater than the mean importance featureImportances = pd.Series(importances, index=features.columns) selectedFeatures = featureImportances.nlargest(len(features_to_keep)) print("\n selectedFeatures after ExtraTreesClassification: ", selectedFeatures) print("****** Completed ExtraTreesClassification *****") #Plot the feature Importance to see which features have been considered as most important for our model to make its predictions #figure(num=None, figsize=(20, 22), dpi=80, facecolor='w', edgecolor='k') #selectedFeatures.plot(kind='barh') selectedFeaturesNames = selectedFeatures.keys() dataSetForFeatureSelection = dataSetForFeatureSelection.drop( selectedFeaturesNames, axis=1) dataSetAfterFeatuerSelection = dataSetAfterFeatuerSelection.drop( dataSetForFeatureSelection.columns, axis=1) dataSetAfterFeatuerSelection[labelName] = label numberOfFeaturesInTheDatasetAfterFeatureSelection = len( dataSetAfterFeatuerSelection.columns) print('\n***** Number of columns in the dataSet after feature selection: ', len(dataSetAfterFeatuerSelection.columns)) print('***** Columns in the dataSet after feature selection: \n', dataSetAfterFeatuerSelection.columns) print( "****** End performing feature selection using ExtraTreesClassifier *****" ) return dataSetAfterFeatuerSelection
def featureSelectionUsingChisquaredTest(dataSetForFeatureSelection): print( "\n****** Start performing feature selection using ChisquaredTest *****" ) print("****** Falls under filter methods (univariate selection) *****") numberOfFeatureToBeSelected = 10 labelName = getLabelName() #To be able to apply Chi-squared test dataSetForFeatureSelection = featureEncodingUsingLabelEncoder( dataSetForFeatureSelection) dataSetAfterFeatuerSelection = dataSetForFeatureSelection #features = dataSetForFeatureSelection.iloc[:,0:len(dataSetForFeatureSelection.columns)-1] features = dataSetForFeatureSelection.drop([labelName], axis=1) label = dataSetForFeatureSelection[labelName] #Apply SelectKBest class to extract top 10 best features bestfeatures = SelectKBest(score_func=chi2, k=numberOfFeatureToBeSelected) fitBestfeatures = bestfeatures.fit(features, label) columns = pd.DataFrame(features.columns) scores = pd.DataFrame(fitBestfeatures.scores_) #concat two dataframes for better visualization scoresOfBestFeatures = pd.concat([columns, scores], axis=1) scoresOfBestFeatures.columns = ['Features', 'Score'] print("\n***** Scores for each feature in the dataset are *****") print(scoresOfBestFeatures.nlargest(numberOfFeatureToBeSelected, 'Score')) mask = fitBestfeatures.get_support() for j in range(0, len(mask)): if (mask[j] == False): dataSetAfterFeatuerSelection.pop(features.columns[j]) numberOfFeaturesInTheDatasetAfterFeatureSelection = len( dataSetAfterFeatuerSelection.columns) print('***** Number of columns in the dataSet after feature selection: ', len(dataSetAfterFeatuerSelection.columns)) print('***** Columns in the dataSet after feature selection: \n', dataSetAfterFeatuerSelection.columns) print("****** End performing feature selection using ChisquaredTest *****") return dataSetAfterFeatuerSelection
def performPreprocessingBuildModelsAndEvaluateAccuracy(trainingDataSet, testingDataSet, arrayOfModels): for i in range(1,len(arrayOfModels)): print('***************************************************************************************************************************') print('********************************************* Building Model-', i ,' As Below *************************************************') print('\t -- Feature Selection: \t ', arrayOfModels[i][0], ' \n\t -- Feature Encoding: \t ', arrayOfModels[i][1], ' \n\t -- Feature Scaling: \t ', arrayOfModels[i][2], ' \n\t -- Classification: \t ', arrayOfModels[i][3], '\n') trainingFileNameWithAbsolutePath, testingFileNameWithAbsolutePath = getPathToTrainingAndTestingDataSets() trainingDataSet = loadCSV(trainingFileNameWithAbsolutePath) testingDataSet = loadCSV(testingFileNameWithAbsolutePath) labelName = getLabelName() label = trainingDataSet[labelName] #Combining the test and training datasets for preprocessing then together, because we observed that in sme datasets #the values in the categorical columns in test dataset and train dataset are being different this causes issues while #applying classification techniques completeDataSet = pd.concat(( trainingDataSet, testingDataSet )) #difficultyLevel = completeDataSet.pop('difficulty_level') print("completeDataSet.shape: ",completeDataSet.shape) print("completeDataSet.head: ",completeDataSet.head()) #Feature Selection if arrayOfModels[i][0] == 'TheilsU': #Perform feature selection using TheilU completeDataSetAfterFeatuerSelection = featureSelectionUsingTheilU(completeDataSet) elif arrayOfModels[i][0] == 'Chi-SquaredTest': #Perform feature selection using Chi-squared Test completeDataSetAfterFeatuerSelection = featureSelectionUsingChisquaredTest(completeDataSet) elif arrayOfModels[i][0] == 'RandomForestClassifier': #Perform feature selection using RandomForestClassifier completeDataSetAfterFeatuerSelection = featureSelectionUsingRandomForestClassifier(completeDataSet) elif arrayOfModels[i][0] == 'ExtraTreesClassifier': #Perform feature selection using ExtraTreesClassifier completeDataSetAfterFeatuerSelection = featureSelectionUsingExtraTreesClassifier(completeDataSet) #Feature Encoding if arrayOfModels[i][1] == 'LabelEncoder': #Perform lable encoding to convert categorical values into label encoded features completeEncodedDataSet = featureEncodingUsingLabelEncoder(completeDataSetAfterFeatuerSelection) elif arrayOfModels[i][1] == 'OneHotEncoder': #Perform OnHot encoding to convert categorical values into one-hot encoded features completeEncodedDataSet = featureEncodingUsingOneHotEncoder(completeDataSetAfterFeatuerSelection) elif arrayOfModels[i][1] == 'FrequencyEncoder': #Perform Frequency encoding to convert categorical values into frequency encoded features completeEncodedDataSet = featureEncodingUsingFrequencyEncoder(completeDataSetAfterFeatuerSelection) elif arrayOfModels[i][1] == 'BinaryEncoder': #Perform Binary encoding to convert categorical values into binary encoded features completeEncodedDataSet = featureEncodingUsingBinaryEncoder(completeDataSetAfterFeatuerSelection) #Feature Scaling if arrayOfModels[i][2] == 'Min-Max': #Perform MinMaxScaler to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingMinMaxScaler(completeEncodedDataSet) elif arrayOfModels[i][2] == 'Binarizing': #Perform Binarizing to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingBinarizer(completeEncodedDataSet) elif arrayOfModels[i][2] == 'Normalizing': #Perform Normalizing to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingNormalizer(completeEncodedDataSet) elif arrayOfModels[i][2] == 'Standardization': #Perform Standardization to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingStandardScalar(completeEncodedDataSet) #Split the complete dataSet into training dataSet and testing dataSet featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet = splitCompleteDataSetIntoTrainingSetAndTestingSet(completeEncodedAndScaledDataset) trainingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTrainingDataSet, labelInPreProcessedTrainingDataSet], axis=1, sort=False) testingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTestingDataSet, labelInPreProcessedTestingDataSet], axis=1, sort=False) #Classification if arrayOfModels[i][3] == 'DecisonTree': #Perform classification using DecisionTreeClassifier classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingDecisionTreeClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) elif arrayOfModels[i][3] == 'RandomForestClassifier': classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingRandomForestClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) elif arrayOfModels[i][3] == 'ExtraTreesClassifier': classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingExtraTreesClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) elif arrayOfModels[i][3] == 'LogisticRegressionRegression': classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingLogisticRegression(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) elif arrayOfModels[i][3] == 'LinearDiscriminantAnalysis': classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingLinearDiscriminantAnalysis(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) elif arrayOfModels[i][3] == 'GuassianNaiveBayes': classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingGaussianNB(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) elif arrayOfModels[i][3] == 'KNN': classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingKNNClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) arrayOfModels[i].append(trainingAccuracyScore) arrayOfModels[i].append(testingAccuracyScore) modelName = arrayOfModels[i][0]+"_"+arrayOfModels[i][1]+"_"+arrayOfModels[i][2]+"_"+arrayOfModels[i][3] modelFileName = getPathToGenerateModels() + modelName+".pkl" arrayOfModels[i].append(modelName) arrayOfModels[i].append(modelFileName) #Save the model to file joblib.dump(classifier, modelFileName)
def performPreprocessing(trainingDataSet, testingDataSet, arrayOfModels): for i in range(0,len(arrayOfModels)): print('***************************************************************************************************************************') print('********************************************* Building Model-', i ,' As Below *************************************************') print('\t -- Feature Selection: \t ', arrayOfModels[i][0], ' \n\t -- Feature Encoding: \t ', arrayOfModels[i][1], ' \n\t -- Feature Scaling: \t ', arrayOfModels[i][2], '\n') trainingFileNameWithAbsolutePath, testingFileNameWithAbsolutePath = getPathToTrainingAndTestingDataSets() trainingDataSet = loadCSV(trainingFileNameWithAbsolutePath) testingDataSet = loadCSV(testingFileNameWithAbsolutePath) labelName = getLabelName() label = trainingDataSet[labelName] #Combining the test and training datasets for preprocessing then together, because we observed that in sme datasets #the values in the categorical columns in test dataset and train dataset are being different this causes issues while #applying classification techniques completeDataSet = pd.concat(( trainingDataSet, testingDataSet )) #difficultyLevel = completeDataSet.pop('difficulty_level') print("completeDataSet.shape: ",completeDataSet.shape) print("completeDataSet.head: ",completeDataSet.head()) #Feature Selection if arrayOfModels[i][0] == 'TheilsU': #Perform feature selection using TheilU completeDataSetAfterFeatuerSelection = featureSelectionUsingTheilU(completeDataSet) elif arrayOfModels[i][0] == 'Chi-SquaredTest': #Perform feature selection using Chi-squared Test completeDataSetAfterFeatuerSelection = featureSelectionUsingChisquaredTest(completeDataSet) elif arrayOfModels[i][0] == 'RandomForestClassifier': #Perform feature selection using RandomForestClassifier completeDataSetAfterFeatuerSelection = featureSelectionUsingRandomForestClassifier(completeDataSet) elif arrayOfModels[i][0] == 'ExtraTreesClassifier': #Perform feature selection using ExtraTreesClassifier completeDataSetAfterFeatuerSelection = featureSelectionUsingExtraTreesClassifier(completeDataSet) #Feature Encoding if arrayOfModels[i][1] == 'LabelEncoder': #Perform lable encoding to convert categorical values into label encoded features completeEncodedDataSet = featureEncodingUsingLabelEncoder(completeDataSetAfterFeatuerSelection) elif arrayOfModels[i][1] == 'OneHotEncoder': #Perform OnHot encoding to convert categorical values into one-hot encoded features completeEncodedDataSet = featureEncodingUsingOneHotEncoder(completeDataSetAfterFeatuerSelection) elif arrayOfModels[i][1] == 'FrequencyEncoder': #Perform Frequency encoding to convert categorical values into frequency encoded features completeEncodedDataSet = featureEncodingUsingFrequencyEncoder(completeDataSetAfterFeatuerSelection) elif arrayOfModels[i][1] == 'BinaryEncoder': #Perform Binary encoding to convert categorical values into binary encoded features completeEncodedDataSet = featureEncodingUsingBinaryEncoder(completeDataSetAfterFeatuerSelection) #Feature Scaling if arrayOfModels[i][2] == 'Min-Max': #Perform MinMaxScaler to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingMinMaxScaler(completeEncodedDataSet) elif arrayOfModels[i][2] == 'Binarizing': #Perform Binarizing to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingBinarizer(completeEncodedDataSet) elif arrayOfModels[i][2] == 'Normalizing': #Perform Normalizing to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingNormalizer(completeEncodedDataSet) elif arrayOfModels[i][2] == 'Standardization': #Perform Standardization to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingStandardScalar(completeEncodedDataSet) #Split the complete dataSet into training dataSet and testing dataSet featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet = splitCompleteDataSetIntoTrainingSetAndTestingSet(completeEncodedAndScaledDataset) trainingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTrainingDataSet, labelInPreProcessedTrainingDataSet], axis=1, sort=False) testingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTestingDataSet, labelInPreProcessedTestingDataSet], axis=1, sort=False) return completeEncodedAndScaledDataset