Пример #1
0
def predictHeartDiseaseUsingML():
    # use parser and find the user's query
    jsonRequest = request.get_json()

    data = [jsonRequest]
    testDataFrame = pd.DataFrame(data)

    #Perform the same preprocessing as performed on training dataset
    testDataFrameEncoded = featureEncodingUsingOneHotEncoder(testDataFrame)
    testDataFrameEncodedAndScaledDataset = featureScalingUsingNormalizer(
        testDataFrameEncoded)

    #Load classifier from file
    xtest = testDataFrameEncodedAndScaledDataset.iloc[:, :-1].values
    ytest = testDataFrameEncodedAndScaledDataset.iloc[:,
                                                      len(testDataFrameEncodedAndScaledDataset
                                                          .columns) - 1].values

    labelencoder_ytest = LabelEncoder()
    ytest = labelencoder_ytest.fit_transform(ytest)
    # Predicting the Test set results
    ytestpred = classifier.predict(xtest)
    prediction = {'prediction': int(ytestpred[0])}
    return jsonify(prediction)
def performPreprocessingBuildModelsAndEvaluateAccuracy(trainingDataSet, testingDataSet, arrayOfModels):
    for i in range(1,len(arrayOfModels)):
        print('***************************************************************************************************************************')
        print('********************************************* Building Model-', i ,' As Below *************************************************')
        print('\t -- Feature Selection: \t ', arrayOfModels[i][0], ' \n\t -- Feature Encoding: \t ', arrayOfModels[i][1], ' \n\t -- Feature Scaling: \t ', arrayOfModels[i][2], ' \n\t -- Classification: \t ', arrayOfModels[i][3], '\n')
 
        trainingFileNameWithAbsolutePath, testingFileNameWithAbsolutePath = getPathToTrainingAndTestingDataSets()
        trainingDataSet = loadCSV(trainingFileNameWithAbsolutePath)
        testingDataSet = loadCSV(testingFileNameWithAbsolutePath)

        labelName = getLabelName()
        label = trainingDataSet[labelName]

        #Combining the test and training datasets for preprocessing then together, because we observed that in sme datasets
        #the values in the categorical columns in test dataset and train dataset are being different this causes issues while
        #applying classification techniques
        completeDataSet = pd.concat(( trainingDataSet, testingDataSet ))

        #difficultyLevel = completeDataSet.pop('difficulty_level')
        
        print("completeDataSet.shape: ",completeDataSet.shape)
        print("completeDataSet.head: ",completeDataSet.head())

        #Feature Selection
        if arrayOfModels[i][0] == 'TheilsU':
            #Perform feature selection using TheilU
            completeDataSetAfterFeatuerSelection = featureSelectionUsingTheilU(completeDataSet)
        elif arrayOfModels[i][0] == 'Chi-SquaredTest':
            #Perform feature selection using Chi-squared Test
            completeDataSetAfterFeatuerSelection = featureSelectionUsingChisquaredTest(completeDataSet)
        elif arrayOfModels[i][0] == 'RandomForestClassifier':
            #Perform feature selection using RandomForestClassifier
            completeDataSetAfterFeatuerSelection = featureSelectionUsingRandomForestClassifier(completeDataSet)
        elif arrayOfModels[i][0] == 'ExtraTreesClassifier':
            #Perform feature selection using ExtraTreesClassifier
            completeDataSetAfterFeatuerSelection = featureSelectionUsingExtraTreesClassifier(completeDataSet)
        
        #Feature Encoding        
        if arrayOfModels[i][1] == 'LabelEncoder':
            #Perform lable encoding to convert categorical values into label encoded features
            completeEncodedDataSet = featureEncodingUsingLabelEncoder(completeDataSetAfterFeatuerSelection)
        elif arrayOfModels[i][1] == 'OneHotEncoder':
            #Perform OnHot encoding to convert categorical values into one-hot encoded features
            completeEncodedDataSet = featureEncodingUsingOneHotEncoder(completeDataSetAfterFeatuerSelection)
        elif arrayOfModels[i][1] == 'FrequencyEncoder':
            #Perform Frequency encoding to convert categorical values into frequency encoded features
            completeEncodedDataSet = featureEncodingUsingFrequencyEncoder(completeDataSetAfterFeatuerSelection)
        elif arrayOfModels[i][1] == 'BinaryEncoder':
            #Perform Binary encoding to convert categorical values into binary encoded features
            completeEncodedDataSet = featureEncodingUsingBinaryEncoder(completeDataSetAfterFeatuerSelection)

        #Feature Scaling        
        if arrayOfModels[i][2] == 'Min-Max':
            #Perform MinMaxScaler to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingMinMaxScaler(completeEncodedDataSet)
        elif arrayOfModels[i][2] == 'Binarizing':
            #Perform Binarizing to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingBinarizer(completeEncodedDataSet)
        elif arrayOfModels[i][2] == 'Normalizing':
            #Perform Normalizing to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingNormalizer(completeEncodedDataSet)
        elif arrayOfModels[i][2] == 'Standardization':
            #Perform Standardization to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingStandardScalar(completeEncodedDataSet)
        
        #Split the complete dataSet into training dataSet and testing dataSet
        featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet = splitCompleteDataSetIntoTrainingSetAndTestingSet(completeEncodedAndScaledDataset)
        
        trainingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTrainingDataSet, labelInPreProcessedTrainingDataSet], axis=1, sort=False)
        testingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTestingDataSet, labelInPreProcessedTestingDataSet], axis=1, sort=False)

        #Classification                
        if arrayOfModels[i][3] == 'DecisonTree':
            #Perform classification using DecisionTreeClassifier
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingDecisionTreeClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)
        elif arrayOfModels[i][3] == 'RandomForestClassifier':
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingRandomForestClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)
        elif arrayOfModels[i][3] == 'ExtraTreesClassifier':
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingExtraTreesClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)
        elif arrayOfModels[i][3] == 'LogisticRegressionRegression':
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingLogisticRegression(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)
        elif arrayOfModels[i][3] == 'LinearDiscriminantAnalysis':
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingLinearDiscriminantAnalysis(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)
        elif arrayOfModels[i][3] == 'GuassianNaiveBayes':
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingGaussianNB(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)
        elif arrayOfModels[i][3] == 'KNN':
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingKNNClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)

        arrayOfModels[i].append(trainingAccuracyScore)
        arrayOfModels[i].append(testingAccuracyScore)
        
        modelName = arrayOfModels[i][0]+"_"+arrayOfModels[i][1]+"_"+arrayOfModels[i][2]+"_"+arrayOfModels[i][3]
        modelFileName = getPathToGenerateModels() + modelName+".pkl"
        arrayOfModels[i].append(modelName)
        arrayOfModels[i].append(modelFileName)
        #Save the model to file
        joblib.dump(classifier, modelFileName)
def performPreprocessing(trainingDataSet, testingDataSet, arrayOfModels):
    for i in range(0,len(arrayOfModels)):
        print('***************************************************************************************************************************')
        print('********************************************* Building Model-', i ,' As Below *************************************************')
        print('\t -- Feature Selection: \t ', arrayOfModels[i][0], ' \n\t -- Feature Encoding: \t ', arrayOfModels[i][1], ' \n\t -- Feature Scaling: \t ', arrayOfModels[i][2], '\n')
 
        trainingFileNameWithAbsolutePath, testingFileNameWithAbsolutePath = getPathToTrainingAndTestingDataSets()
        trainingDataSet = loadCSV(trainingFileNameWithAbsolutePath)
        testingDataSet = loadCSV(testingFileNameWithAbsolutePath)

        labelName = getLabelName()
        label = trainingDataSet[labelName]

        #Combining the test and training datasets for preprocessing then together, because we observed that in sme datasets
        #the values in the categorical columns in test dataset and train dataset are being different this causes issues while
        #applying classification techniques
        completeDataSet = pd.concat(( trainingDataSet, testingDataSet ))

        #difficultyLevel = completeDataSet.pop('difficulty_level')
        
        print("completeDataSet.shape: ",completeDataSet.shape)
        print("completeDataSet.head: ",completeDataSet.head())

        #Feature Selection
        if arrayOfModels[i][0] == 'TheilsU':
            #Perform feature selection using TheilU
            completeDataSetAfterFeatuerSelection = featureSelectionUsingTheilU(completeDataSet)
        elif arrayOfModels[i][0] == 'Chi-SquaredTest':
            #Perform feature selection using Chi-squared Test
            completeDataSetAfterFeatuerSelection = featureSelectionUsingChisquaredTest(completeDataSet)
        elif arrayOfModels[i][0] == 'RandomForestClassifier':
            #Perform feature selection using RandomForestClassifier
            completeDataSetAfterFeatuerSelection = featureSelectionUsingRandomForestClassifier(completeDataSet)
        elif arrayOfModels[i][0] == 'ExtraTreesClassifier':
            #Perform feature selection using ExtraTreesClassifier
            completeDataSetAfterFeatuerSelection = featureSelectionUsingExtraTreesClassifier(completeDataSet)
        
        #Feature Encoding        
        if arrayOfModels[i][1] == 'LabelEncoder':
            #Perform lable encoding to convert categorical values into label encoded features
            completeEncodedDataSet = featureEncodingUsingLabelEncoder(completeDataSetAfterFeatuerSelection)
        elif arrayOfModels[i][1] == 'OneHotEncoder':
            #Perform OnHot encoding to convert categorical values into one-hot encoded features
            completeEncodedDataSet = featureEncodingUsingOneHotEncoder(completeDataSetAfterFeatuerSelection)
        elif arrayOfModels[i][1] == 'FrequencyEncoder':
            #Perform Frequency encoding to convert categorical values into frequency encoded features
            completeEncodedDataSet = featureEncodingUsingFrequencyEncoder(completeDataSetAfterFeatuerSelection)
        elif arrayOfModels[i][1] == 'BinaryEncoder':
            #Perform Binary encoding to convert categorical values into binary encoded features
            completeEncodedDataSet = featureEncodingUsingBinaryEncoder(completeDataSetAfterFeatuerSelection)

        #Feature Scaling        
        if arrayOfModels[i][2] == 'Min-Max':
            #Perform MinMaxScaler to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingMinMaxScaler(completeEncodedDataSet)
        elif arrayOfModels[i][2] == 'Binarizing':
            #Perform Binarizing to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingBinarizer(completeEncodedDataSet)
        elif arrayOfModels[i][2] == 'Normalizing':
            #Perform Normalizing to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingNormalizer(completeEncodedDataSet)
        elif arrayOfModels[i][2] == 'Standardization':
            #Perform Standardization to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingStandardScalar(completeEncodedDataSet)
        
        #Split the complete dataSet into training dataSet and testing dataSet
        featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet = splitCompleteDataSetIntoTrainingSetAndTestingSet(completeEncodedAndScaledDataset)
        
        trainingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTrainingDataSet, labelInPreProcessedTrainingDataSet], axis=1, sort=False)
        testingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTestingDataSet, labelInPreProcessedTestingDataSet], axis=1, sort=False)
    
    return 	completeEncodedAndScaledDataset