示例#1
0
    def dataProcessing(self, inputFilePath, outputFilePath):
        """
        1. given function reads csv file,
           Note: data from https://archive.ics.uci.edu/ml/datasets does not 
                 provide headers, so we are providing our own headers
        2. pre-process it by 
            1. removing null values,
            2. scaling data if required
            3. converting categorical / nominal values into numerical values,
        3. writing data to the csv file and returning refined dataframe
        """
        #reading csv from url in dataframe
        myIO = MyIO()
        inputDataFrame = myIO.inputCSVFromURL(filePath=inputFilePath)

        #         #debug
        #         print ('inputDataFrame = {} '.format(inputDataFrame))
        #         #debug -ends
        dataProcess = DataPreprocess()
        #creating dummy headers and adding them to dataframe
        headerList = dataProcess.provideHeaders(inputDataFrame=inputDataFrame)
        inputDataFrame.columns = headerList

        #removing null values
        nullRemovedDataFrame = dataProcess.removeNullValues(inputDataFrame = \
                                                            inputDataFrame)
        #converting catgorical values into integer values
        numericalDataFrame = dataProcess.categoricalToNumericalConversion(\
                                            dataFrame = nullRemovedDataFrame)
        #scaling integer and float values
        refinedDataFrame = dataProcess.scaleData(inputDataFrame =\
                                                         numericalDataFrame)

        #         #debug
        #         print ('refinedDataFrame =\n {} '.format(refinedDataFrame))
        #         #debug -ends

        #writing refined csv file
        myIO.writeCSV(inputDataFrame = refinedDataFrame, outputFilePath = \
                                                                 outputFilePath)
        return refinedDataFrame
    def preprocessTestingData(self, testingDirPath):
        """
        input: testingDirPath, fileActualClassDict
        output: fileTokenDict
        Given function walks through all files in testingDir Path, and provides
        unique tokens of eachFile and store them in the form, key = fileName,
        value= uniqueTokenList
        """
        fileTokenDict = {}
        fileActualClassDict = {}
        for currentRoot, dirs, files in os.walk(testingDirPath):

            #walking through all files in the currentDir
            for currentFile in files:

                #finding file path of current Directory and reading its content
                currentFilePath = os.path.join(currentRoot, currentFile)

                myIO = MyIO()
                currentInputStr = myIO.readDoc(docPath=currentFilePath)
                #                 #debug
                #                 print("currentInputStr : {}".format(currentInputStr))
                #                 #debug -ends
                #finding token of given file
                fileTokenList = self._tokenizationFilter(
                    rowStr=currentInputStr)
                #adding given file token list to class token list
                #                 fileTokenDict[currentFile] = list(set(fileTokenList))
                fileTokenDict[currentFile] = fileTokenList
                #assigning actual class value
                fileActualClassDict[currentFile] = currentRoot.split(\
                                                                os.path.sep)[-1]

            #for currentFile -ends
        #for currentRoot, dirs, files -ends
        return fileTokenDict, fileActualClassDict
    def preprocessTrainingData(self, dirPath):
        """
        Input: dirPath
        output: classTokenList, uniqueTokenList, nDocsInClassArr, dirNameList
        classTokenList is a list which has sublist of all tokens of classes 
        of given directory
        uniqueTokenList is a list which has all unique tokens of all classes 
        of given directory
        nDocsInClassArr is a numpy array with number of documents of each class
        dirNameList provides all directory names (which are class names here),
        from current dirPath (training or testing)
        
        given folder takes dirPath, walk through all the directories,
        read its files, tokenize them and return the combine tokens of all
        classes, unique token list, and number of documents in each class
        """
        #variables
        classTokenList = []
        generalTokenList = []
        nDocsInClassList = []
        dirNameList = next(os.walk(dirPath))[1]
        #         #debug
        #         print("dirNameList : {}".format(dirNameList))
        #         #debug -ends
        #walking through all internal directories, reading files, finding tokens
        for currentRoot, dirs, files in os.walk(dirPath):
            #finding number of files in given directory and assigning it to list
            nFiles = len(files)
            nDocsInClassList.append(nFiles)

            #walking through all files in the currentDir
            currentClassTokenList = []
            for currentFile in files:
                #finding file path of current Directory and reading its content
                currentFilePath = os.path.join(currentRoot, currentFile)
                myIO = MyIO()
                currentInputStr = myIO.readDoc(docPath=currentFilePath)
                #                 #debug
                #                 print("currentInputStr : {}".format(currentInputStr))
                #                 #debug -ends
                #finding token of given file
                fileTokenList = self._tokenizationFilter(
                    rowStr=currentInputStr)
                #adding given file token list to class token list
                currentClassTokenList.extend(fileTokenList)
                generalTokenList.extend(fileTokenList)
            #for currentFile -ends
            #appending currentClassTokenList to classTokenList
            classTokenList.append(currentClassTokenList)

#             #putting all tokens in one token list
#             generalTokenList.extend(classTokenList)
#for currentRoot,dirs,files -ends
#         #debug
#         print("generalTokenList : {}".format(generalTokenList))
#         #debug -ends
        uniqueTokenList = list(set(generalTokenList))
        #Assuming that our currentFile path is a train/test path, which contains
        #all the classDir, and no files, and the classDir contains all the file
        #Now, root directory does not provide class info. So removing its data
        classTokenList.pop(0)
        #         uniqueTokenList.pop(0)
        nDocsInClassList.pop(0)
        nDocsInClassArr = np.array(nDocsInClassList)
        #returning outputs
        return classTokenList, uniqueTokenList, nDocsInClassArr, dirNameList
    def myUI(self, w1, h1, w2, h2, name_input, name_output):
        '''
        given function performs the following tasks:
        1. read the image in BGR format
        2. convert w1, h1, w2, h2 window size in respective pixel format
        3. convert BGR image into Luv image
        4. find histogram of entire image on Luv domain,
           where L is in range of given window
        5. convert Luv image into BGR image
        6. write output image
        '''
        # 1. read the image in BGR format
        myIO = MyIO()
        bgrImg = myIO.readImage(name_input)
        
        # debug
        print("bgrImg =\n {}".format(bgrImg))
        # debug -ends
                
        # debug
        myIO.showImage(bgrImg, "BGR Image")
        # debug -ends

        # 2. convert w1, h1, w2, h2 window size in respective pixel format
        W1, H1, W2, H2 = myIO.windowsSizeMapping(inputImage = bgrImg,\
                                                 w1 = w1, h1=h1,\
                                                 w2 = w2, h2=h2)
        # debug
        print("W1 = {}, H1={}, W2={}, H2={}".format(W1, H1, W2, H2))
        # debug -ends


        # 3. convert BGR image into Luv image
        colorProcess=ColorProcess()
        LuvImg = colorProcess.bgrToLuv(bgrImg = bgrImg)
        # debug
        print("-----------------------------------------------------")
        print("\nLuvImg = \n{}".format(LuvImg))
        # debug -ends

        #4. find histogram of entire image on Luv domain,
        #   where L is in range of given window
        imageProcess = ImageProcess()
        HELuvImg = imageProcess.histogramEqualizationInLuv(LuvImg, W1, H1, W2, H2)
        
        # debug
        print("-----------------------------------------------------")
        print("HELuvImg = \n{}".format(HELuvImg))
        # debug -ends
        
        # 5. convert Luv image into BGR image
        HEBGRImage = colorProcess.LuvToBGR(LuvImage = HELuvImg)
        
        # debug
        myIO.showImage(HEBGRImage, "Histogram Equalized BGR Image")
        cv2.waitKey(0)
        # debug -ends

        # debug
        print("-----------------------------------------------------")
        print("HEBGRImage =\n {}".format(HEBGRImage))
        # debug -ends

        #6. write output image
        myIO.writeImage(outputImage = HEBGRImage, name_output = name_output)
    def createNeuralNetwork(self, inputFilePath, trainingPercent, maxItr,
                            nHiddenLayers, nNeurons):
        """
        given function creates neural network, 
        and displays its waight at each level and accuracy of the network
        0. read dataset
        1. split dataset into training and testing datasets
        2. initialize network
            a. take weights randomly for hidden and output layers
        3. forward propogation
            a. neuron activation: sigma(wi*xi)
            b. neuron transfer : sigmoid function 1/(1+e(-x))
        4. back propogation
            a. transfer derivative
            b. error back propogation
        5. train network
            a. update weights
        6. predict
        7. find mean square errors
        """
        #0. read dataset
        myIO = MyIO()
        inputDataFrame = myIO.inputProcessedCSV(filePath=inputFilePath)
        headerList = inputDataFrame.columns.values

        #1. split dataset into training and testing dataset\
        myUtility = MyUtility()
        trainingDataFrame, testingDataFrame = myUtility.splitDataset(\
                                            inputDataFrame = inputDataFrame,\
                                            trainingPercent = trainingPercent)

        #2. initializeNeuralNetwork
        uniqueClasses = inputDataFrame['class'].unique()
        numOfUniqueClasses = uniqueClasses.size

        trainingDataArr = trainingDataFrame.values
        testingDataArr = testingDataFrame.values

        trainingAtrArr, trainingClassArr, trainingAtrHeader = \
                                    myUtility.segregateAttributesAndClass(\
                                                inputArr = trainingDataArr,\
                                                inputHeader = headerList)
        testingAtrArr, testingClassArr, testingAtrHeader =\
                                 myUtility.segregateAttributesAndClass(\
                                                    inputArr = testingDataArr,\
                                                    inputHeader = headerList)

        nRows, nCols = trainingDataArr.shape

        neuralNetwork = NeuralNetwork( nInputs = nCols-1,\
                                       nHiddenLayers = nHiddenLayers, \
                                       nNeurons = nNeurons, \
                                       nOutputs = numOfUniqueClasses)

        #4. back propogation
        #         neuralNetwork.findBackwardPropagationError(targetValue = [1,0,0])
        trainingError = neuralNetwork.trainNetwork(\
                                            trainingDataArr = trainingDataArr,\
                                            nIteration = maxItr,\
                                            numOfUniqueClasses=numOfUniqueClasses, \
                                            learningRate=0.5)
        trainingPredictedOPArr = neuralNetwork.predictDataset(\
                                            testingDataSet = trainingDataArr)
        testingPredictedOPArr = neuralNetwork.predictDataset(\
                                            testingDataSet = testingDataArr)

        trainingError =  neuralNetwork.meanSquareError(\
                                    targetArr = trainingClassArr,\
                                    predictedOutputArr = trainingPredictedOPArr)
        testingError = neuralNetwork.meanSquareError(\
                                    targetArr = testingClassArr,\
                                     predictedOutputArr = testingPredictedOPArr)

        #debug
        print("\nAfter training neural network:\n")
        neuralNetwork.printNeuralNetworkWeights(headerList=trainingAtrHeader)
        print('\ntrainingError = {}'.format(trainingError))
        print('testingError = {} '.format(testingError))
示例#6
0
    def createAlarmBayes(self, inputParam, queryParam):
        """
        Given function is a UI function, which takes inputParam and queryParam
        and perform tasks 
        #TODO: complete comment details - which tasks and output if any
        """
        myIO = MyIO()
        evidences_input = myIO.parse_evidence_input(input_value=inputParam)
        query_params = myIO.parse_query_input(input_value=queryParam)

        #         #debug
        #         print ('evidences_input = {} '.format(evidences_input))
        print('query_params = {}'.format(query_params))
        #         #debug -ends

        alarmBayes = AlarmBayes()
        # work on each query param turn by turn
        for query in query_params:
            # result for query

            print("\n###################################################")
            print ("\t RESULT FOR QUERY: {}".format(alarmBayes.find_node(\
                                                         query).name.upper()))
            print("###################################################")

            # enum result
            enumerationUtil = EnumerationUtil()

            enumerationUtil.result_for_enumeration(query, evidences_input,\
                                                                     alarmBayes)

            # sampling
            sample_list = [10, 50, 100, 200, 500, 1000, 10000, 100000]
            sample_output = \
                    enumerationUtil.result_for_sampling(query, evidences_input,\
                                                        alarmBayes, sample_list)
            # sampling rejection
            result_with_sample_rejection = \
                enumerationUtil.result_for_sampling_rejection(query, evidences_input, \
                                                              alarmBayes, sample_list)

            nSamples = len(sample_list)
            print(
                "\n------------------- sampling (positive samples / total samples) ---------------------------"
            )
            myIO.print_sample_output(sample_output, nSamples)

            print(
                "\n------------------ sample - rejection (positive samples / total samples)-------------------"
            )
            # myIO.print_sample_output(sample_rejection_output, nSamples)
            myIO.print_sample_rejection_output(result_with_sample_rejection)

            #finding likelihood
            likelihoodUtil = LikelihoodUtil()
            likelihood_result = enumerationUtil.result_for_likelihood_weight(
                query, evidences_input, alarmBayes, sample_list)

            print(
                "\n------------- likelihood (query sample weight / total weight) ----------------------------"
            )
            myIO.print_likelihood_output(likelihood_result, nSamples)
示例#7
0
    def decisionTreeUI(self, trainingPath, validationPath, testingPath,\
                                    pruningFactor):
        """
        given UI method performs following tasks:
        1. takes input
        2. finds decision tree using ID3 algorithm
        3. perform pruning
        4. provides output
        """
        #taking input
        myIO = MyIO()
        trainingData,trainingHeader,trainingClassArr = myIO.inputCSV(trainingPath)
        validationData, validationHeader, validationClassArr = myIO.inputCSV(validationPath)
        testingData, testingHeader, testingClassArr = myIO.inputCSV(testingPath)
        
        #finding entropy of the class
        treeGeneration=TreeGeneration()
        trainingEntropyOfClass = treeGeneration.findEntropyOfClass(trainingClassArr)
#         #debug
#         print ('entropyOfClass = {} '.format(trainingEntropyOfClass))
#         #debug -ends

        #calling createDecisionTree() to get treeNodeList
        treeNodeList = treeGeneration.createDecisionTree(dataArr = trainingData,\
                                         headerList = trainingHeader,\
                                         classArr = trainingClassArr,\
                                         classEntropy = trainingEntropyOfClass,\
                                         treeNode = [], \
                                         rootNodeCounter = 0,\
                                         parentNode = None)
         
#         #debug
#         print(RenderTree(node = treeNodeList[0], style=AsciiStyle()))
#         #debug -ends
        
        #printing tree
        myIO.printTree(treeNodeList)

        
        accuracyCalculation = AccuracyCalculation()
        prePruningTrainingAccuracy = accuracyCalculation.findAccuracy(\
                                                dataArr = trainingData,\
                                                headerList = trainingHeader,\
                                                classArr = trainingClassArr,\
                                                treeNodeList = treeNodeList)
        prePruningValidationAccuracy = accuracyCalculation.findAccuracy(\
                                                dataArr = validationData,\
                                                headerList = validationHeader,\
                                                classArr = validationClassArr,\
                                                treeNodeList = treeNodeList)
        prePruningTestingAccuracy = accuracyCalculation.findAccuracy(\
                                                dataArr = testingData,\
                                                headerList = testingHeader,\
                                                classArr = testingClassArr,\
                                                treeNodeList = treeNodeList)
         
        #printing accuracy report
        print ("-------------------------")
        print ("pre-Pruning accuracy")
        print ("-------------------------")
        myIO.printAccuracyReport(dataArr = trainingData,\
                                 accuracy = prePruningTrainingAccuracy,\
                                 dataTypeStr = "training",\
                                 treeNodeList = treeNodeList)
        myIO.printAccuracyReport(dataArr = validationData,\
                                 accuracy = prePruningValidationAccuracy,\
                                 dataTypeStr = "validation")
        myIO.printAccuracyReport(dataArr = testingData,\
                                 accuracy = prePruningTestingAccuracy,\
                                 dataTypeStr = "testing")
       
        pruningTree = PruningTree()
        prunedTreeNodeList = pruningTree.findPrunedTree(\
                                pruningFactor = pruningFactor,\
                                treeNodeList = treeNodeList,\
                                validationData = validationData,\
                                validationHeader = validationHeader,\
                                validationClassArr = validationClassArr,\
                                initialvalidationAccuracy = \
                                                    prePruningValidationAccuracy)
        
        
        postPruningTrainingAccuracy = accuracyCalculation.findAccuracy(\
                                                dataArr = trainingData,\
                                                headerList = trainingHeader,\
                                                classArr = trainingClassArr,\
                                                treeNodeList = prunedTreeNodeList)
        postPruningValidationAccuracy = accuracyCalculation.findAccuracy(\
                                                dataArr = validationData,\
                                                headerList = validationHeader,\
                                                classArr = validationClassArr,\
                                                treeNodeList = prunedTreeNodeList)
        postPruningTestingAccuracy = accuracyCalculation.findAccuracy(\
                                                dataArr = testingData,\
                                                headerList = testingHeader,\
                                                classArr = testingClassArr,\
                                                treeNodeList = prunedTreeNodeList)
        
        #printing accuracy report
        print ("-------------------------")
        print ("post-Pruning accuracy")
        print ("-------------------------")
        myIO.printAccuracyReport(dataArr = trainingData,\
                                 accuracy = postPruningTrainingAccuracy,\
                                 dataTypeStr = "training",\
                                 treeNodeList = prunedTreeNodeList)
        myIO.printAccuracyReport(dataArr = validationData,\
                                 accuracy = postPruningValidationAccuracy,\
                                 dataTypeStr = "validation")
        myIO.printAccuracyReport(dataArr = testingData,\
                                 accuracy = postPruningTestingAccuracy,\
                                 dataTypeStr = "testing")
         
        return treeNodeList, prunedTreeNodeList