Exemplo n.º 1
0
    def run(self, k=3, useOnlyRF=True):
        featureGetter = FeatureGetter()
        fileNameTrain = data_io.get_savez_name()
        fileNameTest = data_io.get_savez_name_test()
        print "Merging files..."
        (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest)
        dataset = dataset[:,self.filterIndexes(len(dataset[0]))]        
        print "Shuffling and splitting the data"
        indexesChanged = np.arange(len(dataset))
        np.random.shuffle(indexesChanged)
        splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k+1)
        splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k+1)
        splittedData = self.getShuffledSplits(dataset, indexesChanged, k+1)
        
        """Leave the last split for testing"""
        testNamesObs = splittedNamesObs[k]
        testCoords = splittedCoords[k]
        testDataset = splittedData[k]
        
        splittedNamesObs = splittedNamesObs[:k]
        splittedCoords = splittedCoords[:k]
        splittedData = splittedData[:k]
        
        del(dataset)
        del(coordinates)
        del(namesObservations)
        del(indexesChanged)

        bestModel = None
        bestFmeasure = 0
        
        for i in range(k-1,-1,-1):#i is the index of the validation
            print "Doing cross-validation for i=%d" %i    
            namesObservationsValid = splittedNamesObs[i]
            coordinatesValid = splittedCoords[i]
            datasetValid = splittedData[i]
            namesObservationsValid = np.reshape(namesObservationsValid, namesObservationsValid.shape[0])
            namesObservationsTrain = self.getTrainData(splittedNamesObs,i)
            coordinatesTrain = self.getTrainData(splittedCoords,i)
            datasetTrain = self.getTrainData(splittedData, i)
            namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0])
            print "Getting target vector"
            (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain)
            print "Selecting features"
            classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True)
            model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
            model.fit(obs[indexes], target[indexes])
            if not useOnlyRF:
                importances = classifier.feature_importances_
                filterImportances = np.where(importances > 0.0001)[0]
                print len(filterImportances)
                #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) 
                print "Training model"
                #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True)
                #classifier = KNeighborsClassifier()
                classifier = LinearSVC(verbose=1)
                #classifier = MLPClassifier(verbose=1)
                model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
                model.fit(obs[indexes][:,filterImportances], target[indexes])
            print "Making predictions"
            if not useOnlyRF:
                predictions = model.predict(datasetValid[:,filterImportances])
            else:
                predictions = model.predict(datasetValid)
            predictions = predictions.reshape(len(predictions), 1)
            print "Calculating validation results"
            [_, _, _, _, _, fmeasure, _] = Predictor.finalResults(namesObservationsValid, predictions, coordinatesValid)
            if fmeasure > bestFmeasure:
                bestFmeasure = fmeasure
                bestModel = model
            del(datasetTrain)
            del(datasetValid)
            del(coordinatesTrain)
            del(coordinatesValid)
            del(namesObservationsTrain)
            del(namesObservationsValid)
        
        print "Calculating final results"
        predictions = bestModel.predict(testDataset)
        print "The final score is: "
        testNamesObs = np.reshape(testNamesObs, testNamesObs.shape[0])
        Predictor.finalResults(testNamesObs, predictions, testCoords)
Exemplo n.º 2
0
    def run(self, k=3, patientSplit=True, useOnlyRF=True, breakin2=True):
        featureGetter = FeatureGetter()
        overallTP = 0
        overallFP = 0
        overallFN = 0
        fileNameTrain = data_io.get_savez_name()
        fileNameTest = data_io.get_savez_name_test()
        print "Merging files..."
        (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest)

        dataset = dataset[:,self.filterIndexes(len(dataset[0]))]        
        print "Shuffling and splitting the data"
        indexesChanged = np.arange(len(dataset))
        np.random.shuffle(indexesChanged)
        if patientSplit:
            k = 12
            (splittedNamesObs, splittedCoords, splittedData) = self.getSplits(namesObservations, coordinates, dataset)
            if breakin2:
                k = 2
                (splittedNamesObs, splittedCoords, splittedData) = self.getNewSplits(splittedNamesObs, splittedCoords, splittedData)
        else:
            splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k)
            splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k)
            splittedData = self.getShuffledSplits(dataset, indexesChanged, k)
        
        del(dataset)
        del(coordinates)
        del(namesObservations)
        del(indexesChanged)
        
        overallArrayTP = np.zeros(12)
        overallArrayFP = np.zeros(12)
        overallArrayFN = np.zeros(12)

        for i in range(k-1,-1,-1):#i is the index of the validation
            print "Doing cross-validation for i=%d" %i    
            namesObservationsTest = splittedNamesObs[i]
            coordinatesTest = splittedCoords[i]
            datasetTest = splittedData[i]
            namesObservationsTest = np.reshape(namesObservationsTest, namesObservationsTest.shape[0])
            namesObservationsTrain = self.getTrainData(splittedNamesObs,i)
            coordinatesTrain = self.getTrainData(splittedCoords,i)
            datasetTrain = self.getTrainData(splittedData, i)
            namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0])
            print "Getting target vector"
    
            (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain)
        
            print "Selecting features"
            classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True)
            model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
            model.fit(obs[indexes], target[indexes])
            if not useOnlyRF:
                importances = classifier.feature_importances_
                filterImportances = np.where(importances > 0.0001)[0]
                print len(filterImportances)
                #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) 
                print "Training model"
                #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True)
                #classifier = KNeighborsClassifier()
                classifier = LinearSVC(verbose=1)
                #classifier = MLPClassifier(verbose=1)
                model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
                model.fit(obs[indexes][:,filterImportances], target[indexes])
            print "Making predictions"
            if not useOnlyRF:
                predictions = model.predict(datasetTest[:,filterImportances])
            else:
                predictions = model.predict(datasetTest)
            predictions = predictions.reshape(len(predictions), 1)
            print "Calculating final results"
            [truePositives, falsePositives, falseNegatives, _, _, _, (arrayTP, arrayFP, arrayFN)] = Predictor.finalResults(namesObservationsTest, predictions, coordinatesTest)
            print arrayTP
            print arrayFP
            print arrayFN
            
            overallArrayTP += arrayTP
            overallArrayFP += arrayFP
            overallArrayFN += arrayFN
            overallTP += truePositives
            overallFP += falsePositives
            overallFN += falseNegatives
            del(datasetTrain)
            del(datasetTest)
            del(coordinatesTrain)
            del(coordinatesTest)
            del(namesObservationsTrain)
            del(namesObservationsTest)
        
        precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0)
        recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0)
        fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision)
        
        print "Overall results for k=%d" %k
        print overallTP
        print overallFP
        print overallFN
        print precision
        print recall
        print fmeasure
        
        for i in range(len(overallArrayTP)):
            "Results for patient number %d:"% (i+1)
            overallTP = overallArrayTP[i]
            overallFP = overallArrayFP[i]
            overallFN = overallArrayFN[i]
            precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0)
            recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0)
            fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision)
            print precision
            print recall
            print fmeasure