Exemplo n.º 1
0
 def prepareEnvironment(self):
     # People want to save time
     trainingPathPositive = os.path.join(data_io.get_training_folder(), data_io.get_positive_folder())
     trainingPathOldPositive = os.path.join(data_io.get_training_old_folder(), data_io.get_positive_folder())
     Utils.shift(data_io.get_training_old_folder(), trainingPathOldPositive, data_io.get_positive_folder(), trainingPathPositive)
     trainingPathNegative = os.path.join(data_io.get_training_folder(), data_io.get_negative_folder())
     trainingPathOldNegative = os.path.join(data_io.get_training_old_folder(), data_io.get_negative_folder())
     Utils.shift(data_io.get_training_old_folder(), trainingPathOldNegative, data_io.get_negative_folder(), trainingPathNegative)
     os.mkdir(trainingPathPositive)
     os.mkdir(trainingPathNegative)
     if not self.load:
         Utils.shift('.', data_io.get_savez_name(), data_io.get_savez_name(), data_io.get_savez_name())
     if not self.loadWndchrm:
         Utils.shift('.', data_io.get_wndchrm_dataset(), data_io.get_wndchrm_dataset(), data_io.get_wndchrm_dataset())
Exemplo n.º 2
0
 def run(self):
     print "Preparing the environment"
     self.prepareEnvironment()
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     wndchrmWorker = WndchrmWorkerTrain()
     print "Getting features"
     if not self.loadWndchrm: #Last wndchrm set of features
         featureGetter = FeatureGetter()
         fileName = data_io.get_savez_name()
         if not self.load: #Last features calculated from candidates
             (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections)
         else:
             (namesObservations, coordinates, train) = Utils.loadFeatures(fileName)
         print "Getting target vector"
         (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train)
         print "Saving images"
         imageSaver = ImageSaver(coordinates[indexes], namesObservations[indexes],
                                 imageCollections, featureGetter.patchSize, target[indexes])
         imageSaver.saveImages()
         print "Executing wndchrm algorithm and extracting features"
         (train, target) = wndchrmWorker.executeWndchrm()
     else:
         (train, target) = wndchrmWorker.loadWndchrmFeatures()
     print "Training the model"
     model = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=30, random_state=1, compute_importances=True)
     model.fit(train, target)
     print model.feature_importances_
     print "Saving the classifier"
     data_io.save_model(model)
Exemplo n.º 3
0
 def runWithoutWndchrm(self):
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     print "Getting features"
     featureGetter = FeatureGetter()
     fileName = data_io.get_savez_name()
     if not self.load:  #Last features calculated from candidates
         (namesObservations, coordinates,
          train) = Utils.calculateFeatures(fileName, featureGetter,
                                           imageCollections)
     else:
         (namesObservations, coordinates,
          train) = Utils.loadFeatures(fileName)
     print "Getting target vector"
     (indexes, target,
      obs) = featureGetter.getTargetVector(coordinates, namesObservations,
                                           train)
     print "Training the model"
     classifier = RandomForestClassifier(n_estimators=500,
                                         verbose=2,
                                         n_jobs=1,
                                         min_samples_split=10,
                                         random_state=1,
                                         compute_importances=True)
     #classifier = KNeighborsClassifier(n_neighbors=50)
     model = Pipeline([('scaling', MinMaxScaler()),
                       ('classifying', classifier)])
     model.fit(obs[indexes], target[indexes])
     print "Saving the classifier"
     data_io.save_model(model)
Exemplo n.º 4
0
 def prepareEnvironment(self):
     # People want to save time
     trainingPathPositive = os.path.join(data_io.get_training_folder(),
                                         data_io.get_positive_folder())
     trainingPathOldPositive = os.path.join(
         data_io.get_training_old_folder(), data_io.get_positive_folder())
     Utils.shift(data_io.get_training_old_folder(), trainingPathOldPositive,
                 data_io.get_positive_folder(), trainingPathPositive)
     trainingPathNegative = os.path.join(data_io.get_training_folder(),
                                         data_io.get_negative_folder())
     trainingPathOldNegative = os.path.join(
         data_io.get_training_old_folder(), data_io.get_negative_folder())
     Utils.shift(data_io.get_training_old_folder(), trainingPathOldNegative,
                 data_io.get_negative_folder(), trainingPathNegative)
     os.mkdir(trainingPathPositive)
     os.mkdir(trainingPathNegative)
     if not self.load:
         Utils.shift('.', data_io.get_savez_name(),
                     data_io.get_savez_name(), data_io.get_savez_name())
     if not self.loadWndchrm:
         Utils.shift('.', data_io.get_wndchrm_dataset(),
                     data_io.get_wndchrm_dataset(),
                     data_io.get_wndchrm_dataset())
Exemplo n.º 5
0
 def runWithoutWndchrm(self):
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     print "Getting features"
     featureGetter = FeatureGetter()
     fileName = data_io.get_savez_name()
     if not self.load: #Last features calculated from candidates
         (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections)
     else:
         (namesObservations, coordinates, train) = Utils.loadFeatures(fileName)
     print "Getting target vector"
     (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train)
     print "Training the model"
     classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=10, random_state=1, compute_importances=True)
     #classifier = KNeighborsClassifier(n_neighbors=50)
     model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
     model.fit(obs[indexes], target[indexes])
     print "Saving the classifier"
     data_io.save_model(model)
Exemplo n.º 6
0
 def run(self):
     print "Preparing the environment"
     self.prepareEnvironment()
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     wndchrmWorker = WndchrmWorkerTrain()
     print "Getting features"
     if not self.loadWndchrm:  #Last wndchrm set of features
         featureGetter = FeatureGetter()
         fileName = data_io.get_savez_name()
         if not self.load:  #Last features calculated from candidates
             (namesObservations, coordinates,
              train) = Utils.calculateFeatures(fileName, featureGetter,
                                               imageCollections)
         else:
             (namesObservations, coordinates,
              train) = Utils.loadFeatures(fileName)
         print "Getting target vector"
         (indexes, target,
          obs) = featureGetter.getTargetVector(coordinates,
                                               namesObservations, train)
         print "Saving images"
         imageSaver = ImageSaver(coordinates[indexes],
                                 namesObservations[indexes],
                                 imageCollections, featureGetter.patchSize,
                                 target[indexes])
         imageSaver.saveImages()
         print "Executing wndchrm algorithm and extracting features"
         (train, target) = wndchrmWorker.executeWndchrm()
     else:
         (train, target) = wndchrmWorker.loadWndchrmFeatures()
     print "Training the model"
     model = RandomForestClassifier(n_estimators=500,
                                    verbose=2,
                                    n_jobs=1,
                                    min_samples_split=30,
                                    random_state=1,
                                    compute_importances=True)
     model.fit(train, target)
     print model.feature_importances_
     print "Saving the classifier"
     data_io.save_model(model)
Exemplo n.º 7
0
    def run(self, k=3, useOnlyRF=True):
        featureGetter = FeatureGetter()
        fileNameTrain = data_io.get_savez_name()
        fileNameTest = data_io.get_savez_name_test()
        print "Merging files..."
        (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest)
        dataset = dataset[:,self.filterIndexes(len(dataset[0]))]        
        print "Shuffling and splitting the data"
        indexesChanged = np.arange(len(dataset))
        np.random.shuffle(indexesChanged)
        splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k+1)
        splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k+1)
        splittedData = self.getShuffledSplits(dataset, indexesChanged, k+1)
        
        """Leave the last split for testing"""
        testNamesObs = splittedNamesObs[k]
        testCoords = splittedCoords[k]
        testDataset = splittedData[k]
        
        splittedNamesObs = splittedNamesObs[:k]
        splittedCoords = splittedCoords[:k]
        splittedData = splittedData[:k]
        
        del(dataset)
        del(coordinates)
        del(namesObservations)
        del(indexesChanged)

        bestModel = None
        bestFmeasure = 0
        
        for i in range(k-1,-1,-1):#i is the index of the validation
            print "Doing cross-validation for i=%d" %i    
            namesObservationsValid = splittedNamesObs[i]
            coordinatesValid = splittedCoords[i]
            datasetValid = splittedData[i]
            namesObservationsValid = np.reshape(namesObservationsValid, namesObservationsValid.shape[0])
            namesObservationsTrain = self.getTrainData(splittedNamesObs,i)
            coordinatesTrain = self.getTrainData(splittedCoords,i)
            datasetTrain = self.getTrainData(splittedData, i)
            namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0])
            print "Getting target vector"
            (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain)
            print "Selecting features"
            classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True)
            model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
            model.fit(obs[indexes], target[indexes])
            if not useOnlyRF:
                importances = classifier.feature_importances_
                filterImportances = np.where(importances > 0.0001)[0]
                print len(filterImportances)
                #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) 
                print "Training model"
                #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True)
                #classifier = KNeighborsClassifier()
                classifier = LinearSVC(verbose=1)
                #classifier = MLPClassifier(verbose=1)
                model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
                model.fit(obs[indexes][:,filterImportances], target[indexes])
            print "Making predictions"
            if not useOnlyRF:
                predictions = model.predict(datasetValid[:,filterImportances])
            else:
                predictions = model.predict(datasetValid)
            predictions = predictions.reshape(len(predictions), 1)
            print "Calculating validation results"
            [_, _, _, _, _, fmeasure, _] = Predictor.finalResults(namesObservationsValid, predictions, coordinatesValid)
            if fmeasure > bestFmeasure:
                bestFmeasure = fmeasure
                bestModel = model
            del(datasetTrain)
            del(datasetValid)
            del(coordinatesTrain)
            del(coordinatesValid)
            del(namesObservationsTrain)
            del(namesObservationsValid)
        
        print "Calculating final results"
        predictions = bestModel.predict(testDataset)
        print "The final score is: "
        testNamesObs = np.reshape(testNamesObs, testNamesObs.shape[0])
        Predictor.finalResults(testNamesObs, predictions, testCoords)
Exemplo n.º 8
0
    def run(self, k=3, patientSplit=True, useOnlyRF=True, breakin2=True):
        featureGetter = FeatureGetter()
        overallTP = 0
        overallFP = 0
        overallFN = 0
        fileNameTrain = data_io.get_savez_name()
        fileNameTest = data_io.get_savez_name_test()
        print "Merging files..."
        (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest)

        dataset = dataset[:,self.filterIndexes(len(dataset[0]))]        
        print "Shuffling and splitting the data"
        indexesChanged = np.arange(len(dataset))
        np.random.shuffle(indexesChanged)
        if patientSplit:
            k = 12
            (splittedNamesObs, splittedCoords, splittedData) = self.getSplits(namesObservations, coordinates, dataset)
            if breakin2:
                k = 2
                (splittedNamesObs, splittedCoords, splittedData) = self.getNewSplits(splittedNamesObs, splittedCoords, splittedData)
        else:
            splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k)
            splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k)
            splittedData = self.getShuffledSplits(dataset, indexesChanged, k)
        
        del(dataset)
        del(coordinates)
        del(namesObservations)
        del(indexesChanged)
        
        overallArrayTP = np.zeros(12)
        overallArrayFP = np.zeros(12)
        overallArrayFN = np.zeros(12)

        for i in range(k-1,-1,-1):#i is the index of the validation
            print "Doing cross-validation for i=%d" %i    
            namesObservationsTest = splittedNamesObs[i]
            coordinatesTest = splittedCoords[i]
            datasetTest = splittedData[i]
            namesObservationsTest = np.reshape(namesObservationsTest, namesObservationsTest.shape[0])
            namesObservationsTrain = self.getTrainData(splittedNamesObs,i)
            coordinatesTrain = self.getTrainData(splittedCoords,i)
            datasetTrain = self.getTrainData(splittedData, i)
            namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0])
            print "Getting target vector"
    
            (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain)
        
            print "Selecting features"
            classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True)
            model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
            model.fit(obs[indexes], target[indexes])
            if not useOnlyRF:
                importances = classifier.feature_importances_
                filterImportances = np.where(importances > 0.0001)[0]
                print len(filterImportances)
                #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) 
                print "Training model"
                #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True)
                #classifier = KNeighborsClassifier()
                classifier = LinearSVC(verbose=1)
                #classifier = MLPClassifier(verbose=1)
                model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
                model.fit(obs[indexes][:,filterImportances], target[indexes])
            print "Making predictions"
            if not useOnlyRF:
                predictions = model.predict(datasetTest[:,filterImportances])
            else:
                predictions = model.predict(datasetTest)
            predictions = predictions.reshape(len(predictions), 1)
            print "Calculating final results"
            [truePositives, falsePositives, falseNegatives, _, _, _, (arrayTP, arrayFP, arrayFN)] = Predictor.finalResults(namesObservationsTest, predictions, coordinatesTest)
            print arrayTP
            print arrayFP
            print arrayFN
            
            overallArrayTP += arrayTP
            overallArrayFP += arrayFP
            overallArrayFN += arrayFN
            overallTP += truePositives
            overallFP += falsePositives
            overallFN += falseNegatives
            del(datasetTrain)
            del(datasetTest)
            del(coordinatesTrain)
            del(coordinatesTest)
            del(namesObservationsTrain)
            del(namesObservationsTest)
        
        precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0)
        recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0)
        fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision)
        
        print "Overall results for k=%d" %k
        print overallTP
        print overallFP
        print overallFN
        print precision
        print recall
        print fmeasure
        
        for i in range(len(overallArrayTP)):
            "Results for patient number %d:"% (i+1)
            overallTP = overallArrayTP[i]
            overallFP = overallArrayFP[i]
            overallFN = overallArrayFN[i]
            precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0)
            recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0)
            fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision)
            print precision
            print recall
            print fmeasure