Пример #1
0
    def classificationValidation(self,test_list, kmeans_path, kernel, C, gamma):
        '''
        Main classification Validation function to validate model on
        :param true_val_Vector: true values of test set.
        :param test_list: list of paths where test images are held.
        :param KmeansName: Load the Kmeans classifier for Binerization Task.
        '''
        if gamma == None:
            clf = SVC(C=C,kernel=kernel)
        else: 
            clf = SVC(C=C,gamma=gamma,kernel=kernel)

        print "kernel: " + kernel
        print "gamma: " + str(gamma)
        print "C: " + str(C)

        clf.fit(self.X,self.y)

        results_vector = []
        y_true = []
        cl=0

        k_means = joblib.load(kmeans_name)

        [m,num_of_clusters] = np.shape(self.X)

        for path in test_list:
            for item in os.listdir(path): 
                p = path + "/" + item
                im = cv.imread(p)
                fe = FeatureExtractor(im)
                feature_vector = np.zeros(num_of_clusters)
                raw_vector = fe.computeFeatureVector()
                Km_vector = k_means.predict(raw_vector) 
                for k in range(len(Km_vector)):
                    feature_vector[Km_vector[k]] = feature_vector[Km_vector[k]] + 1 

                res = clf.predict(feature_vector)
                
                # Debugging                    
                if res[0] == 1:
                    print p + " is not a foram!"
                if res[0] == 0:
                    print p + " is a foram!"

                y_true.append(cl)
                results_vector.append(res[0])
            cl = cl + 1

        print "confusion_matrix"
        print confusion_matrix(y_true,results_vector)
    def createClassificationTrainingFromDataset(self, dataset_name, labels_list, path_list):
        '''
        Creates a new training set to work on from given path list and labels.
        Notice path_list and path_labels are intended to be lists of the same length. see tests in __main__ for examples.
        :param dataset_name: the name of the data set
        :param path_list: a list of pathes frome which the images are collected.
        :param labels_list: a list of labels to use for the images collected from corresponding path. (i.e. first label correspond to first path in the path list.)
        '''

        base_path = "binData/"

        labels = []
        trainingData = []
        classes = []
        cl = 0

        ### Building the feature matrix.
        for i, path in enumerate(path_list):

            labels.append(labels_list[i])
            print labels_list[i]

            for item in os.listdir(path):
                p = path + "/" + item
                print p # DEBUG
                im = cv.imread(p)
                fe = FeatureExtractor(im)
                feature_vector = fe.computeFeatureVector()
                if len(trainingData) == 0:
                    trainingData = feature_vector
                else:
                    np.vstack((trainingData, feature_vector))           
                classes.append(cl)
            
            print "vstack Kmeans Classifier: "
            print np.shape(trainingData)

            classes = np.array(classes)
            cl = cl + 1

        ### DEBUG 
        print np.shape(trainingData)
        print np.shape(classes)

        ### SAVING THE DATASETS TO NPZ FORMAT
        np.savez(os.path.join(base_path, dataset_name), trainingData, labels, classes)
    def createKmeansTrainingDataset(self,kmeans_data, dataset_name, kmeans_name, path_list, labels_list, num_of_clusters):
        '''
        Create Training for Kmeans With regression.
        :param: KmeansData: the training matrix obtained using createClassificationTrainingFromDataset method on the HOLDOUT set.
        :param: kmeansName: the name of the Kmeans classifier to be saved and pickeled.
        :param: dataset_name: the name of the NEW dataset created using the Kmeans classifier on the training. i.e. clustering the feature vector.
        :param: path_list: list of paths where the training set is at.
        :param: labels_list: the list of labels for the samples in the training set.
        :param: num_of_clusters: the number of clusters for the Kmeans classifier.        
        '''
        npzfile = np.load(kmeans_data)
        KmeansData = npzfile['arr_0']
        Kmeanslabels = npzfile['arr_1']
        Kmeansclasses = npzfile['arr_2']

        k_means = cluster.KMeans(n_clusters=num_of_clusters)
        k_means.fit(kmeans_data)

        base_path = "binData/"

        labels = labels_list
        trainingData = []
        classes = []
        cl=0

        ### Building the feature matrix.
        for i, path in enumerate(path_list):
            
            print labels_list[i]

            for item in os.listdir(path):
                p = path + "/" + item
                print p # DEBUG
                im = cv.imread(p)
                fe = FeatureExtractor(im)
                feature_vector = np.zeros(num_of_clusters)
                raw_vector = fe.computeFeatureVector()
                Km_vector = k_means.predict(raw_vector) 
                for j in range(len(Km_vector)):
                    feature_vector[Km_vector[j]] = feature_vector[Km_vector[j]] + 1 
                trainingData.append(feature_vector)
                classes.append(cl)
            
            # Here we multiply the number of POSITIVE samples in the training set so that the 'unbalanced' problem of "Foram vs. Not-Foram"
            # 'becomes balanced'.
            if i == 0:
                print "working on positive samples"
                print "Original training size: (should be 68 by 10)"
                print np.shape(trainingData)
                print np.shape(classes)

                for k in range(9):
                    trainingData = np.vstack((trainingData, trainingData))
                    classes = np.hstack((classes,classes))
                
                print "After Multipling Positive Samples by 8"
                print np.shape(trainingData)
                print np.shape(classes)
                
                trainingData = trainingData.tolist()
                classes = classes.tolist()
            
            cl = cl + 1
            
        ### DEBUG 
        print "final shape: (should be 54,000~ by 10):"
        print np.shape(trainingData)

        ### SAVING THE DATASETS TO NPZ FORMAT
        joblib.dump(k_means, os.path.join(base_path, kmeans_name), compress=9)
        np.savez(os.path.join(base_path, dataset_name), trainingData, labels_list, classes)