def clusteringParamSearch(trainPath, trainClasses, trainCount, clusterRange, ptRange, etRange, fisherRange): trainImgs = im.loadImages(trainPath, trainClasses, trainCount) trainLabels = im.loadLabels(trainPath, trainClasses, trainCount) for peakVal in ptRange: for edgeVal in etRange: # Extract SIFT features rec.logger.info("SIFT Parameter. Peak: %f Edge: %f", peakVal, edgeVal) trainDesc, numTrainingDesc = im.extractFeatures(trainImgs, peakVal, edgeVal) for clusterParam in clusterRange: for fDist in fisherRange: learner = cl.Learner() learner.peakThreshold = peakVal learner.edgeThreshold = edgeVal learner.numTrainingDesc = numTrainingDesc rec.logger.info("Cluster parameter. Cluster: %i MinDist %f", clusterParam, fDist) learner.averageSilhouetteCoefficient(trainLabels, trainDesc, clusterParam, fDist) learner.saveToDB()
def train(self, featOrImgs, labels, numCluster, peakThreshold=10.0, edgeThreshold=10.0, maxiter=0, numruns=20): """ Perform k-means clustering and vector quantize all features. The codebook and the histograms of visual words are stored as members. If L{featOrImgs} is a list of images, SIFT features are extracted before all other operations are done. @param featOrImgs: Either a list of SIFT descriptor arrays or a list of images. @type featOrImgs: [numpy.ndarray] or [Image.Image] @param labels: A list of labels corresponding to the list of descriptors/images. @type labels: [string] @param numCluster: The number of clusters used for k-means clustering. @type numCluster: int @param peakThreshold: The peak threshold used during SIFT feature extraction. This argument is ignored if L{featOrImgs} is a list of SIFT features. @type peakThreshold: float @param edgeThreshold: The edge threshold used during SIFT feature extraction. This argument is ignored if L{featOrImgs} is a list of SIFT features. @type edgeThreshold: float @param maxiter: The maximum iterations of one k-means run. "0" means there is no iteration limit. @type maxiter: int @param numruns: The number of k-means runs. @type numruns: int """ self.numCluster = numCluster if isinstance(featOrImgs[0], Image.Image): self.peakThreshold = peakThreshold self.edgeThreshold = edgeThreshold descriptors = im.extractFeatures(featOrImgs, peakThreshold, edgeThreshold) else: descriptors = featOrImgs starttime = time.time() # Compute codebook from all sift descriptors of all images self.codebook = self._cluster(descriptors, numCluster, maxiter, numruns) # Get the histograms as a list of numpy arrays tempHistograms = _buildHistograms(descriptors) # Create orange domain classes = _getAllClasses(labels) self.strClasses = str(classes) histogramDomain = [orange.FloatVariable('a%i' % x) for x in xrange(len(tempHistograms[0]))] classDomain = [orange.EnumVariable("class", values=orange.StringList(classes))] self.domain = orange.Domain(histogramDomain + classDomain) # Create orange exampleTable self.histograms = _convertToOrangeDataSet(tempHistograms, self.domain, labels) endtime = time.time() self.trainingTime = endtime - starttime
def classificationAccuracy(self, histOrImgs, labels=None, confThr=0.0, peakThr=None, edgeThr=None, nu=0.6, gamma=2.0, doCrossVal=False): """ Classify test data and (optionally) perform a cross validation on the training data. @param histOrImgs: Either a list of SIFT descriptor arrays or an iterator of images. @type histOrImgs: numpy.ndarray or [numpy.ndarray] or Image.Image or [Image.Image] @param labels: A list of labels corresponding to the list of descriptors/images. @type labels: [string] @param nu: The S{nu}-Parameter of the support vector machine. @type nu: float @param gamma: The S{gamma}-Parameter of the RBF-Kernel. @type gamma: float @param confidenceThreshold: All classifications who are classified with a lower confidence than this threshold are rejected. 1.0: Everything is rejected, 0.0: Nothing is rejected. @type confidenceThreshold: float @param peakThreshold: A SIFT parameter. Sensible values: 0.0 < x < 30.0. @type peakThreshold: float @param edgeThreshold: A SIFT parameter. Sensible values: 0.0 < x < 10.0. @type edgeThreshold: float @rtype: (float,float) @return: The cross validation accuracy and the test data classification accuracy. """ if self.learner is None: raise ValueError("Learner has to be loaded before classification can be done.") # Set SIFT member variables (so they get stored in the DB if requested) if peakThr is None: self.peakThreshold = self.learner.peakThreshold else: self.peakThreshold = peakThr if edgeThr is None: self.edgeThreshold = self.learner.edgeThreshold else: self.edgeThreshold = edgeThr # If we've been given an images iterator, extract features and vector quantize if isinstance(histOrImgs, collections.Iterator): if labels is None: raise ValueError("If argument 'histOrImgs' is an iterator of images, \ the argument 'lables' must not be None.") desc, self.numTestDesc = im.extractFeatures(histOrImgs, self.peakThreshold, self.edgeThreshold) recognosco.logger.info("Found %i features/image on average.", self.numTestDesc / len(desc)) tmpHistograms = _buildHistograms(self.learner.codebook, desc) histograms = _convertToOrangeDataSet(tmpHistograms, self.learner.domain, labels) else: histograms = histOrImgs values = histograms.domain.classVar.values self.values = values length = len(values) self.confusion = numpy.zeros((length, length), int) starttime = time.time() self.nu = nu self.gamma = gamma svm = orange.SVMLearner() svm.svm_type = orange.SVMLearner.Nu_SVC svm.nu = nu svm.gamma = gamma svm.kernel_type = orange.SVMLearner.RBF svm.probability = True recognosco.logger.debug("Training Support Vector Machine...") self.classifier = svm(self.learner.histograms) recognosco.logger.debug("Done...") crossVal = 0.0 if doCrossVal: crossVal = orngTest.crossValidation([svm], self.learner.histograms, folds=10) numCorrectClassified = 0.0 numClassified = 0.0 for i in range(len(histograms)): c = self.classifier(histograms[i]) recognosco.logger.info("Has the Class: %s", histograms[i].getclass()) recognosco.logger.info("Classified as: %s", c) prob = self.classifier(histograms[i], self.classifier.GetProbabilities) conf = self.__getConfidence(prob) recognosco.logger.info("Confidence: %f", conf) if conf < confThr: recognosco.logger.info("Rejected classification (Threshold: %.2f)", confThr) continue numClassified += 1.0 predicted = values.index(str(c)) actual = values.index(str(histograms[i].getclass())) self.confusion[predicted][actual] += 1 if(c == histograms[i].getclass()): numCorrectClassified += 1.0 endtime = time.time() self.confusion = str(self.confusion) self.clAccuracy = numCorrectClassified / numClassified self.fracClassified = numClassified / len(histograms) if doCrossVal: self.cvAccuracy = orngStat.CA(crossVal)[0] recognosco.logger.info("Cross validation accuracy: %s", self.cvAccuracy) else: self.cvAccuracy = -1.0 recognosco.logger.info("Classification accuracy of test data: %s", self.clAccuracy) self.testTime = endtime - starttime return (self.cvAccuracy, self.classificationAccuracy)
def classify(self, images, nu=0.6, gamma=4.0, confidenceThreshold=0.03, minFeatures=5): """ Classify test data and return the label(s). @param images: A single image or a list of images. @type images: Image.Image or [Image.Image] @param nu: The S{nu}-Parameter of the support vector machine. @type nu: float @param gamma: The S{gamma}-Parameter of the RBF-Kernel.. @type gamma: float @param confidenceThreshold: All classifications who are classified with a lower confidence than this threshold are rejected. 1.0: Everything is rejected, 0.0: Nothing is rejected. @type confidenceThreshold: float @param gamma: The minimum amount of features which should be extracted from every image. @type gamma: float @return: The labels which result from the classification @rtype: [string] """ if self.learner is None: raise ValueError("Learner has to be loaded before classification can be done.") if not isinstance(images, collections.Iterable): images = [images] # We have no labels, so set them to None inputLabels = [None for i in range(len(images))] # Use same SIFT parameters as during training, but change SIFT parameters # if they result in too less features self.numTestDesc = 0 peak = self.learner.peakThreshold edge = self.learner.edgeThreshold peakDecr = 1.0 edgeInc = 1.0 descIter1, descIter2 = itertools.tee(im.extractFeatures(images, peak, edge)) for desc in descIter1: self.numTestDesc += numpy.size(desc, 0) recognosco.logger.info("Found %i features.", self.numTestDesc) while self.numTestDesc < minFeatures and peak >= 0.0 and edge <= 10.0: # Adjust SIFT parameters peakDecr = peakDecr - 0.2 peak = self.learner.peakThreshold * peakDecr edgeInc = edgeInc + 0.2 edge = self.learner.peakThreshold * edgeInc recognosco.logger.warning("Less than %i SIFT features have been extracted. " "Adjusting SIFT parameters (peak: %f, edge: %f).", minFeatures, peak, edge) descIter1, descIter2 = itertools.tee(im.extractFeatures(images, peak, edge)) self.numTestDesc = 0 for desc in descIter1: self.numTestDesc += numpy.size(desc, 0) recognosco.logger.info("Found %i features.", self.numTestDesc) if self.numTestDesc < minFeatures: recognosco.logger.error("Less than %i SIFT features have been extracted. Aborting.", minFeatures) return ["<not enough features>"] tmpHistograms = _buildHistograms(self.learner.codebook, descIter2) histograms = _convertToOrangeDataSet(tmpHistograms, self.learner.domain, inputLabels) self.nu = nu self.gamma = gamma svm = orange.SVMLearner() svm.svm_type = orange.SVMLearner.Nu_SVC svm.nu = nu svm.gamma = gamma svm.kernel_type = orange.SVMLearner.RBF svm.probability = True classifier = svm(self.learner.histograms) outputLabels = [] for i in range(len(histograms)): c = classifier(histograms[i]) prob = classifier(histograms[i], classifier.GetProbabilities) conf = self.__getConfidence(prob) if conf < confidenceThreshold: recognosco.logger.info("Rejected classification. Nearest match:" " %s (Confidence: %.2f%%)", c, conf * 100.0) continue recognosco.logger.info("Classified as: %s. Confidence: %.2f%%", c, conf * 100.0) outputLabels.append(str(c)) return outputLabels