def evaluate(classifier, dataset, groundTruth, confusion=None, verbose=True): """Evaluate the classifier on the given dataset and returns the confusion matrix. Uses only the points that are in the groundTruth parameter for the evaluation. Parameters ---------- classifier : a function which given a point returns its class dataset : the dataset from which to get the points groundTruth : a map from the points to classify to their respective class """ progress = TextProgress(len(groundTruth)) done = 0 confusion = confusion or ConfusionMatrix() for pointId, expected in groundTruth.items(): try: found = classifier(dataset.point(pointId)) confusion.add(expected, found, pointId) except Exception as e: log.warning('Could not classify point "%s" because %s' % (pointId, str(e))) raise done += 1 if verbose: progress.update(done) return confusion
def evaluate(classifier, dataset, groundTruth, confusion = None, verbose = True): """Evaluate the classifier on the given dataset and returns the confusion matrix. Uses only the points that are in the groundTruth parameter for the evaluation. Parameters ---------- classifier : a function which given a point returns its class dataset : the dataset from which to get the points groundTruth : a map from the points to classify to their respective class """ progress = TextProgress(len(groundTruth)) done = 0 confusion = confusion or ConfusionMatrix() for pointId, expected in groundTruth.items(): try: found = classifier(dataset.point(pointId)) confusion.add(expected, found, pointId) except Exception, e: log.warning('Could not classify point "%s" because %s' % (pointId, str(e))) raise done += 1 if verbose: progress.update(done)
def evaluateNfold(nfold, dataset, groundTruth, trainingFunc, *args, **kwargs): """Evaluate the classifier on the given dataset and returns the confusion matrix. The evaluation is performed using n-fold cross validation. Uses only the points that are in the groundTruth parameter for the evaluation. Parameters ---------- nfold : the number of folds to use for the cross-validation dataset : the dataset from which to get the points groundTruth : a map from the points to classify to their respective class trainingFunc : a function which will train and return a classifier given a dataset, the groundtruth, and the *args and **kwargs arguments """ log.info('Doing %d-fold cross validation' % nfold) classes = set(groundTruth.values()) progress = TextProgress(nfold, 'Evaluating fold %(current)d/%(total)d') # get map from class to point names iclasses = {} for c in classes: iclasses[c] = [ p for p in groundTruth.keys() if groundTruth[p] == c ] random.shuffle(iclasses[c]) # get folds folds = {} for i in range(nfold): folds[i] = [] for c in iclasses.values(): foldsize = (len(c)-1)//nfold + 1 # -1/+1 so we take all instances into account, last fold might have fewer instances folds[i] += c[ foldsize * i : foldsize * (i+1) ] # build sub-datasets and run evaluation on them confusion = None pnames = [ p.name() for p in dataset.points() ] for i in range(nfold): if log.isEnabledFor(logging.INFO): progress.update(i+1) trainds = DataSet() trainds.addPoints([ dataset.point(pname) for pname in pnames if pname not in folds[i] ]) traingt = GroundTruth(groundTruth.className, dict([ (p, c) for p, c in groundTruth.items() if p not in folds[i] ])) testds = DataSet() testds.addPoints([ dataset.point(str(pname)) for pname in folds[i] ]) testgt = GroundTruth(groundTruth.className, dict([ (p, c) for p, c in groundTruth.items() if p in folds[i] ])) classifier = trainingFunc(trainds, traingt, *args, **kwargs) confusion = evaluate(classifier, testds, testgt, confusion, verbose = False) return confusion
def readResults(self, dir): """Reads all the results file contained in the given directory, and generates the associated ConfusionMatrix for each one.""" resultFiles = glob.glob(join(dir, '*.result')) progress = TextProgress(len(resultFiles)) for i, filename in enumerate(resultFiles): cm = ConfusionMatrix() cm.load(filename) paramFile = splitext(filename)[0] + '.param' params = yaml.load(open(paramFile).read()) self.results += [ (filename, cm, params) ] progress.update(i+1)