def testKullbackLeibler(self): ds = transform(testdata.loadTestDB(), 'fixlength') # creates a test with more than 1000 points otherwise the test is useless because # we split the workload in chunks of 1000 points when computing the distance dstest = DataSet() ncopy = 20 for cidx in range(ncopy): points = list(ds.points()) for p in points: p.setName(p.name() + '-%d' % cidx) dstest.addPoints(points) # test whether KL doesn't break with multithreading (did in 2.2.1) v = View(dstest) dist = MetricFactory.create('kullbackleibler', dstest.layout(), { 'descriptorName': 'mfcc' }) results = v.nnSearch(ds.samplePoint(), dist).get(6*ncopy) expected = [ 0.0 ]*2*ncopy + [ 6.1013755798339844 ]*ncopy expected += [ 6.4808731079101562 ]*2*ncopy + [ 6.7828292846679688 ]*ncopy for r, e in zip(results, expected): self.assertAlmostEqual(r[1], e, 5)
def train_SVM(dataset, groundTruth, descriptorNames, exclude=[], svmtype='c-svc', kernel='rbf', c=1, gamma=1): # recreate a copy of the given dataset without history ds = DataSet() ds.addPoints([p for p in dataset.points()]) ds = transform(ds, 'normalize', { 'descriptorNames': descriptorNames, 'except': exclude, 'independent': True }) ds = transform( ds, 'svmtrain', { 'descriptorNames': descriptorNames, 'except': exclude, 'className': groundTruth.className, 'type': svmtype, 'kernel': kernel, 'c': c, 'gamma': gamma }) h = ds.history() return lambda p: str(h.mapPoint(p)[groundTruth.className])
def evaluateNfold(nfold, dataset, groundTruth, trainingFunc, *args, **kwargs): """Evaluate the classifier on the given dataset and returns the confusion matrix. The evaluation is performed using n-fold cross validation. Uses only the points that are in the groundTruth parameter for the evaluation. Parameters ---------- nfold : the number of folds to use for the cross-validation dataset : the dataset from which to get the points groundTruth : a map from the points to classify to their respective class trainingFunc : a function which will train and return a classifier given a dataset, the groundtruth, and the *args and **kwargs arguments """ log.info('Doing %d-fold cross validation' % nfold) classes = set(groundTruth.values()) progress = TextProgress(nfold, 'Evaluating fold %(current)d/%(total)d') # get map from class to point names iclasses = {} for c in classes: iclasses[c] = [ p for p in groundTruth.keys() if groundTruth[p] == c ] random.shuffle(iclasses[c]) # get folds folds = {} for i in range(nfold): folds[i] = [] for c in iclasses.values(): foldsize = (len(c)-1)//nfold + 1 # -1/+1 so we take all instances into account, last fold might have fewer instances folds[i] += c[ foldsize * i : foldsize * (i+1) ] # build sub-datasets and run evaluation on them confusion = None pnames = [ p.name() for p in dataset.points() ] for i in range(nfold): if log.isEnabledFor(logging.INFO): progress.update(i+1) trainds = DataSet() trainds.addPoints([ dataset.point(pname) for pname in pnames if pname not in folds[i] ]) traingt = GroundTruth(groundTruth.className, dict([ (p, c) for p, c in groundTruth.items() if p not in folds[i] ])) testds = DataSet() testds.addPoints([ dataset.point(str(pname)) for pname in folds[i] ]) testgt = GroundTruth(groundTruth.className, dict([ (p, c) for p, c in groundTruth.items() if p in folds[i] ])) classifier = trainingFunc(trainds, traingt, *args, **kwargs) confusion = evaluate(classifier, testds, testgt, confusion, verbose = False) return confusion
def train_SVM(dataset, groundTruth, descriptorNames, exclude = [], svmtype = 'c-svc', kernel = 'rbf', c = 1, gamma = 1): # recreate a copy of the given dataset without history ds = DataSet() ds.addPoints([ p for p in dataset.points() ]) ds = transform(ds, 'normalize', { 'descriptorNames': descriptorNames, 'except': exclude, 'independent': True }) ds = transform(ds, 'svmtrain', { 'descriptorNames': descriptorNames, 'except': exclude, 'className': groundTruth.className, 'type': svmtype, 'kernel': kernel, 'c': c, 'gamma': gamma}) h = ds.history() return lambda p: str(h.mapPoint(p)[groundTruth.className])
def PCA(x): points = [] layout = PointLayout() layout.add('x', RealType) for i, l in enumerate(x): p = Point() p.setName('p%d' % i) p.setLayout(layout) p['x'] = l points.append(p) ds = DataSet() ds.addPoints(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'pca', { 'dimension': len(x[0]), 'resultName': 'pca' }) result = [] for p in ds.points(): result.append(p['pca']) return result
def PCA(x): points = [] layout = PointLayout() layout.add('x', RealType) for i, l in enumerate(x): p = Point() p.setName('p%d' % i) p.setLayout(layout) p['x'] = l points.append(p) ds = DataSet() ds.addPoints(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'pca', {'dimension': len(x[0]), 'resultName': 'pca'}) result = [] for p in ds.points(): result.append(p['pca']) return result
def readLibSVMDataSet(filename): data = [l.split() for l in open(filename).readlines()] minidx = maxidx = 1 for l in data: for i in range(1, len(l)): dim, value = l[i].split(':') l[i] = (int(dim), float(value)) minidx = min(minidx, int(dim)) maxidx = max(maxidx, int(dim)) dimension = maxidx - minidx + 1 layout = PointLayout() layout.add('class', StringType) layout.add('value', RealType) ds = DataSet() n = 0 points = [] for l in data: p = Point() p.setLayout(layout) p.setName('instance_%06d' % n) n += 1 p['class'] = l[0] desc = RealDescriptor(dimension, 0.0) for dim, value in l[1:]: desc[dim - minidx] = value p['value'] = desc points.append(p) ds.addPoints(points) return ds
def createDataSet(): ds = DataSet() # p0.a = (0.0, 0.0) (α = undefined) p0 = newPoint('p0') p0['a'] = (0.0, 0.0) # p1.a = (1.0, 0.0) (α = 0) p1 = newPoint('p1') p1['a'] = (1.0, 0.0) # p2.a = (0.0, 1.0) (α = π/2) p2 = newPoint('p2') p2['a'] = (0.0, 1.0) # p3.a = (-1.0, 0.0) (α = π) p3 = newPoint('p3') p3['a'] = (-1.0, 0.0) # p4.a = (1.0, 1.0) (α = π/4) p4 = newPoint('p4') p4['a'] = (1.0, 1.0) # p5.a = (1.0, -1.0) (α = -π/4) p5 = newPoint('p5') p5['a'] = (1.0, -1.0) ds.addPoints([ p0, p1, p2, p3, p4, p5 ]) if testdata.useFixedLength: ds = testdata.fixLength(ds) if testdata.useEnumerate: ds = testdata.enumerateStrings(ds) return ds