Exemplo n.º 1
0
    def testKullbackLeibler(self):
        ds = transform(testdata.loadTestDB(), 'fixlength')

        # creates a test with more than 1000 points otherwise the test is useless because
        # we split the workload in chunks of 1000 points when computing the distance
        dstest = DataSet()
        ncopy = 20
        for cidx in range(ncopy):
            points = list(ds.points())
            for p in points:
                p.setName(p.name() + '-%d' % cidx)
            dstest.addPoints(points)

        # test whether KL doesn't break with multithreading (did in 2.2.1)
        v = View(dstest)
        dist = MetricFactory.create('kullbackleibler',
                                    dstest.layout(),
                                    { 'descriptorName': 'mfcc' })


        results = v.nnSearch(ds.samplePoint(), dist).get(6*ncopy)
        expected = [ 0.0 ]*2*ncopy + [ 6.1013755798339844 ]*ncopy
        expected += [ 6.4808731079101562 ]*2*ncopy + [ 6.7828292846679688 ]*ncopy

        for r, e in zip(results, expected):
            self.assertAlmostEqual(r[1], e, 5)
Exemplo n.º 2
0
def train_SVM(dataset,
              groundTruth,
              descriptorNames,
              exclude=[],
              svmtype='c-svc',
              kernel='rbf',
              c=1,
              gamma=1):
    # recreate a copy of the given dataset without history
    ds = DataSet()
    ds.addPoints([p for p in dataset.points()])

    ds = transform(ds, 'normalize', {
        'descriptorNames': descriptorNames,
        'except': exclude,
        'independent': True
    })

    ds = transform(
        ds, 'svmtrain', {
            'descriptorNames': descriptorNames,
            'except': exclude,
            'className': groundTruth.className,
            'type': svmtype,
            'kernel': kernel,
            'c': c,
            'gamma': gamma
        })

    h = ds.history()
    return lambda p: str(h.mapPoint(p)[groundTruth.className])
Exemplo n.º 3
0
def evaluateNfold(nfold, dataset, groundTruth, trainingFunc, *args, **kwargs):
    """Evaluate the classifier on the given dataset and returns the confusion matrix.

    The evaluation is performed using n-fold cross validation.
    Uses only the points that are in the groundTruth parameter for the evaluation.

    Parameters
    ----------

    nfold        : the number of folds to use for the cross-validation
    dataset      : the dataset from which to get the points
    groundTruth  : a map from the points to classify to their respective class
    trainingFunc : a function which will train and return a classifier given a dataset,
                   the groundtruth, and the *args and **kwargs arguments
    """
    log.info('Doing %d-fold cross validation' % nfold)
    classes = set(groundTruth.values())
    progress = TextProgress(nfold, 'Evaluating fold %(current)d/%(total)d')

    # get map from class to point names
    iclasses = {}
    for c in classes:
        iclasses[c] = [ p for p in groundTruth.keys() if groundTruth[p] == c ]
        random.shuffle(iclasses[c])

    # get folds
    folds = {}
    for i in range(nfold):
        folds[i] = []
        for c in iclasses.values():
            foldsize = (len(c)-1)//nfold + 1 # -1/+1 so we take all instances into account, last fold might have fewer instances
            folds[i] += c[ foldsize * i : foldsize * (i+1) ]

    # build sub-datasets and run evaluation on them
    confusion = None
    pnames = [ p.name() for p in dataset.points() ]

    for i in range(nfold):
        if log.isEnabledFor(logging.INFO):
            progress.update(i+1)

        trainds = DataSet()
        trainds.addPoints([ dataset.point(pname) for pname in pnames if pname not in folds[i] ])
        traingt = GroundTruth(groundTruth.className, dict([ (p, c) for p, c in groundTruth.items() if p not in folds[i] ]))

        testds = DataSet()
        testds.addPoints([ dataset.point(str(pname)) for pname in folds[i] ])
        testgt = GroundTruth(groundTruth.className, dict([ (p, c) for p, c in groundTruth.items() if p in folds[i] ]))

        classifier = trainingFunc(trainds, traingt, *args, **kwargs)
        confusion = evaluate(classifier, testds, testgt, confusion, verbose = False)

    return confusion
Exemplo n.º 4
0
def train_SVM(dataset, groundTruth, descriptorNames, exclude = [], svmtype = 'c-svc', kernel = 'rbf', c = 1, gamma = 1):
    # recreate a copy of the given dataset without history
    ds = DataSet()
    ds.addPoints([ p for p in dataset.points() ])

    ds = transform(ds, 'normalize', { 'descriptorNames': descriptorNames,
                                      'except': exclude,
                                      'independent': True })

    ds = transform(ds, 'svmtrain', { 'descriptorNames': descriptorNames,
                                     'except': exclude,
                                     'className': groundTruth.className,
                                     'type': svmtype,
                                     'kernel': kernel,
                                     'c': c,
                                     'gamma': gamma})

    h = ds.history()
    return lambda p: str(h.mapPoint(p)[groundTruth.className])
Exemplo n.º 5
0
Arquivo: pca.py Projeto: DomT4/gaia
def PCA(x):
    points = []
    layout = PointLayout()
    layout.add('x', RealType)

    for i, l in enumerate(x):
        p = Point()
        p.setName('p%d' % i)
        p.setLayout(layout)
        p['x'] = l
        points.append(p)

    ds = DataSet()
    ds.addPoints(points)

    ds = transform(ds, 'fixlength')
    ds = transform(ds, 'pca', { 'dimension': len(x[0]), 'resultName': 'pca' })

    result = []
    for p in ds.points():
        result.append(p['pca'])

    return result
Exemplo n.º 6
0
def PCA(x):
    points = []
    layout = PointLayout()
    layout.add('x', RealType)

    for i, l in enumerate(x):
        p = Point()
        p.setName('p%d' % i)
        p.setLayout(layout)
        p['x'] = l
        points.append(p)

    ds = DataSet()
    ds.addPoints(points)

    ds = transform(ds, 'fixlength')
    ds = transform(ds, 'pca', {'dimension': len(x[0]), 'resultName': 'pca'})

    result = []
    for p in ds.points():
        result.append(p['pca'])

    return result
Exemplo n.º 7
0
def readLibSVMDataSet(filename):
    data = [l.split() for l in open(filename).readlines()]
    minidx = maxidx = 1
    for l in data:
        for i in range(1, len(l)):
            dim, value = l[i].split(':')
            l[i] = (int(dim), float(value))
            minidx = min(minidx, int(dim))
            maxidx = max(maxidx, int(dim))

    dimension = maxidx - minidx + 1

    layout = PointLayout()
    layout.add('class', StringType)
    layout.add('value', RealType)

    ds = DataSet()
    n = 0
    points = []

    for l in data:
        p = Point()
        p.setLayout(layout)
        p.setName('instance_%06d' % n)
        n += 1

        p['class'] = l[0]
        desc = RealDescriptor(dimension, 0.0)
        for dim, value in l[1:]:
            desc[dim - minidx] = value
        p['value'] = desc

        points.append(p)

    ds.addPoints(points)

    return ds
Exemplo n.º 8
0
def createDataSet():
    ds = DataSet()

    # p0.a = (0.0, 0.0) (α = undefined)
    p0 = newPoint('p0')
    p0['a'] = (0.0, 0.0)

    # p1.a = (1.0, 0.0) (α = 0)
    p1 = newPoint('p1')
    p1['a'] = (1.0, 0.0)

    # p2.a = (0.0, 1.0) (α = π/2)
    p2 = newPoint('p2')
    p2['a'] = (0.0, 1.0)

    # p3.a = (-1.0, 0.0) (α = π)
    p3 = newPoint('p3')
    p3['a'] = (-1.0, 0.0)

    # p4.a = (1.0, 1.0) (α = π/4)
    p4 = newPoint('p4')
    p4['a'] = (1.0, 1.0)

    # p5.a = (1.0, -1.0) (α = -π/4)
    p5 = newPoint('p5')
    p5['a'] = (1.0, -1.0)

    ds.addPoints([ p0, p1, p2, p3, p4, p5 ])

    if testdata.useFixedLength:
        ds = testdata.fixLength(ds)

    if testdata.useEnumerate:
        ds = testdata.enumerateStrings(ds)

    return ds