Exemplos de DataSet.DataSet em Python, exemplos de gaia2.DataSet.DataSet em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_serialization.py Projeto: sildeag/gaia

    def testLayout(self):
        layout = testdata.createSimpleLayout()

        (tmpFile, tmpName) = tempfile.mkstemp()
        os.close(tmpFile)

        # write dataset with layout
        p = Point()
        p.setName('p1')
        p.setLayout(layout)
        p2 = Point()
        p2.setName('p2')
        p2.setLayout(layout)

        p['a.1'] = 23
        self.assertEqual(p['a.1'], 23)
        self.assertRaises(Exception, p.setValue, 'a.4', 34)

        ds1 = DataSet()
        ds1.setName('ds1')
        ds1.addPoint(p)
        ds1.addPoint(p2)
        ds1.save(tmpName)

        # reload dataset
        ds2 = DataSet()
        ds2.load(tmpName)
        self.assertEqual(layout, ds2.layout())
        self.assertEqual(ds2.point('p1')['a.1'], 23)

        # remove temp file
        os.remove(tmpName)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: test_backwardscompat.py Projeto: sildeag/gaia

    def testComplete2(self):
        # have a transformed 2.0 dataset, load it, and have gaia 2.1 transform
        # a point using the history.
        ds = DataSet()

        self.assertRaises(Exception, ds.load, testdata.GAIA_20_BACKWARDS_COMPAT_PCA_DATASET)
        return

        ds.load(testdata.GAIA_20_BACKWARDS_COMPAT_PCA_DATASET)

        ds21 = DataSet()
        ds21.load(testdata.TEST_DATABASE)
        p = ds21.point("17 Blue Monday ['88 12' Version].mp3")

        ds21 = ds.history().mapDataSet(ds21)

        self.assertEqual(ds.history().mapPoint(p),
                         ds21.history().mapPoint(p))

        ds = transform(ds, 'fixlength')
        ds21 = transform(ds21, 'fixlength')

        def search(ds, p):
            p = ds.history().mapPoint(p)
            dist = MetricFactory.create('euclidean', ds.layout())
            return View(ds).nnSearch(p, dist).get(5)

        self.compareResults(search(ds, p),
                            search(ds21, p))

Exemplo n.º 3

0

Exibir arquivo

def evaluateNfold(nfold, dataset, groundTruth, trainingFunc, *args, **kwargs):
    """Evaluate the classifier on the given dataset and returns the confusion matrix.

    The evaluation is performed using n-fold cross validation.
    Uses only the points that are in the groundTruth parameter for the evaluation.

    Parameters
    ----------

    nfold        : the number of folds to use for the cross-validation
    dataset      : the dataset from which to get the points
    groundTruth  : a map from the points to classify to their respective class
    trainingFunc : a function which will train and return a classifier given a dataset,
                   the groundtruth, and the *args and **kwargs arguments
    """
    log.info('Doing %d-fold cross validation' % nfold)
    classes = set(groundTruth.values())
    progress = TextProgress(nfold, 'Evaluating fold %(current)d/%(total)d')

    # get map from class to point names
    iclasses = {}
    for c in classes:
        iclasses[c] = [ p for p in groundTruth.keys() if groundTruth[p] == c ]
        random.shuffle(iclasses[c])

    # get folds
    folds = {}
    for i in range(nfold):
        folds[i] = []
        for c in iclasses.values():
            foldsize = (len(c)-1)//nfold + 1 # -1/+1 so we take all instances into account, last fold might have fewer instances
            folds[i] += c[ foldsize * i : foldsize * (i+1) ]

    # build sub-datasets and run evaluation on them
    confusion = None
    pnames = [ p.name() for p in dataset.points() ]

    for i in range(nfold):
        if log.isEnabledFor(logging.INFO):
            progress.update(i+1)

        trainds = DataSet()
        trainds.addPoints([ dataset.point(pname) for pname in pnames if pname not in folds[i] ])
        traingt = GroundTruth(groundTruth.className, dict([ (p, c) for p, c in groundTruth.items() if p not in folds[i] ]))

        testds = DataSet()
        testds.addPoints([ dataset.point(str(pname)) for pname in folds[i] ])
        testgt = GroundTruth(groundTruth.className, dict([ (p, c) for p, c in groundTruth.items() if p in folds[i] ]))

        classifier = trainingFunc(trainds, traingt, *args, **kwargs)
        confusion = evaluate(classifier, testds, testgt, confusion, verbose = False)

    return confusion

Exemplo n.º 4

0

Exibir arquivo

Arquivo: similarity.py Projeto: chtk/autoqueue

 def get_miximized_tracks(self, filenames):
     """Get list of tracks in ideal order."""
     for filename in filenames:
         self.queue.put((ADD, filename))
     while self.queue.qsize():
         print("waiting for analysis")
         sleep(10)
     encoded = [f.encode('utf-8') for f in filenames]
     dataset = DataSet()
     number_of_tracks = len(filenames)
     for filename in encoded:
         if not self.gaia_db.contains(filename):
             continue
         point = self.gaia_db.point(filename)
         dataset.addPoint(point)
     dataset = self.transform(dataset)
     matrix = {}
     for filename in encoded:
         matrix[filename] = {
             name: score
             for score, name in self.get_neighbours(dataset, filename,
                                                    number_of_tracks)
         }
     clusterer = Clusterer(encoded, lambda f1, f2: matrix[f1][f2])
     clusterer.cluster()
     result = []
     for cluster in clusterer.clusters:
         result.extend([encoded.index(filename) for filename in cluster])
     return result

Exemplo n.º 5

0

Exibir arquivo

    def testMergePointsWithDifferentEnumerationMaps(self):
        #'''ticket #74: when changing the layout of a point, we must also make sure that the enum maps are correctly mapped'''

        p1 = Point()
        p1.setName('p1')
        p1.setLayout(self.l1)
        p1['d'] = 'hello'

        p2 = Point()
        p2.setName('p2')
        p2.setLayout(self.l1)
        p2['d'] = 'world'

        ds = DataSet()
        ds.addPoint(p1)
        ds.addPoint(p2)

        self.assertEqual(ds.point('p1').label('d'), 'hello')
        self.assertEqual(ds.point('p2').label('d'), 'world')

        ds.removePoint('p2')
        ds = transform(ds, 'enumerate', { 'descriptorNames': 'd' })
        ds.addPoint(p2)

        self.assertEqual(ds.point('p1').label('d'), 'hello')
        self.assertEqual(ds.point('p2').label('d'), 'world')

Exemplo n.º 6

0

Exibir arquivo

Arquivo: classifier_SVM.py Projeto: winnerineast/gaia

def train_SVM(dataset,
              groundTruth,
              descriptorNames,
              exclude=[],
              svmtype='c-svc',
              kernel='rbf',
              c=1,
              gamma=1):
    # recreate a copy of the given dataset without history
    ds = DataSet()
    ds.addPoints([p for p in dataset.points()])

    ds = transform(ds, 'normalize', {
        'descriptorNames': descriptorNames,
        'except': exclude,
        'independent': True
    })

    ds = transform(
        ds, 'svmtrain', {
            'descriptorNames': descriptorNames,
            'except': exclude,
            'className': groundTruth.className,
            'type': svmtype,
            'kernel': kernel,
            'c': c,
            'gamma': gamma
        })

    h = ds.history()
    return lambda p: str(h.mapPoint(p)[groundTruth.className])

Exemplo n.º 7

0

Exibir arquivo

Arquivo: gaia_wrapper.py Projeto: vipulchhabra99/acousticbrainz-server

def train_svm_history(project, params, output_file_path):
    params_model = params["model"]
    if params_model.get("classifier") != "svm":
        raise GaiaWrapperException(
            "Can only use this script on SVM config parameters.")

    ds = DataSet()
    ds.load(
        os.path.join(
            project["datasetsDirectory"], "%s-%s.db" %
            (project["className"], params_model["preprocessing"])))

    gt = GroundTruth.fromFile(project["groundtruth"])
    gt.className = "highlevel." + project["className"]

    history = train_svm(
        ds,
        gt,
        type=params_model["type"],
        kernel=params_model["kernel"],
        C=params_model["C"],
        gamma=params_model["gamma"])  # doing the whole training
    if isinstance(output_file_path, unicode):
        output_file_path = output_file_path.encode("utf-8")
    history.save(output_file_path)

Exemplo n.º 8

0

Exibir arquivo

    def testKullbackLeibler(self):
        ds = transform(testdata.loadTestDB(), 'fixlength')

        # creates a test with more than 1000 points otherwise the test is useless because
        # we split the workload in chunks of 1000 points when computing the distance
        dstest = DataSet()
        ncopy = 20
        for cidx in range(ncopy):
            points = list(ds.points())
            for p in points:
                p.setName(p.name() + '-%d' % cidx)
            dstest.addPoints(points)

        # test whether KL doesn't break with multithreading (did in 2.2.1)
        v = View(dstest)
        dist = MetricFactory.create('kullbackleibler',
                                    dstest.layout(),
                                    { 'descriptorName': 'mfcc' })


        results = v.nnSearch(ds.samplePoint(), dist).get(6*ncopy)
        expected = [ 0.0 ]*2*ncopy + [ 6.1013755798339844 ]*ncopy
        expected += [ 6.4808731079101562 ]*2*ncopy + [ 6.7828292846679688 ]*ncopy

        for r, e in zip(results, expected):
            self.assertAlmostEqual(r[1], e, 5)

Exemplo n.º 9

0

Exibir arquivo

def createDataSet():
 
    l = PointLayout()
    l.add('a', RealType)

    ds = DataSet()

    # p1.a = (0.0, 0.0)
    p = Point()
    p.setName('p1')
    p.setLayout(l)
    p['a'] = (0.0, 0.0)
    ds.addPoint(p)

    # p2.a = (0.5, 1.0)
    p = Point()
    p.setName('p2')
    p.setLayout(l)
    p['a'] = (0.5, 1.0)
    ds.addPoint(p)

    if testdata.useFixedLength:
        ds = testdata.fixLength(ds)

    if testdata.useEnumerate:
        ds = testdata.enumerateStrings(ds)

    return ds

Exemplo n.º 10

0

Exibir arquivo

def trainSVMHistory(configFilename, paramsFilename, outputHistoryFilename,
                    className):
    config = yaml.load(open(configFilename).read())
    params = yaml.load(open(paramsFilename).read())['model']

    if params.pop('classifier') != 'svm':
        raise Exception('Can only use this script on SVM config parameters.')

    preproc = params.pop('preprocessing')

    ds = DataSet()
    ds.load(
        join(
            split(configFilename)[0],  # base dir
            config['datasetsDirectory'],  # datasets dir
            '%s-%s.db' % (config['className'], preproc)))  # dataset name

    gt = GroundTruth.fromFile(config['groundtruth'])

    if className:
        gt.className = className

    # add 'highlevel.' in front of the descriptor, this is what will appear in the final Essentia sigfile
    gt.className = 'highlevel.' + gt.className

    # do the whole training
    h = trainSVM(ds, gt, **params)

    h.save(outputHistoryFilename)

Exemplo n.º 11

0

Exibir arquivo

    def __init__(self, indexing_only_mode=False):
        self.indexing_only_mode = indexing_only_mode
        self.index_path = INDEX_DIR
        self.original_dataset = DataSet()
        self.pca_dataset = DataSet()
        if not self.indexing_only_mode:
            self.original_dataset_path = self.__get_dataset_path(INDEX_NAME)
        else:
            self.original_dataset_path = self.__get_dataset_path(
                INDEXING_SERVER_INDEX_NAME)
        self.descriptor_names = {}
        self.metrics = {}
        self.view = None
        self.view_pca = None
        self.transformations_history = None

        self.__load_dataset()

Exemplo n.º 12

0

Exibir arquivo

Arquivo: classificationtask.py Projeto: shudct/gaia

    def run(self, className, outfilename, param, dsname, gtname, evalconfig):

        try:
            classifier = param['classifier']
            gt = GroundTruth(classifier)
            gt.load(gtname)

            # force the GroundTruth class name to be the one specified by our project file, not
            # the one in the original groundtruth file
            gt.className = className

            ds = DataSet()
            ds.load(dsname)

            # some points may have failed to be analyzed, remove those from the GroundTruth
            pnames = ds.pointNames()
            for pid in list(gt.keys()):
                if pid not in pnames:
                    log.warning(
                        'Removing %s from GroundTruth as it could not be found in the merged dataset'
                        % pid)
                    del gt[pid]

            trainerFun, trainingparam, newds = getTrainer(
                classifier, param, ds)

            # run all the evaluations specified in the evaluation config
            for i, evalparam in enumerate(evalconfig):
                # if we already ran this evaluation, no need to run it again...
                resultFilename = outfilename + '_%d.result' % i
                if exists(resultFilename):
                    log.info('File %s already exists. Skipping evaluation...' %
                             resultFilename)
                    continue

                log.info(
                    'Running evaluation %d for: %s with classifier %s and dataset %s'
                    % (i, outfilename, param['classifier'],
                       param['preprocessing']))
                log.info('    PID: %d, parameters: %s' %
                         (os.getpid(), json.dumps(param)))

                # run evaluation
                confusion = evaluateNfold(evalparam['nfold'], ds, gt,
                                          trainerFun, **trainingparam)

                # write evaluation params & result
                with open(outfilename + '_%d.param' % i, 'w') as f:
                    yaml.dump({'model': param, 'evaluation': evalparam}, f)

                confusion.save(resultFilename)

        except Exception:
            log.error(
                'While doing evaluation with param = %s\nevaluation = %s' %
                (param, evalconfig))
            raise

Exemplo n.º 13

0

Exibir arquivo

Arquivo: similarity.py Projeto: chtk/autoqueue

 def initialize_gaia_db(self):
     """Load or initialize the gaia database."""
     if not os.path.isfile(self.gaia_db_path):
         dataset = DataSet()
     else:
         dataset = self.load_gaia_db()
         self.transformed = True
     print("songs in db: %d" % dataset.size())
     return dataset

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test_backwardscompat.py Projeto: sildeag/gaia

    def testDataSet(self):
        # load 2.0 dataset, check some values are correct
        ds = DataSet()
        ds.load(testdata.GAIA_20_BACKWARDS_COMPAT_DATASET)

        self.assertAlmostEqual(ds.point('01 Message - Grandmaster Flash.mp3').value('tempotap_bpm'),
            101.05792999)

        self.assertEqual(ds.point('04 Blue Skies.mp3').label('key_key'),
            'G#')

Exemplo n.º 15

0

Exibir arquivo

    def testAddToDataSetWithDifferentLayouts(self):
        p1 = Point()
        p1.setLayout(self.l1) # +1, ref = 2
        p2 = Point()

        ds = DataSet()
        ds.addPoint(p1) # +2 (dataset+pointcopy), ref = 4
        self.assertRaises(Exception, ds.addPoint, p2)
        self.assertEqual(p1.layout().ref(), 4)
        self.assertEqual(p2.layout().ref(), 1)

Exemplo n.º 16

0

Exibir arquivo

    def __init__(self):
        self.as_dataset = DataSet()
        self.tag_dataset = DataSet()
        self.fs_dataset = DataSet()
        self.ac_dataset = DataSet()
        self.gaia_similiarity = None

        self.index_path = clust_settings.INDEX_DIR

        self.as_view = None
        self.as_metric = None
        self.tag_view = None
        self.tag_metric = None
        self.fs_view = None
        self.fs_metric = None
        self.ac_view = None
        self.ac_metric = None

        self.__load_datasets()

Exemplo n.º 17

0

Exibir arquivo

Arquivo: test_serialization.py Projeto: sildeag/gaia

    def testHistory(self):
        ds = testdata.loadTestDB()
        ignored_descs = testdata.TEST_DATABASE_VARLENGTH_REAL

        testdata.resetSettings()
        ds_orig = testdata.loadTestDB()

        # cleaning, mandatory step
        ds = transform(ds, 'fixlength', {'except': ignored_descs})
        cleaned_db = transform(ds, 'cleaner', {'except': ignored_descs})

        # removing annoying descriptors, like mfcc.cov & mfcc.icov, who don't
        # like to be normalized like the other ones (constant value: dimension)
        no_mfcc_db = transform(cleaned_db, 'remove',
                               {'descriptorNames': '*mfcc*'})

        # normalize, to have everyone change values
        normalized_db = transform(no_mfcc_db, 'normalize',
                                  {'except': ignored_descs})

        testPoints = [
            '01 Oye Como Va - Santana.mp3', '02 Carmen Burana- O Fortuna.mp3',
            '07 Romeo and Juliet- the Knights\' Dance.mp3', '11 Lambada.mp3'
        ]

        for pointName in testPoints:
            p1 = normalized_db.point(pointName)
            p2 = normalized_db.history().mapPoint(ds_orig.point(pointName))

            for name in p1.layout().descriptorNames():
                self.assertEqual(p1[name], p2[name])

        (tmpFile, tmpName) = tempfile.mkstemp()
        os.close(tmpFile)
        normalized_db.save(tmpName)
        reloaded_db = DataSet()
        reloaded_db.load(tmpName)

        for pointName in testPoints:
            p1 = normalized_db.point(pointName)
            p2 = normalized_db.history().mapPoint(ds_orig.point(pointName))
            p3 = reloaded_db.point(pointName)
            p4 = reloaded_db.history().mapPoint(ds_orig.point(pointName))

            self.assert_(p1.layout() == p2.layout())
            self.assert_(p2.layout() == p3.layout())
            self.assert_(p3.layout() == p4.layout())

            for name in p1.layout().descriptorNames():
                self.assertEqual(p1[name], p2[name])
                self.assertEqual(p2[name], p3[name])
                self.assertEqual(p3[name], p4[name])

        # remove temp file
        os.remove(tmpName)

Exemplo n.º 18

0

Exibir arquivo

def loadSmallDB():
    global useFixedLength, useEnumerate
    ds = DataSet()
    ds.load(TEST_SMALLDB)

    if useFixedLength:
        ds = fixLength(ds)

    if useEnumerate:
        ds = enumerateStrings(ds)

    return ds

Exemplo n.º 19

0

Exibir arquivo

def loadGaia20DB():
    global useFixedLength, useEnumerate
    ds = DataSet()
    ds.load(GAIA20_DB)

    if useFixedLength:
        ds = fixLength(ds)

    if useEnumerate:
        ds = enumerateStrings(ds)

    return ds

Exemplo n.º 20

0

Exibir arquivo

def loadTestDB():
    global useFixedLength, useEnumerate
    ds = DataSet()
    ds.load(TEST_DATABASE)

    if useFixedLength:
        ds = fixLength(ds)

    if useEnumerate:
        ds = enumerateStrings(ds, exclude='chords_progression_hash.value')

    return ds

Exemplo n.º 21

0

Exibir arquivo

    def testSecondChanceForLayoutEquality(self):
        '''ticket #21: points try to morph to adapt to dataset if they cannot be naturally inserted'''
        ds = DataSet()
        p = Point()

        p.setName('Paris Hilton')
        p.load('data/04 - Cansei de Ser Sexy - Meeting Paris Hilton.mp3.sig')
        ds.addPoint(p)

        p.setName('2005')
        p.load('data/11_2005-fwyh.mp3.sig')
        ds.addPoint(p)

        self.assertEqual(ds.point('2005')['title'], '2005')

Exemplo n.º 22

0

Exibir arquivo

Arquivo: test_backwardscompat.py Projeto: sildeag/gaia

    def testComplete(self):
        # load 2.0 dataset, history, apply history to dataset
        # check nn-search results are the same as the ones we get when doing it from gaia 2.0
        ds = DataSet()
        ds.load(testdata.GAIA_20_BACKWARDS_COMPAT_DATASET)

        h = TransfoChain()

        self.assertRaises(Exception, h.load, testdata.GAIA_20_BACKWARDS_COMPAT_HISTORY)
        return

        h.load(testdata.GAIA_20_BACKWARDS_COMPAT_HISTORY)

        ds = h.mapDataSet(ds)
        v = View(ds)
        dist = MetricFactory.create('euclidean', ds.layout())

        results = v.nnSearch('01 Respect.mp3', dist).get(5)
        self.compareResults(results, testdata.GAIA_20_BACKWARDS_COMPAT_RESULTS)

        ds21 = DataSet()
        ds21.load(testdata.TEST_DATABASE)
        results = v.nnSearch(h.mapPoint(ds21.point('01 Respect.mp3')), dist).get(5)
        self.compareResults(results, testdata.GAIA_20_BACKWARDS_COMPAT_RESULTS)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: gaia_wrapper.py Projeto: techscientist/acousticbrainz-server

def train_svm_history(project, params, output_file_path):
    params_model = params["model"]
    if params_model.pop("classifier") != "svm":
        raise GaiaWrapperException("Can only use this script on SVM config parameters.")

    ds = DataSet()
    ds.load(os.path.join(
        project["datasetsDirectory"],
        "%s-%s.db" % (project["className"], params_model.pop("preprocessing"))
    ))

    gt = GroundTruth.fromFile(project["groundtruth"])
    gt.className = "highlevel." + project["className"]

    history = train_svm(ds, gt, **params_model)  # doing the whole training
    history.save(output_file_path)

Exemplo n.º 24

0

Exibir arquivo

def createSimpleDataSet():
    global useFixedLength, useEnumerate
    l = createSimpleLayout()
    ds = DataSet()
    p = Point()
    p.setName('p')
    p.setLayout(l)
    ds.addPoint(p)

    if useFixedLength:
        ds = fixLength(ds)

    if useEnumerate:
        ds = enumerateStrings(ds)

    return ds

Exemplo n.º 25

0

Exibir arquivo

def transformDataSet(inputFilename, outputFilename, transfoFile=None):
    """Apply the list of transformations given as a yaml sequence to the specified dataset."""
    print('Preprocessing dataset chunk for %s...' % outputFilename)
    gaia2.cvar.verbose = False

    transfoList = '''
    - transfo: removevl
    - transfo: fixlength
    - transfo: cleaner
    '''

    if transfoFile is not None:
        transfoList = open(transfoFile).read()

    ds = DataSet()
    ds.load(inputFilename)

    ds = applyTransfoChain(ds, transfoList)

    ds.save(outputFilename)

Exemplo n.º 26

0

Exibir arquivo

def harmonizeChunks(partfiles):
    # TODO: check all histories are the same, if not, try to do sth about it
    # find the GCLD (greatest common layout divisor :-) )
    ds = DataSet()
    ds.load(partfiles[0])
    origLayout = ds.layout().copy()
    gcld = ds.layout().copy()

    for pfile in partfiles[1:]:
        ds.load(pfile)
        gcld = gcld & ds.layout()

    # keep some stats about which descriptors got removed and the reason why before throwing
    # away the original history and simplifying it
    vldescs = set()
    nandescs = set()

    # now that we have our GCLD, transform all the chunks so they have the same layout (our GCLD)
    # and simplify their histories so that they also have the same history (the minimum history
    # required to arrive at this target layout).
    for pfile in partfiles:
        ds.load(pfile)

        for t in ds.history().toPython():
            tname = t['Analyzer name']
            descs = t['Applier parameters']['descriptorNames']
            if tname == 'cleaner': nandescs.update(descs)
            elif tname == 'removevl': vldescs.update(descs)

        toremove = ds.layout().differenceWith(gcld)
        if toremove:
            ds = transform(ds, 'remove', {'descriptorNames': toremove})

        ds.simplifyHistory()
        ds.save(pfile)

    # also get the other descriptors that got removed (because of a select or remove transfo)
    rdescs = set(origLayout.differenceWith(gcld)) - (vldescs | nandescs)

    return vldescs, nandescs, rdescs

Exemplo n.º 27

0

Exibir arquivo

    def testComplexReferenceCounting(self):
        ds = DataSet()
        self.assertEqual(ds.layout().ref(), 2) # 1 + 1 from temp object

        p = Point()
        p.setName('p1')
        lext = PointLayout(p.layout()) # +1, {lext,p}.ref = 2
        self.assertEqual(lext.ref(), 2)

        lext = p.layout().copy() # copy, lext.ref = 1; p.ref -= 1, = 1
        self.assertEqual(lext.ref(), 1)

        ds.addPoint(p) # +3 (dataset + pointcopy), ref = 3

        self.assertEqual(lext.ref(), 1)
        self.assertEqual(ds.layout().ref(), 4) # 3 + 1 temp object

        p2 = Point(p) # +1, {p,p2}.ref = 5
        p2.setName('p2')
        self.assertEqual(ds.layout().ref(), 5)
        ds.addPoint(p2)
        self.assertEqual(ds.layout().ref(), 6) # +1 pointcopy, ref = 6

Exemplo n.º 28

0

Exibir arquivo

Arquivo: pca.py Projeto: winnerineast/gaia

def PCA(x):
    points = []
    layout = PointLayout()
    layout.add('x', RealType)

    for i, l in enumerate(x):
        p = Point()
        p.setName('p%d' % i)
        p.setLayout(layout)
        p['x'] = l
        points.append(p)

    ds = DataSet()
    ds.addPoints(points)

    ds = transform(ds, 'fixlength')
    ds = transform(ds, 'pca', {'dimension': len(x[0]), 'resultName': 'pca'})

    result = []
    for p in ds.points():
        result.append(p['pca'])

    return result

Exemplo n.º 29

0

Exibir arquivo

def readLibSVMDataSet(filename):
    data = [l.split() for l in open(filename).readlines()]
    minidx = maxidx = 1
    for l in data:
        for i in range(1, len(l)):
            dim, value = l[i].split(':')
            l[i] = (int(dim), float(value))
            minidx = min(minidx, int(dim))
            maxidx = max(maxidx, int(dim))

    dimension = maxidx - minidx + 1

    layout = PointLayout()
    layout.add('class', StringType)
    layout.add('value', RealType)

    ds = DataSet()
    n = 0
    points = []

    for l in data:
        p = Point()
        p.setLayout(layout)
        p.setName('instance_%06d' % n)
        n += 1

        p['class'] = l[0]
        desc = RealDescriptor(dimension, 0.0)
        for dim, value in l[1:]:
            desc[dim - minidx] = value
        p['value'] = desc

        points.append(p)

    ds.addPoints(points)

    return ds

Exemplo n.º 30

0

Exibir arquivo

def createDataSet():
    ds = DataSet()

    # p0.a = (0.0, 0.0) (α = undefined)
    p0 = newPoint('p0')
    p0['a'] = (0.0, 0.0)

    # p1.a = (1.0, 0.0) (α = 0)
    p1 = newPoint('p1')
    p1['a'] = (1.0, 0.0)

    # p2.a = (0.0, 1.0) (α = π/2)
    p2 = newPoint('p2')
    p2['a'] = (0.0, 1.0)

    # p3.a = (-1.0, 0.0) (α = π)
    p3 = newPoint('p3')
    p3['a'] = (-1.0, 0.0)

    # p4.a = (1.0, 1.0) (α = π/4)
    p4 = newPoint('p4')
    p4['a'] = (1.0, 1.0)

    # p5.a = (1.0, -1.0) (α = -π/4)
    p5 = newPoint('p5')
    p5['a'] = (1.0, -1.0)

    ds.addPoints([ p0, p1, p2, p3, p4, p5 ])

    if testdata.useFixedLength:
        ds = testdata.fixLength(ds)

    if testdata.useEnumerate:
        ds = testdata.enumerateStrings(ds)

    return ds