Пример #1
0
    def testRegressionGaia14(self):
        ds = testdata.loadSmallDB()
        ds = transform(ds, 'fixlength')

        to_remove = testdata.TEST_SMALLDB_VARLENGTH
        dsr = transform(ds, 'remove', {'descriptorNames': to_remove})

        self.compareResults(search(dsr, '1_ethno.wav', 5),
                            testdata.SMALL_DB_RAW_RESULTS)

        dsc = transform(dsr, 'cleaner')
        self.compareResults(search(dsc, '1_ethno.wav', 5),
                            testdata.SMALL_DB_CLEAN_RESULTS)

        dsn = transform(dsc, 'normalize')
        self.compareResults(search(dsn, '1_ethno.wav', 5),
                            testdata.SMALL_DB_NORM_RESULTS)

        dspca = transform(dsn, 'pca', {
            'resultName': 'pca30',
            'dimension': 30,
            'descriptorNames': '*'
        })
        self.compareResults(search(dspca, '1_ethno.wav', 5),
                            testdata.SMALL_DB_PCA_RESULTS)
Пример #2
0
def train_SVM(dataset,
              groundTruth,
              descriptorNames,
              exclude=[],
              svmtype='c-svc',
              kernel='rbf',
              c=1,
              gamma=1):
    # recreate a copy of the given dataset without history
    ds = DataSet()
    ds.addPoints([p for p in dataset.points()])

    ds = transform(ds, 'normalize', {
        'descriptorNames': descriptorNames,
        'except': exclude,
        'independent': True
    })

    ds = transform(
        ds, 'svmtrain', {
            'descriptorNames': descriptorNames,
            'except': exclude,
            'className': groundTruth.className,
            'type': svmtype,
            'kernel': kernel,
            'c': c,
            'gamma': gamma
        })

    h = ds.history()
    return lambda p: str(h.mapPoint(p)[groundTruth.className])
Пример #3
0
    def testMerge(self):
        #setDebugLevel(GAlgorithms)
        ds = testdata.loadTestDB()
        ds1 = transform(ds, 'select', { 'descriptorNames': '*.mean' })
        ds2 = transform(ds, 'select', { 'descriptorNames': '*.var' })
        ds12 = transform(ds, 'select', { 'descriptorNames': [ '*.mean', '*.var'] })

        ds_merged = mergeDataSets(ds1, ds2)

        # we need to do this because to add a Point we need it with the
        # original layout, not the FixedLength one
        testdata.resetSettings()
        ds_orig = testdata.loadTestDB()
        sp = ds_orig.samplePoint()

        # test if we can add points normally
        ds_merged.removePoint(sp.name())
        ds_merged.addPoint(sp)

        # compare datasets contents
        self.compareDataSets(ds12, ds_merged)

        # test the mapDataSet function of the Merge applier
        ds_remapped = ds_merged.history().mapDataSet(ds_orig)

        self.compareDataSets(ds12, ds_remapped)
        self.compareDataSets(ds_merged, ds_remapped)
Пример #4
0
    def testComplete2(self):
        # have a transformed 2.0 dataset, load it, and have gaia 2.1 transform
        # a point using the history.
        ds = DataSet()

        self.assertRaises(Exception, ds.load, testdata.GAIA_20_BACKWARDS_COMPAT_PCA_DATASET)
        return

        ds.load(testdata.GAIA_20_BACKWARDS_COMPAT_PCA_DATASET)

        ds21 = DataSet()
        ds21.load(testdata.TEST_DATABASE)
        p = ds21.point("17 Blue Monday ['88 12' Version].mp3")

        ds21 = ds.history().mapDataSet(ds21)

        self.assertEqual(ds.history().mapPoint(p),
                         ds21.history().mapPoint(p))

        ds = transform(ds, 'fixlength')
        ds21 = transform(ds21, 'fixlength')

        def search(ds, p):
            p = ds.history().mapPoint(p)
            dist = MetricFactory.create('euclidean', ds.layout())
            return View(ds).nnSearch(p, dist).get(5)

        self.compareResults(search(ds, p),
                            search(ds21, p))
Пример #5
0
 def testGaussianize(self):
     ds = testdata.loadTestDB()
     ds = transform(ds, 'removevl')
     ds = transform(ds, 'fixlength')
     self.assertEqual(ds[0]['tempotap_bpm'], 104.28208160400391)
     ds = transform(ds, 'gaussianize')
     self.assertEqual(ds[0]['tempotap_bpm'], -0.1928621232509613)
Пример #6
0
def train_SVM(dataset, groundTruth, descriptorNames, exclude=[], svmtype='c-svc',
              kernel='rbf', c=1, gamma=1, balanceClasses=False):
    # recreate a copy of the given dataset without history
    ds = dataset.copy()
    ds.forgetHistory()

    ds = transform(ds, 'select', { 'descriptorNames': descriptorNames,
                                   'except': exclude })

    ds = transform(ds, 'cleaner')

    ds = transform(ds, 'normalize', { 'independent': True })

    ds = transform(ds, 'addfield',  { 'string': groundTruth.className })

    for p in ds.points():
        p[groundTruth.className] = groundTruth[p.name()]

    ds = transform(ds, 'svmtrain', { 'className': groundTruth.className,
                                     'type': svmtype,
                                     'kernel': kernel,
                                     'c': c,
                                     'gamma': gamma,
                                     'balanceClasses': balanceClasses})

    h = ds.history()
    return lambda p: h.mapPoint(p)[groundTruth.className]
Пример #7
0
def train_SVM(dataset, groundTruth, descriptorNames, exclude = [], svmtype = 'c-svc', kernel = 'rbf', c = 1, gamma = 1):
    # recreate a copy of the given dataset without history
    ds = dataset.copy()
    ds.forgetHistory()

    ds = transform(ds, 'select', { 'descriptorNames': descriptorNames,
                                   'except': exclude })

    ds = transform(ds, 'cleaner')

    ds = transform(ds, 'normalize', { 'independent': True })

    ds = transform(ds, 'addfield',  { 'string': groundTruth.className })

    for p in ds.points():
        p[groundTruth.className] = groundTruth[p.name()]

    ds = transform(ds, 'svmtrain', { 'className': groundTruth.className,
                                     'type': svmtype,
                                     'kernel': kernel,
                                     'c': c,
                                     'gamma': gamma })

    h = ds.history()
    return lambda p: h.mapPoint(p)[groundTruth.className]
Пример #8
0
    def prepare_original_dataset_helper(ds):
        proc_ds1 = transform(ds, "RemoveVL")
        proc_ds2 = transform(proc_ds1, "FixLength")
        proc_ds1 = None
        prepared_ds = transform(proc_ds2, "Cleaner")
        proc_ds2 = None

        return prepared_ds
Пример #9
0
    def normalize_dataset_helper(ds):
        # Remove ['.lowlevel.mfcc.cov','.lowlevel.mfcc.icov'] (they give errors when normalizing)
        ds = transform(ds, "remove", {"descriptorNames": [".lowlevel.mfcc.cov", ".lowlevel.mfcc.icov"]})
        # Add normalization
        normalization_params = {"descriptorNames": "*", "independent": True, "outliers": -1}
        normalized_ds = transform(ds, "normalize", normalization_params)
        ds = None

        return normalized_ds
Пример #10
0
    def testNormalize(self):
        ds = createDataSet()
        ds = transform(ds, 'fixlength')
        dsn = transform(ds, 'normalize')

        # by default, vector-normalization is used
        self.assertEqual(dsn.point('p2').value('a'), (0.5, 1.0))

        dsn2 = transform(ds, 'normalize', { 'independent': True })
        self.assertEqual(dsn2.point('p2').value('a'), (1.0, 1.0))
Пример #11
0
 def testQt46FloatParameterBug(self):
     # Note: this was triggered by Qt 4.6 introducing a QVariant(float) constructor, which resulted
     #       in pmapToPython to fail with an unknown type error (followed by a segfault)...
     ds = testdata.loadTestDB()
     ds = transform(ds, 'fixlength')
     ds = transform(ds, 'removevl')
     ds = transform(ds, 'normalize')
     self.assertEqual(
         ds.history().toPython()[-1]['Applier parameters']['coeffs']
         ['.barkbands.mean']['a'][0], 24.922689437866211)
Пример #12
0
    def testHistory(self):
        ds = testdata.loadTestDB()
        ignored_descs = testdata.TEST_DATABASE_VARLENGTH_REAL

        testdata.resetSettings()
        ds_orig = testdata.loadTestDB()

        # cleaning, mandatory step
        ds = transform(ds, 'fixlength', {'except': ignored_descs})
        cleaned_db = transform(ds, 'cleaner', {'except': ignored_descs})

        # removing annoying descriptors, like mfcc.cov & mfcc.icov, who don't
        # like to be normalized like the other ones (constant value: dimension)
        no_mfcc_db = transform(cleaned_db, 'remove',
                               {'descriptorNames': '*mfcc*'})

        # normalize, to have everyone change values
        normalized_db = transform(no_mfcc_db, 'normalize',
                                  {'except': ignored_descs})

        testPoints = [
            '01 Oye Como Va - Santana.mp3', '02 Carmen Burana- O Fortuna.mp3',
            '07 Romeo and Juliet- the Knights\' Dance.mp3', '11 Lambada.mp3'
        ]

        for pointName in testPoints:
            p1 = normalized_db.point(pointName)
            p2 = normalized_db.history().mapPoint(ds_orig.point(pointName))

            for name in p1.layout().descriptorNames():
                self.assertEqual(p1[name], p2[name])

        (tmpFile, tmpName) = tempfile.mkstemp()
        os.close(tmpFile)
        normalized_db.save(tmpName)
        reloaded_db = DataSet()
        reloaded_db.load(tmpName)

        for pointName in testPoints:
            p1 = normalized_db.point(pointName)
            p2 = normalized_db.history().mapPoint(ds_orig.point(pointName))
            p3 = reloaded_db.point(pointName)
            p4 = reloaded_db.history().mapPoint(ds_orig.point(pointName))

            self.assert_(p1.layout() == p2.layout())
            self.assert_(p2.layout() == p3.layout())
            self.assert_(p3.layout() == p4.layout())

            for name in p1.layout().descriptorNames():
                self.assertEqual(p1[name], p2[name])
                self.assertEqual(p2[name], p3[name])
                self.assertEqual(p3[name], p4[name])

        # remove temp file
        os.remove(tmpName)
Пример #13
0
 def prepare_original_dataset_helper(ds):
     ds = transform(
         ds, 'FixLength'
     )  # this transformation marks which descriptors are of fixed length, it optimizes things
     ds = transform(ds, 'Cleaner')
     try:
         ds = transform(ds, 'enumerate',
                        {'descriptorNames': ['.tonal.chords_progression']})
     except:
         logger.info(
             'WARNING: enumerate transformation to .tonal.chords_progression could not be performed.'
         )
     return ds
Пример #14
0
def gaia_transform(points):
    """
        Takes a dict of point names and filepaths.
        Creates a DataSet and performs the standard transformations 
    """
    ds = DataSet.mergeFiles(points)
    ds = transform(ds, 'fixlength')
    ds = transform(ds, 'cleaner')
    for desc in get_unused_descriptors():
        try:   
            ds = transform(ds, 'remove', desc)
        except Exception, e:
            log.error("Problem removing this descriptor: %s" % e)
Пример #15
0
    def testRCA(self):
        ds = testdata.loadTestDB()
        ds = transform(ds, 'removevl')
        ds = transform(ds, 'fixlength')
        ds = transform(ds, 'remove', { 'descriptorNames': '*cov' })
        ds = transform(ds, 'cleaner')
        ds = transform(ds, 'normalize')
        ds = transform(ds, 'pca', { 'resultName': 'pca15',
                                    'dimension': 15 })
        ds_rca = transform(ds, 'rca', { 'resultName': 'rca10',
                                        'dimension': 10,
                                        'classFile': testdata.RCA_GENRE_GT })

        v = View(ds_rca)
        dist = MetricFactory.create('euclidean', ds_rca.layout())
        self.compareResults(v.nnSearch('01 Cigarettes And Alcohol - Oasis.mp3', dist).get(10),
                            testdata.RCA_GENRE_RESULTS)

        # try by passing directly the groundtruth map
        import gaia2.fastyaml as yaml
        ds_rca = transform(ds, 'rca', { 'resultName': 'rca10',
                                        'dimension': 10,
                                        'classMap': yaml.load(open(testdata.RCA_GENRE_GT).read()) })

        v = View(ds_rca)
        dist = MetricFactory.create('euclidean', ds_rca.layout())
        self.compareResults(v.nnSearch('01 Cigarettes And Alcohol - Oasis.mp3', dist).get(10),
                            testdata.RCA_GENRE_RESULTS)
Пример #16
0
 def prepare_original_dataset_helper(ds):
     ds = transform(
         ds, 'FixLength'
     )  # Needed to optimize use of fixed-length descriptors and save memory
     ds = transform(
         ds, 'Cleaner'
     )  # Remove descriptors that will cause problems in further transformations
     try:
         ds = transform(ds, 'enumerate',
                        {'descriptorNames': ['.tonal.chords_progression']})
     except:  # TODO: exception too broad here...
         logger.info(
             'WARNING: enumerate transformation to .tonal.chords_progression could not be performed.'
         )
     return ds
Пример #17
0
    def testDeleteUnderlyingDataSet(self):
        ds = testdata.loadTestDB()

        params = {'descriptorNames': ['*.mean', '*.var']}

        ds = transform(ds, 'fixlength', params)
        ds = transform(ds, 'cleaner', params)
        ds = transform(ds, 'normalize', params)
        dist = MetricFactory.create('euclidean', ds.layout(), params)

        v = View(ds)
        del ds

        #self.assertRaises(Exception, v.nnSearch, '01 Respect.mp3')
        # this doesn't throw anymore, as the View keeps a ref to the dataset
        v.nnSearch('01 Respect.mp3', dist)
Пример #18
0
    def testEnumerateKey(self):
        db = testdata.loadTestDB()

        testdata.useEnumerate = True
        dbe = testdata.loadTestDB()

        # also make sure we can map single points correctly
        # we need to load it separately and not take it from the dataset to ensure
        # that it'll have a different enum map
        p = Point()
        p.load('data/dataset_small/Vocal and Acapella/04 Blue Skies.mp3.sig')
        print(p.name())

        #also create a transfo that forwards enums after we did the enumerate transfo
        dbes = transform(dbe, 'select', { 'descriptorNames': '*key*' })
        pe = dbes.history().mapPoint(p)

        self.assertEqual(p['key_mode'], pe['key_mode'])
        self.assertEqual(p['key_key'],  pe['key_key'])

        self.assertNotEqual(db.layout(), dbe.layout())

        for p in db.points():
            pe = dbe.point(p.name())

            self.assertEqual(p.label('key_key'),
                             pe.label('key_key'))

            self.assertEqual(p.label('key_mode'),
                             pe.label('key_mode'))
Пример #19
0
    def __load_dataset(self):
        """
        Loads the dataset, does all the necessary steps to make it available for similarity queries and creates the PCA
        version of it. If dataset does not exist, creates a new empty one.
        NOTE: we assume that loaded datasets will have been prepared and normalized (see_
        _prepare_original_dataset() and __normalize_original_dataset()) on due time (see add_point() method below),
        therefore this function does not prepare or normalize loaded datasets.
        """

        if not os.path.exists(sim_settings.INDEX_DIR):
            os.makedirs(sim_settings.INDEX_DIR)

        # load original dataset
        if os.path.exists(self.original_dataset_path):
            self.original_dataset.load(self.original_dataset_path)
            self.__calculate_descriptor_names()

            if self.original_dataset.size(
            ) >= sim_settings.SIMILARITY_MINIMUM_POINTS and not self.indexing_only_mode:

                # Save transformation history so we do not need to compute it every time we need it
                self.transformations_history = self.original_dataset.history(
                ).toPython()

                # Build metrics for the different similarity presets, create a Gaia view
                self.__build_metrics()
                view = View(self.original_dataset)
                self.view = view

                # Compute PCA and create pca view and metric
                # NOTE: this step may take a long time if the dataset is big, but it only needs to be performed once
                # when the similarity server is loaded-
                self.pca_dataset = transform(
                    self.original_dataset, 'pca', {
                        'descriptorNames': sim_settings.PCA_DESCRIPTORS,
                        'dimension': sim_settings.PCA_DIMENSIONS,
                        'resultName': 'pca'
                    })
                self.pca_dataset.setReferenceDataSet(self.original_dataset)
                self.view_pca = View(self.pca_dataset)
                self.__build_pca_metric()

            if self.original_dataset.history().size() <= 0:
                logger.info('Dataset loaded, size: %s points' %
                            (self.original_dataset.size()))
            else:
                logger.info(
                    'Dataset loaded, size: %s points (%i fixed-length desc., %i variable-length desc.)'
                    % (self.original_dataset.size(),
                       len(self.descriptor_names['fixed-length']),
                       len(self.descriptor_names['variable-length'])))

        else:
            # If there is no existing dataset we create an empty one.
            # For the moment we do not create any distance metric nor a view because search won't be possible until
            # the DB has a minimum of SIMILARITY_MINIMUM_POINTS
            self.original_dataset.save(self.original_dataset_path)
            self.__calculate_descriptor_names()
            logger.info('Created new dataset, size: %s points (should be 0)' %
                        (self.original_dataset.size()))
Пример #20
0
    def testKullbackLeibler(self):
        ds = transform(testdata.loadTestDB(), 'fixlength')

        # creates a test with more than 1000 points otherwise the test is useless because
        # we split the workload in chunks of 1000 points when computing the distance
        dstest = DataSet()
        ncopy = 20
        for cidx in range(ncopy):
            points = list(ds.points())
            for p in points:
                p.setName(p.name() + '-%d' % cidx)
            dstest.addPoints(points)

        # test whether KL doesn't break with multithreading (did in 2.2.1)
        v = View(dstest)
        dist = MetricFactory.create('kullbackleibler',
                                    dstest.layout(),
                                    { 'descriptorName': 'mfcc' })


        results = v.nnSearch(ds.samplePoint(), dist).get(6*ncopy)
        expected = [ 0.0 ]*2*ncopy + [ 6.1013755798339844 ]*ncopy
        expected += [ 6.4808731079101562 ]*2*ncopy + [ 6.7828292846679688 ]*ncopy

        for r, e in zip(results, expected):
            self.assertAlmostEqual(r[1], e, 5)
Пример #21
0
    def testMergePointsWithDifferentEnumerationMaps(self):
        #'''ticket #74: when changing the layout of a point, we must also make sure that the enum maps are correctly mapped'''

        p1 = Point()
        p1.setName('p1')
        p1.setLayout(self.l1)
        p1['d'] = 'hello'

        p2 = Point()
        p2.setName('p2')
        p2.setLayout(self.l1)
        p2['d'] = 'world'

        ds = DataSet()
        ds.addPoint(p1)
        ds.addPoint(p2)

        self.assertEqual(ds.point('p1').label('d'), 'hello')
        self.assertEqual(ds.point('p2').label('d'), 'world')

        ds.removePoint('p2')
        ds = transform(ds, 'enumerate', { 'descriptorNames': 'd' })
        ds.addPoint(p2)

        self.assertEqual(ds.point('p1').label('d'), 'hello')
        self.assertEqual(ds.point('p2').label('d'), 'world')
Пример #22
0
    def select(self, dbfile, pca_covered_variance=75, highlevel=True):
        '''
        dbfile: the path to the gaia dataset
        pca_covered_variance: the pca transofrmation should keep at least this variance
        highlevel:include high level descriptors
        '''
        if not os.path.exists("transformed_dbs"):
            os.mkdir("transformed_dbs")
        prefix = dbfile[dbfile.rfind("/") + 1:dbfile.rfind(".")]
        print dbfile
        ds = gaia2.DataSet()
        ds.load(dbfile)
        cleaner = gaia2.AnalyzerFactory.create('cleaner')
        cleanDB = cleaner.analyze(ds).applyToDataSet(ds)

        if highlevel:
            to_remove = ['*.dmean2', '*.dvar2', '*.min', '*.max', '*cov']
        else:
            to_remove = [
                '.highlevel.*', '*.dmean2', '*.dvar2', '*.min', '*.max', '*cov'
            ]

        fselectDB = gaia2.transform(cleanDB, 'remove',
                                    {'descriptorNames': to_remove})

        # NORMALIZE, PCA & Friends
        normalize = gaia2.AnalyzerFactory.create('normalize')
        normalizedDB = normalize.analyze(fselectDB).applyToDataSet(fselectDB)

        pcavar = gaia2.AnalyzerFactory.create(
            'pca', {
                'coveredVariance': pca_covered_variance,
                'resultName': 'pca%dtransform' % pca_covered_variance
            })
        pcaDB = pcavar.analyze(normalizedDB).applyToDataSet(normalizedDB)

        mfccDB = gaia2.transform(
            cleanDB, 'select', {
                'descriptorNames': [
                    '*mfcc*', '.highlevel.*', '.rhythm.bpm',
                    '.rhythm.onset_rate'
                ]
            })

        finalDB = gaia2.mergeDataSets(mfccDB, pcaDB)
        outfile = "transformed_dbs/" + prefix + ".db"
        finalDB.save(outfile)
Пример #23
0
    def testSVM(self):
        trainingDS = testdata.readLibSVMDataSet(testdata.SVM_TRAINING_SET)
        trainingDS = transform(trainingDS, 'fixlength')

        trained = transform(trainingDS, 'svmtrain', { 'descriptorNames': 'value',
                                                      'className': 'class',
                                                      # setting this to True make the results
                                                      # different... bug or libsvm feature?
                                                      #'probability': True
                                                      })

        testDS = testdata.readLibSVMDataSet(testdata.SVM_TESTING_SET)
        predicted = trained.history().mapDataSet(testDS)

        expected = [ l.strip() for l in open(testdata.SVM_RESULT).readlines() ]
        for p, expectedClass in zip(predicted.points(), expected):
            self.assertEqual(p.label('class'), expectedClass)
Пример #24
0
    def testExponentialCompress(self):
        ds = createDataSet()
        ds = transform(ds, 'fixlength')
        dist = MetricFactory.create('ExponentialCompress', ds.layout(), { 'distance': 'euclidean' })

        self.assertEqual(dist(ds.point('p1'), ds.point('p1')), 0.0)
        self.assertAlmostEqual(dist(ds.point('p1'), ds.point('p0')), 0.63212056) # 1-exp(-1)
        self.assertAlmostEqual(dist(ds.point('p1'), ds.point('p3')), 0.86466472) # 1-exp(-2)
Пример #25
0
    def testAddFieldFixedLength(self):
        ds = testdata.loadTestDB()
        ds_fl = transform(ds, 'fixlength')
        ds_addvl = transform(ds, 'addfield', { 'real': 'hello' })
        ds_fl_addvl = transform(ds_fl, 'addfield', { 'real': 'hello' })

        self.assertEqual(ds_addvl.layout().descriptorLocation('hello').lengthType(), VariableLength)
        self.assertEqual(ds_fl_addvl.layout().descriptorLocation('hello').lengthType(), VariableLength)

        ds_addvl_fl = transform(ds_addvl, 'fixlength')
        ds_fl_addvl_fl = transform(ds_fl_addvl, 'fixlength')

        self.assertEqual(ds_addvl_fl.layout(), ds_fl_addvl_fl.layout())

        ds_fl_addfl = transform(ds_fl, 'addfield', { 'real': 'hello', 'size': { 'hello': 1 } })
        self.assertEqual(ds_fl_addfl.layout(), ds_fl_addvl_fl.layout())
        self.assertEqual(ds_fl_addfl[0]['hello'], 0)

        ds_fl_addfl2 = transform(ds_fl, 'addfield', { 'real': 'hello',
                                                      'string': 'youhou',
                                                      'size': { 'hello': 3, 'youhou': 6 },
                                                      'default': { 'hello': [ 2, 5, 3 ],
                                                                   'youhou': [ 'a', 'b', 'c', 'd', 'e', 'f' ] }
                                                      })

        self.assertEqual(ds_fl_addfl2.layout().descriptorLocation('hello').dimension(), 3)
        self.assertEqual(ds_fl_addfl2.layout().descriptorLocation('youhou').dimension(), 6)

        self.assertEqual(ds_fl_addfl2[0]['hello'], (2, 5, 3))
Пример #26
0
 def normalize_dataset_helper(ds, descriptor_names):
     # Add normalization
     normalization_params = {
         "descriptorNames": descriptor_names,
         "independent": True,
         "outliers": -1
     }
     ds = transform(ds, 'normalize', normalization_params)
     return ds
Пример #27
0
def mergeDataSet(eqloud):

    if eqloud == 'eqloud': ext = 'sig'
    else: ext = 'neq.sig'

    datasetName = datasetdir + className + '_%s.db' % eqloud

    if os.path.exists(datasetName):
        print('Dataset already exists:', datasetName)
        return

    if className == 'genre_itms':
        mergelist, groundTruth = getSignatureData_iTMS(str(basedir), ext,
                                                       metafile)
    elif className == 'mood':
        mergelist, groundTruth = getSignatureData_CyrilMoods(str(basedir), ext)
    elif className == 'mood_mirex':
        mergelist, groundTruth = getSignatureData_MirexMoods(str(basedir), ext)
    elif className == 'artist':
        mergelist, groundTruth = getSignatureData_MirexArtist(
            str(basedir), ext)
    else:  # genre
        mergelist, groundTruth = getSignatureData(str(basedir), ext)

    # merge dataset
    import gaia2
    ds = gaia2.DataSet.mergeFiles(mergelist)

    # preprocessing common to all tests:
    ds = gaia2.transform(ds, 'removevl')
    ds = gaia2.transform(ds, 'fixlength')
    ds = gaia2.transform(ds, 'cleaner')

    # transform the dataset to add the class information
    ds = gaia2.transform(ds, 'addfield', {'string': className})

    for p in ds.points():
        p[className] = groundTruth[p.name()]

    ds.save(datasetName)

    # write groundTruth as pickled file
    import pickle
    pickle.dump(groundTruth, open(datasetName + '.groundtruth', 'w'))
Пример #28
0
def train_SVM(dataset, groundTruth, descriptorNames, exclude = [], svmtype = 'c-svc', kernel = 'rbf', c = 1, gamma = 1):
    # recreate a copy of the given dataset without history
    ds = DataSet()
    ds.addPoints([ p for p in dataset.points() ])

    ds = transform(ds, 'normalize', { 'descriptorNames': descriptorNames,
                                      'except': exclude,
                                      'independent': True })

    ds = transform(ds, 'svmtrain', { 'descriptorNames': descriptorNames,
                                     'except': exclude,
                                     'className': groundTruth.className,
                                     'type': svmtype,
                                     'kernel': kernel,
                                     'c': c,
                                     'gamma': gamma})

    h = ds.history()
    return lambda p: str(h.mapPoint(p)[groundTruth.className])
Пример #29
0
 def transform(dataset):
     """Transform dataset for distance computations."""
     dataset = transform(dataset, 'fixlength')
     dataset = transform(dataset, 'cleaner')
     # dataset = transform(dataset, 'remove', {'descriptorNames': '*mfcc*'})
     for field in ('*beats_position*', '*bpm_estimates*', '*bpm_intervals*',
                   '*onset_times*', '*oddtoevenharmonicenergyratio*'):
         try:
             dataset = transform(dataset, 'remove',
                                 {'descriptorNames': field})
         except Exception as ex:
             print(repr(ex))
     dataset = transform(dataset, 'normalize')
     dataset = transform(dataset, 'pca', {
         'dimension': 30,
         'descriptorNames': ['*'],
         'resultName': 'pca30'
     })
     return dataset
Пример #30
0
 def transform(dataset):
     """Transform dataset for distance computations."""
     dataset = transform(dataset, 'fixlength')
     dataset = transform(dataset, 'cleaner')
     # dataset = transform(dataset, 'remove', {'descriptorNames': '*mfcc*'})
     for field in ('*beats_position*', '*bpm_estimates*', '*bpm_intervals*',
                   '*onset_times*', '*oddtoevenharmonicenergyratio*'):
         try:
             dataset = transform(
                 dataset, 'remove', {'descriptorNames': field})
         except Exception as ex:
             print(repr(ex))
     dataset = transform(dataset, 'normalize')
     dataset = transform(
         dataset, 'pca', {
             'dimension': 30,
             'descriptorNames': ['*'],
             'resultName': 'pca30'})
     return dataset
Пример #31
0
def mergeDataSet(eqloud):

    if eqloud == 'eqloud': ext = 'sig'
    else: ext = 'neq.sig'

    datasetName = datasetdir + className + '_%s.db' % eqloud

    if os.path.exists(datasetName):
        print 'Dataset already exists:', datasetName
        return

    if className == 'genre_itms':
        mergelist, groundTruth = getSignatureData_iTMS(str(basedir), ext, metafile)
    elif className == 'mood':
        mergelist, groundTruth = getSignatureData_CyrilMoods(str(basedir), ext)
    elif className == 'mood_mirex':
        mergelist, groundTruth = getSignatureData_MirexMoods(str(basedir), ext)
    elif className == 'artist':
        mergelist, groundTruth = getSignatureData_MirexArtist(str(basedir), ext)
    else: # genre
        mergelist, groundTruth = getSignatureData(str(basedir), ext)

    # merge dataset
    import gaia2
    ds = gaia2.DataSet.mergeFiles(mergelist)

    # preprocessing common to all tests:
    ds = gaia2.transform(ds, 'removevl')
    ds = gaia2.transform(ds, 'fixlength')
    ds = gaia2.transform(ds, 'cleaner')

    # transform the dataset to add the class information
    ds = gaia2.transform(ds, 'addfield', { 'string': className })

    for p in ds.points():
        p[className] = groundTruth[p.name()]

    ds.save(datasetName)

    # write groundTruth as pickled file
    import cPickle
    cPickle.dump(groundTruth, open(datasetName + '.groundtruth', 'w'))
Пример #32
0
    def testWrongArgument(self):
        ds = testdata.loadTestDB()
        ds = transform(ds, 'fixlength')
        ds = transform(ds, 'removevl')
        ds = transform(ds, 'cleaner')
        ds = transform(ds, 'normalize')
        # missing param: className
        self.assertRaises(Exception, transform, ds, 'svmtrain',
                          {'descriptorNames': '*.mean'})
        # wrong param: descriptorName
        self.assertRaises(Exception, transform, ds, 'svmtrain', {
            'className': 'kloug',
            'descriptorName': '*.mean'
        })

        # missing param: resultName
        self.assertRaises(Exception, transform, ds, 'pca', {
            'dimension': 15,
            'resultName': ''
        })
Пример #33
0
def addRCA(ds, groundTruth, dim, selectConfig = {}):
    #ds_rca = transform(ds, 'fixlength') # should be unnecessary
    ds_rca = ds
    if selectConfig:
        ds_rca = transform(ds_rca, 'select', selectConfig)

    ds_rca = transform(ds_rca, 'gaussianize')

    # if dimension is too high, we need to preprocess before with a PCA, otherwise RCA doesn't work
    l = ds_rca.layout()
    descdim = l.descriptorLocation(l.descriptorNames()).dimension(RealType)
    if descdim > 80:
        ds_rca = transform(ds_rca, 'pca', { 'resultName': 'pca%d' % 80,
                                            'dimension': 80 })

    ds_rca = transform(ds_rca, 'rca', { 'resultName': 'rca%d' % dim,
                                        'dimension': dim,
                                        'classMap': pmap(groundTruth) })

    return mergeDataSets(ds, ds_rca)
Пример #34
0
    def testParsedVsConstructedFilters(self):
        ds = testdata.loadTestDB()
        ds = transform(ds, 'fixlength')

        p = ds.samplePoint()
        p2 = ds.point('Higher State of Consciousness.mp3')

        queries = [
            (p, '', ''), (p2, '', ''),
            (p2, 'WHERE value.tempotap_bpm.value > 140',
             Filter('tempotap_bpm.value', '>', 140)),
            (p, 'WHERE value.tempotap_bpm > 110',
             Filter('tempotap_bpm', '>', 110)),
            (p, 'WHERE value.tempotap_bpm > -10',
             Filter('tempotap_bpm', '>', -10)),
            (p, 'WHERE value.tempotap_bpm > 23000',
             Filter('tempotap_bpm', '>', 23000)),
            (p, 'WHERE value.tempotap_bpm > 120 AND value.tempotap_bpm < 130',
             AndFilter([
                 Filter('tempotap_bpm', '>', 120),
                 Filter('tempotap_bpm', '<', 130)
             ])),
            (p, 'WHERE value.tempotap_bpm BETWEEN 130 AND 120',
             Filter('tempotap_bpm', 'between', [130, 120])),
            (p, 'WHERE label.key_key = "C"', Filter('key_key', '==', 'C')),
            (p2,
             '''WHERE ((label.key_key = "A" AND label.key_mode = "major") OR
                                   (label.key_key = "E" AND label.key_mode = "minor"))
                                  AND value.tempotap_bpm < 90''',
             AndFilter([
                 OrFilter([
                     AndFilter([
                         Filter('key_key', '==', 'A'),
                         Filter('key_mode', '==', 'major')
                     ]),
                     AndFilter([
                         Filter('key_key', '==', 'E'),
                         Filter('key_mode', '==', 'minor')
                     ])
                 ]),
                 Filter('tempotap_bpm', '<', 90)
             ]))
        ]

        dist = MetricFactory.create('euclidean', ds.layout(),
                                    {'descriptorNames': '*.mean'})
        v = View(ds)

        for (pt, filtstr, filt) in queries:
            self.assertEqualSearchSpace(v.nnSearch(pt, dist, filtstr),
                                        v.nnSearch(pt, dist, filt))
Пример #35
0
    def testCenter(self):
        ds = testdata.createSimpleDataSet()
        l = testdata.createSimpleLayout()
        for i in range(4):
            p = Point()
            p.setName('p%d' % i)
            p.setLayout(l)
            ds.addPoint(p)
        ds.removePoint('p')

        ds.point('p0')['a.1'] = [ 0, 1 ]
        ds.point('p1')['a.1'] = [ 4, 3 ]
        ds.point('p2')['a.1'] = [ 6, 9 ]
        ds.point('p3')['a.1'] = [ 2, 27 ]
        # mean = [ 3, 10 ]

        ds = transform(ds, 'fixlength')
        dsc = transform(ds, 'center', { 'descriptorNames': 'a.1' })

        self.assertEqual(dsc.point('p0')['a.1'], (-3, -9))
        self.assertEqual(dsc.point('p1')['a.1'], ( 1, -7))
        self.assertEqual(dsc.point('p2')['a.1'], ( 3, -1))
        self.assertEqual(dsc.point('p3')['a.1'], (-1, 17))
Пример #36
0
Файл: pca.py Проект: DomT4/gaia
def PCA(x):
    points = []
    layout = PointLayout()
    layout.add('x', RealType)

    for i, l in enumerate(x):
        p = Point()
        p.setName('p%d' % i)
        p.setLayout(layout)
        p['x'] = l
        points.append(p)

    ds = DataSet()
    ds.addPoints(points)

    ds = transform(ds, 'fixlength')
    ds = transform(ds, 'pca', { 'dimension': len(x[0]), 'resultName': 'pca' })

    result = []
    for p in ds.points():
        result.append(p['pca'])

    return result
Пример #37
0
def PCA(x):
    points = []
    layout = PointLayout()
    layout.add('x', RealType)

    for i, l in enumerate(x):
        p = Point()
        p.setName('p%d' % i)
        p.setLayout(layout)
        p['x'] = l
        points.append(p)

    ds = DataSet()
    ds.addPoints(points)

    ds = transform(ds, 'fixlength')
    ds = transform(ds, 'pca', {'dimension': len(x[0]), 'resultName': 'pca'})

    result = []
    for p in ds.points():
        result.append(p['pca'])

    return result
Пример #38
0
    def testWeightedPearson(self):
        ds = testdata.createSimpleDataSet()
        ds.point('p')['a.1'] = [ 0, 0 ] # need to have 2 values before fixing length
        p1 = transform(ds, 'fixlength').point('p')
        p2 = Point(p1)

        dist = MetricFactory.create('WeightedPearson', p1.layout(), { 'weights': { '1': 0.3,
                                                                                   'c': 0.7 }
                                                                      })
        p1['a.1'] = [ 0.12, 2.71 ]
        p1['c'] = 4.32
        p2['1'] = [ 0.46, 1.12 ]
        p2['c'] = 2.4242

        self.assertAlmostEqual(dist(p1, p2), 0.038222129799, 6)
Пример #39
0
 def normalize_dataset_helper(ds, descriptor_names):
     # NOTE: The "except" list of descriptors below should be reviewed if a new extractor is used. The point is to
     # remove descriptors can potentially break normalize transform (e.g. descriptors with value = 0)
     normalization_params = {
         "descriptorNames": descriptor_names,
         "except": [
             "*.min",
             "*.max",
             "tonal.chords_histogram",
         ],
         "independent": True,
         "outliers": -1
     }
     ds = transform(ds, 'normalize', normalization_params)
     return ds
Пример #40
0
def addVarFromCov(ds, desc):
    '''Adds the .var aggregate descriptor to the specified descriptor using its
    .cov aggregate, for all the points in the dataset.'''

    ds = transform(ds, 'addfield', { 'real': desc + '.var' })

    # add the .var descriptor using .cov (it's the diagonal of the matrix)
    for p in ds.points():
        m = utils.toMatrix(p.value(desc + '.cov'))
        dim = len(m)
        diag = RealDescriptor(dim, 0.0)
        for i in range(dim):
            diag[i] = m[i][i]
        p.setValue(desc + '.var', diag)

    return ds
Пример #41
0
def harmonizeChunks(partfiles):
    # TODO: check all histories are the same, if not, try to do sth about it
    # find the GCLD (greatest common layout divisor :-) )
    ds = DataSet()
    ds.load(partfiles[0])
    origLayout = ds.layout().copy()
    gcld = ds.layout().copy()

    for pfile in partfiles[1:]:
        ds.load(pfile)
        gcld = gcld & ds.layout()

    # keep some stats about which descriptors got removed and the reason why before throwing
    # away the original history and simplifying it
    vldescs = set()
    nandescs = set()

    # now that we have our GCLD, transform all the chunks so they have the same layout (our GCLD)
    # and simplify their histories so that they also have the same history (the minimum history
    # required to arrive at this target layout).
    for pfile in partfiles:
        ds.load(pfile)

        for t in ds.history().toPython():
            tname = t['Analyzer name']
            descs = t['Applier parameters']['descriptorNames']
            if   tname == 'cleaner':  nandescs.update(descs)
            elif tname == 'removevl': vldescs.update(descs)

        toremove = ds.layout().differenceWith(gcld)
        if toremove:
            ds = transform(ds, 'remove', { 'descriptorNames': toremove })

        ds.simplifyHistory()
        ds.save(pfile)

    # also get the other descriptors that got removed (because of a select or remove transfo)
    rdescs = set(origLayout.differenceWith(gcld)) - (vldescs | nandescs)

    return vldescs, nandescs, rdescs
Пример #42
0
def highlevel_mosaic(target, tcorpus, scorpus, scope=5):
    """
        This will be used to test the highlevel mosaicing process.
        The scope variable controls the number of results which are returned 
        for each target unit which is sought.

    """
    # Create a temporary file for the mosaic audio
    filepath = os.path.join(os.getcwd(), 'temp_mosaic.wav')
    if os.path.isfile(filepath):
        os.remove(filepath)
    mosaic = Mosaic(filepath)
    cost = RepeatUnitCost()
    context = Context()
    gridder = Gridder()
    units = tcorpus.list_audio_units(audio_filename=target, chop='highlevel')
    hdb = scorpus.get_gaia_unit_db(chop='highlevel_%s' % self.chop)
    distance = get_mood_distance(hdb)
    v = View(hdb, distance)
    results = {}
    for f in units:
        p = Point()
        p.load(switch_ext(f, '.yaml'))
        unit_name = switch_ext(os.path.basename(f), '')
        p.setName(unit_name)
        p_m = hdb.history().mapPoint(p)
        results.update({f:v.nnSearch(p_m).get(scope)})
    log.debug("Ok, now we have a dict with each target segment, along with its corresponding nearest matches in source db")
    log.debug("Check to see that we have every second of target audio accounted for - I think not!") 
    #return results
    #new_results = results.copy()
    ds = DataSet()
    for r in results:
        units = []
        for u in results[r]:
            ds.load(switch_ext(u[0], '.db'))
            for n in ds.pointNames():
                units.append(n)
        new_ds = gaia_transform(dict(zip(units, units)))
        results.update({r:new_ds})
    #return results
    # Very important - target units must be in correct order
    index = 0
    index_skip = 0
    for r in sorted(results.keys()):
        tds = DataSet()
        tds.load(switch_ext(r, '.db'))
        #return tds, results
        sds = results[r]
        source_set = set(sds.layout().descriptorNames())
        target_set = set(tds.layout().descriptorNames())
        remove_from_source = source_set.difference(target_set)
        remove_from_target = target_set.difference(source_set)
        if len(remove_from_source) > 0:
            log.debug("Will try to remove %s from the source DataSet" % remove_from_source)
            try:
                sds = transform(results[r], 'remove', {'descriptorNames':list(remove_from_source)})
            except Exception, e:
                log.error("Failed to remove %s from source DataSet" % list(remove_from_source))
                return results[r], tds
        if len(remove_from_target) > 0:
            log.debug("Will try to remove %s from the target DataSet" % remove_from_source)
            try:
                tds = transform(tds, 'remove', {'descriptorNames':list(remove_from_target)})
            except Exception, e:
                log.error("Failed to remove %s from target DataSet" % list(remove_from_target))
                return results[r], tds
Пример #43
0
from gaia2 import DataSet, transform

def gaia_transform(points):
    """
        Takes a dict of point names and filepaths.
        Creates a DataSet and performs the standard transformations 
    """
    ds = DataSet.mergeFiles(points)
    ds = transform(ds, 'fixlength')
    ds = transform(ds, 'cleaner')
    for desc in get_unused_descriptors():
        try:   
            ds = transform(ds, 'remove', desc)
        except Exception, e:
            log.error("Problem removing this descriptor: %s" % e)
    ds = transform(ds, 'normalize')
    return ds

def get_unused_descriptors():
    """
        Gets some descriptors which are not commonly used in order to remove
        them from the analysis
    """
    for d in ['rhythm.beats_position', 'rhythm.bpm_estimates', 
                 'rhythm.bpm_intervals', 'rhythm.onset_times', 
                 'rhythm.rubato_start', 'rhythm.rubato_stop', 
                ]:
        yield {'descriptorNames': [d]}


def process_highlevel(corpus, filepath, chop):