def get_miximized_tracks(self, filenames): """Get list of tracks in ideal order.""" for filename in filenames: self.queue.put((ADD, filename)) while self.queue.qsize(): print("waiting for analysis") sleep(10) encoded = [f.encode('utf-8') for f in filenames] dataset = DataSet() number_of_tracks = len(filenames) for filename in encoded: if not self.gaia_db.contains(filename): continue point = self.gaia_db.point(filename) dataset.addPoint(point) dataset = self.transform(dataset) matrix = {} for filename in encoded: matrix[filename] = { name: score for score, name in self.get_neighbours( dataset, filename, number_of_tracks)} clusterer = Clusterer(encoded, lambda f1, f2: matrix[f1][f2]) clusterer.cluster() result = [] for cluster in clusterer.clusters: result.extend([encoded.index(filename) for filename in cluster]) return result
def get_miximized_tracks(self, filenames): """Get list of tracks in ideal order.""" for filename in filenames: self.queue.put((ADD, filename)) while self.queue.qsize(): print("waiting for analysis") sleep(10) encoded = [f.encode('utf-8') for f in filenames] dataset = DataSet() number_of_tracks = len(filenames) for filename in encoded: if not self.gaia_db.contains(filename): continue point = self.gaia_db.point(filename) dataset.addPoint(point) dataset = self.transform(dataset) matrix = {} for filename in encoded: matrix[filename] = { name: score for score, name in self.get_neighbours(dataset, filename, number_of_tracks) } clusterer = Clusterer(encoded, lambda f1, f2: matrix[f1][f2]) clusterer.cluster() result = [] for cluster in clusterer.clusters: result.extend([encoded.index(filename) for filename in cluster]) return result
def testKullbackLeibler(self): ds = transform(testdata.loadTestDB(), 'fixlength') # creates a test with more than 1000 points otherwise the test is useless because # we split the workload in chunks of 1000 points when computing the distance dstest = DataSet() ncopy = 20 for cidx in range(ncopy): points = list(ds.points()) for p in points: p.setName(p.name() + '-%d' % cidx) dstest.addPoints(points) # test whether KL doesn't break with multithreading (did in 2.2.1) v = View(dstest) dist = MetricFactory.create('kullbackleibler', dstest.layout(), { 'descriptorName': 'mfcc' }) results = v.nnSearch(ds.samplePoint(), dist).get(6*ncopy) expected = [ 0.0 ]*2*ncopy + [ 6.1013755798339844 ]*ncopy expected += [ 6.4808731079101562 ]*2*ncopy + [ 6.7828292846679688 ]*ncopy for r, e in zip(results, expected): self.assertAlmostEqual(r[1], e, 5)
def train_SVM(dataset, groundTruth, descriptorNames, exclude=[], svmtype='c-svc', kernel='rbf', c=1, gamma=1): # recreate a copy of the given dataset without history ds = DataSet() ds.addPoints([p for p in dataset.points()]) ds = transform(ds, 'normalize', { 'descriptorNames': descriptorNames, 'except': exclude, 'independent': True }) ds = transform( ds, 'svmtrain', { 'descriptorNames': descriptorNames, 'except': exclude, 'className': groundTruth.className, 'type': svmtype, 'kernel': kernel, 'c': c, 'gamma': gamma }) h = ds.history() return lambda p: str(h.mapPoint(p)[groundTruth.className])
def train_svm_history(project, params, output_file_path): params_model = params["model"] if params_model.get("classifier") != "svm": raise GaiaWrapperException( "Can only use this script on SVM config parameters.") ds = DataSet() ds.load( os.path.join( project["datasetsDirectory"], "%s-%s.db" % (project["className"], params_model["preprocessing"]))) gt = GroundTruth.fromFile(project["groundtruth"]) gt.className = "highlevel." + project["className"] history = train_svm( ds, gt, type=params_model["type"], kernel=params_model["kernel"], C=params_model["C"], gamma=params_model["gamma"]) # doing the whole training if isinstance(output_file_path, unicode): output_file_path = output_file_path.encode("utf-8") history.save(output_file_path)
def trainSVMHistory(configFilename, paramsFilename, outputHistoryFilename, className): config = yaml.load(open(configFilename).read()) params = yaml.load(open(paramsFilename).read())['model'] if params.pop('classifier') != 'svm': raise Exception('Can only use this script on SVM config parameters.') preproc = params.pop('preprocessing') ds = DataSet() ds.load( join( split(configFilename)[0], # base dir config['datasetsDirectory'], # datasets dir '%s-%s.db' % (config['className'], preproc))) # dataset name gt = GroundTruth.fromFile(config['groundtruth']) if className: gt.className = className # add 'highlevel.' in front of the descriptor, this is what will appear in the final Essentia sigfile gt.className = 'highlevel.' + gt.className # do the whole training h = trainSVM(ds, gt, **params) h.save(outputHistoryFilename)
def testMergePointsWithDifferentEnumerationMaps(self): #'''ticket #74: when changing the layout of a point, we must also make sure that the enum maps are correctly mapped''' p1 = Point() p1.setName('p1') p1.setLayout(self.l1) p1['d'] = 'hello' p2 = Point() p2.setName('p2') p2.setLayout(self.l1) p2['d'] = 'world' ds = DataSet() ds.addPoint(p1) ds.addPoint(p2) self.assertEqual(ds.point('p1').label('d'), 'hello') self.assertEqual(ds.point('p2').label('d'), 'world') ds.removePoint('p2') ds = transform(ds, 'enumerate', { 'descriptorNames': 'd' }) ds.addPoint(p2) self.assertEqual(ds.point('p1').label('d'), 'hello') self.assertEqual(ds.point('p2').label('d'), 'world')
def initialize_gaia_db(self): """Load or initialize the gaia database.""" if not os.path.isfile(self.gaia_db_path): dataset = DataSet() else: dataset = self.load_gaia_db() self.transformed = True print("songs in db: %d" % dataset.size()) return dataset
def run(self, className, outfilename, param, dsname, gtname, evalconfig): try: classifier = param['classifier'] gt = GroundTruth(classifier) gt.load(gtname) # force the GroundTruth class name to be the one specified by our project file, not # the one in the original groundtruth file gt.className = className ds = DataSet() ds.load(dsname) # some points may have failed to be analyzed, remove those from the GroundTruth pnames = ds.pointNames() for pid in list(gt.keys()): if pid not in pnames: log.warning( 'Removing %s from GroundTruth as it could not be found in the merged dataset' % pid) del gt[pid] trainerFun, trainingparam, newds = getTrainer( classifier, param, ds) # run all the evaluations specified in the evaluation config for i, evalparam in enumerate(evalconfig): # if we already ran this evaluation, no need to run it again... resultFilename = outfilename + '_%d.result' % i if exists(resultFilename): log.info('File %s already exists. Skipping evaluation...' % resultFilename) continue log.info( 'Running evaluation %d for: %s with classifier %s and dataset %s' % (i, outfilename, param['classifier'], param['preprocessing'])) log.info(' PID: %d, parameters: %s' % (os.getpid(), json.dumps(param))) # run evaluation confusion = evaluateNfold(evalparam['nfold'], ds, gt, trainerFun, **trainingparam) # write evaluation params & result with open(outfilename + '_%d.param' % i, 'w') as f: yaml.dump({'model': param, 'evaluation': evalparam}, f) confusion.save(resultFilename) except Exception: log.error( 'While doing evaluation with param = %s\nevaluation = %s' % (param, evalconfig)) raise
def testAddToDataSetWithDifferentLayouts(self): p1 = Point() p1.setLayout(self.l1) # +1, ref = 2 p2 = Point() ds = DataSet() ds.addPoint(p1) # +2 (dataset+pointcopy), ref = 4 self.assertRaises(Exception, ds.addPoint, p2) self.assertEqual(p1.layout().ref(), 4) self.assertEqual(p2.layout().ref(), 1)
def loadTestDB(): global useFixedLength, useEnumerate ds = DataSet() ds.load(TEST_DATABASE) if useFixedLength: ds = fixLength(ds) if useEnumerate: ds = enumerateStrings(ds, exclude='chords_progression_hash.value') return ds
def loadSmallDB(): global useFixedLength, useEnumerate ds = DataSet() ds.load(TEST_SMALLDB) if useFixedLength: ds = fixLength(ds) if useEnumerate: ds = enumerateStrings(ds) return ds
def loadGaia20DB(): global useFixedLength, useEnumerate ds = DataSet() ds.load(GAIA20_DB) if useFixedLength: ds = fixLength(ds) if useEnumerate: ds = enumerateStrings(ds) return ds
def run(self, className, outfilename, param, dsname, gtname, evalconfig): try: classifier = param['classifier'] gt = GroundTruth(classifier) gt.load(gtname) # force the GroundTruth class name to be the one specified by our project file, not # the one in the original groundtruth file gt.className = className ds = DataSet() ds.load(dsname) # some points may have failed to be analyzed, remove those from the GroundTruth pnames = ds.pointNames() for pid in list(gt.keys()): if pid not in pnames: log.warning('Removing %s from GroundTruth as it could not be found in the merged dataset' % pid) del gt[pid] trainerFun, trainingparam, newds = getTrainer(classifier, param, ds) # run all the evaluations specified in the evaluation config for i, evalparam in enumerate(evalconfig): # if we already ran this evaluation, no need to run it again... resultFilename = outfilename + '_%d.result' % i if exists(resultFilename): log.info('File %s already exists. Skipping evaluation...' % resultFilename) continue log.info('Running evaluation %d for: %s with classifier %s and dataset %s' % (i, outfilename, param['classifier'], param['preprocessing'])) log.info(' PID: %d, parameters: %s' % (os.getpid(), json.dumps(param))) # run evaluation confusion = evaluateNfold(evalparam['nfold'], ds, gt, trainerFun, **trainingparam) # write evaluation params & result with open(outfilename + '_%d.param' % i, 'w') as f: yaml.dump({ 'model': param, 'evaluation': evalparam }, f) confusion.save(resultFilename) except Exception: log.error('While doing evaluation with param = %s\nevaluation = %s' % (param, evalconfig)) raise
def createSimpleDataSet(): global useFixedLength, useEnumerate l = createSimpleLayout() ds = DataSet() p = Point() p.setName('p') p.setLayout(l) ds.addPoint(p) if useFixedLength: ds = fixLength(ds) if useEnumerate: ds = enumerateStrings(ds) return ds
def __init__(self): self.index_path = INDEX_DIR self.original_dataset = DataSet() self.original_dataset_path = self.__get_dataset_path(INDEX_NAME) self.metrics = {} self.view = None self.__load_dataset()
def testValues(self): collection = yaml.load(open(testdata.TEST_DATABASE_FILES, 'r').read()) # prepend 'data/' to the filenames for pid, filename in list(collection.items()): collection[pid] = 'data/' + filename cvar.verbose = False ds = DataSet.mergeFiles(collection) cvar.verbose = True self.assertAlmostEqual( ds.point('Panic-The Smiths.mp3').value('danceability'), 0.5691167712) self.assertAlmostEqual( ds.point('11 Go.mp3').value('energy.mean'), 0.0231081359) self.assertAlmostEqual( ds.point('03 The Chopper [Shy FX Remix].mp3').value( 'chords_number_rate'), 0.0551007539) self.assertEqual( ds.point('08 I Can\'t Dance - Genesis.mp3').label('key_key'), 'D#') self.assertEqual( ds.point('06 Booo!.mp3').label('chords_mode'), 'major') ds.save(testdata.TEST_DATABASE)
def train_svm_history(project, params, output_file_path): params_model = params["model"] if params_model.pop("classifier") != "svm": raise GaiaWrapperException("Can only use this script on SVM config parameters.") ds = DataSet() ds.load(os.path.join( project["datasetsDirectory"], "%s-%s.db" % (project["className"], params_model.pop("preprocessing")) )) gt = GroundTruth.fromFile(project["groundtruth"]) gt.className = "highlevel." + project["className"] history = train_svm(ds, gt, **params_model) # doing the whole training history.save(output_file_path)
def __init__(self, indexing_only_mode=False): self.indexing_only_mode = indexing_only_mode self.index_path = INDEX_DIR self.original_dataset = DataSet() self.pca_dataset = DataSet() if not self.indexing_only_mode: self.original_dataset_path = self.__get_dataset_path(INDEX_NAME) else: self.original_dataset_path = self.__get_dataset_path( INDEXING_SERVER_INDEX_NAME) self.descriptor_names = {} self.metrics = {} self.view = None self.view_pca = None self.transformations_history = None self.__load_dataset()
def train_SVM(dataset, groundTruth, descriptorNames, exclude = [], svmtype = 'c-svc', kernel = 'rbf', c = 1, gamma = 1): # recreate a copy of the given dataset without history ds = DataSet() ds.addPoints([ p for p in dataset.points() ]) ds = transform(ds, 'normalize', { 'descriptorNames': descriptorNames, 'except': exclude, 'independent': True }) ds = transform(ds, 'svmtrain', { 'descriptorNames': descriptorNames, 'except': exclude, 'className': groundTruth.className, 'type': svmtype, 'kernel': kernel, 'c': c, 'gamma': gamma}) h = ds.history() return lambda p: str(h.mapPoint(p)[groundTruth.className])
def train_svm_history(project, params, output_file_path): params_model = params["model"] if params_model.get("classifier") != "svm": raise GaiaWrapperException("Can only use this script on SVM config parameters.") ds = DataSet() ds.load(os.path.join( project["datasetsDirectory"], "%s-%s.db" % (project["className"], params_model["preprocessing"]) )) gt = GroundTruth.fromFile(project["groundtruth"]) gt.className = "highlevel." + project["className"] history = train_svm(ds, gt, type=params_model["type"], kernel=params_model["kernel"], C=params_model["C"], gamma=params_model["gamma"]) # doing the whole training if isinstance(output_file_path, unicode): output_file_path = output_file_path.encode("utf-8") history.save(output_file_path)
def evaluateNfold(nfold, dataset, groundTruth, trainingFunc, *args, **kwargs): """Evaluate the classifier on the given dataset and returns the confusion matrix. The evaluation is performed using n-fold cross validation. Uses only the points that are in the groundTruth parameter for the evaluation. Parameters ---------- nfold : the number of folds to use for the cross-validation dataset : the dataset from which to get the points groundTruth : a map from the points to classify to their respective class trainingFunc : a function which will train and return a classifier given a dataset, the groundtruth, and the *args and **kwargs arguments """ log.info('Doing %d-fold cross validation' % nfold) classes = set(groundTruth.values()) progress = TextProgress(nfold, 'Evaluating fold %(current)d/%(total)d') # get map from class to point names iclasses = {} for c in classes: iclasses[c] = [ p for p in groundTruth.keys() if groundTruth[p] == c ] random.shuffle(iclasses[c]) # get folds folds = {} for i in range(nfold): folds[i] = [] for c in iclasses.values(): foldsize = (len(c)-1)//nfold + 1 # -1/+1 so we take all instances into account, last fold might have fewer instances folds[i] += c[ foldsize * i : foldsize * (i+1) ] # build sub-datasets and run evaluation on them confusion = None pnames = [ p.name() for p in dataset.points() ] for i in range(nfold): if log.isEnabledFor(logging.INFO): progress.update(i+1) trainds = DataSet() trainds.addPoints([ dataset.point(pname) for pname in pnames if pname not in folds[i] ]) traingt = GroundTruth(groundTruth.className, dict([ (p, c) for p, c in groundTruth.items() if p not in folds[i] ])) testds = DataSet() testds.addPoints([ dataset.point(str(pname)) for pname in folds[i] ]) testgt = GroundTruth(groundTruth.className, dict([ (p, c) for p, c in groundTruth.items() if p in folds[i] ])) classifier = trainingFunc(trainds, traingt, *args, **kwargs) confusion = evaluate(classifier, testds, testgt, confusion, verbose = False) return confusion
def transformDataSet(inputFilename, outputFilename, transfoFile = None): """Apply the list of transformations given as a yaml sequence to the specified dataset.""" print 'Preprocessing dataset chunk for %s...' % outputFilename gaia2.cvar.verbose = False transfoList = ''' - transfo: removevl - transfo: fixlength - transfo: cleaner ''' if transfoFile is not None: transfoList = open(transfoFile).read() ds = DataSet() ds.load(inputFilename) ds = applyTransfoChain(ds, transfoList) ds.save(outputFilename)
def createDataSet(): l = PointLayout() l.add('a', RealType) ds = DataSet() # p1.a = (0.0, 0.0) p = Point() p.setName('p1') p.setLayout(l) p['a'] = (0.0, 0.0) ds.addPoint(p) # p2.a = (0.5, 1.0) p = Point() p.setName('p2') p.setLayout(l) p['a'] = (0.5, 1.0) ds.addPoint(p) if testdata.useFixedLength: ds = testdata.fixLength(ds) if testdata.useEnumerate: ds = testdata.enumerateStrings(ds) return ds
def gaia_transform(points): """ Takes a dict of point names and filepaths. Creates a DataSet and performs the standard transformations """ ds = DataSet.mergeFiles(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'cleaner') for desc in get_unused_descriptors(): try: ds = transform(ds, 'remove', desc) except Exception, e: log.error("Problem removing this descriptor: %s" % e)
def testComplexReferenceCounting(self): ds = DataSet() self.assertEqual(ds.layout().ref(), 2) # 1 + 1 from temp object p = Point() p.setName('p1') lext = PointLayout(p.layout()) # +1, {lext,p}.ref = 2 self.assertEqual(lext.ref(), 2) lext = p.layout().copy() # copy, lext.ref = 1; p.ref -= 1, = 1 self.assertEqual(lext.ref(), 1) ds.addPoint(p) # +3 (dataset + pointcopy), ref = 3 self.assertEqual(lext.ref(), 1) self.assertEqual(ds.layout().ref(), 4) # 3 + 1 temp object p2 = Point(p) # +1, {p,p2}.ref = 5 p2.setName('p2') self.assertEqual(ds.layout().ref(), 5) ds.addPoint(p2) self.assertEqual(ds.layout().ref(), 6) # +1 pointcopy, ref = 6
def PCA(x): points = [] layout = PointLayout() layout.add('x', RealType) for i, l in enumerate(x): p = Point() p.setName('p%d' % i) p.setLayout(layout) p['x'] = l points.append(p) ds = DataSet() ds.addPoints(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'pca', { 'dimension': len(x[0]), 'resultName': 'pca' }) result = [] for p in ds.points(): result.append(p['pca']) return result
def PCA(x): points = [] layout = PointLayout() layout.add('x', RealType) for i, l in enumerate(x): p = Point() p.setName('p%d' % i) p.setLayout(layout) p['x'] = l points.append(p) ds = DataSet() ds.addPoints(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'pca', {'dimension': len(x[0]), 'resultName': 'pca'}) result = [] for p in ds.points(): result.append(p['pca']) return result
def readLibSVMDataSet(filename): data = [l.split() for l in open(filename).readlines()] minidx = maxidx = 1 for l in data: for i in range(1, len(l)): dim, value = l[i].split(':') l[i] = (int(dim), float(value)) minidx = min(minidx, int(dim)) maxidx = max(maxidx, int(dim)) dimension = maxidx - minidx + 1 layout = PointLayout() layout.add('class', StringType) layout.add('value', RealType) ds = DataSet() n = 0 points = [] for l in data: p = Point() p.setLayout(layout) p.setName('instance_%06d' % n) n += 1 p['class'] = l[0] desc = RealDescriptor(dimension, 0.0) for dim, value in l[1:]: desc[dim - minidx] = value p['value'] = desc points.append(p) ds.addPoints(points) return ds
def testComplete(self): # load 2.0 dataset, history, apply history to dataset # check nn-search results are the same as the ones we get when doing it from gaia 2.0 ds = DataSet() ds.load(testdata.GAIA_20_BACKWARDS_COMPAT_DATASET) h = TransfoChain() self.assertRaises(Exception, h.load, testdata.GAIA_20_BACKWARDS_COMPAT_HISTORY) return h.load(testdata.GAIA_20_BACKWARDS_COMPAT_HISTORY) ds = h.mapDataSet(ds) v = View(ds) dist = MetricFactory.create('euclidean', ds.layout()) results = v.nnSearch('01 Respect.mp3', dist).get(5) self.compareResults(results, testdata.GAIA_20_BACKWARDS_COMPAT_RESULTS) ds21 = DataSet() ds21.load(testdata.TEST_DATABASE) results = v.nnSearch(h.mapPoint(ds21.point('01 Respect.mp3')), dist).get(5) self.compareResults(results, testdata.GAIA_20_BACKWARDS_COMPAT_RESULTS)
def createDataSet(): ds = DataSet() # p0.a = (0.0, 0.0) (α = undefined) p0 = newPoint('p0') p0['a'] = (0.0, 0.0) # p1.a = (1.0, 0.0) (α = 0) p1 = newPoint('p1') p1['a'] = (1.0, 0.0) # p2.a = (0.0, 1.0) (α = π/2) p2 = newPoint('p2') p2['a'] = (0.0, 1.0) # p3.a = (-1.0, 0.0) (α = π) p3 = newPoint('p3') p3['a'] = (-1.0, 0.0) # p4.a = (1.0, 1.0) (α = π/4) p4 = newPoint('p4') p4['a'] = (1.0, 1.0) # p5.a = (1.0, -1.0) (α = -π/4) p5 = newPoint('p5') p5['a'] = (1.0, -1.0) ds.addPoints([ p0, p1, p2, p3, p4, p5 ]) if testdata.useFixedLength: ds = testdata.fixLength(ds) if testdata.useEnumerate: ds = testdata.enumerateStrings(ds) return ds
def testDataSet(self): # load 2.0 dataset, check some values are correct ds = DataSet() ds.load(testdata.GAIA_20_BACKWARDS_COMPAT_DATASET) self.assertAlmostEqual(ds.point('01 Message - Grandmaster Flash.mp3').value('tempotap_bpm'), 101.05792999) self.assertEqual(ds.point('04 Blue Skies.mp3').label('key_key'), 'G#')
def testHistory(self): ds = testdata.loadTestDB() ignored_descs = testdata.TEST_DATABASE_VARLENGTH_REAL testdata.resetSettings() ds_orig = testdata.loadTestDB() # cleaning, mandatory step ds = transform(ds, 'fixlength', {'except': ignored_descs}) cleaned_db = transform(ds, 'cleaner', {'except': ignored_descs}) # removing annoying descriptors, like mfcc.cov & mfcc.icov, who don't # like to be normalized like the other ones (constant value: dimension) no_mfcc_db = transform(cleaned_db, 'remove', {'descriptorNames': '*mfcc*'}) # normalize, to have everyone change values normalized_db = transform(no_mfcc_db, 'normalize', {'except': ignored_descs}) testPoints = [ '01 Oye Como Va - Santana.mp3', '02 Carmen Burana- O Fortuna.mp3', '07 Romeo and Juliet- the Knights\' Dance.mp3', '11 Lambada.mp3' ] for pointName in testPoints: p1 = normalized_db.point(pointName) p2 = normalized_db.history().mapPoint(ds_orig.point(pointName)) for name in p1.layout().descriptorNames(): self.assertEqual(p1[name], p2[name]) (tmpFile, tmpName) = tempfile.mkstemp() os.close(tmpFile) normalized_db.save(tmpName) reloaded_db = DataSet() reloaded_db.load(tmpName) for pointName in testPoints: p1 = normalized_db.point(pointName) p2 = normalized_db.history().mapPoint(ds_orig.point(pointName)) p3 = reloaded_db.point(pointName) p4 = reloaded_db.history().mapPoint(ds_orig.point(pointName)) self.assert_(p1.layout() == p2.layout()) self.assert_(p2.layout() == p3.layout()) self.assert_(p3.layout() == p4.layout()) for name in p1.layout().descriptorNames(): self.assertEqual(p1[name], p2[name]) self.assertEqual(p2[name], p3[name]) self.assertEqual(p3[name], p4[name]) # remove temp file os.remove(tmpName)
def testSecondChanceForLayoutEquality(self): '''ticket #21: points try to morph to adapt to dataset if they cannot be naturally inserted''' ds = DataSet() p = Point() p.setName('Paris Hilton') p.load('data/04 - Cansei de Ser Sexy - Meeting Paris Hilton.mp3.sig') ds.addPoint(p) p.setName('2005') p.load('data/11_2005-fwyh.mp3.sig') ds.addPoint(p) self.assertEqual(ds.point('2005')['title'], '2005')
def __init__(self): self.as_dataset = DataSet() self.tag_dataset = DataSet() self.fs_dataset = DataSet() self.ac_dataset = DataSet() self.gaia_similiarity = None self.index_path = clust_settings.INDEX_DIR self.as_view = None self.as_metric = None self.tag_view = None self.tag_metric = None self.fs_view = None self.fs_metric = None self.ac_view = None self.ac_metric = None self.__load_datasets()
def transformDataSet(inputFilename, outputFilename, transfoFile=None): """Apply the list of transformations given as a yaml sequence to the specified dataset.""" print('Preprocessing dataset chunk for %s...' % outputFilename) gaia2.cvar.verbose = False transfoList = ''' - transfo: removevl - transfo: fixlength - transfo: cleaner ''' if transfoFile is not None: transfoList = open(transfoFile).read() ds = DataSet() ds.load(inputFilename) ds = applyTransfoChain(ds, transfoList) ds.save(outputFilename)
class GaiaWrapper: def __init__(self, indexing_only_mode=False): self.indexing_only_mode = indexing_only_mode self.index_path = sim_settings.INDEX_DIR self.original_dataset = DataSet() self.pca_dataset = DataSet() if not self.indexing_only_mode: self.original_dataset_path = self.__get_dataset_path( sim_settings.INDEX_NAME) else: self.original_dataset_path = self.__get_dataset_path( sim_settings.INDEXING_SERVER_INDEX_NAME) self.descriptor_names = {} self.metrics = {} self.view = None self.view_pca = None self.transformations_history = None self.__load_dataset() def __get_dataset_path(self, ds_name): return os.path.join(sim_settings.INDEX_DIR, ds_name + '.db') def __load_dataset(self): """ Loads the dataset, does all the necessary steps to make it available for similarity queries and creates the PCA version of it. If dataset does not exist, creates a new empty one. NOTE: we assume that loaded datasets will have been prepared and normalized (see_ _prepare_original_dataset() and __normalize_original_dataset()) on due time (see add_point() method below), therefore this function does not prepare or normalize loaded datasets. """ if not os.path.exists(sim_settings.INDEX_DIR): os.makedirs(sim_settings.INDEX_DIR) # load original dataset if os.path.exists(self.original_dataset_path): self.original_dataset.load(self.original_dataset_path) self.__calculate_descriptor_names() if self.original_dataset.size( ) >= sim_settings.SIMILARITY_MINIMUM_POINTS and not self.indexing_only_mode: # Save transformation history so we do not need to compute it every time we need it self.transformations_history = self.original_dataset.history( ).toPython() # Build metrics for the different similarity presets, create a Gaia view self.__build_metrics() view = View(self.original_dataset) self.view = view # Compute PCA and create pca view and metric # NOTE: this step may take a long time if the dataset is big, but it only needs to be performed once # when the similarity server is loaded- self.pca_dataset = transform( self.original_dataset, 'pca', { 'descriptorNames': sim_settings.PCA_DESCRIPTORS, 'dimension': sim_settings.PCA_DIMENSIONS, 'resultName': 'pca' }) self.pca_dataset.setReferenceDataSet(self.original_dataset) self.view_pca = View(self.pca_dataset) self.__build_pca_metric() if self.original_dataset.history().size() <= 0: logger.info('Dataset loaded, size: %s points' % (self.original_dataset.size())) else: logger.info( 'Dataset loaded, size: %s points (%i fixed-length desc., %i variable-length desc.)' % (self.original_dataset.size(), len(self.descriptor_names['fixed-length']), len(self.descriptor_names['variable-length']))) else: # If there is no existing dataset we create an empty one. # For the moment we do not create any distance metric nor a view because search won't be possible until # the DB has a minimum of SIMILARITY_MINIMUM_POINTS self.original_dataset.save(self.original_dataset_path) self.__calculate_descriptor_names() logger.info('Created new dataset, size: %s points (should be 0)' % (self.original_dataset.size())) def __prepare_original_dataset(self): logger.info('Preparing the original dataset.') self.original_dataset = self.prepare_original_dataset_helper( self.original_dataset) self.__calculate_descriptor_names() def __normalize_original_dataset(self): logger.info('Normalizing the original dataset.') self.original_dataset = self.normalize_dataset_helper( self.original_dataset, self.descriptor_names['fixed-length']) def __calculate_descriptor_names(self): layout = self.original_dataset.layout() all_descriptor_names = layout.descriptorNames() fixed_length_descritpor_names = [] variable_length_descritpor_names = [] multidimensional_descriptor_names = [] for name in all_descriptor_names: region = layout.descriptorLocation(name) if region.lengthType() == VariableLength: variable_length_descritpor_names.append(name) else: fixed_length_descritpor_names.append(name) try: if region.dimension() > 1: multidimensional_descriptor_names.append(name) except: # TODO: exception too broad here... pass self.descriptor_names = { 'all': all_descriptor_names, 'fixed-length': fixed_length_descritpor_names, 'variable-length': variable_length_descritpor_names, 'multidimensional': multidimensional_descriptor_names } @staticmethod def prepare_original_dataset_helper(ds): ds = transform( ds, 'FixLength' ) # Needed to optimize use of fixed-length descriptors and save memory ds = transform( ds, 'Cleaner' ) # Remove descriptors that will cause problems in further transformations try: ds = transform(ds, 'enumerate', {'descriptorNames': ['.tonal.chords_progression']}) except: # TODO: exception too broad here... logger.info( 'WARNING: enumerate transformation to .tonal.chords_progression could not be performed.' ) return ds @staticmethod def normalize_dataset_helper(ds, descriptor_names): # NOTE: The "except" list of descriptors below should be reviewed if a new extractor is used. The point is to # remove descriptors can potentially break normalize transform (e.g. descriptors with value = 0) normalization_params = { "descriptorNames": descriptor_names, "except": [ "*.min", "*.max", "tonal.chords_histogram", ], "independent": True, "outliers": -1 } ds = transform(ds, 'normalize', normalization_params) return ds def __build_metrics(self): for preset in sim_settings.PRESETS: if preset != 'pca': # PCA metric is built only after pca dataset is created so it should not be built here logger.info('Bulding metric for preset %s' % preset) name = preset path = sim_settings.PRESET_DIR + name + ".yaml" preset_file = yaml.safe_load(open(path)) distance = preset_file['distance']['type'] parameters = preset_file['distance']['parameters'] search_metric = DistanceFunctionFactory.create( str(distance), self.original_dataset.layout(), parameters) self.metrics[name] = search_metric def __build_pca_metric(self): logger.info('Bulding metric for preset pca') preset_file = yaml.safe_load(open(sim_settings.PRESET_DIR + "pca.yaml")) distance = preset_file['distance']['type'] parameters = preset_file['distance']['parameters'] search_metric = DistanceFunctionFactory.create( str(distance), self.pca_dataset.layout(), parameters) self.metrics['pca'] = search_metric def add_point(self, point_location, point_name): if self.original_dataset.contains(str(point_name)): self.original_dataset.removePoint(str(point_name)) p = Point() if os.path.exists(str(point_location)): try: p.load(str(point_location)) p.setName(str(point_name)) if self.original_dataset.size( ) <= sim_settings.SIMILARITY_MINIMUM_POINTS: # Add point to original_dataset because PCA dataset has not been created yet self.original_dataset.addPoint(p) msg = 'Added point with name %s. Index has now %i points.' % \ (str(point_name), self.original_dataset.size()) logger.info(msg) else: # Add point to PCA dataset because it has been already created. # PCA dataset will take care of adding the point to the original dataset as well. self.pca_dataset.addPoint(p) msg = 'Added point with name %s. Index has now %i points (pca index has %i points).' % \ (str(point_name), self.original_dataset.size(), self.pca_dataset.size()) logger.info(msg) except Exception as e: msg = 'Point with name %s could NOT be added (%s).' % ( str(point_name), str(e)) logger.info(msg) return { 'error': True, 'result': msg, 'status_code': sim_settings.SERVER_ERROR_CODE } else: msg = 'Point with name %s could NOT be added because analysis file does not exist (%s).' % \ (str(point_name), str(point_location)) logger.info(msg) return { 'error': True, 'result': msg, 'status_code': sim_settings.SERVER_ERROR_CODE } if self.original_dataset.size( ) == sim_settings.SIMILARITY_MINIMUM_POINTS: # Do enumerate try: self.original_dataset = transform( self.original_dataset, 'enumerate', {'descriptorNames': ['.tonal.chords_progression']}) except: # TODO: exception too broad here... logger.info( 'WARNING: enumerate transformation to .tonal.chords_progression could not be performed.' ) # If when adding a new point we reach the minimum points for similarity, do the needful so that the dataset # can be used for search. This includes preparing the dataset, normalizing it, saveing it and creating view and # distance metrics. This will only happen once when the size of the dataset reaches SIMILARITY_MINIMUM_POINTS. if self.original_dataset.size( ) == sim_settings.SIMILARITY_MINIMUM_POINTS and not self.indexing_only_mode: self.__prepare_original_dataset() self.__normalize_original_dataset() self.transformations_history = self.original_dataset.history( ).toPython() self.save_index(msg="(reaching %i points)" % sim_settings.SIMILARITY_MINIMUM_POINTS) # TODO: the code below is repeated from __load_dataset() method, should be moved into a util function # Build metrics for the different similarity presets, create a Gaia view self.__build_metrics() view = View(self.original_dataset) self.view = view # Compute PCA and create pca view and metric # NOTE: this step may take a long time if the dataset is big, but it only needs to be performed once # when the similarity server is loaded- self.pca_dataset = transform( self.original_dataset, 'pca', { 'descriptorNames': sim_settings.PCA_DESCRIPTORS, 'dimension': sim_settings.PCA_DIMENSIONS, 'resultName': 'pca' }) self.pca_dataset.setReferenceDataSet(self.original_dataset) self.view_pca = View(self.pca_dataset) self.__build_pca_metric() return {'error': False, 'result': msg} def delete_point(self, point_name): if self.original_dataset.contains(str(point_name)): if self.original_dataset.size( ) <= sim_settings.SIMILARITY_MINIMUM_POINTS: # Remove from original dataset self.original_dataset.removePoint(str(point_name)) else: # Remove from pca dataset (pca dataset will take care of removing from original dataset too) self.pca_dataset.removePoint(str(point_name)) logger.info( 'Deleted point with name %s. Index has now %i points (pca index has %i points).' % (str(point_name), self.original_dataset.size(), self.pca_dataset.size())) return {'error': False, 'result': True} else: msg = 'Can\'t delete point with name %s because it does not exist.' % str( point_name) logger.info(msg) return { 'error': True, 'result': msg, 'status_code': sim_settings.NOT_FOUND_CODE } def get_point(self, point_name): logger.info('Getting point with name %s' % str(point_name)) if self.original_dataset.contains(str(point_name)): return self.original_dataset.point(str(point_name)) def get_all_point_names(self): point_names = sorted( [int(name) for name in self.original_dataset.pointNames()]) logger.info('Getting all point names (%i points)' % len(point_names)) return {'error': False, 'result': point_names} def save_index(self, filename=None, msg=""): tic = time.time() path = self.original_dataset_path if filename: path = sim_settings.INDEX_DIR + filename + ".db" logger.info('Saving index to (%s)...' % path + msg) self.original_dataset.save(path) toc = time.time() logger.info( 'Finished saving index (done in %.2f seconds, index has now %i points).' % ((toc - tic), self.original_dataset.size())) return {'error': False, 'result': path} def contains(self, point_name): logger.info('Checking if index has point with name %s' % str(point_name)) return { 'error': False, 'result': self.original_dataset.contains(point_name) } def get_sounds_descriptors(self, point_names, descriptor_names=None, normalization=True, only_leaf_descriptors=False): """ Returns a list with the descriptor values for all requested point names """ logger.info('Getting descriptors for points %s' % ','.join([str(name) for name in point_names])) # Add dot '.' at the beginning of descriptor names if not present if descriptor_names: descriptor_names_aux = list() for name in descriptor_names: if name[0] != '.': descriptor_names_aux.append('.' + name) else: descriptor_names_aux.append(name) descriptor_names = descriptor_names_aux[:] data = dict() required_descriptor_names = self.__calculate_complete_required_descriptor_names( descriptor_names, only_leaf_descriptors=only_leaf_descriptors) if type(required_descriptor_names) == dict: return required_descriptor_names # There has been an error for point_name in point_names: sound_descriptors = self.__get_point_descriptors( point_name, required_descriptor_names, normalization) if 'error' not in sound_descriptors: data[point_name] = sound_descriptors return {'error': False, 'result': data} def __calculate_complete_required_descriptor_names( self, descriptor_names, only_leaf_descriptors=False): if not descriptor_names: descriptor_names = self.descriptor_names['all'][:] try: structured_layout = generate_structured_dict_from_layout( self.descriptor_names['all'][:]) processed_descriptor_names = [] for name in descriptor_names: nested_descriptors = get_nested_dictionary_value( name.split('.')[1:], structured_layout) if not nested_descriptors: processed_descriptor_names.append(name) else: if only_leaf_descriptors: # only return descriptors if nested descriptors are statistics if len( set(nested_descriptors.keys()).intersection([ 'min', 'max', 'dvar2', 'dmean2', 'dmean', 'var', 'dvar', 'mean' ])) > 0: for extra_name in nested_descriptors.keys(): processed_descriptor_names.append( '%s.%s' % (name, extra_name)) else: # Return all nested descriptor names extra_names = [] get_nested_descriptor_names(nested_descriptors, extra_names) for extra_name in extra_names: processed_descriptor_names.append( '%s.%s' % (name, extra_name)) processed_descriptor_names = list(set(processed_descriptor_names)) return processed_descriptor_names except: return { 'error': True, 'result': 'Wrong descriptor names, unable to create layout.', 'status_code': sim_settings.BAD_REQUEST_CODE } def __get_point_descriptors(self, point_name, required_descriptor_names, normalization=True): """ Get normalization coefficients to transform the input data (get info from the last transformation which has been a normalization) """ normalization_coeffs = None if not normalization: trans_hist = self.transformations_history for i in range(0, len(trans_hist)): if trans_hist[-(i + 1)]['Analyzer name'] == 'normalize': normalization_coeffs = trans_hist[-( i + 1)]['Applier parameters']['coeffs'] required_layout = generate_structured_dict_from_layout( required_descriptor_names) try: p = self.original_dataset.point(str(point_name)) except: return { 'error': True, 'result': 'Sound does not exist in gaia index.', 'status_code': sim_settings.NOT_FOUND_CODE } for descriptor_name in required_descriptor_names: try: value = p.value(str(descriptor_name)) if normalization_coeffs: if descriptor_name in normalization_coeffs: a = normalization_coeffs[descriptor_name]['a'] b = normalization_coeffs[descriptor_name]['b'] if len(a) == 1: value = float(value - b[0]) / a[0] else: normalized_value = [] for i in range(0, len(a)): normalized_value.append( float(value[i] - b[i]) / a[i]) value = normalized_value except: try: value = p.label(str(descriptor_name)) except: value = None if descriptor_name[0] == '.': descriptor_name = descriptor_name[1:] set_nested_dictionary_value(descriptor_name.split('.'), required_layout, value) return required_layout # SIMILARITY SEARCH and CONTENT SEARCH def search_dataset(self, query_point, number_of_results, preset_name, offset=0): preset_name = str(preset_name) results = [] count = 0 size = self.original_dataset.size() if size < sim_settings.SIMILARITY_MINIMUM_POINTS: msg = 'Not enough datapoints in the dataset (%s < %s).' % ( size, sim_settings.SIMILARITY_MINIMUM_POINTS) logger.info(msg) return { 'error': True, 'result': msg, 'status_code': sim_settings.SERVER_ERROR_CODE } query_point = str(query_point) logger.info('NN search for point with name %s (preset = %s)' % (query_point, preset_name)) results = [] if not self.original_dataset.contains(query_point): msg = "Sound with id %s doesn't exist in the dataset." % query_point logger.info(msg) return { 'error': True, 'result': msg, 'status_code': sim_settings.NOT_FOUND_CODE } if preset_name == 'pca': # Search on PCA view search = self.view_pca.nnSearch(query_point, self.metrics[preset_name]) else: # Search on original dataset view search = self.view.nnSearch(query_point, self.metrics[preset_name]) results = search.get(int(number_of_results), offset=int(offset)) count = search.size() return {'error': False, 'result': {'results': results, 'count': count}} def api_search(self, target_type, target, filter, preset_name, metric_descriptor_names, num_results, offset, in_ids): # Check if index has sufficient points size = self.original_dataset.size() if size < sim_settings.SIMILARITY_MINIMUM_POINTS: msg = 'Not enough datapoints in the dataset (%s < %s).' % ( size, sim_settings.SIMILARITY_MINIMUM_POINTS) logger.info(msg) return { 'error': True, 'result': msg, 'status_code': sim_settings.SERVER_ERROR_CODE } # Get some dataset parameters that will be useful later trans_hist = self.transformations_history layout = self.original_dataset.layout() pca_layout = self.pca_dataset.layout() coeffs = None # Get normalization coefficients for i in range(0, len(trans_hist)): if trans_hist[-(i + 1)]['Analyzer name'] == 'normalize': coeffs = trans_hist[-(i + 1)]['Applier parameters']['coeffs'] # Process target if target: if target_type == 'sound_id': query_point = str(target) if not self.original_dataset.contains(query_point): msg = "Sound with id %s doesn't exist in the dataset and can not be set as similarity target." \ % query_point logger.info(msg) return { 'error': True, 'result': msg, 'status_code': sim_settings.NOT_FOUND_CODE } else: query = query_point elif target_type == 'descriptor_values': # Transform input params to the normalized feature space and add them to a query point # If there are no params specified in the target, the point is set as empty (probably random sounds # are returned) feature_names = [] query = Point() query.setLayout(layout) try: for param in target.keys(): # Only add numerical parameters. Non numerical ones (like key) are only used as filters if param in coeffs.keys(): feature_names.append(str(param)) value = target[param] if coeffs: a = coeffs[param]['a'] b = coeffs[param]['b'] if len(a) == 1: norm_value = a[0] * value + b[0] else: norm_value = [] for i in range(0, len(a)): norm_value.append(a[i] * value[i] + b[i]) query.setValue(str(param), norm_value) else: query.setValue(str(param), value) except: return { 'error': True, 'result': 'Invalid target (descriptor values could not be correctly parsed)', 'status_code': sim_settings.BAD_REQUEST_CODE } # Overwrite metric with present descriptors in target metric = DistanceFunctionFactory.create( 'euclidean', layout, {'descriptorNames': feature_names}) elif target_type == 'file': # Target is specified as the attached file # Create a point with the data in 'descriptors_data' and search for it target_file_parsing_type = '-' try: # Try directly loading the file p, query = Point(), Point() p.loadFromString(yaml.dump(target)) if preset_name == 'pca': query = self.pca_dataset.history().mapPoint( p) # map point to pca dataset else: query = self.original_dataset.history().mapPoint( p) # map point to original dataset target_file_parsing_type = 'mapPoint' except Exception as e: logger.info( 'Unable to create gaia point from uploaded file (%s). ' 'Trying adding descriptors one by one.' % e) # If does not work load descriptors one by one try: query = Point() #query.setLayout(layout) feature_names = [] get_nested_descriptor_names(target, feature_names) feature_names = [ '.%s' % item for item in feature_names ] nonused_features = [] for param in feature_names: if param in coeffs.keys(): value = get_nested_dictionary_value( param[1:].split('.'), target) if coeffs: try: a = coeffs[param]['a'] b = coeffs[param]['b'] if len(a) == 1: norm_value = a[0] * value + b[0] else: norm_value = [] for i in range(0, len(a)): norm_value.append(a[i] * value[i] + b[i]) query.setValue(str(param[1:]), norm_value) except: nonused_features.append(param) else: query.setValue(str(param[1:]), value) else: nonused_features.append(param) if preset_name == 'pca': query = self.pca_dataset.history().mapPoint( query) # map point to pca dataset else: query = self.original_dataset.history().mapPoint( p) # map point to original dataset target_file_parsing_type = 'walkDict' except Exception as e: logger.info( 'Unable to create gaia point from uploaded file and adding descriptors one by ' 'one (%s)' % e) return { 'error': True, 'result': 'Unable to create gaia point from uploaded file. Probably the ' 'file does not have the required layout. Are you using the ' 'correct version of Essentia\'s Freesound extractor?', 'status_code': sim_settings.SERVER_ERROR_CODE } else: query = Point() # Empty target if preset_name == 'pca': query.setLayout(pca_layout) else: query.setLayout(layout) # Process filter if filter: filter = parse_filter_list(filter, coeffs) else: filter = "" # Empty filter # log log_message = 'Similarity search' if target: if target_type == 'sound_id': log_target = '%s (sound id)' % str(target) elif target_type == 'descriptor_values': log_target = '%s (descriptor values)' % str(target) elif target_type == 'file': log_target = 'uploaded file (%s)' % target_file_parsing_type log_message += ' with target: %s' % log_target if filter: log_message += ' with filter: %s' % str(filter) logger.info(log_message) # if in_ids is specified, edit the filter accordingly if in_ids: if not filter: filter = 'WHERE point.id IN ("' + '", "'.join(in_ids) + '")' else: filter += ' AND point.id IN ("' + '", "'.join(in_ids) + '")' # Set query metric metric = self.metrics[preset_name] if metric_descriptor_names: metric = DistanceFunctionFactory.create( 'euclidean', layout, {'descriptorNames': metric_descriptor_names}) # Do query! try: if target_type == 'descriptor_values' and target: search = self.view.nnSearch(query, metric, str(filter)) else: if preset_name == 'pca': search = self.view_pca.nnSearch(query, metric, str(filter)) else: search = self.view.nnSearch(query, metric, str(filter)) results = search.get(num_results, offset=offset) count = search.size() except Exception as e: return { 'error': True, 'result': 'Similarity server error', 'status_code': sim_settings.SERVER_ERROR_CODE } note = None if target_type == 'file': if target_file_parsing_type == 'walkDict': note = 'The layout of the given analysis file differed from what we expected. Similarity results ' \ 'might not be accurate. Was the file generated with the last version of Essentia\'s ' \ 'Freesound extractor?' return { 'error': False, 'result': { 'results': results, 'count': count, 'note': note } }
def harmonizeChunks(partfiles): # TODO: check all histories are the same, if not, try to do sth about it # find the GCLD (greatest common layout divisor :-) ) ds = DataSet() ds.load(partfiles[0]) origLayout = ds.layout().copy() gcld = ds.layout().copy() for pfile in partfiles[1:]: ds.load(pfile) gcld = gcld & ds.layout() # keep some stats about which descriptors got removed and the reason why before throwing # away the original history and simplifying it vldescs = set() nandescs = set() # now that we have our GCLD, transform all the chunks so they have the same layout (our GCLD) # and simplify their histories so that they also have the same history (the minimum history # required to arrive at this target layout). for pfile in partfiles: ds.load(pfile) for t in ds.history().toPython(): tname = t['Analyzer name'] descs = t['Applier parameters']['descriptorNames'] if tname == 'cleaner': nandescs.update(descs) elif tname == 'removevl': vldescs.update(descs) toremove = ds.layout().differenceWith(gcld) if toremove: ds = transform(ds, 'remove', { 'descriptorNames': toremove }) ds.simplifyHistory() ds.save(pfile) # also get the other descriptors that got removed (because of a select or remove transfo) rdescs = set(origLayout.differenceWith(gcld)) - (vldescs | nandescs) return vldescs, nandescs, rdescs
def highlevel_mosaic(target, tcorpus, scorpus, scope=5): """ This will be used to test the highlevel mosaicing process. The scope variable controls the number of results which are returned for each target unit which is sought. """ # Create a temporary file for the mosaic audio filepath = os.path.join(os.getcwd(), 'temp_mosaic.wav') if os.path.isfile(filepath): os.remove(filepath) mosaic = Mosaic(filepath) cost = RepeatUnitCost() context = Context() gridder = Gridder() units = tcorpus.list_audio_units(audio_filename=target, chop='highlevel') hdb = scorpus.get_gaia_unit_db(chop='highlevel_%s' % self.chop) distance = get_mood_distance(hdb) v = View(hdb, distance) results = {} for f in units: p = Point() p.load(switch_ext(f, '.yaml')) unit_name = switch_ext(os.path.basename(f), '') p.setName(unit_name) p_m = hdb.history().mapPoint(p) results.update({f:v.nnSearch(p_m).get(scope)}) log.debug("Ok, now we have a dict with each target segment, along with its corresponding nearest matches in source db") log.debug("Check to see that we have every second of target audio accounted for - I think not!") #return results #new_results = results.copy() ds = DataSet() for r in results: units = [] for u in results[r]: ds.load(switch_ext(u[0], '.db')) for n in ds.pointNames(): units.append(n) new_ds = gaia_transform(dict(zip(units, units))) results.update({r:new_ds}) #return results # Very important - target units must be in correct order index = 0 index_skip = 0 for r in sorted(results.keys()): tds = DataSet() tds.load(switch_ext(r, '.db')) #return tds, results sds = results[r] source_set = set(sds.layout().descriptorNames()) target_set = set(tds.layout().descriptorNames()) remove_from_source = source_set.difference(target_set) remove_from_target = target_set.difference(source_set) if len(remove_from_source) > 0: log.debug("Will try to remove %s from the source DataSet" % remove_from_source) try: sds = transform(results[r], 'remove', {'descriptorNames':list(remove_from_source)}) except Exception, e: log.error("Failed to remove %s from source DataSet" % list(remove_from_source)) return results[r], tds if len(remove_from_target) > 0: log.debug("Will try to remove %s from the target DataSet" % remove_from_source) try: tds = transform(tds, 'remove', {'descriptorNames':list(remove_from_target)}) except Exception, e: log.error("Failed to remove %s from target DataSet" % list(remove_from_target)) return results[r], tds
def load_gaia_db(self): """Load the gaia database from disk.""" dataset = DataSet() dataset.load(self.gaia_db_path) return dataset
def testDoubleLoadMixedVersions(self): ds = DataSet() ds.load(testdata.TEST_DATABASE) ds.load(testdata.GAIA_20_BACKWARDS_COMPAT_DATASET)
def mergeAll(pointList, outputFilename, chunkSize, transfoFile, select = None, exclude = None): # TODO: validation of the yaml file format? (ie: pre-2.3 yaml files should be rejected) totalPoints = len(fastyaml.load(open(pointList).read())) begin, end = 0, chunkSize partfiles = [] partfileTemplate = outputFilename + '_%d_%d.partdb' # keep this information for future reference as it won't be accessible anymore # once the dataset is merged excluded = [] if exclude: try: p = gaia2.Point() p.load(gaia2.fastyaml.loadfile(pointList).items()[0][1]) excluded = p.layout().descriptorNames(exclude) except: raise # merge each chunk separately # this includes removevl and fixlength, which should yield smaller files than just after # merging, so it should then be possible to load all of them together to merge them while begin < totalPoints: end = min(end, totalPoints) partfile = partfileTemplate % (begin, end) partfiles += [ partfile ] mergeChunk(pointList, partfile, transfoFile, begin, end, select, exclude) begin, end = end, end + chunkSize horizontalLine() # make sure all histories are the same, if not do whatever it takes to reach that point # also "simplify" the histories so that they are the minimum history representation required # to get to the layout of the final dataset print 'Harmonizing chunks so that they all have the same layout & history...' vldescs, nandescs, rdescs = harmonizeChunks(partfiles) rdescs = rdescs | set(excluded) horizontalLine() # merge all those partfiles together print 'Assembling full dataset together...' dstotal = DataSet() for pfile in partfiles: print 'Merging partfile', pfile ds = DataSet() ds.load(pfile) dstotal.appendDataSet(ds) dstotal.save(outputFilename) # print a nice informative summary of what has been done to the dataset horizontalLine() msg = ''' Final dataset information ------------------------- Number of points: %s Descriptors removed: - because they were of variable length: %s - because they were either constant, contained NaN or contained Inf: %s - because they were removed explicitly: %s Your dataset has been saved at %s''' # remove leading dot vldescs = sorted( d[1:] for d in vldescs ) nandescs = sorted( d[1:] for d in nandescs ) rdescs = sorted( d[1:] for d in rdescs ) print msg % (str(dstotal.size()), ', '.join(vldescs), ', '.join(nandescs), ', '.join(rdescs), outputFilename) # clean up temporary files for pfile in partfiles: os.remove(pfile) os.remove(pfile + '.raw')
class GaiaWrapper: def __init__(self): self.index_path = INDEX_DIR self.original_dataset = DataSet() self.original_dataset_path = self.__get_dataset_path(INDEX_NAME) self.metrics = {} self.view = None self.__load_dataset() def __get_dataset_path(self, ds_name): return os.path.join(INDEX_DIR, ds_name + ".db") def __load_dataset(self): # Loads the dataset, applies transforms if needed and saves. If dataset does not exists, creates an empty one and saves. if not os.path.exists(INDEX_DIR): os.makedirs(INDEX_DIR) # load original dataset if os.path.exists(self.original_dataset_path): self.original_dataset.load(self.original_dataset_path) if self.original_dataset.size() >= SIMILARITY_MINIMUM_POINTS: # if we have loaded a dataset of the correct size but it is unprepared, prepare it if self.original_dataset.history().size() <= 0: self.__prepare_original_dataset() self.__normalize_original_dataset() self.original_dataset.save(self.original_dataset_path) # if we have loaded a dataset which has not been normalized, normalize it normalized = False for element in self.original_dataset.history().toPython(): if element["Analyzer name"] == "normalize": normalized = True break if not normalized: self.__normalize_original_dataset() self.original_dataset.save(self.original_dataset_path) # build metrics for the different similarity presets self.__build_metrics() # create view view = View(self.original_dataset) self.view = view logger.debug("Dataset loaded, size: %s points" % (self.original_dataset.size())) else: # If there is no existing dataset we create an empty one. # For the moment we do not create any distance metric nor a view because search won't be possible until the DB has a minimum of SIMILARITY_MINIMUM_POINTS self.original_dataset.save(self.original_dataset_path) logger.debug("Created new dataset, size: %s points (should be 0)" % (self.original_dataset.size())) def __prepare_original_dataset(self): logger.debug("Preparing the original dataset.") self.original_dataset = self.prepare_original_dataset_helper(self.original_dataset) def __normalize_original_dataset(self): logger.debug("Normalizing the original dataset.") self.original_dataset = self.normalize_dataset_helper(self.original_dataset) @staticmethod def prepare_original_dataset_helper(ds): proc_ds1 = transform(ds, "RemoveVL") proc_ds2 = transform(proc_ds1, "FixLength") proc_ds1 = None prepared_ds = transform(proc_ds2, "Cleaner") proc_ds2 = None return prepared_ds @staticmethod def normalize_dataset_helper(ds): # Remove ['.lowlevel.mfcc.cov','.lowlevel.mfcc.icov'] (they give errors when normalizing) ds = transform(ds, "remove", {"descriptorNames": [".lowlevel.mfcc.cov", ".lowlevel.mfcc.icov"]}) # Add normalization normalization_params = {"descriptorNames": "*", "independent": True, "outliers": -1} normalized_ds = transform(ds, "normalize", normalization_params) ds = None return normalized_ds def __build_metrics(self): for preset in PRESETS: logger.debug("Bulding metric for preset %s" % preset) name = preset path = PRESET_DIR + name + ".yaml" preset_file = yaml.load(open(path)) distance = preset_file["distance"]["type"] parameters = preset_file["distance"]["parameters"] search_metric = DistanceFunctionFactory.create(str(distance), self.original_dataset.layout(), parameters) self.metrics[name] = search_metric def add_point(self, point_location, point_name): if self.original_dataset.contains(str(point_name)): self.original_dataset.removePoint(str(point_name)) try: p = Point() p.load(str(point_location)) p.setName(str(point_name)) self.original_dataset.addPoint(p) size = self.original_dataset.size() logger.debug("Added point with name %s. Index has now %i points." % (str(point_name), size)) except: msg = "Point with name %s could NOT be added. Index has now %i points." % (str(point_name), size) logger.debug(msg) return {"error": True, "result": msg} # If when adding a new point we reach the minimum points for similarity, prepare the dataset, save and create view and distance metrics # This will most never happen, only the first time we start similarity server, there is no index created and we add 2000 points. if size == SIMILARITY_MINIMUM_POINTS: self.__prepare_original_dataset() self.__normalize_original_dataset() self.save_index(msg="(reaching 2000 points)") # build metrics for the different similarity presets self.__build_metrics() # create view view = View(self.original_dataset) self.view = view return {"error": False, "result": True} def delete_point(self, point_name): if self.original_dataset.contains(str(point_name)): self.original_dataset.removePoint(str(point_name)) logger.debug( "Deleted point with name %s. Index has now %i points." % (str(point_name), self.original_dataset.size()) ) return {"error": False, "result": True} else: msg = "Can't delete point with name %s because it does not exist." % str(point_name) logger.debug(msg) return {"error": True, "result": msg} def get_point(self, point_name): logger.debug("Getting point with name %s" % str(point_name)) if self.original_dataset.contains(str(point_name)): return self.original_dataset.point(str(point_name)) def save_index(self, filename=None, msg=""): tic = time.time() path = self.original_dataset_path if filename: path = INDEX_DIR + filename + ".db" logger.debug("Saving index to (%s)..." % path + msg) self.original_dataset.save(path) toc = time.time() logger.debug( "Finished saving index (done in %.2f seconds, index has now %i points)." % ((toc - tic), self.original_dataset.size()) ) return {"error": False, "result": path} def contains(self, point_name): logger.debug("Checking if index has point with name %s" % str(point_name)) return {"error": False, "result": self.original_dataset.contains(point_name)} # SIMILARITY SEARCH (WEB and API) def search_dataset(self, query_point, number_of_results, preset_name): preset_name = str(preset_name) query_point = str(query_point) logger.debug("NN search for point with name %s (preset = %s)" % (query_point, preset_name)) size = self.original_dataset.size() if size < SIMILARITY_MINIMUM_POINTS: msg = "Not enough datapoints in the dataset (%s < %s)." % (size, SIMILARITY_MINIMUM_POINTS) logger.debug(msg) return {"error": True, "result": msg} # raise Exception('Not enough datapoints in the dataset (%s < %s).' % (size, SIMILARITY_MINIMUM_POINTS)) if query_point.endswith(".yaml"): # The point doesn't exist in the dataset.... # So, make a temporary point, add all the transformations # to it and search for it p, p1 = Point(), Point() p.load(query_point) p1 = self.original_dataset.history().mapPoint(p) similar_sounds = self.view.nnSearch(p1, self.metrics[preset_name]).get(int(number_of_results)) else: if not self.original_dataset.contains(query_point): msg = "Sound with id %s doesn't exist in the dataset." % query_point logger.debug(msg) return {"error": True, "result": msg} # raise Exception("Sound with id %s doesn't exist in the dataset." % query_point) similar_sounds = self.view.nnSearch(query_point, self.metrics[preset_name]).get(int(number_of_results)) return {"error": False, "result": similar_sounds} # CONTENT-BASED SEARCH (API) def query_dataset(self, query_parameters, number_of_results): size = self.original_dataset.size() if size < SIMILARITY_MINIMUM_POINTS: msg = "Not enough datapoints in the dataset (%s < %s)." % (size, SIMILARITY_MINIMUM_POINTS) logger.debug(msg) return {"error": True, "result": msg} # raise Exception('Not enough datapoints in the dataset (%s < %s).' % (size, SIMILARITY_MINIMUM_POINTS)) trans_hist = self.original_dataset.history().toPython() layout = self.original_dataset.layout() # Get normalization coefficients to transform the input data (get info from the last transformation which has been a normalization) coeffs = None for i in range(0, len(trans_hist)): if trans_hist[-(i + 1)]["Analyzer name"] == "normalize": coeffs = trans_hist[-(i + 1)]["Applier parameters"]["coeffs"] ############## # PARSE TARGET ############## # Transform input params to the normalized feature space and add them to a query point # If there are no params specified in the target, the point is set as empty (probably random sounds are returned) q = Point() q.setLayout(layout) feature_names = [] # If some target has been specified... if query_parameters["target"].keys(): for param in query_parameters["target"].keys(): # Only add numerical parameters. Non numerical ones (like key) are only used as filters if param in coeffs.keys(): feature_names.append(str(param)) value = query_parameters["target"][param] if coeffs: a = coeffs[param]["a"] b = coeffs[param]["b"] if len(a) == 1: norm_value = a[0] * value + b[0] else: norm_value = [] for i in range(0, len(a)): norm_value.append(a[i] * value[i] + b[i]) # text = str(type(param)) + " " + str(type(norm_value)) q.setValue(str(param), norm_value) else: q.setValue(str(param), value) ############## # PARSE FILTER ############## filter = "" # If some filter has been specified... if query_parameters["filter"]: if type(query_parameters["filter"][0:5]) == str: filter = query_parameters["filter"] else: filter = self.parse_filter_list(query_parameters["filter"], coeffs) ############# # DO QUERY!!! ############# logger.debug( "Content based search with target: " + str(query_parameters["target"]) + " and filter: " + str(filter) ) metric = DistanceFunctionFactory.create("euclidean", layout, {"descriptorNames": feature_names}) # Looks like that depending on the version of gaia, variable filter must go after or before the metric # For the gaia version we have currently (sep 2012) in freesound: nnSearch(query,filter,metric) # results = self.view.nnSearch(q,str(filter),metric).get(int(number_of_results)) # <- Freesound results = self.view.nnSearch(q, metric, str(filter)).get(int(number_of_results)) return {"error": False, "result": results} # UTILS for content-based search def prepend_value_label(self, f): if f["type"] == "NUMBER" or f["type"] == "RANGE" or f["type"] == "ARRAY": return "value" else: return "label" def parse_filter_list(self, filter_list, coeffs): # TODO: eliminate this? # coeffs = None filter = "WHERE" for f in filter_list: if type(f) != dict: filter += f else: if f["type"] == "NUMBER" or f["type"] == "STRING" or f["type"] == "ARRAY": if f["type"] == "NUMBER": if coeffs: norm_value = coeffs[f["feature"]]["a"][0] * f["value"] + coeffs[f["feature"]]["b"][0] else: norm_value = f["value"] elif f["type"] == "ARRAY": if coeffs: norm_value = [] for i in range(len(f["value"])): norm_value.append( coeffs[f["feature"]]["a"][i] * f["value"][i] + coeffs[f["feature"]]["b"][i] ) else: norm_value = f["value"] else: norm_value = f["value"] filter += " " + self.prepend_value_label(f) + f["feature"] + "=" + str(norm_value) + " " else: filter += " " if f["value"]["min"]: if coeffs: norm_value = coeffs[f["feature"]]["a"][0] * f["value"]["min"] + coeffs[f["feature"]]["b"][0] else: norm_value = f["value"]["min"] filter += self.prepend_value_label(f) + f["feature"] + ">" + str(norm_value) + " " if f["value"]["max"]: if f["value"]["min"]: filter += "AND " if coeffs: norm_value = coeffs[f["feature"]]["a"][0] * f["value"]["max"] + coeffs[f["feature"]]["b"][0] else: norm_value = f["value"]["max"] filter += self.prepend_value_label(f) + f["feature"] + "<" + str(norm_value) + " " return filter