def run(self, className, outfilename, param, dsname, gtname, evalconfig): try: classifier = param['classifier'] gt = GroundTruth(classifier) gt.load(gtname) # force the GroundTruth class name to be the one specified by our project file, not # the one in the original groundtruth file gt.className = className ds = DataSet() ds.load(dsname) # some points may have failed to be analyzed, remove those from the GroundTruth pnames = ds.pointNames() for pid in list(gt.keys()): if pid not in pnames: log.warning('Removing %s from GroundTruth as it could not be found in the merged dataset' % pid) del gt[pid] trainerFun, trainingparam, newds = getTrainer(classifier, param, ds) # run all the evaluations specified in the evaluation config for i, evalparam in enumerate(evalconfig): # if we already ran this evaluation, no need to run it again... resultFilename = outfilename + '_%d.result' % i if exists(resultFilename): log.info('File %s already exists. Skipping evaluation...' % resultFilename) continue log.info('Running evaluation %d for: %s with classifier %s and dataset %s' % (i, outfilename, param['classifier'], param['preprocessing'])) log.info(' PID: %d, parameters: %s' % (os.getpid(), json.dumps(param))) # run evaluation confusion = evaluateNfold(evalparam['nfold'], ds, gt, trainerFun, **trainingparam) # write evaluation params & result with open(outfilename + '_%d.param' % i, 'w') as f: yaml.dump({ 'model': param, 'evaluation': evalparam }, f) confusion.save(resultFilename) except Exception: log.error('While doing evaluation with param = %s\nevaluation = %s' % (param, evalconfig)) raise
def trainSVMHistory(configFilename, paramsFilename, outputHistoryFilename, className): config = yaml.load(open(configFilename).read()) params = yaml.load(open(paramsFilename).read())['model'] if params.pop('classifier') != 'svm': raise Exception('Can only use this script on SVM config parameters.') preproc = params.pop('preprocessing') ds = DataSet() ds.load( join( split(configFilename)[0], # base dir config['datasetsDirectory'], # datasets dir '%s-%s.db' % (config['className'], preproc))) # dataset name gt = GroundTruth.fromFile(config['groundtruth']) if className: gt.className = className # add 'highlevel.' in front of the descriptor, this is what will appear in the final Essentia sigfile gt.className = 'highlevel.' + gt.className # do the whole training h = trainSVM(ds, gt, **params) h.save(outputHistoryFilename)
def train_svm_history(project, params, output_file_path): params_model = params["model"] if params_model.get("classifier") != "svm": raise GaiaWrapperException( "Can only use this script on SVM config parameters.") ds = DataSet() ds.load( os.path.join( project["datasetsDirectory"], "%s-%s.db" % (project["className"], params_model["preprocessing"]))) gt = GroundTruth.fromFile(project["groundtruth"]) gt.className = "highlevel." + project["className"] history = train_svm( ds, gt, type=params_model["type"], kernel=params_model["kernel"], C=params_model["C"], gamma=params_model["gamma"]) # doing the whole training if isinstance(output_file_path, unicode): output_file_path = output_file_path.encode("utf-8") history.save(output_file_path)
def run(self, className, outfilename, param, dsname, gtname, evalconfig): try: classifier = param['classifier'] gt = GroundTruth(classifier) gt.load(gtname) # force the GroundTruth class name to be the one specified by our project file, not # the one in the original groundtruth file gt.className = className ds = DataSet() ds.load(dsname) # some points may have failed to be analyzed, remove those from the GroundTruth pnames = ds.pointNames() for pid in list(gt.keys()): if pid not in pnames: log.warning( 'Removing %s from GroundTruth as it could not be found in the merged dataset' % pid) del gt[pid] trainerFun, trainingparam, newds = getTrainer( classifier, param, ds) # run all the evaluations specified in the evaluation config for i, evalparam in enumerate(evalconfig): # if we already ran this evaluation, no need to run it again... resultFilename = outfilename + '_%d.result' % i if exists(resultFilename): log.info('File %s already exists. Skipping evaluation...' % resultFilename) continue log.info( 'Running evaluation %d for: %s with classifier %s and dataset %s' % (i, outfilename, param['classifier'], param['preprocessing'])) log.info(' PID: %d, parameters: %s' % (os.getpid(), json.dumps(param))) # run evaluation confusion = evaluateNfold(evalparam['nfold'], ds, gt, trainerFun, **trainingparam) # write evaluation params & result with open(outfilename + '_%d.param' % i, 'w') as f: yaml.dump({'model': param, 'evaluation': evalparam}, f) confusion.save(resultFilename) except Exception: log.error( 'While doing evaluation with param = %s\nevaluation = %s' % (param, evalconfig)) raise
def train_svm_history(project, params, output_file_path): params_model = params["model"] if params_model.pop("classifier") != "svm": raise GaiaWrapperException("Can only use this script on SVM config parameters.") ds = DataSet() ds.load(os.path.join( project["datasetsDirectory"], "%s-%s.db" % (project["className"], params_model.pop("preprocessing")) )) gt = GroundTruth.fromFile(project["groundtruth"]) gt.className = "highlevel." + project["className"] history = train_svm(ds, gt, **params_model) # doing the whole training history.save(output_file_path)
def loadGroundTruth(self, name=None): gttypes = self._config['groundTruth'].keys() if name is None: name = gttypes[0] if len(gttypes) > 1: print 'WARNING: more than 1 GroundTruth file, selecting default "%s" (out of %s)' % ( name, gttypes) else: if name not in gttypes: print 'WARNING: invalid ground truth: "%s", selecting default one instead: "%s" (out of %s)' % ( name, gttypes[0], gttypes) name = gttypes[0] self._groundTruthFile = self.groundTruthFilePath(name) self.groundTruth = GroundTruth.fromFile(self._groundTruthFile)
def loadGroundTruth(self, name=None): gttypes = self._config["groundTruth"].keys() if name is None: name = gttypes[0] if len(gttypes) > 1: print 'WARNING: more than 1 GroundTruth file, selecting default "%s" (out of %s)' % (name, gttypes) else: if name not in gttypes: print 'WARNING: invalid ground truth: "%s", selecting default one instead: "%s" (out of %s)' % ( name, gttypes[0], gttypes, ) name = gttypes[0] self._groundTruthFile = self.groundTruthFilePath(name) self.groundTruth = GroundTruth.fromFile(self._groundTruthFile)
def train_svm_history(project, params, output_file_path): params_model = params["model"] if params_model.get("classifier") != "svm": raise GaiaWrapperException("Can only use this script on SVM config parameters.") ds = DataSet() ds.load(os.path.join( project["datasetsDirectory"], "%s-%s.db" % (project["className"], params_model["preprocessing"]) )) gt = GroundTruth.fromFile(project["groundtruth"]) gt.className = "highlevel." + project["className"] history = train_svm(ds, gt, type=params_model["type"], kernel=params_model["kernel"], C=params_model["C"], gamma=params_model["gamma"]) # doing the whole training if isinstance(output_file_path, unicode): output_file_path = output_file_path.encode("utf-8") history.save(output_file_path)
def validate(basedir): # let's start with some basic check print('Checking basic directory layout...') if not exists(basedir): raise Exception('The specified base directory does not exist') # check required metadata files are there and that they're valid configFile = join(basedir, 'metadata', 'config.yaml') if not exists(configFile): raise Exception( 'config.yaml could not be found in the metadata/ folder') config = yaml.load(open(configFile).read()) if 'version' not in config: raise Exception('config.yaml doesn\'t have a version number') # check that the specified audioFormats correspond to the audio/ subfolders print('Checking available audio formats...') audioFormats = config['audioFormats'] if not audioFormats: raise Exception('audioFormats not specified in config.yaml') audioFolders = [ basename(f) for f in glob.glob(join(basedir, 'audio', '*')) ] if len(audioFolders) != len(audioFormats): raise Exception( 'Some audio folders are not described in the audioFormats section of the config.yaml' ) print('Found formats:', str(audioFolders)) # check the audio formats are valid, in particular that they have a valid filelist for format, desc in audioFormats.items(): print("\nChecking format '%s':" % format) # TODO: at some point in the future we should also check for valid values in desc if not exists(join(basedir, 'audio', format)): raise Exception( '%s is listed as an audio format, but doesn\'t appear in the audio/ folder' % format) if 'filelist' not in desc: raise Exception('Audio format "%s" does not define a filelist' % format) filelist = yaml.load( open(join(basedir, 'metadata', desc['filelist'])).read()) print(' filelist OK, lists %d files' % len(filelist)) for pid, filename in filelist.items(): fullpath = join(basedir, 'audio', format, filename) if not exists(fullpath): raise Exception( 'For format "%s": file "%s" appears in filelist, however there is no corresponding "%s"' % (format, filename, fullpath)) # check that the groundtruth files do actually exist if they are specified print('\nChecking groundtruth files...') groundTruth = config.get('groundTruth', {}) print('Found groundtruth files:', str(list(groundTruth.keys()))) for name, gtfile in groundTruth.items(): print("\nChecking groundtruth '%s':" % name) gt = GroundTruth('') gt.load(join(basedir, 'metadata', gtfile)) # check that the IDs used in the groundtruth files exist in all the filelists for afname, af in audioFormats.items(): flist = yaml.load( open(join(basedir, 'metadata', af['filelist'])).read()).keys() for gid in gt: if gid not in flist: raise Exception( "ID '%s' is in GroundTruth '%s', but could not be found in filelist for audio format '%s'" % (gid, gtfile, afname)) print(' gt filelist OK, found classes:', str(sorted(set(gt.values()))))
def validate(basedir): # let's start with some basic check print 'Checking basic directory layout...' if not exists(basedir): raise Exception('The specified base directory does not exist') # check required metadata files are there and that they're valid configFile = join(basedir, 'metadata', 'config.yaml') if not exists(configFile): raise Exception('config.yaml could not be found in the metadata/ folder') config = yaml.load(open(configFile).read()) if 'version' not in config: raise Exception('config.yaml doesn\'t have a version number') # check that the specified audioFormats correspond to the audio/ subfolders print 'Checking available audio formats...' audioFormats = config['audioFormats'] if not audioFormats: raise Exception('audioFormats not specified in config.yaml') audioFolders = [ basename(f) for f in glob.glob(join(basedir, 'audio', '*')) ] if len(audioFolders) != len(audioFormats): raise Exception('Some audio folders are not described in the audioFormats section of the config.yaml') print 'Found formats:', str(audioFolders) # check the audio formats are valid, in particular that they have a valid filelist for format, desc in audioFormats.items(): print "\nChecking format '%s':" % format # TODO: at some point in the future we should also check for valid values in desc if not exists(join(basedir, 'audio', format)): raise Exception('%s is listed as an audio format, but doesn\'t appear in the audio/ folder' % format) if 'filelist' not in desc: raise Exception('Audio format "%s" does not define a filelist' % format) filelist = yaml.load(open(join(basedir, 'metadata', desc['filelist'])).read()) print ' filelist OK, lists %d files' % len(filelist) for pid, filename in filelist.items(): fullpath = join(basedir, 'audio', format, filename) if not exists(fullpath): raise Exception('For format "%s": file "%s" appears in filelist, however there is no corresponding "%s"' % (format, filename, fullpath)) # check that the groundtruth files do actually exist if they are specified print '\nChecking groundtruth files...' groundTruth = config.get('groundTruth', {}) print 'Found groundtruth files:', str(groundTruth.keys()) for name, gtfile in groundTruth.items(): print "\nChecking groundtruth '%s':" % name gt = GroundTruth('') gt.load(join(basedir, 'metadata', gtfile)) # check that the IDs used in the groundtruth files exist in all the filelists for afname, af in audioFormats.items(): flist = yaml.load(open(join(basedir, 'metadata', af['filelist'])).read()).keys() for gid in gt: if gid not in flist: raise Exception("ID '%s' is in GroundTruth '%s', but could not be found in filelist for audio format '%s'" % (gid, gtfile, afname)) print ' gt filelist OK, found classes:', str(sorted(set(gt.values())))
def generate_std_metadata(basedir, gtname, options): audioFormats = {} # make sure metadata folder exists os.system('mkdir -p "%s"' % join(basedir, 'metadata')) # generate a filelist for each audio folder filelists = {} for format in glob.glob(join(basedir, 'audio', '*')): format = basename(format) flist = generateMergeFilelist(join(basedir, 'audio', format), validFile = lambda x: True, filename2gid = lambda x: x) filelists[format] = flist filelist = open(join(basedir, 'metadata', '%s_filelist.yaml' % format), 'w') yaml.dump(flist, filelist) audioFormats[format] = { 'filelist': '%s_filelist.yaml' % format } # generate groundtruth, if asked groundTruth = {} if options.gttype is not None: if options.gttype == 'dir': # use the last filelist to get the GT, which should be independent of audio format # as it relies on points IDs only gt = GroundTruth(gtname) for pid in flist: gt[pid] = pid.split('/')[0] gt.save(join(basedir, 'metadata', 'groundtruth.yaml')) groundTruth[gtname] = 'groundtruth.yaml' elif options.gttype == 'txt': gt = GroundTruth(gtname) for pid in flist: gtfile = join(basedir, 'metadata', format, pid) + '.txt' gt[pid] = open(gtfile).read().strip() gt.save(join(basedir, 'metadata', 'groundtruth.yaml')) groundTruth[gtname] = 'groundtruth.yaml' elif options.gttype == 'mdir': # look for all the directories which can be paired in a XXX / not_XXX fashion # and create a groundtruth file for each of those mdirs = set(pid.split('/')[0] for pid in flist) mdirs = [ c for c in mdirs if ('not_' + c in mdirs or 'not-' + c in mdirs) ] print 'Found following possible classes', mdirs for c in mdirs: gt = GroundTruth(gtname + '_' + c) for pid in flist: cls = pid.split('/')[0] # only keep those files which we are interested in for our specific subclass if not (cls == c or cls == 'not_' + c or cls == 'not-' + c): continue gt[pid] = cls gt.save(join(basedir, 'metadata', 'groundtruth_%s.yaml' % c)) groundTruth['%s_%s' % (gtname, c)] = 'groundtruth_%s.yaml' % c else: print 'WARNING: unknown groundtruth type:', str(options.gttype) print ' not generating any groundtruth files...' # write the main config file config = { 'version': 1.0, 'audioFormats': audioFormats, 'groundTruth': groundTruth } yaml.dump(config, open(join(basedir, 'metadata', 'config.yaml'), 'w'))
def generate_std_metadata(basedir, gtname, options): audioFormats = {} # make sure metadata folder exists os.system('mkdir -p "%s"' % join(basedir, 'metadata')) # generate a filelist for each audio folder filelists = {} for format in glob.glob(join(basedir, 'audio', '*')): format = basename(format) flist = generateMergeFilelist(join(basedir, 'audio', format), validFile = lambda x: True, filename2gid = lambda x: x) filelists[format] = flist filelist = open(join(basedir, 'metadata', '%s_filelist.yaml' % format), 'w') yaml.dump(flist, filelist) audioFormats[format] = { 'filelist': '%s_filelist.yaml' % format } # generate groundtruth, if asked groundTruth = {} if options.gttype is not None: if options.gttype == 'dir': # use the last filelist to get the GT, which should be independent of audio format # as it relies on points IDs only gt = GroundTruth(gtname) for pid in flist: gt[pid] = pid.split('/')[0] gt.save(join(basedir, 'metadata', 'groundtruth.yaml')) groundTruth[gtname] = 'groundtruth.yaml' elif options.gttype == 'txt': gt = GroundTruth(gtname) for pid in flist: gtfile = join(basedir, 'metadata', format, pid) + '.txt' gt[pid] = open(gtfile).read().strip() gt.save(join(basedir, 'metadata', 'groundtruth.yaml')) groundTruth[gtname] = 'groundtruth.yaml' elif options.gttype == 'mdir': # look for all the directories which can be paired in a XXX / not_XXX fashion # and create a groundtruth file for each of those mdirs = set(pid.split('/')[0] for pid in flist) mdirs = [ c for c in mdirs if ('not_' + c in mdirs or 'not-' + c in mdirs) ] print ('Found following possible classes', mdirs) for c in mdirs: gt = GroundTruth(gtname + '_' + c) for pid in flist: cls = pid.split('/')[0] # only keep those files which we are interested in for our specific subclass if not (cls == c or cls == 'not_' + c or cls == 'not-' + c): continue gt[pid] = cls gt.save(join(basedir, 'metadata', 'groundtruth_%s.yaml' % c)) groundTruth['%s_%s' % (gtname, c)] = 'groundtruth_%s.yaml' % c else: print ('WARNING: unknown groundtruth type:', str(options.gttype)) print (' not generating any groundtruth files...') # write the main config file config = { 'version': 1.0, 'audioFormats': audioFormats, 'groundTruth': groundTruth } yaml.dump(config, open(join(basedir, 'metadata', 'config.yaml'), 'w'))