def run(self, className, outfilename, param, dsname, gtname, evalconfig): try: classifier = param['classifier'] gt = GroundTruth(classifier) gt.load(gtname) # force the GroundTruth class name to be the one specified by our project file, not # the one in the original groundtruth file gt.className = className ds = DataSet() ds.load(dsname) # some points may have failed to be analyzed, remove those from the GroundTruth pnames = ds.pointNames() for pid in list(gt.keys()): if pid not in pnames: log.warning( 'Removing %s from GroundTruth as it could not be found in the merged dataset' % pid) del gt[pid] trainerFun, trainingparam, newds = getTrainer( classifier, param, ds) # run all the evaluations specified in the evaluation config for i, evalparam in enumerate(evalconfig): # if we already ran this evaluation, no need to run it again... resultFilename = outfilename + '_%d.result' % i if exists(resultFilename): log.info('File %s already exists. Skipping evaluation...' % resultFilename) continue log.info( 'Running evaluation %d for: %s with classifier %s and dataset %s' % (i, outfilename, param['classifier'], param['preprocessing'])) log.info(' PID: %d, parameters: %s' % (os.getpid(), json.dumps(param))) # run evaluation confusion = evaluateNfold(evalparam['nfold'], ds, gt, trainerFun, **trainingparam) # write evaluation params & result with open(outfilename + '_%d.param' % i, 'w') as f: yaml.dump({'model': param, 'evaluation': evalparam}, f) confusion.save(resultFilename) except Exception: log.error( 'While doing evaluation with param = %s\nevaluation = %s' % (param, evalconfig)) raise
def validate(basedir): # let's start with some basic check print('Checking basic directory layout...') if not exists(basedir): raise Exception('The specified base directory does not exist') # check required metadata files are there and that they're valid configFile = join(basedir, 'metadata', 'config.yaml') if not exists(configFile): raise Exception( 'config.yaml could not be found in the metadata/ folder') config = yaml.load(open(configFile).read()) if 'version' not in config: raise Exception('config.yaml doesn\'t have a version number') # check that the specified audioFormats correspond to the audio/ subfolders print('Checking available audio formats...') audioFormats = config['audioFormats'] if not audioFormats: raise Exception('audioFormats not specified in config.yaml') audioFolders = [ basename(f) for f in glob.glob(join(basedir, 'audio', '*')) ] if len(audioFolders) != len(audioFormats): raise Exception( 'Some audio folders are not described in the audioFormats section of the config.yaml' ) print('Found formats:', str(audioFolders)) # check the audio formats are valid, in particular that they have a valid filelist for format, desc in audioFormats.items(): print("\nChecking format '%s':" % format) # TODO: at some point in the future we should also check for valid values in desc if not exists(join(basedir, 'audio', format)): raise Exception( '%s is listed as an audio format, but doesn\'t appear in the audio/ folder' % format) if 'filelist' not in desc: raise Exception('Audio format "%s" does not define a filelist' % format) filelist = yaml.load( open(join(basedir, 'metadata', desc['filelist'])).read()) print(' filelist OK, lists %d files' % len(filelist)) for pid, filename in filelist.items(): fullpath = join(basedir, 'audio', format, filename) if not exists(fullpath): raise Exception( 'For format "%s": file "%s" appears in filelist, however there is no corresponding "%s"' % (format, filename, fullpath)) # check that the groundtruth files do actually exist if they are specified print('\nChecking groundtruth files...') groundTruth = config.get('groundTruth', {}) print('Found groundtruth files:', str(list(groundTruth.keys()))) for name, gtfile in groundTruth.items(): print("\nChecking groundtruth '%s':" % name) gt = GroundTruth('') gt.load(join(basedir, 'metadata', gtfile)) # check that the IDs used in the groundtruth files exist in all the filelists for afname, af in audioFormats.items(): flist = yaml.load( open(join(basedir, 'metadata', af['filelist'])).read()).keys() for gid in gt: if gid not in flist: raise Exception( "ID '%s' is in GroundTruth '%s', but could not be found in filelist for audio format '%s'" % (gid, gtfile, afname)) print(' gt filelist OK, found classes:', str(sorted(set(gt.values()))))
def generate_std_metadata(basedir, gtname, options): audioFormats = {} # make sure metadata folder exists os.system('mkdir -p "%s"' % join(basedir, 'metadata')) # generate a filelist for each audio folder filelists = {} for format in glob.glob(join(basedir, 'audio', '*')): format = basename(format) flist = generateMergeFilelist(join(basedir, 'audio', format), validFile = lambda x: True, filename2gid = lambda x: x) filelists[format] = flist filelist = open(join(basedir, 'metadata', '%s_filelist.yaml' % format), 'w') yaml.dump(flist, filelist) audioFormats[format] = { 'filelist': '%s_filelist.yaml' % format } # generate groundtruth, if asked groundTruth = {} if options.gttype is not None: if options.gttype == 'dir': # use the last filelist to get the GT, which should be independent of audio format # as it relies on points IDs only gt = GroundTruth(gtname) for pid in flist: gt[pid] = pid.split('/')[0] gt.save(join(basedir, 'metadata', 'groundtruth.yaml')) groundTruth[gtname] = 'groundtruth.yaml' elif options.gttype == 'txt': gt = GroundTruth(gtname) for pid in flist: gtfile = join(basedir, 'metadata', format, pid) + '.txt' gt[pid] = open(gtfile).read().strip() gt.save(join(basedir, 'metadata', 'groundtruth.yaml')) groundTruth[gtname] = 'groundtruth.yaml' elif options.gttype == 'mdir': # look for all the directories which can be paired in a XXX / not_XXX fashion # and create a groundtruth file for each of those mdirs = set(pid.split('/')[0] for pid in flist) mdirs = [ c for c in mdirs if ('not_' + c in mdirs or 'not-' + c in mdirs) ] print ('Found following possible classes', mdirs) for c in mdirs: gt = GroundTruth(gtname + '_' + c) for pid in flist: cls = pid.split('/')[0] # only keep those files which we are interested in for our specific subclass if not (cls == c or cls == 'not_' + c or cls == 'not-' + c): continue gt[pid] = cls gt.save(join(basedir, 'metadata', 'groundtruth_%s.yaml' % c)) groundTruth['%s_%s' % (gtname, c)] = 'groundtruth_%s.yaml' % c else: print ('WARNING: unknown groundtruth type:', str(options.gttype)) print (' not generating any groundtruth files...') # write the main config file config = { 'version': 1.0, 'audioFormats': audioFormats, 'groundTruth': groundTruth } yaml.dump(config, open(join(basedir, 'metadata', 'config.yaml'), 'w'))