Пример #1
0
    def run(self, className, outfilename, param, dsname, gtname, evalconfig):

        try:
            classifier = param['classifier']
            gt = GroundTruth(classifier)
            gt.load(gtname)

            # force the GroundTruth class name to be the one specified by our project file, not
            # the one in the original groundtruth file
            gt.className = className

            ds = DataSet()
            ds.load(dsname)

            # some points may have failed to be analyzed, remove those from the GroundTruth
            pnames = ds.pointNames()
            for pid in list(gt.keys()):
                if pid not in pnames:
                    log.warning(
                        'Removing %s from GroundTruth as it could not be found in the merged dataset'
                        % pid)
                    del gt[pid]

            trainerFun, trainingparam, newds = getTrainer(
                classifier, param, ds)

            # run all the evaluations specified in the evaluation config
            for i, evalparam in enumerate(evalconfig):
                # if we already ran this evaluation, no need to run it again...
                resultFilename = outfilename + '_%d.result' % i
                if exists(resultFilename):
                    log.info('File %s already exists. Skipping evaluation...' %
                             resultFilename)
                    continue

                log.info(
                    'Running evaluation %d for: %s with classifier %s and dataset %s'
                    % (i, outfilename, param['classifier'],
                       param['preprocessing']))
                log.info('    PID: %d, parameters: %s' %
                         (os.getpid(), json.dumps(param)))

                # run evaluation
                confusion = evaluateNfold(evalparam['nfold'], ds, gt,
                                          trainerFun, **trainingparam)

                # write evaluation params & result
                with open(outfilename + '_%d.param' % i, 'w') as f:
                    yaml.dump({'model': param, 'evaluation': evalparam}, f)

                confusion.save(resultFilename)

        except Exception:
            log.error(
                'While doing evaluation with param = %s\nevaluation = %s' %
                (param, evalconfig))
            raise
Пример #2
0
def validate(basedir):
    # let's start with some basic check
    print('Checking basic directory layout...')
    if not exists(basedir):
        raise Exception('The specified base directory does not exist')

    # check required metadata files are there and that they're valid
    configFile = join(basedir, 'metadata', 'config.yaml')
    if not exists(configFile):
        raise Exception(
            'config.yaml could not be found in the metadata/ folder')

    config = yaml.load(open(configFile).read())
    if 'version' not in config:
        raise Exception('config.yaml doesn\'t have a version number')

    # check that the specified audioFormats correspond to the audio/ subfolders
    print('Checking available audio formats...')
    audioFormats = config['audioFormats']
    if not audioFormats:
        raise Exception('audioFormats not specified in config.yaml')

    audioFolders = [
        basename(f) for f in glob.glob(join(basedir, 'audio', '*'))
    ]

    if len(audioFolders) != len(audioFormats):
        raise Exception(
            'Some audio folders are not described in the audioFormats section of the config.yaml'
        )

    print('Found formats:', str(audioFolders))

    # check the audio formats are valid, in particular that they have a valid filelist
    for format, desc in audioFormats.items():
        print("\nChecking format '%s':" % format)
        # TODO: at some point in the future we should also check for valid values in desc
        if not exists(join(basedir, 'audio', format)):
            raise Exception(
                '%s is listed as an audio format, but doesn\'t appear in the audio/ folder'
                % format)

        if 'filelist' not in desc:
            raise Exception('Audio format "%s" does not define a filelist' %
                            format)

        filelist = yaml.load(
            open(join(basedir, 'metadata', desc['filelist'])).read())
        print('  filelist OK, lists %d files' % len(filelist))

        for pid, filename in filelist.items():
            fullpath = join(basedir, 'audio', format, filename)
            if not exists(fullpath):
                raise Exception(
                    'For format "%s": file "%s" appears in filelist, however there is no corresponding "%s"'
                    % (format, filename, fullpath))

    # check that the groundtruth files do actually exist if they are specified
    print('\nChecking groundtruth files...')
    groundTruth = config.get('groundTruth', {})
    print('Found groundtruth files:', str(list(groundTruth.keys())))
    for name, gtfile in groundTruth.items():
        print("\nChecking groundtruth '%s':" % name)
        gt = GroundTruth('')
        gt.load(join(basedir, 'metadata', gtfile))
        # check that the IDs used in the groundtruth files exist in all the filelists
        for afname, af in audioFormats.items():
            flist = yaml.load(
                open(join(basedir, 'metadata', af['filelist'])).read()).keys()
            for gid in gt:
                if gid not in flist:
                    raise Exception(
                        "ID '%s' is in GroundTruth '%s', but could not be found in filelist for audio format '%s'"
                        % (gid, gtfile, afname))
        print('  gt filelist OK, found classes:',
              str(sorted(set(gt.values()))))
Пример #3
0
def generate_std_metadata(basedir, gtname, options):
    audioFormats = {}

    # make sure metadata folder exists
    os.system('mkdir -p "%s"' % join(basedir, 'metadata'))

    # generate a filelist for each audio folder
    filelists = {}
    for format in glob.glob(join(basedir, 'audio', '*')):
        format = basename(format)

        flist = generateMergeFilelist(join(basedir, 'audio', format),
                                      validFile = lambda x: True,
                                      filename2gid = lambda x: x)

        filelists[format] = flist

        filelist = open(join(basedir, 'metadata', '%s_filelist.yaml' % format), 'w')
        yaml.dump(flist, filelist)

        audioFormats[format] = { 'filelist': '%s_filelist.yaml' % format }

    # generate groundtruth, if asked
    groundTruth = {}
    if options.gttype is not None:

        if options.gttype == 'dir':
            # use the last filelist to get the GT, which should be independent of audio format
            # as it relies on points IDs only
            gt = GroundTruth(gtname)
            for pid in flist:
                gt[pid] = pid.split('/')[0]

            gt.save(join(basedir, 'metadata', 'groundtruth.yaml'))
            groundTruth[gtname] = 'groundtruth.yaml'

        elif options.gttype == 'txt':
            gt = GroundTruth(gtname)
            for pid in flist:
                gtfile = join(basedir, 'metadata', format, pid) + '.txt'
                gt[pid] = open(gtfile).read().strip()

            gt.save(join(basedir, 'metadata', 'groundtruth.yaml'))
            groundTruth[gtname] = 'groundtruth.yaml'

        elif options.gttype == 'mdir':
            # look for all the directories which can be paired in a XXX / not_XXX fashion
            # and create a groundtruth file for each of those
            mdirs = set(pid.split('/')[0] for pid in flist)
            mdirs = [ c for c in mdirs if ('not_' + c in mdirs or 'not-' + c in mdirs) ]
            print ('Found following possible classes', mdirs)

            for c in mdirs:
                gt = GroundTruth(gtname + '_' + c)
                for pid in flist:
                    cls = pid.split('/')[0]
                    # only keep those files which we are interested in for our specific subclass
                    if not (cls == c or cls == 'not_' + c or cls == 'not-' + c):
                        continue

                    gt[pid] = cls

                gt.save(join(basedir, 'metadata', 'groundtruth_%s.yaml' % c))
                groundTruth['%s_%s' % (gtname, c)] = 'groundtruth_%s.yaml' % c


        else:
            print ('WARNING: unknown groundtruth type:', str(options.gttype))
            print ('         not generating any groundtruth files...')



    # write the main config file
    config = { 'version': 1.0,
               'audioFormats': audioFormats,
               'groundTruth': groundTruth
               }

    yaml.dump(config, open(join(basedir, 'metadata', 'config.yaml'), 'w'))