def mergeDirectory(dirname, outputFilename, chunkSize, transfoFile, select = None, exclude = None): idIsFullPath = False ext = '.sig' # TODO: this should be more flexible validFile = lambda x: x.endswith('.sig') and not x.endswith('.neq.sig') # find the list of all the points that should go into the dataset plist = {} for root, dirs, files in os.walk(dirname): for filename in filter(validFile, files): fullpath = os.path.join(root, filename) pid = filename if idIsFullPath: pid = fullpath # remove extension from the point id pid = pid[:-len(ext)] plist[pid] = fullpath # write a temporary yaml filelist (should delete itself upon closing) import tempfile yamllist = tempfile.NamedTemporaryFile(mode = 'w+') fastyaml.dump(plist, yamllist) yamllist.flush() # call 'classic' merge function mergeAll(yamllist.name, outputFilename, chunkSize, transfoFile, select, exclude)
def launchMasterSlaves(): config = { 'port': 8090, 'slaves': [{ 'host': 'localhost', 'port': 8091 }, { 'host': 'localhost', 'port': 8092 }] } yaml.dump(config, open('/tmp/cyclops_unittest_config.yaml', 'w')) pids = [] for slave in config['slaves']: pids += [ subprocess.Popen( ['cyclops', '-p', str(slave['port'])], stdout=subprocess.PIPE, stderr=subprocess.PIPE).pid ] time.sleep(1) pids += [ subprocess.Popen( ['cyclopsmaster', '/tmp/cyclops_unittest_config.yaml'], stdout=subprocess.PIPE, stderr=subprocess.PIPE).pid ] time.sleep(1) return pids
def launchMasterSlaves(): config = { 'port': 8090, 'slaves': [ { 'host': 'localhost', 'port': 8091 }, { 'host': 'localhost', 'port': 8092 } ] } yaml.dump(config, open('/tmp/cyclops_unittest_config.yaml', 'w')) pids = [] for slave in config['slaves']: pids += [ subprocess.Popen([ 'cyclops', '-p', str(slave['port']) ], stdout = subprocess.PIPE, stderr = subprocess.PIPE).pid ] time.sleep(1) pids += [ subprocess.Popen([ 'cyclopsmaster', '/tmp/cyclops_unittest_config.yaml' ], stdout = subprocess.PIPE, stderr = subprocess.PIPE).pid ] time.sleep(1) return pids
def convertJsonToSig(filelist_file, result_filelist_file): fl = yaml.load(open(filelist_file, 'r')) result_fl = fl errors = [] for trackid, json_file in fl.iteritems(): try: data = json.load(open(json_file)) # remove descriptors, that will otherwise break gaia_fusion due to incompatibility of layouts if 'tags' in data['metadata']: del data['metadata']['tags'] if 'sample_rate' in data['metadata']['audio_properties']: del data['metadata']['audio_properties']['sample_rate'] sig_file = os.path.splitext(json_file)[0] + '.sig' yaml.dump(data, open(sig_file, 'w')) result_fl[trackid] = sig_file except: errors += [json_file] yaml.dump(result_fl, open(result_filelist_file, 'w')) print "Failed to convert", len(errors), "files:" for e in errors: print e return len(errors) == 0
def mergeDirectory(dirname, outputFilename, chunkSize, transfoFile, select=None, exclude=None): idIsFullPath = False ext = '.sig' # TODO: this should be more flexible validFile = lambda x: x.endswith('.sig') and not x.endswith('.neq.sig') # find the list of all the points that should go into the dataset plist = {} for root, dirs, files in os.walk(dirname): for filename in filter(validFile, files): fullpath = os.path.join(root, filename) pid = filename if idIsFullPath: pid = fullpath # remove extension from the point id pid = pid[:-len(ext)] plist[pid] = fullpath # write a temporary yaml filelist (should delete itself upon closing) import tempfile yamllist = tempfile.NamedTemporaryFile(mode='w+') fastyaml.dump(plist, yamllist) yamllist.flush() # call 'classic' merge function mergeAll(yamllist.name, outputFilename, chunkSize, transfoFile, select, exclude)
def convertJsonToSig(filelist_file, result_filelist_file): fl = yaml.load(open(filelist_file, 'r')) result_fl = fl errors = [] for trackid, json_file in fl.iteritems(): try: data = json.load(open(json_file)) # remove descriptors, that will otherwise break gaia_fusion due to incompatibility of layouts if 'tags' in data['metadata']: del data['metadata']['tags'] if 'sample_rate' in data['metadata']['audio_properties']: del data['metadata']['audio_properties']['sample_rate'] if 'lossless' in data['metadata']['audio_properties']: del data['metadata']['audio_properties']['lossless'] sig_file = os.path.splitext(json_file)[0] + '.sig' yaml.safe_dump(data, open(sig_file, 'w')) result_fl[trackid] = sig_file except: errors += [json_file] yaml.dump(result_fl, open(result_filelist_file, 'w')) print("Failed to convert", len(errors), "files:") for e in errors: print(e) return len(errors) == 0
def save(self, filename): # convert to "normal" dicts before saving data = { 'matrix': dict((k, dict(v)) for k, v in self.matrix.items()), 'fold': self.folds } with open(filename, 'w') as f: yaml.dump(data, f)
def run(self, className, outfilename, param, dsname, gtname, evalconfig): try: classifier = param['classifier'] gt = GroundTruth(classifier) gt.load(gtname) # force the GroundTruth class name to be the one specified by our project file, not # the one in the original groundtruth file gt.className = className ds = DataSet() ds.load(dsname) # some points may have failed to be analyzed, remove those from the GroundTruth pnames = ds.pointNames() for pid in list(gt.keys()): if pid not in pnames: log.warning( 'Removing %s from GroundTruth as it could not be found in the merged dataset' % pid) del gt[pid] trainerFun, trainingparam, newds = getTrainer( classifier, param, ds) # run all the evaluations specified in the evaluation config for i, evalparam in enumerate(evalconfig): # if we already ran this evaluation, no need to run it again... resultFilename = outfilename + '_%d.result' % i if exists(resultFilename): log.info('File %s already exists. Skipping evaluation...' % resultFilename) continue log.info( 'Running evaluation %d for: %s with classifier %s and dataset %s' % (i, outfilename, param['classifier'], param['preprocessing'])) log.info(' PID: %d, parameters: %s' % (os.getpid(), json.dumps(param))) # run evaluation confusion = evaluateNfold(evalparam['nfold'], ds, gt, trainerFun, **trainingparam) # write evaluation params & result with open(outfilename + '_%d.param' % i, 'w') as f: yaml.dump({'model': param, 'evaluation': evalparam}, f) confusion.save(resultFilename) except Exception: log.error( 'While doing evaluation with param = %s\nevaluation = %s' % (param, evalconfig)) raise
def evaluate_dataset(eval_job, dataset_dir, storage_dir): db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_RUNNING) eval_location = os.path.join(os.path.abspath(dataset_dir), eval_job["id"]) utils.path.create_path(eval_location) temp_dir = tempfile.mkdtemp() try: snapshot = db.dataset.get_snapshot(eval_job["snapshot_id"]) train, test = artistfilter.filter(eval_job["snapshot_id"], eval_job["options"]) db.dataset_eval.add_sets_to_job(eval_job["id"], train, test) logging.info("Generating filelist.yaml and copying low-level data for evaluation...") filelist_path = os.path.join(eval_location, "filelist.yaml") filelist = dump_lowlevel_data(train.keys(), temp_dir) with open(filelist_path, "w") as f: yaml.dump(filelist, f) logging.info("Generating groundtruth.yaml...") groundtruth_path = os.path.join(eval_location, "groundtruth.yaml") with open(groundtruth_path, "w") as f: yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f) logging.info("Training model...") results = gaia_wrapper.train_model( project_dir=eval_location, groundtruth_file=groundtruth_path, filelist_file=filelist_path, ) logging.info("Saving results...") save_history_file(storage_dir, results["history_path"], eval_job["id"]) db.dataset_eval.set_job_result(eval_job["id"], json.dumps({ "project_path": eval_location, "parameters": results["parameters"], "accuracy": results["accuracy"], "confusion_matrix": results["confusion_matrix"], "history_path": results["history_path"], })) db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_DONE) logging.info("Evaluation job %s has been completed." % eval_job["id"]) # TODO(roman): Also need to catch exceptions from Gaia. except db.exceptions.DatabaseException as e: logging.info("Evaluation job %s has failed!" % eval_job["id"]) db.dataset_eval.set_job_status( job_id=eval_job["id"], status=db.dataset_eval.STATUS_FAILED, status_msg=str(e), ) logging.info(e) finally: # Clean up the source files used to generate this model. # We can recreate them from the database if we need them # at a later stage. shutil.rmtree(temp_dir)
def save(self, yamlfile): with open(yamlfile, 'w') as f: yaml.dump( { 'version': 1.0, 'type': 'singleClass', 'className': self.className, 'groundTruth': dict(self) }, f)
def main(dirname, options): if os.path.isdir(dirname): print ("running in dir", dirname) project_dir = os.path.abspath(dirname) projname = os.path.basename(project_dir) else: print("Invalid directory: " + dirname) sys.exit(2) # if config/results exist, need force to rm them project_file = os.path.join(project_dir, "%s.project" % projname) results_model_file = os.path.join(project_dir, "%s.history" % projname) resultsdir = os.path.join(project_dir, "results") datasetsdir = os.path.join(project_dir, "datasets") if os.path.exists(resultsdir): print >> sys.stderr, "Results directory already exists. Use -f to delete and re-run" return classes = [d for d in os.listdir(project_dir) \ if os.path.isdir(os.path.join(project_dir, d))] print (classes) groundtruth_name = os.path.join(project_dir, "groundtruth.yaml") json_name = os.path.join(project_dir, "filelist.yaml") yaml_name = os.path.join(project_dir, "filelist-yaml.yaml") filelist = {} groundtruth = template missingsig = False for c in classes: files = get_files_in_dir(os.path.join(project_dir, c), "json") yamlfiles = get_files_in_dir(os.path.join(project_dir, c), "sig") if len(files) != len(yamlfiles): missingsig = True print ("got", len(files), "files in", c) for f in files: id = os.path.splitext(os.path.basename(f))[0] groundtruth["groundTruth"][id] = c filelist[id] = os.path.join(project_dir, c, f) # check directories for sig and convert groundtruth["className"] = projname yaml.dump(filelist, open(json_name, "w")) yaml.dump(groundtruth, open(groundtruth_name, "w")) if missingsig: print ("converting sig") json_to_sig.convertJsonToSig(json_name, yaml_name) # run train_model.trainModel(groundtruth_name, yaml_name, project_file, project_dir, results_model_file)
def main(dirname, options): print "running in dir", dirname project_dir = os.path.abspath(dirname) projname = os.path.basename(dirname) # if config/results exist, need force to rm them project_file = os.path.join(project_dir, "%s.project" % projname) results_model_file = os.path.join(project_dir, "%s.history" % projname) resultsdir = os.path.join(project_dir, "results") datasetsdir = os.path.join(project_dir, "datasets") if os.path.exists(resultsdir): print >> sys.stderr, "Results directory already exists. Use -f to delete and re-run" return classes = [d for d in os.listdir(project_dir) \ if os.path.isdir(os.path.join(project_dir, d))] print classes groundtruth_name = os.path.join(project_dir, "groundtruth.yaml") json_name = os.path.join(project_dir, "filelist.yaml") yaml_name = os.path.join(project_dir, "filelist-yaml.yaml") filelist = {} groundtruth = template missingsig = False for c in classes: files = get_files_in_dir(os.path.join(project_dir, c), "json") yamlfiles = get_files_in_dir(os.path.join(project_dir, c), "sig") if len(files) != len(yamlfiles): missingsig = True print "got", len(files), "files in", c for f in files: id = os.path.splitext(os.path.basename(f))[0] groundtruth["groundTruth"][id] = c filelist[id] = os.path.join(project_dir, c, f) # check directories for sig and convert groundtruth["className"] = projname yaml.dump(filelist, open(json_name, "w")) yaml.dump(groundtruth, open(groundtruth_name, "w")) if missingsig: print "converting sig" json_to_sig.convertJsonToSig(json_name, yaml_name) # run train_model.trainModel(groundtruth_name, yaml_name, project_file, project_dir, results_model_file)
def generateProjectFromCollection(): parser = OptionParser( usage='%prog [options] collection_name sigfiles_dir project_file\n\n' + 'this will also generate a groundtruth and a filelist file to be used by the project file.' ) parser.add_option( '-g', '--groundtruth', dest='desiredGroundTruth', help= 'Which type of ground truth to use, in case the collection has more than one' ) options, args = parser.parse_args() try: collection_name = args[0] sigfiles_dir = args[1] project_file = args[2] except: parser.print_help() sys.exit(1) # create collection from a directory collection_name if it exists, use an MTG-DB collection otherwise if os.path.isdir(collection_name): collec = gaia2.mtgdb.Collection(collection_name, groundTruth=options.desiredGroundTruth) else: collec = gaia2.mtgdb.MtgdbCollection( collection_name, groundTruth=options.desiredGroundTruth) # write yaml file of sigfiles to merge for this project filelistFilename = abspath(splitext(project_file)[0] + '.filelist.yaml') sigfileList = sigfileListFromCollection(collec, sigfiles_dir) with open(filelistFilename, 'w') as filelist: yaml.dump(sigfileList, filelist) # write the project file with open(project_file, 'w') as pfile: pfile.write( PROJECT_TEMPLATE % { 'className': collec.groundTruth.className, 'filelist': filelistFilename, 'groundtruth': abspath(collec._groundTruthFile) }) print 'Successfully written', project_file
def run(self, className, outfilename, param, dsname, gtname, evalconfig): try: classifier = param['classifier'] gt = GroundTruth(classifier) gt.load(gtname) # force the GroundTruth class name to be the one specified by our project file, not # the one in the original groundtruth file gt.className = className ds = DataSet() ds.load(dsname) # some points may have failed to be analyzed, remove those from the GroundTruth pnames = ds.pointNames() for pid in list(gt.keys()): if pid not in pnames: log.warning('Removing %s from GroundTruth as it could not be found in the merged dataset' % pid) del gt[pid] trainerFun, trainingparam, newds = getTrainer(classifier, param, ds) # run all the evaluations specified in the evaluation config for i, evalparam in enumerate(evalconfig): # if we already ran this evaluation, no need to run it again... resultFilename = outfilename + '_%d.result' % i if exists(resultFilename): log.info('File %s already exists. Skipping evaluation...' % resultFilename) continue log.info('Running evaluation %d for: %s with classifier %s and dataset %s' % (i, outfilename, param['classifier'], param['preprocessing'])) log.info(' PID: %d, parameters: %s' % (os.getpid(), json.dumps(param))) # run evaluation confusion = evaluateNfold(evalparam['nfold'], ds, gt, trainerFun, **trainingparam) # write evaluation params & result with open(outfilename + '_%d.param' % i, 'w') as f: yaml.dump({ 'model': param, 'evaluation': evalparam }, f) confusion.save(resultFilename) except Exception: log.error('While doing evaluation with param = %s\nevaluation = %s' % (param, evalconfig)) raise
def __call__(self, *args, **kwargs): if kwargs: raise NotImplementedError('Cannot use keyword arguments with YamlRPC at the moment...') if VERBOSE: serializeStart = time.time() try: q = yaml.dump({ 'method': self.methodName, 'params': list(args), 'id': 'gloubi-boulga' }) except: raise RuntimeError('Could not serialize Yaml request, most likely one of the arguments could not be serialized:\n%s' % list(args)) if VERBOSE: responseTime = time.time() - serializeStart print 'serialized request in %f seconds' % responseTime # we don't want the '+'-quoting params = urllib.urlencode({ 'q': q }).replace('+', ' ') headers = { 'Content-type': 'application/x-www-form-urlencoded', 'Accept': 'text/plain' } if VERBOSE: startTime = time.time() conn = httplib.HTTPConnection(self.endPoint) try: conn.request('POST', '/', params, headers) except Exception, e: raise RuntimeError('request failed', self.endPoint, self.methodName, args, e)
def convertJsonToSig(): parser = OptionParser(usage = '%prog [options] filelist_file result_filelist_file\n' + """ Converts json files found in filelist_file into *.sig yaml files compatible with Gaia. The result files are written to the same directory where original files were located. """ ) options, args = parser.parse_args() try: filelist_file = args[0] result_filelist_file = args[1] except: parser.print_help() sys.exit(1) fl = yaml.load(open(filelist_file, 'r')) result_fl = fl errors = [] for trackid, json_file in fl.iteritems(): try: data = json.load(open(json_file)) # remove descriptors, that will otherwise break gaia_fusion due to incompatibility of layouts if 'tags' in data['metadata']: del data['metadata']['tags'] if 'sample_rate' in data['metadata']['audio_properties']: del data['metadata']['audio_properties']['sample_rate'] sig_file = os.path.splitext(json_file)[0] + '.sig' yaml.dump(data, open(sig_file, 'w')) result_fl[trackid] = sig_file except: errors += [json_file] yaml.dump(result_fl, open(result_filelist_file, 'w')) print "Failed to convert", len(errors), "files:" for e in errors: print e return len(errors)
def evaluate_dataset(eval_job): db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_RUNNING) temp_dir = tempfile.mkdtemp() try: dataset = db.dataset.get(eval_job["dataset_id"]) logging.info("Generating filelist.yaml and copying low-level data for evaluation...") filelist_path = os.path.join(temp_dir, "filelist.yaml") filelist = dump_lowlevel_data(extract_recordings(dataset), os.path.join(temp_dir, "data")) with open(filelist_path, "w") as f: yaml.dump(filelist, f) logging.info("Generating groundtruth.yaml...") groundtruth_path = os.path.join(temp_dir, "groundtruth.yaml") with open(groundtruth_path, "w") as f: yaml.dump(create_groundtruth(dataset), f) logging.info("Training model...") results = gaia_wrapper.train_model( groundtruth_file=groundtruth_path, filelist_file=filelist_path, project_dir=temp_dir, ) logging.info("Saving results...") save_history_file(results["history_path"], eval_job["id"]) db.dataset_eval.set_job_result(eval_job["id"], json.dumps({ "parameters": results["parameters"], "accuracy": results["accuracy"], "confusion_matrix": results["confusion_matrix"], "history_path": results["history_path"], })) db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_DONE) logging.info("Evaluation job %s has been completed." % eval_job["id"]) # TODO(roman): Also need to catch exceptions from Gaia. except db.exceptions.DatabaseException as e: logging.info("Evaluation job %s has failed!" % eval_job["id"]) db.dataset_eval.set_job_status( job_id=eval_job["id"], status=db.dataset_eval.STATUS_FAILED, status_msg=str(e), ) logging.info(e) finally: shutil.rmtree(temp_dir) # Cleanup
def convertJsonToSig(): parser = OptionParser( usage='%prog [options] filelist_file result_filelist_file\n' + """ Converts json files found in filelist_file into *.sig yaml files compatible with Gaia. The result files are written to the same directory where original files were located. """) options, args = parser.parse_args() try: filelist_file = args[0] result_filelist_file = args[1] except: parser.print_help() sys.exit(1) fl = yaml.load(open(filelist_file, 'r')) result_fl = fl errors = [] for trackid, json_file in fl.iteritems(): try: data = json.load(open(json_file)) # remove descriptors, that will otherwise break gaia_fusion due to incompatibility of layouts if 'tags' in data['metadata']: del data['metadata']['tags'] if 'sample_rate' in data['metadata']['audio_properties']: del data['metadata']['audio_properties']['sample_rate'] sig_file = os.path.splitext(json_file)[0] + '.sig' yaml.dump(data, open(sig_file, 'w')) result_fl[trackid] = sig_file except: errors += [json_file] yaml.dump(result_fl, open(result_filelist_file, 'w')) print "Failed to convert", len(errors), "files:" for e in errors: print e return len(errors)
def __call__(self, *args, **kwargs): if kwargs: raise NotImplementedError('Cannot use keyword arguments with YamlRPC at the moment...') if VERBOSE: serializeStart = time.time() try: q = yaml.dump({ 'method': self.methodName, 'params': list(args), 'id': 'gloubi-boulga' }) except: raise RuntimeError('Could not serialize Yaml request, most likely one of the arguments could not be serialized:\n%s' % list(args)) if VERBOSE: responseTime = time.time() - serializeStart print ('serialized request in %f seconds' % responseTime) # we don't want the '+'-quoting params = urlencode({ 'q': q }).replace('+', ' ') headers = { 'Content-type': 'application/x-www-form-urlencoded', 'Accept': 'text/plain' } if VERBOSE: startTime = time.time() conn = http_client.HTTPConnection(self.endPoint) try: conn.request('POST', '/', params, headers) except Exception as e: raise RuntimeError('request failed', self.endPoint, self.methodName, args, e) response = conn.getresponse() if VERBOSE: responseTime = time.time() - startTime print ('received answer in %f seconds' % responseTime) #print response.status, response.reason startParseTime = time.time() result = yaml.load(response.read()) if VERBOSE: responseTime = time.time() - startParseTime print ('parsed answer in %f seconds' % responseTime) responseTime = time.time() - serializeStart print ('total time: %f seconds' % responseTime) if 'error' in result: raise RuntimeError(result['error']['message']) return result['result']
def lowlevel_data_to_yaml(data): """Prepares dictionary with low-level data about recording for processing and converts it into YAML string. """ # Removing descriptors, that will otherwise break gaia_fusion due to # incompatibility of layouts (see Gaia implementation for more details). if "tags" in data["metadata"]: del data["metadata"]["tags"] if "sample_rate" in data["metadata"]["audio_properties"]: del data["metadata"]["audio_properties"]["sample_rate"] if 'lossless' in data['metadata']['audio_properties']: del data['metadata']['audio_properties']['lossless'] return yaml.dump(data)
def generateProjectFromCollection(): parser = OptionParser(usage = '%prog [options] collection_name sigfiles_dir project_file\n\n' + 'this will also generate a groundtruth and a filelist file to be used by the project file.') parser.add_option('-g', '--groundtruth', dest = 'desiredGroundTruth', help = 'Which type of ground truth to use, in case the collection has more than one') options, args = parser.parse_args() try: collection_name = args[0] sigfiles_dir = args[1] project_file = args[2] except: parser.print_help() sys.exit(1) # create collection from a directory collection_name if it exists, use an MTG-DB collection otherwise if os.path.isdir(collection_name): collec = gaia2.mtgdb.Collection(collection_name, groundTruth = options.desiredGroundTruth) else: collec = gaia2.mtgdb.MtgdbCollection(collection_name, groundTruth = options.desiredGroundTruth) # write yaml file of sigfiles to merge for this project filelistFilename = abspath(splitext(project_file)[0] + '.filelist.yaml') sigfileList = sigfileListFromCollection(collec, sigfiles_dir) with open(filelistFilename, 'w') as filelist: yaml.dump(sigfileList, filelist) # write the project file with open(project_file, 'w') as pfile: pfile.write(PROJECT_TEMPLATE % { 'className': collec.groundTruth.className, 'filelist': filelistFilename, 'groundtruth': abspath(collec._groundTruthFile) }) print 'Successfully written', project_file
def __call__(self, *args, **kwargs): if kwargs: raise NotImplementedError( 'Cannot use keyword arguments with YamlRPC at the moment...') if VERBOSE: serializeStart = time.time() try: q = yaml.dump({ 'method': self.methodName, 'params': list(args), 'id': 'gloubi-boulga' }) except: raise RuntimeError( 'Could not serialize Yaml request, most likely one of the arguments could not be serialized:\n%s' % list(args)) if VERBOSE: responseTime = time.time() - serializeStart print 'serialized request in %f seconds' % responseTime # we don't want the '+'-quoting params = urllib.urlencode({'q': q}).replace('+', ' ') headers = { 'Content-type': 'application/x-www-form-urlencoded', 'Accept': 'text/plain' } if VERBOSE: startTime = time.time() conn = httplib.HTTPConnection(self.endPoint) try: conn.request('POST', '/', params, headers) except Exception, e: raise RuntimeError('request failed', self.endPoint, self.methodName, args, e)
def save(self, filename): # convert to "normal" dicts before saving data = dict((k, dict(v)) for k, v in list(self.matrix.items())) with open(filename, 'w') as f: yaml.dump(data, f)
def save(self, filename): # convert to "normal" dicts before saving data = dict((k, dict(v)) for k, v in self.matrix.items()) with open(filename, 'w') as f: yaml.dump(data, f)
ds_harm_proc.save(ds_harm_filename) if __name__ == '__main__': c = loadCollections() try: os.mkdir(WORK_DIR) except OSError: pass # need to do some prep work before to harmonize all datasets layouts. This won't be # necessary anymore in the future when all is nicely generated with a single coherent # script, but at the moment we have to work with the data we have... harmonizeDatasets(c) cachedFolds = False foldsFile = '%s/folds.yaml' % WORK_DIR if os.path.exists(foldsFile): folds = yaml.loadfile(foldsFile) else: print('Generating folds for all collections...') folds = generateFolds(c, NFOLDS) yaml.dump(folds, open(foldsFile, 'w')) print('Training SVM models for their corresponding folds...') trainSVMfolds(c, folds) print('Generating the evaluation datasets from the models...') generateEvaluationDatasets(c, folds)
return result if __name__ == '__main__': try: resultsdir = sys.argv[1] except: print 'Usage: %s results_dir [classifierType]' % sys.argv[0] exit(1) try: classifierType = sys.argv[2] except: classifierType = None cr = ClassificationResults() print 'Loading all results...' cr.readResults(resultsdir) print 'Best parameters:' for r, filename, params in cr.best(10, classifierType): print '*'*100 print 'Correct classification: %2f%%' % r print 'Filename:', filename model = params['model'] print 'Classifier:', model['classifier'] print 'Parameters:' del model['classifier'] print ' ' + yaml.dump(model).replace('\n', '\n ')[:-4]
if __name__ == '__main__': c = loadCollections() try: os.mkdir(WORK_DIR) except OSError: pass # need to do some prep work before to harmonize all datasets layouts. This won't be # necessary anymore in the future when all is nicely generated with a single coherent # script, but at the moment we have to work with the data we have... harmonizeDatasets(c) cachedFolds = False foldsFile = '%s/folds.yaml' % WORK_DIR if os.path.exists(foldsFile): folds = yaml.loadfile(foldsFile) else: print 'Generating folds for all collections...' folds = generateFolds(c, NFOLDS) yaml.dump(folds, open(foldsFile, 'w')) print 'Training SVM models for their corresponding folds...' trainSVMfolds(c, folds) print 'Generating the evaluation datasets from the models...' generateEvaluationDatasets(c, folds)
def evaluate_dataset(eval_job, dataset_dir, storage_dir): db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_RUNNING) eval_location = os.path.join(os.path.abspath(dataset_dir), eval_job["id"]) utils.path.create_path(eval_location) temp_dir = tempfile.mkdtemp() try: snapshot = db.dataset.get_snapshot(eval_job["snapshot_id"]) train, test = artistfilter.filter(eval_job["snapshot_id"], eval_job["options"]) db.dataset_eval.add_sets_to_job(eval_job["id"], train, test) logging.info( "Generating filelist.yaml and copying low-level data for evaluation..." ) filelist_path = os.path.join(eval_location, "filelist.yaml") filelist = dump_lowlevel_data(train.keys(), temp_dir) with open(filelist_path, "w") as f: yaml.dump(filelist, f) logging.info("Generating groundtruth.yaml...") groundtruth_path = os.path.join(eval_location, "groundtruth.yaml") with open(groundtruth_path, "w") as f: yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f) # Passing more user preferences to train the model. logging.info("Training model...") results = gaia_wrapper.train_model( project_dir=eval_location, groundtruth_file=groundtruth_path, filelist_file=filelist_path, c_values=eval_job["options"].get("c_values", []), gamma_values=eval_job["options"].get("gamma_values", []), preprocessing_values=eval_job["options"].get( "preprocessing_values", []), ) logging.info("Saving results...") save_history_file(storage_dir, results["history_path"], eval_job["id"]) db.dataset_eval.set_job_result( eval_job["id"], json.dumps({ "project_path": eval_location, "parameters": results["parameters"], "accuracy": results["accuracy"], "confusion_matrix": results["confusion_matrix"], "history_path": results["history_path"], })) db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_DONE) logging.info("Evaluation job %s has been completed." % eval_job["id"]) # TODO(roman): Also need to catch exceptions from Gaia. except db.exceptions.DatabaseException as e: logging.info("Evaluation job %s has failed!" % eval_job["id"]) db.dataset_eval.set_job_status( job_id=eval_job["id"], status=db.dataset_eval.STATUS_FAILED, status_msg=str(e), ) logging.info(e) finally: # Clean up the source files used to generate this model. # We can recreate them from the database if we need them # at a later stage. shutil.rmtree(temp_dir)
def save(self, yamlfile): with open(yamlfile, 'w') as f: yaml.dump({ 'version': 1.0, 'type': 'singleClass', 'className': self.className, 'groundTruth': dict(self) }, f)
return result if __name__ == '__main__': try: resultsdir = sys.argv[1] except: print ('Usage: %s results_dir [classifierType]' % sys.argv[0]) exit(1) try: classifierType = sys.argv[2] except: classifierType = None cr = ClassificationResults() print ('Loading all results...') cr.readResults(resultsdir) print ('Best parameters:') for r, filename, params in cr.best(10, classifierType): print ('*'*100) print ('Correct classification: %2f%%' % r) print ('Filename:', filename) model = params['model'] print ('Classifier:', model['classifier']) print ('Parameters:') del model['classifier'] print (' ' + yaml.dump(model).replace('\n', '\n ')[:-4])
def main(input_directory, output_directory, project_name, force=False, seed=None, cluster_mode=False, force_consistency=False): print("looking for data in dir", input_directory) print("storing results in dir", output_directory) project_dir = os.path.abspath(input_directory) projname = project_name output_dir = os.path.abspath(output_directory) # if config/results exist, need force to rm them project_file = os.path.join(output_dir, "%s.project" % projname) results_model_file = os.path.join(output_dir, "%s.history" % projname) resultsdir = os.path.join(output_dir, "results") if force: shutil.rmtree(output_directory, ignore_errors=True) if not os.path.exists(resultsdir): os.makedirs(resultsdir) classes = [d for d in os.listdir(project_dir) if os.path.isdir(os.path.join(project_dir, d))] print(classes) groundtruth_name = os.path.join(resultsdir, "groundtruth.yaml") json_name = os.path.join(resultsdir, "filelist-to-convert.yaml") yaml_name = os.path.join(resultsdir, "filelist-yaml.yaml") filelist = {} groundtruth = template for c in classes: jsonfiles = get_files_in_dir(os.path.join(project_dir, c), "json") yamlfiles = get_files_in_dir(os.path.join(project_dir, c), "sig") yamlfilesNoExt = [f.rstrip('.sig') for f in yamlfiles] if (len(jsonfiles) > 0): filesToConvert = { os.path.splitext(os.path.basename(f))[0]: os.path.join(project_dir, c, f) for f in jsonfiles if f.rstrip('.json') not in yamlfilesNoExt } print("{} json files have to be converted into yamls. " "{} already exist.".format(len(filesToConvert), len(yamlfiles))) yaml.dump(filesToConvert, open(json_name, "w")) json_to_sig.convertJsonToSig(json_name, yaml_name) yamlfiles = get_files_in_dir(os.path.join(project_dir, c), "sig") print("got", len(yamlfiles), "files in", c) for f in yamlfiles: id = os.path.splitext(os.path.basename(f))[0] groundtruth["groundTruth"][id] = c filelist[id] = os.path.join(project_dir, c, f) # check directories for sig and convert groundtruth["className"] = projname yaml.dump(filelist, open(yaml_name, "w")) yaml.dump(groundtruth, open(groundtruth_name, "w")) if os.path.exists(json_name): os.remove(json_name) train_model.train_model(groundtruth_name, yaml_name, project_file, resultsdir, results_model_file, seed=seed, cluster_mode=cluster_mode, force_consistency=force_consistency)