def process(database, inputMetaPath, resultBaseDir, cutoff=50, verbose=3, parallel=1, preDispatch='2*n_jobs', randomize=False, limit=1, debug=False, dummy=False, rerun=None, hideFinished=False, slurm=False): meta = result.getMeta(inputMetaPath) connection = batch.getConnection(slurm, debug) makeDir(resultBaseDir) cacheDir = makeDir(os.path.join(resultBaseDir, "cache")) resultDir = makeDir(os.path.join(resultBaseDir, "results")) jobDir = makeDir(os.path.join(resultBaseDir, "jobs")) #cachedMetaPath = os.path.join(cacheDir, "base.json") baseXPath, baseYPath, baseMetaPath = cache.getExperiment( experiment=meta["experiment"]["name"], experimentOptions=meta["experiment"]["options"], database=database, writer="writeNumpyText", useCached=True, cacheDir=cacheDir) features = meta["features"] count = 0 featureSet = [] cls = meta["results"]["best"] paramSets = [x["params"] for x in meta["results"]["all"]] classifierArgs = {} for paramSet in paramSets: for key in paramSet: if not key in classifierArgs: classifierArgs[key] = [] classifierArgs[key].append(paramSet[key]) classifierNameMap = { "LinearSVC":"svm.LinearSVC", "svm.LinearSVC":"svm.LinearSVC", "ExtraTreesClassifier":"ensemble.ExtraTreesClassifier", "ensemble.ExtraTreesClassifier":"ensemble.ExtraTreesClassifier", "RLScore":"RLScore", "RFEWrapper":"svm.LinearSVC" } classifierName = classifierNameMap[cls["classifier"]] #classifier, classifierArgs = learn.getClassifier(classifierName, params) submitCount = 0 sleepTime = 15 for featureName in features: feature = features[featureName] batch.waitForJobs(limit, submitCount, connection, sleepTime) print "Processing feature", featureName print feature featureSet.append(feature["id"]) jobName = "_".join([meta["experiment"]["name"], meta["template"]["project"], classifierName, "feature-" + str(feature["rank"])]) pointResultPath = os.path.join(resultDir, jobName + ".json") print "Feature set", featureSet if len(featureSet) > 1: # hiddenResults = curvePoint(baseXPath, baseYPath, baseMetaPath, featureSet, pointResultPath, # classifier=classifier, classifierArgs=params, getCV=eval(cls["cv"]), # numFolds=cls["folds"], verbose=verbose, parallel=parallel, # preDispatch=preDispatch, randomize=randomize, metric=cls["metric"])[3] #results.append(hiddenResults) command = "python curvePoint.py" command += " -X " + baseXPath command += " -y " + baseYPath command += " -m " + inputMetaPath command += " -o " + pointResultPath command += " --cutoff " + str(count) command += " --classifier " + classifierName command += " --classifierArgs \"" + str(classifierArgs) + "\"" command += " --iteratorCV " + cls["cv"] command += " --numFolds " + str(cls["folds"]) command += " --verbose " + str(verbose) command += " --parallel " + str(parallel) command += " --preDispatch \"" + str(preDispatch) + "\"" if randomize: command += " --randomize " command += " --metric " + cls["metric"] if batch.submitJob(command, connection, jobDir, jobName, dummy, rerun, hideFinished): submitCount += 1 count += 1 if count > cutoff: break
parser = argparse.ArgumentParser(parents=[exampleOptions], description='Feature selection with Greedy RLS') parser.add_argument('-x','--features', help='Input file for feature vectors (X)', default=None) parser.add_argument('-y','--labels', help='Input file for class labels (Y)', default=None) parser.add_argument('-m','--meta', help='Metadata input file name (optional)', default=None) parser.add_argument('--noCache', help='Do not use cache', default=False, action="store_true") parser.add_argument('--cacheDir', help='Cache directory, used if x, y or m are undefined (optional)', default=os.path.join(tempfile.gettempdir(), "CAMDA2014")) parser.add_argument('-n','--numFolds', help='Number of folds in cross-validation', type=int, default=5) parser.add_argument('-s','--subsetsize', help='Number of features to be selected', type=int, default=50) parser.add_argument('--useOrigOut', help='', default=True, action="store_true") parser.add_argument('--outfile', help='Output file for results', type=str, default="selected.txt") #parser.add_argument('--outaccuracies', help='Output file for accuracies on each CV round', type=str, default="accuracies.txt") parser.add_argument('-r', '--result', help='Output file for detailed results (optional)', default=None) options = parser.parse_args() featureFilePath, labelFilePath, metaFilePath = getExperiment(experiment=options.experiment, experimentOptions=options.options, database=options.database, writer=options.writer, useCached=not options.noCache, featureFilePath=options.features, labelFilePath=options.labels, metaFilePath=options.meta) #X = np.loadtxt(options.features) #Y = np.loadtxt(options.labels) #f = open('X') #X = cPickle.load(f) #f.close() model, perfs, selected, best_logrp, best_scount = greedyRLS(featureFilePath, labelFilePath, metaFilePath, options.numFolds, options.subsetsize, resultPath=options.result) if (options.useOrigOut): f = open(options.outfile, 'w') f.write(str(best_logrp)+" " +str(best_scount)+"\n") f.write("".join(str(x)+" " for x in perfs)+"\n") f.write("".join(str(x)+" " for x in selected)+"\n") f.close() #np.savetxt(options.outfeatures, selected, fmt="%d")