Пример #1
0
def process(database, inputMetaPath, resultBaseDir, cutoff=50, verbose=3, parallel=1, 
            preDispatch='2*n_jobs', randomize=False, limit=1, debug=False,
            dummy=False, rerun=None, hideFinished=False, slurm=False):
    meta = result.getMeta(inputMetaPath)
    
    connection = batch.getConnection(slurm, debug)
    
    makeDir(resultBaseDir)
    cacheDir = makeDir(os.path.join(resultBaseDir, "cache"))
    resultDir = makeDir(os.path.join(resultBaseDir, "results"))
    jobDir = makeDir(os.path.join(resultBaseDir, "jobs"))

    #cachedMetaPath = os.path.join(cacheDir, "base.json")
    
    baseXPath, baseYPath, baseMetaPath = cache.getExperiment(
         experiment=meta["experiment"]["name"], experimentOptions=meta["experiment"]["options"], 
         database=database, writer="writeNumpyText", useCached=True, cacheDir=cacheDir)

    features = meta["features"]
    count = 0
    featureSet = []
    cls = meta["results"]["best"]
    paramSets = [x["params"] for x in meta["results"]["all"]]
    classifierArgs = {}
    for paramSet in paramSets:
        for key in paramSet:
            if not key in classifierArgs:
                classifierArgs[key] = []
            classifierArgs[key].append(paramSet[key])
    classifierNameMap = {
        "LinearSVC":"svm.LinearSVC",
        "svm.LinearSVC":"svm.LinearSVC",
        "ExtraTreesClassifier":"ensemble.ExtraTreesClassifier",
        "ensemble.ExtraTreesClassifier":"ensemble.ExtraTreesClassifier",
        "RLScore":"RLScore",
        "RFEWrapper":"svm.LinearSVC"
    }
    classifierName = classifierNameMap[cls["classifier"]]
    #classifier, classifierArgs = learn.getClassifier(classifierName, params)
    submitCount = 0
    sleepTime = 15
    for featureName in features:
        feature = features[featureName]
        batch.waitForJobs(limit, submitCount, connection, sleepTime)
        print "Processing feature", featureName
        print feature
        featureSet.append(feature["id"])
        jobName = "_".join([meta["experiment"]["name"], meta["template"]["project"], classifierName, "feature-" + str(feature["rank"])])
        pointResultPath = os.path.join(resultDir, jobName + ".json")
        print "Feature set", featureSet
        if len(featureSet) > 1:
#             hiddenResults = curvePoint(baseXPath, baseYPath, baseMetaPath, featureSet, pointResultPath, 
#                        classifier=classifier, classifierArgs=params, getCV=eval(cls["cv"]),
#                        numFolds=cls["folds"], verbose=verbose, parallel=parallel,
#                        preDispatch=preDispatch, randomize=randomize, metric=cls["metric"])[3]
            #results.append(hiddenResults)
            command = "python curvePoint.py"
            command +=  " -X " + baseXPath
            command +=  " -y " + baseYPath
            command +=  " -m " + inputMetaPath
            command +=  " -o " + pointResultPath
            command +=  " --cutoff " + str(count)
            command +=  " --classifier " + classifierName
            command +=  " --classifierArgs \"" + str(classifierArgs) + "\"" 
            command +=  " --iteratorCV " + cls["cv"]
            command +=  " --numFolds " + str(cls["folds"])
            command +=  " --verbose " + str(verbose)
            command +=  " --parallel " + str(parallel)
            command +=  " --preDispatch \"" + str(preDispatch) + "\""
            if randomize: 
                command +=  " --randomize "
            command +=  " --metric " + cls["metric"]
            
            if batch.submitJob(command, connection, jobDir, jobName, dummy, rerun, hideFinished):
                submitCount += 1
        count += 1
        if count > cutoff:
            break
Пример #2
0
 parser = argparse.ArgumentParser(parents=[exampleOptions], description='Feature selection with Greedy RLS')
 parser.add_argument('-x','--features', help='Input file for feature vectors (X)', default=None)
 parser.add_argument('-y','--labels', help='Input file for class labels (Y)', default=None)
 parser.add_argument('-m','--meta', help='Metadata input file name (optional)', default=None)
 parser.add_argument('--noCache', help='Do not use cache', default=False, action="store_true")
 parser.add_argument('--cacheDir', help='Cache directory, used if x, y or m are undefined (optional)', default=os.path.join(tempfile.gettempdir(), "CAMDA2014"))
 parser.add_argument('-n','--numFolds', help='Number of folds in cross-validation', type=int, default=5)
 parser.add_argument('-s','--subsetsize', help='Number of features to be selected', type=int, default=50)
 parser.add_argument('--useOrigOut', help='', default=True, action="store_true")
 parser.add_argument('--outfile', help='Output file for results', type=str, default="selected.txt")
 #parser.add_argument('--outaccuracies', help='Output file for accuracies on each CV round', type=str, default="accuracies.txt")
 parser.add_argument('-r', '--result', help='Output file for detailed results (optional)', default=None)
 options = parser.parse_args()
 
 featureFilePath, labelFilePath, metaFilePath = getExperiment(experiment=options.experiment, experimentOptions=options.options, 
                                                              database=options.database, writer=options.writer, 
                                                              useCached=not options.noCache, featureFilePath=options.features, 
                                                              labelFilePath=options.labels, metaFilePath=options.meta)
 
 #X = np.loadtxt(options.features)
 #Y = np.loadtxt(options.labels)
 #f = open('X')
 #X = cPickle.load(f)
 #f.close()
 model, perfs, selected, best_logrp, best_scount = greedyRLS(featureFilePath, labelFilePath, metaFilePath, options.numFolds, options.subsetsize, resultPath=options.result)
 if (options.useOrigOut):
     f = open(options.outfile, 'w')
     f.write(str(best_logrp)+" " +str(best_scount)+"\n")
     f.write("".join(str(x)+" " for x in perfs)+"\n")
     f.write("".join(str(x)+" " for x in selected)+"\n")
     f.close()
     #np.savetxt(options.outfeatures, selected, fmt="%d")