示例#1
0
def convertTree(config, treeName, category):
    """ Wrapper for the functionality of preprocessing.dataset  """
    logging.info("Starting conversion")

    checkNcreateFolder(config.outputFolder)

    datasetName = config.outputPrefix + "_" + config.sampleName + "_" + config.categories[
        category].name
    dataset = Dataset(datasetName, config.outputFolder, treeName)

    logging.info("Setting sample selection: %s", config.sampleSelection)
    dataset.sampleSelection = config.sampleSelection
    logging.info("Setting category selection: %s",
                 config.categories[category].selection)
    dataset.selection = config.categories[category].selection

    if config.excludeBranches is not None:
        dataset.ignoreBranches = config.excludeBranches

    logging.info("Setting files")
    dataset.addFiles(config.files)

    logging.info("Setting output branches")
    dataset.setOutputBranches(config.outputVariables)

    logging.debug("Setting indexing branches: %s", config.indexVariables)
    dataset.outputIndex = config.indexVariables

    if config.addRatio:
        dataset.setSF(config.sampleSF, "sampleRatio")

    logging.info("Starting processing dataset")
    dataset.process(config.maxEvents)

    logging.info("Finished processing")
示例#2
0
def convertTree(inputs, outFolder, name, treeName, indexVars):
    logging.info("Starting conversion")

    checkNcreateFolder(outFolder)

    dataset = Dataset(name, outFolder, treeName)

    files = []
    for _input in inputs:
        with open(_input, "r") as f:
            data = f.read()
            for line in data.split("\n"):
                if ".root" in line:
                    files.append(line)

    logging.info("Setting files")
    dataset.addFiles(files)
    logging.info("Setting output branches")
    dataset.setOutputBranches("*")

    logging.debug("Setting indexing branches: %s", indexVars)
    dataset.outputIndex = indexVars

    logging.info("Starting processing dataset")
    dataset.process(999999999999999999999)
    logging.info("Finished processing")
示例#3
0
def createLookup(model, inputFile, outputFolder):
    checkNcreateFolder(outputFolder, onlyFolder=True)
    
    dfPrediction, _, _ = processData(model, inputFile)
    
    inputFileName = inputFile.split("/")[-1]
    inputFileName = inputFileName.split(".")[0]
    
    writeLookupTable(dfPrediction, outputFolder, inputFileName)
示例#4
0
def process(args, styleConfig):
    inputDFs = getDataframes([
        filename for filename in args.input if not filename.startswith("merge")
    ])
    vars2Process = generateVariableList(inputDFs[0], args.plotVars,
                                        args.excludeVars)
    if len([
            filename for filename in args.input if filename.startswith("merge")
    ]) == 1:
        mergedName, mergedDF, weightedDFs = mergeDatasets(args, vars2Process)

        #print(mergedDF)
        inputDFs = inputDFs + [mergedDF]

        if args.inputNames is not None:
            args.inputNames = args.inputNames + [mergedName]

    if args.inputNames is not None:
        assert len(args.inputNames) == len(inputDFs)

    if args.plotCorr:
        logging.info("Will do correlation plots")
        for iDF, inputDF in enumerate(inputDFs):
            logging.info("Processing file %s", inputDFs)
            outpath = args.output.split("/")
            if len(
                    outpath
            ) > 1:  #In this case we want to insert the folder for the dataframe
                folders = outpath[0:-1]
                folders.append("DF-" +
                               str(iDF) if args.inputNames is None else args.
                               inputNames[iDF])
                thisOutput = "/".join(folders + [outpath[-1]])
                thisOutput += "_corr2D_"
                checkNcreateFolder(thisOutput)
            else:
                thisOutput = args.output + "_corr2D_" + (str(
                    iDF) if args.inputNames is None else args.inputNames[iDF])

            getCorrealtions(styleConfig, inputDF, thisOutput, vars2Process,
                            args.transform)

    if args.plotDist:
        logging.info("Will plot distributions")
        thisOutput = args.output + "_dist_"
        checkNcreateFolder(thisOutput)
        getDistributions(styleConfig, inputDFs, thisOutput, vars2Process,
                         args.inputNames, args.normalized, args.transform,
                         args.lumi, args.CMSString, args.colors, args.catlabel,
                         args.yScale)
示例#5
0
def convertTreeMulti(config, treeName, category):
    logging.info("Starting conversion using multi method")
    checkNcreateFolder(config.outputFolder)

    #For multi mode, we generate a dataset per sample. In the loop the output is disabled and in the end the
    #dataframs of the 1:: samples will be added to the first and saved

    eventsLeft = config.maxEvents
    dfs = []
    baseDataset = None
    for iSample, sample in enumerate(config.samples):
        logging.info("Processing sample %s", sample)
        if iSample == 0:
            datasetName = config.outputPrefix + "_" + config.sampleName + "_" + config.categories[
                category].name
        else:
            datasetName = config.outputPrefix + "_" + config.sampleInfo[
                sample].name + "_" + config.categories[category].name
        dataset = Dataset(datasetName, config.outputFolder, treeName)
        logging.info("Setting sample selection: %s",
                     config.sampleInfo[sample].selection)
        dataset.sampleSelection = config.sampleInfo[sample].selection
        logging.info("Setting category selection: %s",
                     config.categories[category].selection)
        dataset.selection = config.categories[category].selection

        if config.excludeBranches is not None:
            dataset.ignoreBranches = config.excludeBranches
        logging.info("Setting files")
        dataset.addFiles(config.sampleInfo[sample].files)

        logging.info("Setting output branches")

        dataset.setOutputBranches(config.outputVariables)

        logging.debug("Setting indexing branches: %s", config.indexVariables)
        dataset.outputIndex = config.indexVariables

        if config.addRatio:
            dataset.setSF(config.sampleInfo[sample].addSF, "sampleRatio")

        logging.info("Starting processing dataset")
        thisSampleDF = dataset.process(eventsLeft, skipOutput=True)
        eventsLeft -= len(thisSampleDF)
        dfs.append(thisSampleDF)
        if iSample == 0:
            baseDataset = copy(dataset)

    baseDataset.makeOutput(pd.concat(dfs))
    logging.info("Finished processing")
示例#6
0
def trainAutoencoder(config, useDevice, batch=False):
    logging.debug("Output folder")
    checkNcreateFolder(config.output, onlyFolder=True)
    logging.debug("Copying used config to outputfolder")
    shutil.copy2(config.path, config.output + "/usedConfig.cfg")

    logging.info("Initializing samples and data")
    allSample, data = initialize(config)
    logging.debug("Getting activations")
    thisEncoderActivation, thisDecoderActivation = buildActivations(config)
    logging.debug("Encoder: %s", thisEncoderActivation)
    logging.debug("Decoder: %s", thisDecoderActivation)

    logging.info("Initializing autoencoder")
    thisAutoencoder = Autoencoder(identifier=config.net.name,
                                  inputDim=config.net.inputDimention,
                                  encoderDim=config.encoder.dimention,
                                  hiddenLayerDim=[
                                      config.hiddenLayers[i].dimention
                                      for i in range(config.nHiddenLayers)
                                  ],
                                  weightDecay=config.net.useWeightDecay,
                                  robust=config.net.robustAutoencoder,
                                  encoderActivation=thisEncoderActivation,
                                  decoderActivation=thisDecoderActivation,
                                  loss=config.net.loss,
                                  metric=['mae', "msle", "acc"],
                                  batchSize=config.net.batchSize)

    logging.info("Setting optimizer")
    logging.debug("In config: %s", config.net.optimizer)
    thisAutoencoder.optimizer = config.net.optimizer
    #thisAutoencoder.setOptimizer(optimizerName=config.net.optimizer)

    logging.info("Building model")
    thisAutoencoder.buildModel()
    logging.info("Compiling model")
    thisAutoencoder.compileModel()

    trainData = data.getTrainData()
    # print(data.trainVariables)
    # print(data.trainDF[data.trainVariables])
    # #print(data.untransfromedDF[data.trainVariables[1]])

    # print(trainData)
    # input("Press ret")

    testData = data.getTestData()

    trainWeights = data.trainTrainingWeights
    testWeights = data.testTrainingWeights

    logging.info("Fitting model")
    thisAutoencoder.network.summary()
    if not batch:
        input("Press ret")
    thisAutoencoder.trainModel(trainData,
                               trainWeights,
                               config.output,
                               epochs=config.net.trainEpochs,
                               valSplit=config.net.validationSplit,
                               thisDevice="",
                               earlyStopping=(args.stopEarly
                                              or config.net.doEarlyStopping),
                               patience=config.net.StoppingPatience)

    logging.info("Evaluation....")
    predictedData = thisAutoencoder.evalModel(testData, testWeights,
                                              data.trainVariables,
                                              config.output, True, True)
    logging.info("Getting reco error for loss")
    reconstMetric, reconstErrTest = thisAutoencoder.getReconstructionErr(
        testData)
    reconstMetric, reconstErrTrain = thisAutoencoder.getReconstructionErr(
        trainData)
    make1DHistoPlot([reconstErrTest, reconstErrTrain],
                    None,
                    "{0}/{1}_{2}".format(config.output, "TrainingReconst",
                                         reconstMetric),
                    20, (0, 2),
                    "Loss function", ["Test Sample", "Training Sample"],
                    normalized=True)
    logging.info("Saving testData and weights")
    data2Pickle = {
        "variables": config.trainingVariables,
        "testInputData": testData,
        "testWeights": testWeights,
        "testPredictionData": predictedData
    }

    with open("{0}/testDataArrays.pkl".format(config.output),
              "wb") as pickleOut:
        pickle.dump(data2Pickle, pickleOut)

    thisAutoencoder.saveModel(config.output, data.transformations)
示例#7
0
def main(args):
    checkNcreateFolder(args.output, onlyFolder=True)
    evalData = {}
    for iData, pickledData in enumerate(args.input):
        with open(pickledData, "rb") as f:
            evalData[args.inputID[iData]] = pickle.load(f)

    assert evalData[args.inputID[0]]["variables"] == evalData[
        args.inputID[1]]["variables"]

    variables = evalData[args.inputID[0]]["variables"]
    datasets = evalData[args.inputID[0]]["datasets"]
    if args.skipDataset is not None:
        for ds in args.skipDataset:
            datasets.remove(ds)

    inputData = {}
    predictionData = {}
    for name in args.inputID:
        inputData[name] = {}
        predictionData[name] = {}
        for iVar, var in enumerate(variables):
            logging.debug("Adding array for output %s, variable %s", name, var)
            dataList = [
                evalData[name]["inputData"][i][:, iVar]
                for i in range(len(evalData[name]["inputData"]))
            ]
            dataListPred = [
                evalData[name]["predictionData"][i][:, iVar]
                for i in range(len(evalData[name]["predictionData"]))
            ]
            inputData[name][var] = {}
            predictionData[name][var] = {}
            for iDataset, dataset in enumerate(datasets):
                inputData[name][var][dataset] = dataList[iDataset]
                logging.debug("Added input data for %s with len %s", dataset,
                              len(inputData[name][var][dataset]))
                predictionData[name][var][dataset] = dataListPred[iDataset]
                logging.debug("Added prediction data for %s with len %s",
                              dataset, len(inputData[name][var][dataset]))

    if args.transform:
        transtransforms = {}
        for iName, name in enumerate(args.inputID):
            with open(args.useTransform, "r") as f:
                transtransforms[name] = json.load(f)
            for iVar, var in enumerate(variables):
                for iDataset, dataset in enumerate(datasets):
                    inputData[name][var][dataset] = transform(
                        "Gauss", inputData[name][var][dataset], [var],
                        transtransforms[name])
                    predictionData[name][var][dataset] = transform(
                        "Gauss", predictionData[name][var][dataset], [var],
                        transtransforms[name])

    logging.info("Starting plotting")
    for var in variables:
        logging.info("Plotting var %s", var)
        for dataset in datasets:
            dataSetCompInput = []
            dataSetCompPred = []
            dataSetCompAll = []
            legendInput = []
            legendPrediction = []
            legendAll = []
            for name in args.inputID:
                dataSetCompInput.append(inputData[name][var][dataset])
                dataSetCompPred.append(predictionData[name][var][dataset])
                dataSetCompAll.append(inputData[name][var][dataset])
                dataSetCompAll.append(predictionData[name][var][dataset])
                thisTextInput = "%s Input data (%s)" % (name, dataset)
                thisTextPrediction = "%s Predicted data (%s)" % (name, dataset)
                legendInput.append(thisTextInput)
                legendPrediction.append(thisTextPrediction)
                legendAll.append(thisTextInput)
                legendAll.append(thisTextPrediction)
            make1DHistoPlot(dataSetCompInput,
                            None,
                            "{0}/comp_input_{1}_{2}".format(
                                args.output, var, dataset),
                            nBins=40,
                            binRange=(-4, 4),
                            varAxisName=var,
                            legendEntries=legendInput,
                            normalized=True)
            make1DHistoPlot(dataSetCompPred,
                            None,
                            "{0}/comp_prediction_{1}_{2}".format(
                                args.output, var, dataset),
                            nBins=40,
                            binRange=(-4, 4),
                            varAxisName=var,
                            legendEntries=legendPrediction,
                            normalized=True)
            make1DHistoPlot(dataSetCompAll,
                            None,
                            "{0}/comp_all_{1}_{2}".format(
                                args.output, var, dataset),
                            nBins=40,
                            binRange=(-4, 4),
                            varAxisName=var,
                            legendEntries=legendAll,
                            normalized=True)
def main(config):
    checkNcreateFolder(config.output)
    for dataset in config.runDatasets:
        logging.info("Processing dataset %s", dataset)
        finalDF = processDataset(config, dataset)
        createlookup_dnn.writeLookupTable(finalDF, config.output, dataset)
示例#9
0
            thisCatLabel = plotSettings["catLabel"]
        else:
            thisCatLabel = None
        if "yScale" in plotSettings.keys():
            thisYScale = plotSettings["yScale"]
        else:
            thisYScale = 1.25


        #thisCMSString = None
            
        styleConfig = StyleConfig(plotSettings["style"])

        inputDFs = checkInputData.getDataframes([filename for filename in plotSettings["inputs"]])
        
        checkNcreateFolder(os.path.expanduser(plotSettings["output"]))


        checkInputData.getDistributions(
            styleConfig,
            inputDFs,
            os.path.expanduser(plotSettings["output"]),
            ["MEM"],
            plotSettings["Legend"],
            plotSettings["normalized"],
            False,
            plotSettings["lumi"],
            thisCMSString,
            theseColors,
            thisCatLabel,
            thisYScale,
示例#10
0
def trainDNN(config, batch=False, addMetrics=["MEM"]):
    logging.debug("Output folder")
    checkNcreateFolder(config.output, onlyFolder=True)
    logging.debug("Copying used config to outputfolder")
    shutil.copy2(config.path, config.output + "/usedConfig.cfg")

    logging.info("Initializing samples and data")
    allSample, data = initialize(config, incGenWeights=config.includeGenWeight)

    logging.info("Initializing DNN")
    thisDNN = DNN(identifier=config.net.name,
                  inputDim=config.net.inputDimention,
                  layerDims=config.net.layerDimentions,
                  weightDecay=config.net.useWeightDecay,
                  weightDecayLambda=config.net.weightDecayLambda,
                  activation=config.net.activation,
                  outputActivation=config.net.outputActivation,
                  loss=config.net.loss,
                  metric=["acc"],
                  batchSize=config.net.batchSize)

    logging.info("Setting optimizer")
    logging.debug("In config: %s", config.net.optimizer)
    thisDNN.optimizer = config.net.optimizer

    logging.info("Building model")
    if config.net.loss == "binary_crossentropy":
        thisDNN.buildModel(nClasses=1,
                           dropoutAll=config.net.dropoutAll,
                           dropoutOutput=config.net.dropoutOutput,
                           dropoutPercent=config.net.dropoutPercent)
    else:
        thisDNN.buildModel(nClasses=len(data.outputClasses),
                           dropoutAll=config.net.dropoutAll,
                           dropoutOutput=config.net.dropoutOutput,
                           dropoutPercent=config.net.dropoutPercent)
    logging.info("Compiling model")
    thisDNN.compileModel()

    thisDNN.network.summary(print_fn=logging.warning)
    if not batch:
        input("Press ret")

    trainData = data.getTrainData()
    trainLabels = data.trainLabels
    trainWeights = data.trainTrainingWeights

    logging.info("Training DNN")
    thisDNN.trainModel(trainData,
                       trainLabels,
                       trainWeights,
                       config.output,
                       epochs=config.net.trainEpochs,
                       valSplit=config.net.validationSplit,
                       earlyStopping=config.net.doEarlyStopping,
                       patience=config.net.StoppingPatience)

    testData = data.getTestData()
    testLabels = data.testLabels
    testWeights = data.testTrainingWeights

    #TODO: Make this configurable
    ROCMetrics = []
    if (len(addMetrics) == 1 and addMetrics[0] == ""):
        addMetrics = []
    for metric in addMetrics:
        ROCMetrics.append(
            (metric, data.getTestData(asMatrix=False)[metric].values))

    logging.info("Model evaluation")
    thisDNN.evalModel(testData,
                      testWeights,
                      testLabels,
                      trainData,
                      trainWeights,
                      trainLabels,
                      config.trainingVariables,
                      config.output,
                      data.outputClasses,
                      plotMetics=True,
                      saveData=True,
                      addROCMetrics=ROCMetrics,
                      forceColors=config.forceColors)

    logging.info("Saving model")
    thisDNN.saveModel(config.output, data.transformations)