Exemplo n.º 1
    def build_2d_hists(self, input_ntuple, out_dir=""):
        Call profiling routines to build the histogram files. 
        Record the hists as built in the requests_tuple
        if not self._twodim_inputs:
            return False
        built_hists = {}
        input_list = []
        if not out_dir:
            out_dir = os.path.split(self._requests_pickle)[0]

        cache_files = glob.glob(out_dir + "/*")

        out_file = ""
        for x in xrange(100):
            f_name = "twodim_cache{}.root".format(x)
            out_file = os.path.join(out_dir, f_name)
            if not out_file in cache_files:
        assert out_file, "could not name output file"

        for id_tuple, inputs in self._twodim_inputs.items():
            first_hist_name = inputs[1][0]
            second_hist_name = inputs[0][0]
            compound_hist_name = first_hist_name + "_vs_" + second_hist_name
            all_hists = [compound_hist_name + "_" + t for t in _tags]
            built_hists[id_tuple] = (out_file, all_hists)

        print "profiling 2d hists"
        reported_out, reported_hists = pro2d(
            input_ntuple, tree="SVTree", out_file=out_file, plots=input_list, tags=_tags, show_progress=True

        with open(self._requests_pickle) as pkl:
            status_dict = cPickle.load(pkl)

        with open(self._requests_pickle, "w") as pkl:
            cPickle.dump(status_dict, pkl)
def make_flat_ntuple(
    weight_file = '', 
    jet_collection = 'BTag_AntiKt4TopoEMJetsReTagged', 
    jet_tagger = 'JetFitterCharm', 
    output_path = None, 
    rds_path = 'reduced_dataset.root', 
    observer_discriminators = _default_observers, 
    do_test = False, 
    skim_function = pyprep.make_flat_ntuple, 

    double_variables, int_variables = rds.get_allowed_rds_variables(
        input_files = input_files, 
        full_dir_name = '_'.join([jet_collection,jet_tagger]))

    # --- make weights if a name is given 
    if weight_file and not os.path.isfile(weight_file): 
        # build a light ntuple if one doesn't exist
        if os.path.isfile(rds_path): 
            small_rds_path = rds_path 

            print 'making flat ntuple to build weight file'

            rds_dir, rds_name = os.path.split(rds_path)
            small_rds = '.'.join(rds_name.split('.')[:-1]) + '_small.root'
            small_rds_path = os.path.join(rds_dir,small_rds)
            if not os.path.isfile(small_rds_path): 
                    input_files = input_files, 
                    jet_collection = jet_collection, 
                    jet_tagger = jet_tagger, 
                    output_file = small_rds_path)
        pt_low, pt_high = (15.0, 250.0)
        log_span = log(pt_high) - log(pt_low)
        log_range = [log(pt_low) + i * log_span / 10 for i in xrange(11)]
        pt_bins = [exp(x) for x in log_range]

        from jetnet import cxxprofile
            in_file = small_rds_path, 
            tree = 'SVTree', 
            plots = [( ('JetPt', 30,15.0,200),
                       ('JetEta',10,-2.5,2.5) )], 
            tags = ['bottom','charm','light'], 
            out_file = weight_file, 
            show_progress = True)

    # --- rds part

    rds_dir, rds_file = os.path.split(rds_path)
    if rds_dir and not os.path.isdir(rds_dir): 

    if os.path.isfile(rds_path): 
        raise IOError(
            "{} already exists, refusing to overwrite".format(rds_path) )
            input_files = input_files, 
            weight_file = weight_file, 
            double_variables = double_variables, 
            int_variables = int_variables, 
            observer_discriminators = observer_discriminators, 
            pt_divisions = pt_divisions, 
            jet_collection = jet_collection, 
            jet_tagger = jet_tagger, 
            output_file = rds_path, 
            debug = do_test, 
def train_and_test(input_files, 
                   working_dir = None, 
                   do_test = False, 

    config = SafeConfigParser()

    # --- setup preprocessing
    preproc = dict(config.items('preprocessing'))
    jet_collection = preproc['jet_collection']

    pt_divisions = [float(x) for x in preproc['pt_divisions'].split() ]
    observer_discriminators = preproc['observer_discriminators'].split()

    # --- early load of post-training options  
    training_opts = dict(config.items('training'))
    testing_opts = dict(config.items('testing'))
    training_variables = training_opts['variables'].split()

    testing_dataset = None

    if 'testing_dataset' in testing_opts: 
        testing_dataset = testing_opts['testing_dataset']

    # --- change some things if this is an array job
    jet_tagger = preproc['jet_tagger']
    if 'ARRAYID' in jet_tagger: 
        the_array_id = os.environ['PBS_ARRAYID'].rjust(2,'0')
        jet_tagger = jet_tagger.replace('ARRAYID',the_array_id)
        working_dir = jet_tagger
        if testing_dataset: 
            testing_dataset = os.path.join(working_dir,testing_dataset)

    if testing_dataset and not os.path.isfile(testing_dataset): 
        raise IOError('{} not found'.format(testing_dataset))

    flavor_weights = {}
    if config.has_section('weights'): 
        warn('moving [weights] contents into [training] section', 
        flavor_weights = dict( config.items('weights') )
        for wt_name, wt in flavor_weights.items(): 
            config.set('training', wt_name + '_wt', wt)
        with open(config_file_name,'w') as new_cfg: 

    flavors = ['bottom','charm','light']
    flavor_weights = { 
        f : config.get('training', f + '_wt') for f in flavors
    for f in flavor_weights: 
        flavor_weights[f] = float(flavor_weights[f])

    # --- setup the working directory 
    if not working_dir: 
        working_dir = jet_collection
    if not os.path.isdir(working_dir): 

    # --- hold here if someone else is working 

    # --- rds part
    rds_name = 'reduced_dataset.root'
    # get weights file 
    rds_dir = os.path.join(working_dir, 'reduced')
    if not os.path.isdir(rds_dir): 

    rds_path = os.path.join(rds_dir, rds_name )
    if not testing_dataset: 
        testing_dataset = rds_path

    weight_file = os.path.join(rds_dir, 'weights.root')
    if not os.path.isfile(weight_file): 
        # build a light ntuple if one doesn't exist
        if os.path.isfile(rds_path): 
            small_rds_path = rds_path 

            print '--- making flat ntuple to build weight file ---'
            small_rds = 'small_rds.root'
            small_rds_path = os.path.join(rds_dir,small_rds)
            if not os.path.isfile(small_rds_path): 
                    input_files = input_files, 
                    jet_collection = jet_collection, 
                    jet_tagger = jet_tagger, 
                    output_file = small_rds_path)
        pt_low, pt_high = (15.0, 300)
        log_span = log(pt_high) - log(pt_low)
        log_range = [log(pt_low) + i * log_span / 10 for i in xrange(11)]
        pt_bins = [exp(x) for x in log_range]

        print '--- making weight file ---'
        from jetnet import cxxprofile
            in_file = small_rds_path, 
            tree = 'SVTree', 
            plots = [( ('JetPt', pt_bins),
                       ('JetEta',10,-2.5,2.5) )], 
            tags = ['bottom','charm','light'], 
            out_file = weight_file, 
            show_progress = True)

    double_variables, int_variables = rds.get_allowed_rds_variables(
        input_files = input_files, 
        full_dir_name = jet_collection + '_' + jet_tagger)

    if not os.path.isfile(rds_path): 
        print '--- making flattened dataset for training ---'
        flags = 'hr' if not do_test else 'd'
            input_files = input_files, 
            weight_file = weight_file, 
            double_variables = double_variables, 
            int_variables = int_variables, 
            observer_discriminators = observer_discriminators, 
            pt_divisions = pt_divisions, 
            jet_collection = jet_collection, 
            jet_tagger = jet_tagger, 
            output_file = rds_path, 
            flags = flags, 

    # --- unset other job hold 
    set_hold(working_dir, value = False)

    proc = process.RDSProcess(
        reduced_dataset = rds_path, 
        working_dir = working_dir, 
        training_variables = training_variables, 
        flavor_weights = flavor_weights, 
        testing_dataset = testing_dataset, 
        do_test = do_test, 
        config_file = config_file)
    proc_outputs = proc.out_queue.get(block = False)

    # --- make the summary folder 

    working_dir_list = working_dir.split('/')[:-1]
    if not working_dir_list: 
        summary_dir = 'summary'
        working_dir_parent = os.path.join(*working_dir_list)
        summary_dir = os.path.join(working_dir_parent,'summary')

    if not os.path.isdir(summary_dir): 

    summary_base_name, cfg_ext = os.path.splitext(config_file)
    if 'PBS_ARRAYID' in os.environ: 
        summary_base_name += '_subjob{}'.format(os.environ['PBS_ARRAYID'])

    if 'profile' in proc_outputs: 
        profile_summary_name = summary_base_name + '_profile.root'
        profile_summary_path = os.path.join(summary_dir,profile_summary_name)
        shutil.copyfile(proc_outputs['profile'], profile_summary_path)

    this_config_name = summary_base_name + cfg_ext
    this_config_path = os.path.join(summary_dir, this_config_name)
    shutil.copyfile(config_file, this_config_path)