def build_rds(working_dir, reduced_dir, reduced_dataset, input_files, jet_collection, check_file, do_test): if not os.path.isdir(working_dir): os.mkdir(working_dir) # --- rds part if not os.path.isdir(reduced_dir): os.mkdir(reduced_dir) if not os.path.isfile(reduced_dataset): double_variables, int_variables = rds.get_allowed_rds_variables( input_files = input_files, jet_collection = jet_collection) pyprep.prep_ntuple(input_files = input_files, double_variables = double_variables, int_variables = int_variables, observer_discriminators = observer_discriminators, jet_collection = jet_collection, output_file = reduced_dataset, debug = do_test) profile_file = os.path.join(reduced_dir, 'profiled.root') if not os.path.isfile(profile_file): profile.make_profile_file(reduced_dataset, profile_file) # mark as done open(check_file,'w').close()
def train_and_test(input_files, config_file, working_dir = None, do_test = False, ): config = SafeConfigParser() config.read(config_file) # --- setup preprocessing preproc = dict(config.items('preprocessing')) jet_collection = preproc['jet_collection'] pt_divisions = [float(x) for x in preproc['pt_divisions'].split() ] observer_discriminators = preproc['observer_discriminators'].split() # --- early load of post-training options training_opts = dict(config.items('training')) testing_opts = dict(config.items('testing')) training_variables = training_opts['variables'].split() testing_dataset = None if 'testing_dataset' in testing_opts: testing_dataset = testing_opts['testing_dataset'] # --- change some things if this is an array job jet_tagger = preproc['jet_tagger'] if 'ARRAYID' in jet_tagger: the_array_id = os.environ['PBS_ARRAYID'].rjust(2,'0') jet_tagger = jet_tagger.replace('ARRAYID',the_array_id) working_dir = jet_tagger if testing_dataset: testing_dataset = os.path.join(working_dir,testing_dataset) if testing_dataset and not os.path.isfile(testing_dataset): raise IOError('{} not found'.format(testing_dataset)) flavor_weights = {} if config.has_section('weights'): warn('moving [weights] contents into [training] section', FutureWarning) flavor_weights = dict( config.items('weights') ) for wt_name, wt in flavor_weights.items(): config.set('training', wt_name + '_wt', wt) config.remove_section('weights') with open(config_file_name,'w') as new_cfg: config.write(new_cfg) flavors = ['bottom','charm','light'] flavor_weights = { f : config.get('training', f + '_wt') for f in flavors } for f in flavor_weights: flavor_weights[f] = float(flavor_weights[f]) # --- setup the working directory if not working_dir: working_dir = jet_collection if not os.path.isdir(working_dir): os.mkdir(working_dir) # --- hold here if someone else is working hold_job(working_dir) set_hold(working_dir) # --- rds part rds_name = 'reduced_dataset.root' # get weights file rds_dir = os.path.join(working_dir, 'reduced') if not os.path.isdir(rds_dir): os.mkdir(rds_dir) rds_path = os.path.join(rds_dir, rds_name ) if not testing_dataset: testing_dataset = rds_path weight_file = os.path.join(rds_dir, 'weights.root') if not os.path.isfile(weight_file): # build a light ntuple if one doesn't exist if os.path.isfile(rds_path): small_rds_path = rds_path else: print '--- making flat ntuple to build weight file ---' small_rds = 'small_rds.root' small_rds_path = os.path.join(rds_dir,small_rds) if not os.path.isfile(small_rds_path): pyprep.make_flat_ntuple( input_files = input_files, jet_collection = jet_collection, jet_tagger = jet_tagger, output_file = small_rds_path) pt_low, pt_high = (15.0, 300) log_span = log(pt_high) - log(pt_low) log_range = [log(pt_low) + i * log_span / 10 for i in xrange(11)] pt_bins = [exp(x) for x in log_range] print '--- making weight file ---' from jetnet import cxxprofile cxxprofile.pro2d( in_file = small_rds_path, tree = 'SVTree', plots = [( ('JetPt', pt_bins), ('JetEta',10,-2.5,2.5) )], tags = ['bottom','charm','light'], out_file = weight_file, show_progress = True) double_variables, int_variables = rds.get_allowed_rds_variables( input_files = input_files, full_dir_name = jet_collection + '_' + jet_tagger) if not os.path.isfile(rds_path): print '--- making flattened dataset for training ---' flags = 'hr' if not do_test else 'd' pyprep.make_flat_ntuple( input_files = input_files, weight_file = weight_file, double_variables = double_variables, int_variables = int_variables, observer_discriminators = observer_discriminators, pt_divisions = pt_divisions, jet_collection = jet_collection, jet_tagger = jet_tagger, output_file = rds_path, flags = flags, ) # --- unset other job hold set_hold(working_dir, value = False) proc = process.RDSProcess( reduced_dataset = rds_path, working_dir = working_dir, training_variables = training_variables, flavor_weights = flavor_weights, testing_dataset = testing_dataset, do_test = do_test, config_file = config_file) proc.start() proc.join() proc_outputs = proc.out_queue.get(block = False) # --- make the summary folder working_dir_list = working_dir.split('/')[:-1] if not working_dir_list: summary_dir = 'summary' else: working_dir_parent = os.path.join(*working_dir_list) summary_dir = os.path.join(working_dir_parent,'summary') if not os.path.isdir(summary_dir): os.mkdir(summary_dir) summary_base_name, cfg_ext = os.path.splitext(config_file) if 'PBS_ARRAYID' in os.environ: summary_base_name += '_subjob{}'.format(os.environ['PBS_ARRAYID']) if 'profile' in proc_outputs: profile_summary_name = summary_base_name + '_profile.root' profile_summary_path = os.path.join(summary_dir,profile_summary_name) shutil.copyfile(proc_outputs['profile'], profile_summary_path) this_config_name = summary_base_name + cfg_ext this_config_path = os.path.join(summary_dir, this_config_name) shutil.copyfile(config_file, this_config_path)
def make_flat_ntuple( input_files, pt_divisions, weight_file = '', jet_collection = 'BTag_AntiKt4TopoEMJetsReTagged', jet_tagger = 'JetFitterCharm', output_path = None, rds_path = 'reduced_dataset.root', observer_discriminators = _default_observers, do_test = False, skim_function = pyprep.make_flat_ntuple, ): double_variables, int_variables = rds.get_allowed_rds_variables( input_files = input_files, full_dir_name = '_'.join([jet_collection,jet_tagger])) # --- make weights if a name is given if weight_file and not os.path.isfile(weight_file): # build a light ntuple if one doesn't exist if os.path.isfile(rds_path): small_rds_path = rds_path else: print 'making flat ntuple to build weight file' rds_dir, rds_name = os.path.split(rds_path) small_rds = '.'.join(rds_name.split('.')[:-1]) + '_small.root' small_rds_path = os.path.join(rds_dir,small_rds) if not os.path.isfile(small_rds_path): pyprep.make_flat_ntuple( input_files = input_files, jet_collection = jet_collection, jet_tagger = jet_tagger, output_file = small_rds_path) pt_low, pt_high = (15.0, 250.0) log_span = log(pt_high) - log(pt_low) log_range = [log(pt_low) + i * log_span / 10 for i in xrange(11)] pt_bins = [exp(x) for x in log_range] from jetnet import cxxprofile cxxprofile.pro2d( in_file = small_rds_path, tree = 'SVTree', plots = [( ('JetPt', 30,15.0,200), ('JetEta',10,-2.5,2.5) )], tags = ['bottom','charm','light'], out_file = weight_file, show_progress = True) # --- rds part rds_dir, rds_file = os.path.split(rds_path) if rds_dir and not os.path.isdir(rds_dir): os.mkdir(rds_dir) if os.path.isfile(rds_path): raise IOError( "{} already exists, refusing to overwrite".format(rds_path) ) else: skim_function( input_files = input_files, weight_file = weight_file, double_variables = double_variables, int_variables = int_variables, observer_discriminators = observer_discriminators, pt_divisions = pt_divisions, jet_collection = jet_collection, jet_tagger = jet_tagger, output_file = rds_path, debug = do_test, )
def run_full_chain_by_pt( input_files, working_dir = None, output_path = None, rds_dir = 'reduced_pt', jet_collection = 'AntiKt4TopoEMJets', do_test = False, training_variables = training_variable_whitelist, pt_divisions = default_pt_divisions, flavor_weights = {}, cram = False, sequential = False): if working_dir is None: working_dir = jet_collection if not os.path.isdir(working_dir): os.mkdir(working_dir) # --- rds part reduced_dir = os.path.join(working_dir, rds_dir) if not os.path.isdir(reduced_dir): os.mkdir(reduced_dir) reduced_datasets = glob.glob('%s/reduced_*' % reduced_dir) if len(reduced_datasets) == 0: double_variables, int_variables = rds.get_allowed_rds_variables( input_files = input_files, jet_collection = jet_collection) pyprep.make_ntuples_ptcat( input_files = input_files, double_variables = double_variables, int_variables = int_variables, observer_discriminators = observer_discriminators, pt_divisions = [float(pt) for pt in pt_divisions], jet_collection = jet_collection, output_dir = reduced_dir, debug = do_test ) reduced_datasets = glob.glob('%s/reduced_*' % reduced_dir) n_processors = multiprocessing.cpu_count() # -- allow one less cpu than process, # the low bin doesn't run anyway if n_processors < len(reduced_datasets) - 1: print 'WARNING: not enough processors for these subjobs ' 'want %i, found %i' % (len(reduced_datasets), n_processors) if not cram and not sequential: sys.exit('quitting...') subprocesses = [] for ds in reduced_datasets: rds_basename = os.path.basename(ds).rstrip('.root') category = rds_basename.lstrip('reduced_') working_subdir = os.path.join(working_dir,'pt_' + category) if not os.path.isdir(working_subdir): os.mkdir(working_subdir) proc = process.RDSProcess( reduced_dataset = ds, working_dir = working_subdir, training_variables = training_variables, flavor_weights = flavor_weights, do_test = do_test) proc.start() subprocesses.append(proc) if sequential: proc.join() for proc in subprocesses: proc.join() return 0