def merge_files_by_process(root_files): global input_folder, output_folder electron_qcd_samples = [ 'QCD_Pt-20to30_BCtoE', 'QCD_Pt-30to80_BCtoE', 'QCD_Pt-80to170_BCtoE', 'QCD_Pt-20to30_EMEnriched', 'QCD_Pt-30to80_EMEnriched', 'QCD_Pt-80to170_EMEnriched', 'GJets_HT-40To100', 'GJets_HT-100To200', 'GJets_HT-200'] singleTop_samples = [ 'T_tW-channel', 'T_t-channel', 'T_s-channel', 'Tbar_tW-channel', 'Tbar_t-channel', 'Tbar_s-channel'] wplusjets_samples = [ 'W1Jet', 'W2Jets', 'W3Jets', 'W4Jets'] vplusjets_samples = wplusjets_samples vplusjets_samples.append('DYJetsToLL') diboson_samples = [ 'WWtoAnything', 'WZtoAnything', 'ZZtoAnything'] signal_samples = [ 'TTJet', 'SingleTop'] summations = { 'QCD_Electron':electron_qcd_samples, 'SingleTop' : singleTop_samples, 'WPlusJets' : wplusjets_samples, 'VPlusJets' : vplusjets_samples, 'DiBoson': diboson_samples, 'Signal': signal_samples } summation_files = {} file_template = '' template_token = '<temp>' for summation, samples in summations.iteritems(): summation_files[summation] = [] for file_in_path in root_files: process_name = get_process_from_file(file_in_path) if not file_template: file_template = file_in_path.replace(process_name, template_token) file_template = file_template.replace(input_folder, output_folder) if process_name in samples: summation_files[summation].append(file_in_path) for summation, files in summation_files.iteritems(): output_file = file_template.replace(template_token, summation) merge_ROOT_files(files, output_file)
def merge_files_by_process(root_files): global input_folder, output_folder electron_qcd_samples = [ 'QCD_Pt-20to30_BCtoE', 'QCD_Pt-30to80_BCtoE', 'QCD_Pt-80to170_BCtoE', 'QCD_Pt-20to30_EMEnriched', 'QCD_Pt-30to80_EMEnriched', 'QCD_Pt-80to170_EMEnriched', 'GJets_HT-40To100', 'GJets_HT-100To200', 'GJets_HT-200' ] singleTop_samples = [ 'T_tW-channel', 'T_t-channel', 'T_s-channel', 'Tbar_tW-channel', 'Tbar_t-channel', 'Tbar_s-channel' ] wplusjets_samples = ['W1Jet', 'W2Jets', 'W3Jets', 'W4Jets'] vplusjets_samples = wplusjets_samples vplusjets_samples.append('DYJetsToLL') diboson_samples = ['WWtoAnything', 'WZtoAnything', 'ZZtoAnything'] signal_samples = ['TTJet', 'SingleTop'] summations = { 'QCD_Electron': electron_qcd_samples, 'SingleTop': singleTop_samples, 'WPlusJets': wplusjets_samples, 'VPlusJets': vplusjets_samples, 'DiBoson': diboson_samples, 'Signal': signal_samples } summation_files = {} file_template = '' template_token = '<temp>' for summation, samples in summations.iteritems(): summation_files[summation] = [] for file_in_path in root_files: process_name = get_process_from_file(file_in_path) if not file_template: file_template = file_in_path.replace(process_name, template_token) file_template = file_template.replace(input_folder, output_folder) if process_name in samples: summation_files[summation].append(file_in_path) for summation, files in summation_files.iteritems(): output_file = file_template.replace(template_token, summation) merge_ROOT_files(files, output_file)
continue print "Merging" if 'unfolding' in sample: # print 'unfolding in sample' output_file = measurement_config.unfolding_output_general_template % sample input_files = [measurement_config.unfolding_input_templates[sample] % input_sample for input_sample in input_samples] else: #if any (generator_systematic in sample for generator_systematic in measurement_config.generator_systematics): # print 'generator systematic in sample' output_file = measurement_config.central_general_template % sample input_files = [measurement_config.central_general_template % input_sample for input_sample in input_samples] print output_file for input_file in input_files: print input_file if not os.path.exists(output_file): merge_ROOT_files(input_files, output_file, compression = 7) new_files.append(output_file) print '='*120 #merge all other histogram files for category in measurement_config.categories_and_prefixes.keys(): for sample, input_samples in sample_summations.iteritems(): if not sample in ['VJets', 'QCD_Muon', 'SingleTop']: # continue print "Merging" output_file = measurement_config.general_category_templates[category] % sample print output_file input_files = [measurement_config.general_category_templates[category] % input_sample for input_sample in input_samples] for input_file in input_files: print input_file if not os.path.exists(output_file):
# Make folder make_folder_if_not_exists(path_to_AN_folder + "/" + category) current_working_directory = os.getcwd() #find current working directory output_file_hdfs = config.general_category_templates[category] % sample output_file = output_file_hdfs.replace( "/hdfs/TopQuarkGroup/results/histogramfiles", current_working_directory) input_files = [ config.general_category_templates[category] % input_sample for input_sample in input_samples ] if not os.path.exists(output_file): merge_ROOT_files(input_files, output_file, compression=7, waitToFinish=True) print "merging ", sample else: print 'Not merging ', sample, 'as', output_file, 'already exists' # Now move output file to hdfs # Check if file already exists on hdfs if os.path.exists(output_file_hdfs): print "Output file on hdfs already exists. Removing and replacing with new version." command = 'hadoop fs -rm -skipTrash ' + output_file_hdfs.split('/hdfs')[-1] p = subprocess.Popen(command, shell=True) p.wait() print '\nStarting rsync' output_log_file = output_file.replace(".root", ".log")
category = job[0] sample = job[1] input_samples = job[2] # print 'Test with :',sample, category, input_samples # Make folder make_folder_if_not_exists( path_to_AN_folder + "/" + category) current_working_directory = os.getcwd() #find current working directory output_file_hdfs = config.general_category_templates[category] % sample output_file = output_file_hdfs.replace("/hdfs/TopQuarkGroup/results/histogramfiles", current_working_directory) input_files = [config.general_category_templates[category] % input_sample for input_sample in input_samples] if not os.path.exists( output_file ): merge_ROOT_files( input_files, output_file, compression = 7, waitToFinish=True ) print "merging ", sample else : print 'Not merging ',sample,'as',output_file,'already exists' # Now move output file to hdfs # Check if file already exists on hdfs if os.path.exists( output_file_hdfs ): print "Output file on hdfs already exists. Removing and replacing with new version." command = 'hadoop fs -rm -skipTrash ' + output_file_hdfs.split('/hdfs')[-1] p = subprocess.Popen(command, shell=True) p.wait() print '\nStarting rsync' output_log_file = output_file.replace(".root", ".log") command = 'rsync --verbose --progress --stats --compress --recursive --times --update %s %s >> %s' % (output_file,output_file_hdfs,output_log_file)
current_working_directory = os.getcwd() #find current working directory output_file = config_8TeV.central_general_template % sample output_file = output_file.replace( "/hdfs/TopQuarkGroup/results/histogramfiles", current_working_directory) input_files = [ config_8TeV.central_general_template % input_sample for input_sample in input_samples ] print output_file for input_file in input_files: print input_file if not os.path.exists(output_file): merge_ROOT_files(input_files, output_file, compression=7) print "merging ", sample new_files.append(output_file) print '=' * 120 # if 8 concurrent processes, wait until they are finished before starting the next set to avoid overloading the machine while (int( subprocess.check_output("ps ax | grep 'hadd' | wc -l", shell=True)) - 2) >= 8: time.sleep(30) # sleep for 30 seconds # merge all other histogram files for category in config_8TeV.categories_and_prefixes.keys(): for sample, input_samples in sample_summations.iteritems(): if not sample in ['QCD_Electron', 'QCD_Muon', 'VJets', 'SingleTop']: # continue