def do_cuts(args): from root_optimize.timing import secondsToStr # before doing anything, let's ensure the directory we make is ok if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) else: raise IOError("Output directory already exists: {0:s}".format( args.output_directory)) # first step is to group by the sample DID dids = defaultdict(list) for fname in args.files: dids[utils.get_did(fname)].append(fname) # load in the supercuts file supercuts = utils.read_supercuts_file(args.supercuts) # load up the weights file if not os.path.isfile(args.weightsFile): raise ValueError( 'The supplied weights file `{0}` does not exist or I cannot find it.' .format(args.weightsFile)) else: weights = json.load(file(args.weightsFile)) # parallelize num_cores = min(multiprocessing.cpu_count(), args.num_cores) logger.log(25, "Using {0} cores".format(num_cores)) results = Parallel(n_jobs=num_cores)(delayed(utils.do_cut)( did, files, supercuts, weights, args.tree_name, args.output_directory, args.eventWeightBranch, args.numpy) for did, files in dids.iteritems()) for did, result in zip(dids, results): logger.log( 25, 'DID {0:s}: {1:s}'.format(did, 'ok' if result[0] else 'not ok')) logger.log( 25, "Total CPU elapsed time: {0}".format( secondsToStr(sum(result[1] for result in results)))) return True
def do_cuts(args): from root_optimize.timing import secondsToStr # before doing anything, let's ensure the directory we make is ok if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) elif args.overwrite: import shutil shutil.rmtree(args.output_directory) else: raise IOError("Output directory already exists: {0:s}".format( args.output_directory)) # first step is to group by the sample DID dids = defaultdict(list) for fname in args.files: dids[utils.get_did(fname)].append(fname) # load in the supercuts file supercuts = utils.read_supercuts_file(args.supercuts) # load up the weights file if not os.path.isfile(args.weightsFile): raise ValueError( 'The supplied weights file `{0}` does not exist or I cannot find it.' .format(args.weightsFile)) else: weights = json.load(file(args.weightsFile)) # parallelize num_cores = min(multiprocessing.cpu_count(), args.num_cores) logger.log(25, "Using {0} cores".format(num_cores)) pids = None # if pids is None, do_cut() will disable the progress if not args.hide_subtasks: from numpy import memmap, uint64 pids = memmap(os.path.join(tempfile.mkdtemp(), 'pids'), dtype=uint64, shape=num_cores, mode='w+') overall_progress = tqdm.tqdm(total=len(dids), desc='Num. files', position=0, leave=True, unit='file', dynamic_ncols=True) class CallBack(object): completed = defaultdict(int) def __init__(self, index, parallel): self.index = index self.parallel = parallel def __call__(self, index): CallBack.completed[self.parallel] += 1 overall_progress.update() overall_progress.refresh() if self.parallel._original_iterable: self.parallel.dispatch_next() import joblib.parallel joblib.parallel.CallBack = CallBack results = Parallel(n_jobs=num_cores)(delayed(utils.do_cut)( did, files, supercuts, weights, args.tree_name, args.output_directory, args.eventWeightBranch, args.numpy, pids) for did, files in dids.items()) overall_progress.close() for did, result in zip(dids, results): logger.log( 25, 'DID {0:s}: {1:s}'.format(did, 'ok' if result[0] else 'not ok')) logger.log( 25, "Total CPU elapsed time: {0}".format( secondsToStr(sum(result[1] for result in results)))) return True
def do_cuts(args): from root_optimize.timing import secondsToStr # before doing anything, let's ensure the directory we make is ok if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) elif args.overwrite: import shutil shutil.rmtree(args.output_directory) else: raise IOError("Output directory already exists: {0:s}".format( args.output_directory)) tree_patterns = [ re.compile(str.encode(fnmatch.translate(tree_pattern))) for tree_pattern in args.tree_patterns ] # first step is to group by the tree name trees = defaultdict(list) for fname in args.files: with uproot.open(fname) as f: tree_names = set( sorted( tname.split(b';')[0] for tname in f.allkeys(filterclass=lambda cls: issubclass( cls, uproot.tree.TTreeMethods)))) logger.log(25, "{0:s} has {1:d} trees".format(fname, len(tree_names))) for tree_name in tree_names: matched = any( tree_pattern.search(tree_name) for tree_pattern in tree_patterns) if matched: trees[tree_name].append(fname) logger.log( 25, " - [{1:s}] {0:s}".format(tree_name.decode('utf-8'), "x" if matched else " "), ) # load in the supercuts file supercuts = utils.read_supercuts_file(args.supercuts) branchesSpecified = utils.supercuts_to_branches(supercuts) eventWeightBranchesSpecified = utils.extract_branch_names( args.eventWeightBranch) proposedBranches = set( map(str.encode, itertools.chain(branchesSpecified, eventWeightBranchesSpecified))) # parallelize num_cores = min(multiprocessing.cpu_count(), args.num_cores) logger.log(25, "Using {0} cores".format(num_cores)) pids = None # if pids is None, do_cut() will disable the progress if not args.hide_subtasks: from numpy import memmap, uint64 pids = memmap( os.path.join(tempfile.mkdtemp(), "pids"), dtype=uint64, shape=num_cores, mode="w+", ) overall_progress = tqdm.tqdm( total=len(trees), desc="Num. trees", position=0, leave=True, unit="tree", ncols=120, miniters=1, ) class BatchCompletionCallBack(object): completed = defaultdict(int) def __init__(self, time, index, parallel): self.index = index self.parallel = parallel def __call__(self, index): BatchCompletionCallBack.completed[self.parallel] += 1 overall_progress.update() # overall_progress.refresh() if self.parallel._original_iterator is not None: self.parallel.dispatch_next() import joblib.parallel joblib.parallel.BatchCompletionCallBack = BatchCompletionCallBack with utils.std_out_err_redirect_tqdm(): results = Parallel(n_jobs=num_cores)(delayed(utils.do_cut)( tree_name, files, supercuts, proposedBranches, args.output_directory, args.eventWeightBranch, pids, ) for tree_name, files in trees.items()) overall_progress.close() for tree_name, result in zip(trees, results): logger.log( 25, "Tree {0:s}: {1:s}".format(tree_name.decode('utf-8'), "ok" if result[0] else "not ok"), ) logger.log( 25, "Total CPU elapsed time: {0}".format( secondsToStr(sum(result[1] for result in results))), ) return True
def do_cuts(args): from root_optimize.timing import secondsToStr # before doing anything, let's ensure the directory we make is ok if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) elif args.overwrite: import shutil shutil.rmtree(args.output_directory) else: raise IOError("Output directory already exists: {0:s}".format(args.output_directory)) # first step is to group by the sample DID dids = defaultdict(list) for fname in args.files: dids[utils.get_did(fname)].append(fname) # load in the supercuts file supercuts = utils.read_supercuts_file(args.supercuts) # load up the weights file if not os.path.isfile(args.weightsFile): raise ValueError('The supplied weights file `{0}` does not exist or I cannot find it.'.format(args.weightsFile)) else: weights = json.load(file(args.weightsFile)) # parallelize num_cores = min(multiprocessing.cpu_count(), args.num_cores) logger.log(25, "Using {0} cores".format(num_cores) ) pids = None # if pids is None, do_cut() will disable the progress if not args.hide_subtasks: from numpy import memmap, uint64 pids = memmap(os.path.join(tempfile.mkdtemp(), 'pids'), dtype=uint64, shape=num_cores, mode='w+') overall_progress = tqdm.tqdm(total=len(dids), desc='Num. files', position=0, leave=True, unit='file', dynamic_ncols=True) class CallBack(object): completed = defaultdict(int) def __init__(self, index, parallel): self.index = index self.parallel = parallel def __call__(self, index): CallBack.completed[self.parallel] += 1 overall_progress.update() overall_progress.refresh() if self.parallel._original_iterable: self.parallel.dispatch_next() import joblib.parallel joblib.parallel.CallBack = CallBack results = Parallel(n_jobs=num_cores)(delayed(utils.do_cut)(did, files, supercuts, weights, args.tree_name, args.output_directory, args.eventWeightBranch, args.numpy, pids) for did, files in dids.iteritems()) overall_progress.close() for did, result in zip(dids, results): logger.log(25, 'DID {0:s}: {1:s}'.format(did, 'ok' if result[0] else 'not ok')) logger.log(25, "Total CPU elapsed time: {0}".format(secondsToStr(sum(result[1] for result in results)))) return True