class BundleMercurialRepository(Task): task_namespace = "law.mercurial" exclude_files = CSVParameter( default=(), description="patterns of files to exclude, default: " "()") include_files = CSVParameter( default=(), description="patterns of files to force-include, " "takes precedence over .hgignore, default: ()") custom_checksum = luigi.Parameter(default=NO_STR, description="a custom checksum to use, " "default: NO_STR") def __init__(self, *args, **kwargs): super(BundleMercurialRepository, self).__init__(*args, **kwargs) self._checksum = None @abstractmethod def get_repo_path(self): return @property def checksum(self): if self.custom_checksum != NO_STR: return self.custom_checksum if self._checksum is None: checksum_script = rel_path(__file__, "scripts", "repository_checksum.sh") cmd = [checksum_script, self.get_repo_path()] code, out, _ = interruptable_popen(cmd, stdout=subprocess.PIPE) if code != 0: raise Exception("repository checksum calculation failed") self._checksum = out.strip() return self._checksum def output(self): repo_base = os.path.basename(self.get_repo_path()) return LocalFileTarget("{}_{}.tgz".format(repo_base, self.checksum)) @log def run(self): with self.output().localize("w") as tmp: self.bundle(tmp.path) def bundle(self, dst_path): bundle_script = rel_path(__file__, "scripts", "bundle_repository.sh") cmd = [bundle_script, self.get_repo_path(), get_path(dst_path)] cmd += [" ".join(self.exclude_files)] cmd += [" ".join(self.include_files)] code = interruptable_popen(cmd, executable="/bin/bash")[0] if code != 0: raise Exception("repository bundling failed")
class GLiteWorkflow(BaseRemoteWorkflow): workflow_proxy_cls = GLiteWorkflowProxy glite_workflow_run_decorators = None glite_job_manager_defaults = None glite_job_file_factory_defaults = None glite_ce = CSVParameter(default=(), significant=False, description="target glite computing " "element(s), default: ()") glite_job_kwargs = [] glite_job_kwargs_submit = ["glite_ce"] glite_job_kwargs_cancel = None glite_job_kwargs_cleanup = None glite_job_kwargs_query = None exclude_params_branch = {"glite_ce"} exclude_index = True @abstractmethod def glite_output_directory(self): return None @abstractmethod def glite_bootstrap_file(self): return None def glite_wrapper_file(self): return law_src_path("job", "bash_wrapper.sh") def glite_stageout_file(self): return None def glite_workflow_requires(self): return OrderedDict() def glite_output_postfix(self): self.get_branch_map() if self.branches: return "_" + "_".join(str(b) for b in sorted(self.branches)) else: return "_{}To{}".format(self.start_branch, self.end_branch) def glite_output_uri(self): return self.glite_output_directory().url() def glite_delegate_proxy(self, endpoint): return delegate_voms_proxy_glite(endpoint, stdout=sys.stdout, stderr=sys.stderr, cache=True) def glite_create_job_manager(self, **kwargs): kwargs = merge_dicts(self.glite_job_manager_defaults, kwargs) return GLiteJobManager(**kwargs) def glite_create_job_file_factory(self, **kwargs): # job file fectory config priority: kwargs > class defaults kwargs = merge_dicts({}, self.glite_job_file_factory_defaults, kwargs) return GLiteJobFileFactory(**kwargs) def glite_job_config(self, config, job_num, branches): return config def glite_dump_intermediate_submission_data(self): return True def glite_post_submit_delay(self): return self.poll_interval * 60 def glite_use_local_scheduler(self): return True def glite_cmdline_args(self): return []
class GLiteWorkflow(BaseRemoteWorkflow): workflow_proxy_cls = GLiteWorkflowProxy glite_workflow_run_decorators = None glite_job_manager_defaults = None glite_job_file_factory_defaults = None glite_ce = CSVParameter( default=(), significant=False, description="target glite computing element(s); default: empty", ) glite_job_kwargs = [] glite_job_kwargs_submit = ["glite_ce"] glite_job_kwargs_cancel = None glite_job_kwargs_cleanup = None glite_job_kwargs_query = None exclude_params_branch = {"glite_ce"} exclude_index = True @abstractmethod def glite_output_directory(self): return None @abstractmethod def glite_bootstrap_file(self): return None def glite_wrapper_file(self): return law_src_path("job", "bash_wrapper.sh") def glite_stageout_file(self): return None def glite_workflow_requires(self): return DotDict() def glite_output_postfix(self): return "_" + self.get_branches_repr() def glite_output_uri(self): return self.glite_output_directory().url() def glite_delegate_proxy(self, endpoint): return delegate_voms_proxy_glite(endpoint, stdout=sys.stdout, stderr=sys.stderr, cache=True) def glite_job_manager_cls(self): return GLiteJobManager def glite_create_job_manager(self, **kwargs): kwargs = merge_dicts(self.glite_job_manager_defaults, kwargs) return self.glite_job_manager_cls()(**kwargs) def glite_job_file_factory_cls(self): return GLiteJobFileFactory def glite_create_job_file_factory(self, **kwargs): # job file fectory config priority: kwargs > class defaults kwargs = merge_dicts({}, self.glite_job_file_factory_defaults, kwargs) return self.glite_job_file_factory_cls()(**kwargs) def glite_job_config(self, config, job_num, branches): return config def glite_use_local_scheduler(self): return True def glite_cmdline_args(self): return {} def glite_destination_info(self, info): return info
class BaseWorkflow(Task): workflow = luigi.Parameter(default=NO_STR, significant=False, description="the type of the " "workflow to use") acceptance = luigi.FloatParameter(default=1.0, significant=False, description="number of " "finished jobs to consider the task successful, relative fraction (<= 1) or absolute value " "(> 1), default: 1.0") tolerance = luigi.FloatParameter(default=0.0, significant=False, description="number of failed " "jobs to still consider the task successful, relative fraction (<= 1) or absolute value " "(> 1), default: 0.0") pilot = luigi.BoolParameter(significant=False, description="disable requirements of the " "workflow to let branch tasks resolve requirements on their own") branch = luigi.IntParameter(default=-1, description="the branch number/index to run this " "task for, -1 means this task is the workflow, default: -1") start_branch = luigi.IntParameter(default=NO_INT, description="the branch to start at, " "default: 0") end_branch = luigi.IntParameter(default=NO_INT, description="the branch to end at, NO_INT " "means end, default: NO_INT") branches = CSVParameter(cls=luigi.IntParameter, default=[], significant=False, description="branches to use") workflow_proxy_cls = BaseWorkflowProxy target_collection_cls = None outputs_siblings = False force_contiguous_branches = False workflow_property = None cached_workflow_property = None exclude_db = True exclude_params_branch = {"print_deps", "print_status", "remove_output", "workflow", "acceptance", "tolerance", "pilot", "start_branch", "end_branch", "branches"} exclude_params_workflow = {"branch"} def __init__(self, *args, **kwargs): super(BaseWorkflow, self).__init__(*args, **kwargs) # determine workflow proxy class to instantiate if self.is_workflow(): classes = self.__class__.mro() for cls in classes: if not issubclass(cls, BaseWorkflow): continue if self.workflow in (NO_STR, cls.workflow_proxy_cls.workflow_type): self.workflow = cls.workflow_proxy_cls.workflow_type self.workflow_proxy = cls.workflow_proxy_cls(task=self) logger.debug("created workflow proxy instance of type '{}'".format( cls.workflow_proxy_cls.workflow_type)) break else: raise ValueError("unknown workflow type {}".format(self.workflow)) # cached attributes for the workflow self._branch_map = None self._branch_tasks = None else: # cached attributes for branches self._workflow_task = None def _forward_attribute(self, attr): return attr in _forward_attributes and self.is_workflow() def __getattribute__(self, attr, proxy=True, force=False): if proxy and attr != "__class__": if force or (attr != "_forward_attribute" and self._forward_attribute(attr)): return getattr(self.workflow_proxy, attr) return super(BaseWorkflow, self).__getattribute__(attr) def cli_args(self, exclude=None, replace=None): if exclude is None: exclude = set() if self.is_branch(): exclude |= self.exclude_params_branch else: exclude |= self.exclude_params_workflow return super(BaseWorkflow, self).cli_args(exclude=exclude, replace=replace) def is_branch(self): return self.branch != -1 def is_workflow(self): return not self.is_branch() def as_branch(self, branch=0): if self.is_branch(): return self else: return self.req(self, branch=branch) def as_workflow(self): if self.is_workflow(): return self else: if self._workflow_task is None: self._workflow_task = self.req(self, branch=NO_INT) return self._workflow_task @abstractmethod def create_branch_map(self): return def _reset_branch_boundaries(self, branches=None): if self.is_branch(): raise Exception("calls to _reset_branch_boundaries are forbidden for branch tasks") if branches is None: branches = list(self._branch_map.keys()) min_branch = min(branches) max_branch = max(branches) # reset start_branch self.start_branch = max(min_branch, min(max_branch, self.start_branch)) # reset end_branch if self.end_branch < 0: self.end_branch = sys.maxsize self.end_branch = max(self.start_branch, min(max_branch + 1, self.end_branch)) def _reduce_branch_map(self): if self.is_branch(): raise Exception("calls to _reduce_branch_map are forbidden for branch tasks") # reduce by start/end branch for b in list(self._branch_map.keys()): if not (self.start_branch <= b < self.end_branch): del self._branch_map[b] # reduce by branches if self.branches: for b in list(self._branch_map.keys()): if b not in self.branches: del self._branch_map[b] def get_branch_map(self, reset_boundaries=True, reduce=True): if self.is_branch(): return self.as_workflow().get_branch_map(reset_boundaries=reset_boundaries, reduce=reduce) else: if self._branch_map is None: self._branch_map = self.create_branch_map() # some type and sanity checks if isinstance(self._branch_map, (list, tuple)): self._branch_map = dict(enumerate(self._branch_map)) elif self.force_contiguous_branches: n = len(self._branch_map) if set(self._branch_map.keys()) != set(range(n)): raise ValueError("branch map keys must constitute contiguous range " "[0, {})".format(n)) else: for branch in self._branch_map: if not isinstance(branch, six.integer_types) or branch < 0: raise ValueError("branch map keys must be non-negative integers, got " "'{}' ({})".format(branch, type(branch).__name__)) # post-process if reset_boundaries: self._reset_branch_boundaries() if reduce: self._reduce_branch_map() return self._branch_map @property def branch_map(self): return self.get_branch_map() @property def branch_data(self): if self.is_workflow(): raise Exception("calls to branch_data are forbidden for workflow tasks") elif self.branch not in self.branch_map: raise ValueError("invalid branch '{}', not found in branch map".format(self.branch)) return self.branch_map[self.branch] def get_branch_tasks(self): if self.is_branch(): return self.as_workflow().get_branch_tasks() else: if self._branch_tasks is None: branch_map = self.branch_map if branch_map is None: raise AttributeError("workflow task '{}' requires a branch_map".format(self)) self._branch_tasks = OrderedDict() for b in branch_map: self._branch_tasks[b] = self.req(self, branch=b, _exclude=self.exclude_params_branch) return self._branch_tasks def workflow_requires(self): if self.is_branch(): raise Exception("calls to workflow_requires are forbidden for branch tasks") return OrderedDict() def workflow_input(self): if self.is_branch(): raise Exception("calls to workflow_input are forbidden for branch tasks") return luigi.task.getpaths(self.workflow_proxy.requires()) def requires_from_branch(self): if self.is_branch(): raise Exception("calls to requires_from_branch are forbidden for branch tasks") return self.__class__.requires(self)
class BaseWorkflow(Task): """ Base class of all workflows. .. py:classattribute:: workflow type: luigi.Parameter Workflow type that refers to the workflow proxy implementation at instantiation / execution time. Empty default value. .. py:classattribute:: acceptance type: luigi.FloatParameter Number of complete tasks to consider the workflow successful. Values larger than one are interpreted as absolute numbers, and as fractions otherwise. Defaults to *1.0*. .. py:classattribute:: tolerance type: luigi.FloatParameter Number of failed tasks to still consider the workflow successful. Values larger than one are interpreted as absolute numbers, and as fractions otherwise. Defaults to *0.0*. .. py:classattribute:: branch type: luigi.IntParameter The branch number to run this task for. *-1* means that this task is the actual *workflow*, rather than a *branch* task. Defaults to *-1*. .. py:classattribute:: start_branch type: luigi.IntParameter First branch to process. Defaults to *0*. .. py:classattribute:: end_branch type: luigi.IntParameter First branch that is *not* processed (pythonic). Defaults to *-1*. .. py:classattribute:: branches type: law.CSVParameter Explicit list of branches to process. Empty default value. .. py:classattribute:: workflow_proxy_cls type: BaseWorkflowProxy Reference to the workflow proxy class associated to this workflow. .. py:classattribute:: workflow_complete type: None, callable Custom completion check that is used by the workflow's proxy when callable. .. py:classattribute:: output_collection_cls type: TargetCollection Configurable target collection class to use, such as :py:class:`target.collection.TargetCollection`, :py:class:`target.collection.FileCollection` or :py:class:`target.collection.SiblingFileCollection`. .. py:classattribute:: force_contiguous_branches type: bool Flag that denotes if this workflow is forced to use contiguous branch numbers, starting from 0. If *False*, an exception is raised otherwise. .. py:classattribute:: workflow_property type: function Reference to :py:func:`workflow_property`. .. py:classattribute:: cached_workflow_property type: function Reference to :py:func:`cached_workflow_property`. .. py:classattribute:: workflow_run_decorators type: sequence, None Sequence of decorator functions that will be conveniently used to decorate the workflow proxy's run method. This way, there is no need to subclass and reset the :py:attr:`workflow_proxy_cls` just to add a decorator. The value is *None* by default. .. py:attribute:: workflow_cls type: law.task.Register Reference to the class of the realized workflow. This is especially helpful in case your derived class inherits from multiple workflows. .. py:attribute:: workflow_proxy type: BaseWorkflowProxy Reference to the underlying workflow proxy instance. .. py:attribute:: branch_map read-only type: dict Shorthand for :py:meth:`get_branch_map`. .. py:attribute:: branch_data read-only Shorthand for ``self.branch_map[self.branch]``. """ workflow = luigi.Parameter(default=NO_STR, significant=False, description="the type of the " "workflow to use") acceptance = luigi.FloatParameter(default=1.0, significant=False, description="number of " "finished tasks to consider the task successful, relative fraction (<= 1) or absolute " "value (> 1), default: 1.0") tolerance = luigi.FloatParameter(default=0.0, significant=False, description="number of failed " "tasks to still consider the task successful, relative fraction (<= 1) or absolute value " "(> 1), default: 0.0") pilot = luigi.BoolParameter(significant=False, description="disable requirements of the " "workflow to let branch tasks resolve requirements on their own") branch = luigi.IntParameter(default=-1, description="the branch number/index to run this " "task for, -1 means this task is the workflow, default: -1") start_branch = luigi.IntParameter(default=NO_INT, description="the branch to start at, " "default: 0") end_branch = luigi.IntParameter(default=NO_INT, description="the branch to end at, NO_INT " "means end, default: NO_INT") branches = CSVParameter(default=[], significant=False, description="branches to use") workflow_proxy_cls = BaseWorkflowProxy workflow_complete = None output_collection_cls = None force_contiguous_branches = False workflow_property = None cached_workflow_property = None workflow_run_decorators = None exclude_index = True exclude_params_branch = { "workflow", "acceptance", "tolerance", "pilot", "start_branch", "end_branch", "branches", } exclude_params_workflow = {"branch"} def __init__(self, *args, **kwargs): super(BaseWorkflow, self).__init__(*args, **kwargs) # determine workflow proxy class to instantiate if self.is_workflow(): classes = self.__class__.mro() for cls in classes: if not issubclass(cls, BaseWorkflow): continue if not cls._defined_workflow_proxy: continue if self.workflow in (NO_STR, cls.workflow_proxy_cls.workflow_type): self.workflow = cls.workflow_proxy_cls.workflow_type self.workflow_cls = cls self.workflow_proxy = cls.workflow_proxy_cls(task=self) logger.debug("created workflow proxy instance of type '{}'".format( cls.workflow_proxy_cls.workflow_type)) break else: raise ValueError("unknown workflow type {}".format(self.workflow)) # cached attributes for the workflow self._branch_map = None self._branch_tasks = None # cached attributes for branches self._workflow_task = None def __getattribute__(self, attr, proxy=True): return get_proxy_attribute(self, attr, proxy=proxy, super_cls=Task) def cli_args(self, exclude=None, replace=None): if exclude is None: exclude = set() if self.is_branch(): exclude |= self.exclude_params_branch else: exclude |= self.exclude_params_workflow return super(BaseWorkflow, self).cli_args(exclude=exclude, replace=replace) def is_branch(self): """ Returns whether or not this task refers to a *branch*. """ return self.branch != -1 def is_workflow(self): """ Returns whether or not this task refers to the *workflow*. """ return not self.is_branch() def as_branch(self, branch=0): """ When this task refers to the workflow, a re-instantiated task with a certain *branch* and identical parameters is returned. Otherwise, the branch task itself is returned. """ if self.is_branch(): return self else: return self.req(self, branch=branch, _exclude=self.exclude_params_branch) def as_workflow(self): """ When this task refers to a branch task, a re-instantiated task with ``branch=-1`` and identical parameters is returned. Otherwise, the workflow itself is returned. """ if self.is_workflow(): return self else: if self._workflow_task is None: self._workflow_task = self.req(self, branch=-1, _exclude=self.exclude_params_workflow) return self._workflow_task @abstractmethod def create_branch_map(self): """ Abstract method that must be overwritten by inheriting tasks to define the branch map. """ return def _reset_branch_boundaries(self, branches=None): if self.is_branch(): raise Exception("calls to _reset_branch_boundaries are forbidden for branch tasks") if branches is None: branches = list(self._branch_map.keys()) min_branch = min(branches) max_branch = max(branches) # reset start_branch self.start_branch = max(min_branch, min(max_branch, self.start_branch)) # reset end_branch if self.end_branch < 0: self.end_branch = sys.maxsize self.end_branch = max(self.start_branch, min(max_branch + 1, self.end_branch)) def _reduce_branch_map(self): if self.is_branch(): raise Exception("calls to _reduce_branch_map are forbidden for branch tasks") # reduce by start/end branch for b in list(self._branch_map.keys()): if not (self.start_branch <= b < self.end_branch): del self._branch_map[b] # reduce by branches if self.branches: # helper to expand slices, e.g. "1-3" -> 1,2,3 or "4-" -> 4,5,6,... def expand(b): if "-" in str(b): parts = str(b).strip().split("-") if len(parts) == 2: start = int(parts[0]) if parts[0] else None end = int(parts[1]) if parts[1] else None return start, end return int(b) # determine branches to remove remove_branches = sorted(list(self._branch_map.keys())) for b in self.branches: b = expand(b) if isinstance(b, tuple): start = b[0] if b[0] is not None else min(remove_branches) end = b[1] if b[1] is not None else max(remove_branches) for b in range(start, end + 1): if b in remove_branches: remove_branches.remove(b) else: if b in remove_branches: remove_branches.remove(b) # actual removal for b in remove_branches: del self._branch_map[b] def get_branch_map(self, reset_boundaries=True, reduce=True): """ Creates and returns the branch map defined in :py:meth:`create_branch_map`. If *reset_boundaries* is *True*, the *start_branch* and *end_branch* attributes are rearranged to not exceed the actual branch map length. If *reduce* is *True* and an explicit list of branch numbers was set, the branch map is filtered accordingly. The branch map is cached. """ if self.is_branch(): return self.as_workflow().get_branch_map(reset_boundaries=reset_boundaries, reduce=reduce) else: if self._branch_map is None: self._branch_map = self.create_branch_map() # some type and sanity checks if isinstance(self._branch_map, (list, tuple)): self._branch_map = dict(enumerate(self._branch_map)) elif isinstance(self._branch_map, six.integer_types): self._branch_map = dict(enumerate(range(self._branch_map))) elif self.force_contiguous_branches: n = len(self._branch_map) if set(self._branch_map.keys()) != set(range(n)): raise ValueError("branch map keys must constitute contiguous range " "[0, {})".format(n)) else: for branch in self._branch_map: if not isinstance(branch, six.integer_types) or branch < 0: raise ValueError("branch map keys must be non-negative integers, got " "'{}' ({})".format(branch, type(branch).__name__)) # post-process if reset_boundaries: self._reset_branch_boundaries() if reduce: self._reduce_branch_map() return self._branch_map @property def branch_map(self): return self.get_branch_map() @property def branch_data(self): if self.is_workflow(): raise Exception("calls to branch_data are forbidden for workflow tasks") elif self.branch not in self.branch_map: raise ValueError("invalid branch '{}', not found in branch map".format(self.branch)) return self.branch_map[self.branch] def get_branch_tasks(self): """ Returns a dictionary that maps branch numbers to instantiated branch tasks. As this might be computationally intensive, the return value is cached. """ if self.is_branch(): return self.as_workflow().get_branch_tasks() else: if self._branch_tasks is None: branch_map = self.get_branch_map() if branch_map is None: raise AttributeError("workflow task '{}' requires a branch_map".format(self)) self._branch_tasks = OrderedDict() for b in branch_map: self._branch_tasks[b] = self.req(self, branch=b, _exclude=self.exclude_params_branch) return self._branch_tasks def workflow_requires(self): """ Hook to add workflow requirements. This method is expected to return a dictionary. When this method is called from a branch task, an exception is raised. """ if self.is_branch(): raise Exception("calls to workflow_requires are forbidden for branch tasks") return OrderedDict() def workflow_input(self): """ Returns the output targets if all workflow requirements, comparable to the normal ``input()`` method of plain tasks. When this method is called from a branch task, an exception is raised. """ if self.is_branch(): raise Exception("calls to workflow_input are forbidden for branch tasks") return luigi.task.getpaths(self.workflow_proxy.requires()) def requires_from_branch(self): """ Returns the requirements defined in the standard ``requires()`` method, but called in the context of the workflow. This method is only recommended in case all required tasks that would normally take a branch number, are intended to be instantiated with ``branch=-1``. When this method is called from a branch task, an exception is raised. """ if self.is_branch(): raise Exception("calls to requires_from_branch are forbidden for branch tasks") return self.__class__.requires(self)
class Task(BaseTask): log_file = luigi.Parameter(default=NO_STR, significant=False, description="a custom log file, " "default: <task.default_log_file>") print_deps = CSVParameter( default=(), significant=False, description="print task dependencies " "but do not run any task; this CSV parameter accepts a single integer value which sets the " "task recursion depth (0 means non-recursive)") print_status = CSVParameter( default=(), significant=False, description="print the task status " "but do not run any task; this CSV parameter accepts up to three values: 1. the task " "recursion depth (0 means non-recursive), 2. the depth of the status text of target " "collections (default: 0), 3. a flag that is passed to the status text creation (default: " "'')") print_output = CSVParameter( default=(), significant=False, description="print a flat list of " "output targets but do not run any task; this CSV parameter accepts a single integer value " "which sets the task recursion depth (0 means non-recursive") remove_output = CSVParameter( default=(), significant=False, description="remove task outputs " "but do not run any task; this CSV parameter accepts up to three values: 1. the task " "recursion depth (0 means non-recursive), 2. one of the modes 'i' (interactive), 'a' " "(all), 'd' (dry run) (default: 'i'), 3. a flag that decides whether outputs of external " "tasks should be removed (default: False)") fetch_output = CSVParameter( default=(), significant=False, description="copy all task outputs " "into a local directory but do not run any task; this CSV parameter accepts up to four " "values: 1. the task recursion depth (0 means non-recursive), 2. one of the modes 'i' " "(interactive), 'a' (all), 'd' (dry run) (default: 'i'), 3. the target directory (default: " "'.'), 4. a flag that decides whether outputs of external tasks should be fetched " "(default: False)") interactive_params = [ "print_deps", "print_status", "print_output", "remove_output", "fetch_output", ] message_cache_size = 10 exclude_index = True exclude_params_req = set() exclude_params_repr = set() @classmethod def req_params(cls, inst, _exclude=None, _prefer_cli=None, **kwargs): _exclude = set() if _exclude is None else set(make_list(_exclude)) # always exclude interactive parameters _exclude |= set(inst.interactive_params) return super(Task, cls).req_params(inst, _exclude=_exclude, _prefer_cli=_prefer_cli, **kwargs) def __init__(self, *args, **kwargs): super(Task, self).__init__(*args, **kwargs) # cache for messages published to the scheduler self._message_cache = [] # cache for the last progress published to the scheduler self._last_progress_percentage = None @property def default_log_file(self): return "-" def is_root_task(self): return root_task() == self def publish_message(self, *args): msg = " ".join(str(arg) for arg in args) print(msg) sys.stdout.flush() self._publish_message(*args) def _publish_message(self, *args): msg = " ".join(str(arg) for arg in args) # add to message cache and handle overflow msg = uncolored(msg) self._message_cache.append(msg) if self.message_cache_size >= 0: end = max(len(self._message_cache) - self.message_cache_size, 0) del self._message_cache[:end] # set status message using the current message cache self.set_status_message("\n".join(self._message_cache)) def create_message_stream(self, *args, **kwargs): return TaskMessageStream(self, *args, **kwargs) @contextmanager def publish_step(self, msg, success_message="done", fail_message="failed", runtime=False): self.publish_message(msg) success = False t0 = time.time() try: yield success = True finally: msg = success_message if success else fail_message if runtime: diff = time.time() - t0 msg = "{} (took {})".format(msg, human_duration(seconds=diff)) self.publish_message(msg) def publish_progress(self, percentage): percentage = int(math.floor(percentage)) if percentage != self._last_progress_percentage: self._last_progress_percentage = percentage self.set_progress_percentage(percentage) def create_progress_callback(self, n_total, reach=(0, 100)): def make_callback(n, start, end): def callback(i): self.publish_progress(start + (i + 1) / float(n) * (end - start)) return callback if isinstance(n_total, (list, tuple)): width = 100. / len(n_total) reaches = [(width * i, width * (i + 1)) for i in range(len(n_total))] return n_total.__class__( make_callback(n, *r) for n, r in zip(n_total, reaches)) else: return make_callback(n_total, *reach) def cli_args(self, exclude=None, replace=None): exclude = set() if exclude is None else set(make_list(exclude)) # always exclude interactive parameters exclude |= set(self.interactive_params) return super(Task, self).cli_args(exclude=exclude, replace=replace) def __repr__(self): return self.repr(color=False) def repr(self, all_params=False, color=None): if color is None: cfg = Config.instance() color = cfg.get_expanded_boolean("task", "colored_repr") family = self._repr_family(self.get_task_family(), color=color) parts = [ self._repr_param(*pair, color=color) for pair in self._repr_params(all_params=all_params) ] + [ self._repr_flag(flag, color=color) for flag in self._repr_flags() ] return "{}({})".format(family, ", ".join(parts)) def colored_repr(self, all_params=False): # deprecation warning until v0.1 logger.warning( "the use of {0}.colored_repr() is deprecated, please use " "{0}.repr(color=True) instead".format(self.__class__.__name__)) return self.repr(all_params=all_params, color=True) def _repr_params(self, all_params=False): # build key value pairs of all significant parameters params = self.get_params() exclude = set() if not all_params: exclude |= self.exclude_params_repr exclude |= self.inst_exclude_params_repr() exclude |= set(self.interactive_params) pairs = [] for name, param in params: if param.significant and not multi_match(name, exclude): value = getattr(self, name) pairs.append((name, param.serialize(value))) return pairs def _repr_flags(self): return [] def inst_exclude_params_repr(self): return set() @classmethod def _repr_family(cls, family, color=False): return colored(family, "green") if color else family @classmethod def _repr_param(cls, name, value, color=False): return "{}={}".format( colored(name, color="blue", style="bright") if color else name, value) @classmethod def _repr_flag(cls, name, color=False): return colored(name, color="magenta") if color else name def _print_deps(self, args): return print_task_deps(self, *args) def _print_status(self, args): return print_task_status(self, *args) def _print_output(self, args): return print_task_output(self, *args) def _remove_output(self, args): return remove_task_output(self, *args) def _fetch_output(self, args): import law.target.remote as ltr with patch_object(ltr, "global_retries", 0, lock=True): return fetch_task_output(self, *args) def localize_input(self, *args, **kwargs): return localize_file_targets(self.input(), *args, **kwargs) def localize_output(self, *args, **kwargs): return localize_file_targets(self.output(), *args, **kwargs)
class ARCWorkflow(BaseRemoteWorkflow): workflow_proxy_cls = ARCWorkflowProxy arc_workflow_run_decorators = None arc_job_manager_defaults = None arc_job_file_factory_defaults = None arc_ce = CSVParameter(default=(), significant=False, description="target arc computing " "element(s), default: ()") exclude_params_branch = {"arc_ce"} exclude_index = True @abstractmethod def arc_output_directory(self): return None @abstractmethod def arc_bootstrap_file(self): return None def arc_wrapper_file(self): return law_src_path("job", "bash_wrapper.sh") def arc_stageout_file(self): return None def arc_workflow_requires(self): return OrderedDict() def arc_output_postfix(self): self.get_branch_map() if self.branches: return "_" + "_".join(self.branches) else: return "_{}To{}".format(self.start_branch, self.end_branch) def arc_output_uri(self): return self.arc_output_directory().url() def arc_create_job_manager(self, **kwargs): kwargs = merge_dicts(self.arc_job_manager_defaults, kwargs) return ARCJobManager(**kwargs) def arc_create_job_file_factory(self, **kwargs): # job file fectory config priority: kwargs > class defaults kwargs = merge_dicts({}, self.arc_job_file_factory_defaults, kwargs) return ARCJobFileFactory(**kwargs) def arc_job_config(self, config, job_num, branches): return config def arc_dump_intermediate_submission_data(self): return True def arc_post_submit_delay(self): return self.poll_interval * 60 def arc_use_local_scheduler(self): return True def arc_cmdline_args(self): return []
class ARCWorkflow(BaseRemoteWorkflow): workflow_proxy_cls = ARCWorkflowProxy arc_workflow_run_decorators = None arc_job_manager_defaults = None arc_job_file_factory_defaults = None arc_ce = CSVParameter(default=(), significant=False, description="target arc computing " "element(s); default: empty") arc_job_kwargs = [] arc_job_kwargs_submit = ["arc_ce"] arc_job_kwargs_cancel = None arc_job_kwargs_cleanup = None arc_job_kwargs_query = None exclude_params_branch = {"arc_ce"} exclude_index = True @abstractmethod def arc_output_directory(self): return None @abstractmethod def arc_bootstrap_file(self): return None def arc_wrapper_file(self): return law_src_path("job", "bash_wrapper.sh") def arc_stageout_file(self): return None def arc_workflow_requires(self): return DotDict() def arc_output_postfix(self): return "_" + self.get_branches_repr() def arc_output_uri(self): return self.arc_output_directory().url() def arc_create_job_manager(self, **kwargs): kwargs = merge_dicts(self.arc_job_manager_defaults, kwargs) return ARCJobManager(**kwargs) def arc_create_job_file_factory(self, **kwargs): # job file fectory config priority: kwargs > class defaults kwargs = merge_dicts({}, self.arc_job_file_factory_defaults, kwargs) return ARCJobFileFactory(**kwargs) def arc_job_config(self, config, job_num, branches): return config def arc_use_local_scheduler(self): return True def arc_cmdline_args(self): return {}
class Task(BaseTask): log_file = luigi.Parameter(default=NO_STR, significant=False, description="a custom log file, " "default: <task.default_log_file>") print_deps = CSVParameter( default=[], significant=False, description="print task dependencies, " "do not run any task, the passed numbers set the recursion depth (0 means non-recursive)" ) print_status = CSVParameter( default=[], significant=False, description="print the task status, " "do not run any task, the passed numbers set the recursion depth (0 means non-recursive) " "and optionally the collection depth") remove_output = CSVParameter( default=[], significant=False, description="remove all outputs, " "do not run any task, the passed number sets the recursion depth (0 means non-recursive)" ) interactive_params = ["print_deps", "print_status", "remove_output"] message_cache_size = 10 exclude_index = True exclude_params_req = set(interactive_params) def __init__(self, *args, **kwargs): super(Task, self).__init__(*args, **kwargs) # cache for messages published to the scheduler self._message_cache = [] # cache for the last progress published to the scheduler self._last_progress_percentage = None @property def default_log_file(self): return "-" def publish_message(self, *args): msg = " ".join(str(arg) for arg in args) print(msg) sys.stdout.flush() self._publish_message(*args) def _publish_message(self, *args): msg = " ".join(str(arg) for arg in args) # add to message cache and handle overflow msg = uncolored(msg) self._message_cache.append(msg) if self.message_cache_size >= 0: end = max(len(self._message_cache) - self.message_cache_size, 0) del self._message_cache[:end] # set status message using the current message cache self.set_status_message("\n".join(self._message_cache)) def create_message_stream(self, *args, **kwargs): return TaskMessageStream(self, *args, **kwargs) @contextmanager def publish_step(self, msg, success_message="done", fail_message="failed"): self.publish_message(msg) success = False try: yield success = True finally: self.publish_message(success_message if success else fail_message) def publish_progress(self, percentage, precision=0): percentage = round(percentage, precision) if percentage != self._last_progress_percentage: self._last_progress_percentage = percentage self.set_progress_percentage(percentage) def create_progress_callback(self, n_total, reach=(0, 100)): def make_callback(n, start, end): def callback(i): self.publish_progress(start + (i + 1) / float(n) * (end - start)) return callback if isinstance(n_total, (list, tuple)): width = 100. / len(n_total) reaches = [(width * i, width * (i + 1)) for i in range(len(n_total))] return n_total.__class__( make_callback(n, *r) for n, r in zip(n_total, reaches)) else: return make_callback(n_total, *reach) def colored_repr(self, color=True): family = self._repr_family(self.task_family, color=color) parts = [ self._repr_param(*pair, color=color) for pair in self._repr_params(color=color) ] parts += [ self._repr_flag(flag, color=color) for flag in self._repr_flags(color=color) ] return "{}({})".format(family, ", ".join(parts)) def _repr_params(self, color=True): # build key value pairs of all significant parameters params = self.get_params() param_values = self.get_param_values(params, [], self.param_kwargs) param_objs = dict(params) pairs = [] for param_name, param_value in param_values: if param_objs[param_name].significant: pairs.append((param_name, param_objs[param_name].serialize(param_value))) return pairs def _repr_flags(self, color=True): return [] @classmethod def _repr_family(cls, family, color=True): return colored(family, "green") if color else family @classmethod def _repr_param(cls, name, value, color=True): return "{}={}".format( colored(name, color="blue", style="bright") if color else name, value) @classmethod def _repr_flag(cls, name, color=True): return colored(name, color="magenta") if color else name def _print_deps(self, *args, **kwargs): return print_task_deps(self, *args, **kwargs) def _print_status(self, *args, **kwargs): return print_task_status(self, *args, **kwargs) def _remove_output(self, *args, **kwargs): return remove_task_output(self, *args, **kwargs)
class Task(six.with_metaclass(Register, BaseTask)): log_file = luigi.Parameter(default=NO_STR, significant=False, description="a custom log file; " "default: <task.default_log_file>") print_deps = CSVParameter(default=(), significant=False, description="print task dependencies " "but do not run any task; this CSV parameter accepts a single integer value which sets the " "task recursion depth (0 means non-recursive)") print_status = CSVParameter(default=(), significant=False, description="print the task status " "but do not run any task; this CSV parameter accepts up to three values: 1. the task " "recursion depth (0 means non-recursive), 2. the depth of the status text of target " "collections (default: 0), 3. a flag that is passed to the status text creation (default: " "'')") print_output = CSVParameter(default=(), significant=False, description="print a flat list of " "output targets but do not run any task; this CSV parameter accepts up to two values: 1. " "the task recursion depth (0 means non-recursive), 2. a boolean flag that decides whether " "paths of file targets should contain file system schemes (default: True)") remove_output = CSVParameter(default=(), significant=False, description="remove task outputs " "but do not run any task by default; this CSV parameter accepts up to three values: 1. the " "task recursion depth (0 means non-recursive), 2. one of the modes 'i' (interactive), 'a' " "(all), 'd' (dry run) (default: 'i'), 3. a boolean flag that decides whether the task is " "run after outputs were removed (default: False)") fetch_output = CSVParameter(default=(), significant=False, description="copy all task outputs " "into a local directory but do not run any task; this CSV parameter accepts up to four " "values: 1. the task recursion depth (0 means non-recursive), 2. one of the modes 'i' " "(interactive), 'a' (all), 'd' (dry run) (default: 'i'), 3. the target directory (default: " "'.'), 4. a boolean flag that decides whether external outputs and outputs of external " "tasks should be fetched (default: False)") interactive_params = [ "print_deps", "print_status", "print_output", "remove_output", "fetch_output", ] # cache size for published messages message_cache_size = 10 # force skipping this task when remove_output is set to "all" mode skip_output_removal = False exclude_index = True exclude_params_req = set() exclude_params_repr = set() @classmethod def req_params(cls, inst, _exclude=None, _prefer_cli=None, **kwargs): _exclude = set() if _exclude is None else set(make_list(_exclude)) # always exclude interactive parameters _exclude |= set(inst.interactive_params) return super(Task, cls).req_params(inst, _exclude=_exclude, _prefer_cli=_prefer_cli, **kwargs) def __init__(self, *args, **kwargs): super(Task, self).__init__(*args, **kwargs) # cache for messages published to the scheduler self._message_cache = [] # cache for the last progress published to the scheduler self._last_progress_percentage = None @property def default_log_file(self): return "-" def is_root_task(self): return root_task() == self def publish_message(self, msg, scheduler=True): msg = str(msg) sys.stdout.write(msg + "\n") sys.stdout.flush() if scheduler: self._publish_message(msg) def _publish_message(self, msg): msg = str(msg) # add to message cache and handle overflow msg = uncolored(msg) self._message_cache.append(msg) if self.message_cache_size >= 0: end = max(len(self._message_cache) - self.message_cache_size, 0) del self._message_cache[:end] # set status message using the current message cache if callable(getattr(self, "set_status_message", None)): self.set_status_message("\n".join(self._message_cache)) else: logger.warning("set_status_message not set, cannot send task message to scheduler") def create_message_stream(self, *args, **kwargs): return TaskMessageStream(self, *args, **kwargs) @contextmanager def publish_step(self, msg, success_message="done", fail_message="failed", runtime=True, scheduler=True): self.publish_message(msg, scheduler=scheduler) success = False t0 = time.time() try: yield success = True finally: msg = success_message if success else fail_message if runtime: diff = time.time() - t0 msg = "{} (took {})".format(msg, human_duration(seconds=diff)) self.publish_message(msg, scheduler=scheduler) def publish_progress(self, percentage, precision=1): percentage = int(round_discrete(percentage, precision, "floor")) if percentage != self._last_progress_percentage: self._last_progress_percentage = percentage if callable(getattr(self, "set_progress_percentage", None)): self.set_progress_percentage(percentage) else: logger.warning("set_progress_percentage not set, cannot send task progress to " "scheduler") def create_progress_callback(self, n_total, reach=(0, 100), precision=1): def make_callback(n, start, end): def callback(i): self.publish_progress(start + (i + 1) / float(n) * (end - start), precision) return callback if isinstance(n_total, (list, tuple)): width = 100. / len(n_total) reaches = [(width * i, width * (i + 1)) for i in range(len(n_total))] return n_total.__class__(make_callback(n, *r) for n, r in zip(n_total, reaches)) else: return make_callback(n_total, *reach) def cli_args(self, exclude=None, replace=None): exclude = set() if exclude is None else set(make_list(exclude)) # always exclude interactive parameters exclude |= set(self.interactive_params) return super(Task, self).cli_args(exclude=exclude, replace=replace) def __repr__(self): color = Config.instance().get_expanded_boolean("task", "colored_repr") return self.repr(color=color) def __str__(self): color = Config.instance().get_expanded_boolean("task", "colored_str") return self.repr(color=color) def repr(self, all_params=False, color=None, **kwargs): if color is None: color = Config.instance().get_expanded_boolean("task", "colored_repr") family = self._repr_family(self.get_task_family(), color=color, **kwargs) parts = [ self._repr_param(name, value, color=color, **kwargs) for name, value in six.iteritems(self._repr_params(all_params=all_params)) ] + [ self._repr_flag(flag, color=color, **kwargs) for flag in self._repr_flags() ] return "{}({})".format(family, ", ".join(parts)) def _repr_params(self, all_params=False): # determine parameters to exclude exclude = set() if not all_params: exclude |= self.exclude_params_repr exclude |= set(self.interactive_params) # build a map "name -> value" for all significant parameters params = OrderedDict() for name, param in self.get_params(): if param.significant and not multi_match(name, exclude): params[name] = getattr(self, name) return params def _repr_flags(self): return [] def _repr_family(self, family, color=False, **kwargs): return colored(family, "green") if color else family def _repr_param(self, name, value, color=False, serialize=True, **kwargs): # try to serialize first unless explicitly disabled if serialize: param = getattr(self.__class__, name, no_value) if param != no_value: value = param.serialize(value) return "{}={}".format(colored(name, color="blue", style="bright") if color else name, value) def _repr_flag(self, name, color=False, **kwargs): return colored(name, color="magenta") if color else name def _print_deps(self, args): return print_task_deps(self, *args) def _print_status(self, args): return print_task_status(self, *args) def _print_output(self, args): return print_task_output(self, *args) def _remove_output(self, args): return remove_task_output(self, *args) def _fetch_output(self, args): return fetch_task_output(self, *args) @classmethod def _law_run_inst(cls, inst, _exclude=None, _replace=None, _global=None, _run_kwargs=None): # get the cli arguments args = inst.cli_args(exclude=_exclude, replace=_replace) args = sum((make_list(tpl) for tpl in args.items()), []) # add global parameters when given if _global: args.extend([str(arg) for arg in make_list(_global)]) # build the full command cmd = [cls.get_task_family()] + args # run it return law_run(cmd, **(_run_kwargs or {})) @classmethod def law_run_inst(cls, _exclude=None, _replace=None, _global=None, _run_kwargs=None, **kwargs): # create a new instance inst = cls(**kwargs) return cls._law_run_inst(inst, _exclude=_exclude, _replace=_replace, _global=_global, _run_kwargs=_run_kwargs) def law_run(self, _exclude=None, _replace=None, _global=None, _run_kwargs=None, **kwargs): # when kwargs are given, create a new instance inst = self.req(self, **kwargs) if kwargs else self return self._law_run_inst(inst, _exclude=_exclude, _replace=_replace, _global=_global, _run_kwargs=_run_kwargs) def localize_input(self, *args, **kwargs): return localize_file_targets(self.input(), *args, **kwargs) def localize_output(self, *args, **kwargs): return localize_file_targets(self.output(), *args, **kwargs)
class PlotScaleFactor(PlotTask): hist_name = "sf" shifts = CSVParameter(default=["*"], description="Systematic shifts to plot." " Allows globbing.") fix_normalization = FitScaleFactors.fix_normalization norm_to_nominal = luigi.BoolParameter() is_c_flavour = luigi.BoolParameter() b_taggers = CSVParameter(default=["deepcsv"]) iterations = CSVParameter(default=[0]) versions = CSVParameter(default=[None], description="Scale factor versions to compare." "The same version is used for all required tasks.") def __init__(self, *args, **kwargs): super(PlotScaleFactor, self).__init__(*args, **kwargs) # identifiers used in file names self.shifts_identifier = "_".join(self.shifts) self.file_identifiers = list(self.b_taggers) + [self.shifts_identifier] if self.is_c_flavour: self.file_identifiers.append("c") if self.norm_to_nominal: self.file_identifiers.append("normed") if self.is_c_flavour: all_shifts = MeasureCScaleFactors.shifts else: all_shifts = MeasureScaleFactors.shifts skip_shifts = ["nominal"] + list(jes_total_shifts) all_shifts = [shift for shift in all_shifts if shift not in skip_shifts] # get matching shifts self.shifts = [shift for shift in all_shifts if law.util.multi_match(shift, self.shifts)] # Check if multiple shifts are present and thus have to be combined (envelope) self.multiple_shifts = len(self.shifts) > 2 if not self.is_c_flavour: # make sure the nominal histograms are processed first self.shifts.insert(0, "nominal") if len(self.shifts) != len(set(self.shifts)): raise Exception("Duplicate shift in {}".format(self.shifts)) def requires(self): reqs = OrderedDict() measure_task = MeasureCScaleFactors if self.is_c_flavour else MeasureScaleFactors for config in itertools.product(self.b_taggers, self.iterations, self.versions): b_tagger, iteration, version = config reqs[config] = OrderedDict() for shift in self.shifts: reqs[config][shift] = { "fit": FitScaleFactors.req(self, shift=shift, b_tagger=b_tagger, iteration=iteration, version=version if version is not None else self.get_version(FitScaleFactors), _prefer_cli=["version"]), "hist": measure_task.req(self, shift=shift, b_tagger=b_tagger, iteration=iteration, version=version if version is not None else self.get_version(measure_task), _prefer_cli=["version"]) } if self.fix_normalization: reqs[config]["norm"] = MergeScaleFactorWeights.req(self, normalize_cerrs=self.is_c_flavour, b_tagger=b_tagger, iteration=iteration, version=version if version is not None else self.get_version(MergeScaleFactorWeights), _prefer_cli=["version"]) return reqs def output(self): filename = "plots_{}.tgz".format("_".join(self.file_identifiers)) return self.local_target(filename) def run(self): import ROOT ROOT.PyConfig.IgnoreCommandLineOptions = True ROOT.gROOT.SetBatch() inp = self.input() outp = self.output() local_tmp = LocalDirectoryTarget(is_tmp=True) local_tmp.touch() plots = {} if self.norm_to_nominal and self.shifts[0] != "nominal": raise KeyError("'norm_to_nominal' is set to true, but no nominal values found.") for color_idx, (config, config_input) in enumerate(inp.items()): b_tagger, iteration, version = config config_ids = [b_tagger] if len(self.iterations) > 1: config_ids.append("iteration {}".format(iteration)) if len(self.versions) > 1: config_ids.append("version {}".format(version)) config_id = ", ".join(config_ids) nominal_hists = {} nominal_fit_hists = {} # combined errors for multiple shifts up_shifted_hists = defaultdict(dict) up_shifted_fit_hists = defaultdict(dict) down_shifted_hists = defaultdict(dict) down_shifted_fit_hists = defaultdict(dict) if self.fix_normalization: normalization_input = config_input.pop("norm") for shift_idx, (shift, inp_target) in enumerate(config_input.items()): # get scaling factors for normalization if self.fix_normalization: norm_factors = normalization_input.load()[shift] with inp_target["fit"]["sf"].load("r") as fit_file, \ inp_target["hist"]["scale_factors"].load("r") as hist_file: for category_key in fit_file.GetListOfKeys(): category_name = category_key.GetName() if not self.config_inst.has_category(category_name): raise KeyError("Unknown category {}".format(category_name)) category = self.config_inst.get_category(category_name) pt_range = category.get_aux("pt") eta_range = category.get_aux("eta") region = category.get_aux("region") # same category name for different b-taggers if len(self.b_taggers) > 1: plot_category = category.name.replace("__" + b_tagger, "") else: plot_category = category.name fit_category_dir = fit_file.Get(category_name) fit_hist = fit_category_dir.Get(self.hist_name) hist_category_dir = hist_file.Get(category_name) hist = hist_category_dir.Get(self.hist_name) # truncate first bin hist = self.rebin_hist(hist, region, b_tagger=b_tagger, truncate=True) # normalize histogram if required # fit histograms are already normalized in FitScaleFactors if self.fix_normalization and not self.is_c_flavour: hist.Scale(norm_factors[category_name]) if shift == "nominal": # make sure histograms are not cleaned up when the file is closed nominal_fit_hists[plot_category] = fit_hist.Clone() nominal_fit_hists[plot_category].SetDirectory(0) nominal_hists[plot_category] = hist.Clone() nominal_hists[plot_category].SetDirectory(0) # for c-jets, there is no nominal histogram # Instead, all nominal values are set to 1 if shift_idx == 0 and self.is_c_flavour: nominal_fit_hist = fit_hist.Clone() for bin_idx in range(1, nominal_fit_hist.GetNbinsX() + 1): nominal_fit_hist.SetBinContent(bin_idx, 1.0) nominal_fit_hist.SetDirectory(0) nominal_fit_hists[plot_category] = nominal_fit_hist if shift != "nominal" and self.multiple_shifts: # collect all shifted fit histograms to build envelope later sys, direction = shift.rsplit("_", 1) if direction == "up": up_shifted_fit_hists[plot_category][sys] = fit_hist.Clone() up_shifted_fit_hists[plot_category][sys].SetDirectory(0) up_shifted_hists[plot_category][sys] = hist.Clone() up_shifted_hists[plot_category][sys].SetDirectory(0) elif direction == "down": down_shifted_fit_hists[plot_category][sys] = fit_hist.Clone() down_shifted_fit_hists[plot_category][sys].SetDirectory(0) down_shifted_hists[plot_category][sys] = hist.Clone() down_shifted_hists[plot_category][sys].SetDirectory(0) else: raise ValueError("Unknown direction {}".format(direction)) if self.norm_to_nominal: fit_hist.Divide(nominal_fit_hists[plot_category]) # get same category key for all b-taggers if plot_category in plots: plot = plots[plot_category] else: plot = ROOTPlot(category.name, category.name) plot.create_pads() plots[plot_category] = plot plot.cd(0, 0) fit_hist.GetXaxis().SetRangeUser(-.1, 1.0) y_min = 0.6 if self.norm_to_nominal else 0. y_max = 1.4 if self.norm_to_nominal else 2. fit_hist.GetYaxis().SetRangeUser(y_min, y_max) if len(self.b_taggers) == 1: title = self.config_inst.get_aux("btaggers")[b_tagger]["label"] + " discriminator" else: title = "B-Tag Discriminant" fit_hist.GetXaxis().SetTitle(title) fit_hist.GetYaxis().SetTitle("SF") if shift_idx == 0: if not self.multiple_shifts or shift == "nominal": # only draw this fit histogram if it is not part of a shifted envelope plot.draw({"sf": fit_hist}, line_color=1, add_to_legend=False) line = ROOT.TLine(0., 0., 0., 2.) line.SetLineStyle(9) plot.draw({"line": line}, add_same_option=False, line_color=1, add_to_legend=False) # add category information to plot if not np.isinf(pt_range[1]): text = r"#splitline{%d < p_{T} < %d}{%.1f < |#eta| < %.1f}" % \ (pt_range[0], pt_range[1], eta_range[0], eta_range[1]) else: text = r"#splitline{p_{T} > %d}{%.1f < |#eta| < %.1f}" % \ (pt_range[0], eta_range[0], eta_range[1]) plot.draw_text(text) elif not self.multiple_shifts: plot.draw({shift: fit_hist}, line_color=None) if shift == "nominal" and not self.norm_to_nominal: plot.draw({config_id + ", nominal": hist}, line_color=1, add_to_legend=(len(self.shifts) != 1)) if self.multiple_shifts: for plot_category in plots: plot = plots[plot_category] plot.cd(0, 0) # build shifted histograms fit_hist_down, fit_hist_up = build_hist_envelope(nominal_fit_hists[plot_category], up_shifted_fit_hists[plot_category], down_shifted_fit_hists[plot_category], envelope_as_errors=False) hist_down, hist_up = build_hist_envelope(nominal_hists[plot_category], up_shifted_hists[plot_category], down_shifted_hists[plot_category], envelope_as_errors=False) if self.norm_to_nominal: fit_hist_up.Divide(nominal_fit_hists[plot_category]) fit_hist_down.Divide(nominal_fit_hists[plot_category]) hist_up.Divide(nominal_hists[plot_category]) hist_down.Divide(nominal_hists[plot_category]) plot.draw({config_id + ", up": fit_hist_up}, line_color=None) plot.draw({config_id + ", down": fit_hist_down}, line_color=None) plot.draw({config_id + ", up": hist_up}, line_color=2, options=["hist"]) plot.draw({config_id + ", down": hist_down}, line_color=4, options=["hist"]) # save plots for plot_category in plots: plot = plots[plot_category] plot_name = self.get_plot_name(plot_category, self.shifts_identifier, self.b_taggers[0], self.iterations[0]) plot.save(os.path.join(local_tmp.path, plot_name), draw_legend=True, lumi=self.config_inst.get_aux("lumi").values()[0] / 1000.) del plot with outp.localize("w") as tmp: with tarfile.open(tmp.path, "w:gz") as tar: for plot_file in os.listdir(local_tmp.path): tar.add(os.path.join(local_tmp.path, plot_file), arcname=plot_file)
class PlotVariable(PlotTask): b_tagger = MergeHistograms.b_tagger iteration = MergeHistograms.iteration final_it = MergeHistograms.final_it category_tag = luigi.Parameter(default="merged") variable = CSVParameter(default=["jet{i_probe_jet}_{b_tag_var}_{region}_{shift}"], description="Variable to plot, or multiple variables that are filled into one histogram. " "{} accesses auxiliary information.") mc_split = luigi.ChoiceParameter(choices=["process", "flavor"], default="process") normalize = luigi.BoolParameter(description="Normalize MC histogram to data histogram") truncate = luigi.BoolParameter(description="Truncate the bin below zero, to be used " "for b-tag variable plots.") rebin = luigi.BoolParameter(description="Rebin variable to 'measurement' binning, only " "for b-tag variable plots. Not usable with category-optimized binning.") x_title = luigi.Parameter(default="", description="Title for the plot x-axis.") logarithmic = luigi.BoolParameter(description="Plot y axis with logarithmic scale.") draw_stacked = luigi.BoolParameter(description="Plot MC processes separated by *mc_split*, " "combined in a stack.") draw_systematics = luigi.BoolParameter(description="Draw envelope of systematic uncertainties.") mc_key = "mc" data_key = "data" def __init__(self, *args, **kwargs): super(PlotVariable, self).__init__(*args, **kwargs) if self.draw_systematics: self.shifts = [shift for shift in MeasureScaleFactors.shifts if not shift in jes_total_shifts] if self.final_it: self.shifts += MeasureCScaleFactors.shifts else: self.shifts = ["nominal"] def requires(self): reqs = {} reqs["hists"] = MergeHistograms.req(self, branch=0, version=self.get_version(MergeHistograms), _prefer_cli=["version"]) if self.normalize: reqs["scale"] = MeasureScaleFactors.req(self, iteration=0, version=self.get_version(MeasureScaleFactors), _prefer_cli=["version"]) return reqs def associate_hist(self, process=None, flavor=None, region=None): # associate hist either to data or monte carlo # returns *add_to_data*, *sign* (1. or -1.) if process.is_data: return True, 1. else: return False, 1. def run(self): def add_hist(hist, new_hist, sign=1.): if hist is None: hist = new_hist.Clone() hist.Scale(sign) else: hist.Add(new_hist, sign) return hist import ROOT ROOT.PyConfig.IgnoreCommandLineOptions = True ROOT.gROOT.SetBatch() inp = self.input() outp = self.output() if self.normalize: scales = inp["scale"]["channel_scales"].load() local_tmp = LocalDirectoryTarget(is_tmp=True) local_tmp.touch() categories = [] for category, _, _ in self.config_inst.walk_categories(): if category.has_tag((self.category_tag, self.b_tagger), mode=all): categories.append(category) # create plot objects plot_dict = {} for category in categories: plot = ROOTPlot(category.name, category.name) plot.create_pads(n_pads_y=2, limits_y=[0., 0.3, 1.0], legend_loc="upper") plot_dict[category] = plot with inp["hists"].load("r") as input_file: for category in categories: data_hist = None mc_hists = defaultdict(lambda: defaultdict(lambda: None)) # shift -> key (process/flavor) for leaf_cat, _, children in category.walk_categories(): # we are only interested in leaves if children: continue flavor = leaf_cat.get_aux("flavor", None) channel = leaf_cat.get_aux("channel") region = leaf_cat.get_aux("region", None) category_dir = input_file.GetDirectory(leaf_cat.name) for process_key in category_dir.GetListOfKeys(): process = self.config_inst.get_process(process_key.GetName()) process_dir = category_dir.GetDirectory(process.name) # avoid double counting of inclusive and flavor-dependent histograms if flavor is not None: # Not needed in case region isn't flavor specific if process.is_data and flavor != "inclusive": continue elif process.is_mc and flavor == "inclusive": continue for shift in self.shifts: if process.is_data and shift != "nominal": continue for variable in self.variable: # create variable name from template aux = leaf_cat.aux.copy() aux["b_tag_var"] = self.config_inst.get_aux("btaggers")[self.b_tagger]["variable"] aux["b_tagger"] = self.b_tagger aux["shift"] = shift variable = variable.format(**aux) hist = process_dir.Get(variable) binning_type = "measurement" if self.rebin else None hist = self.rebin_hist(hist, region, binning_type=binning_type, truncate=self.truncate) add_to_data, sign = self.associate_hist(process=process, flavor=flavor, region=region) if add_to_data: if shift != "nominal": raise Exception("Cannot add shifted samples to data.") data_hist = add_hist(data_hist, hist, sign=sign) else: if self.normalize and region is not None: # apply "trigger" sfs as part of the normalization hist.Scale(scales[channel.name][region]) key = process if self.mc_split == "process" else flavor mc_hists[shift][key] = add_hist(mc_hists[shift][key], hist, sign=sign) if self.normalize: # normalize mc yield to data in this category mc_yield = sum(hist.Integral() for hist in mc_hists["nominal"].values()) data_yield = data_hist.Integral() norm_factor = data_yield / mc_yield for shift in self.shifts: for mc_hist in mc_hists[shift].values(): mc_hist.Scale(norm_factor) # get maximum value of hists/ stacks drawn to set axis ranges mc_hist_sum = mc_hists["nominal"].values()[0].Clone() for mc_hist in mc_hists["nominal"].values()[1:]: mc_hist_sum.Add(mc_hist) hist_maximum = max([mc_hist_sum.GetMaximum(), data_hist.GetMaximum()]) # get plot names mc_key = self.mc_key.format(**{"region": category.get_aux("region", None)}) data_key = self.data_key.format(**{"region": category.get_aux("region", None)}) plot = plot_dict[category] # data and mc histograms plot.cd(0, 1) if self.draw_stacked: plot.draw(mc_hists["nominal"], stacked=True, stack_maximum=1.5*hist_maximum, y_title="Entries") else: # fix axis range invis_hist = mc_hist_sum.Clone() if mc_hist_sum.GetMaximum() > data_hist.GetMaximum() else data_hist.Clone() invis_hist.Scale(1.5) plot.draw({"invis": invis_hist}, invis=True) plot.draw({mc_key: mc_hist_sum}, line_color=None) plot.draw({data_key: data_hist}) if self.draw_systematics: up_shifted_mc_hists = {} down_shifted_mc_hists = {} for shift in self.shifts: # combine processes/ flavors shifted_mc_hist_sum = mc_hists[shift].values()[0].Clone() for mc_hist in mc_hists[shift].values()[1:]: shifted_mc_hist_sum.Add(mc_hist) if shift.endswith("_down"): down_shifted_mc_hists[shift[:-5]] = shifted_mc_hist_sum.Clone() elif shift.endswith("_up"): up_shifted_mc_hists[shift[:-3]] = shifted_mc_hist_sum.Clone() envelope = build_hist_envelope(mc_hist_sum, up_shifted_mc_hists, down_shifted_mc_hists, envelope_as_errors=True) plot.draw_as_graph(envelope, options="2", hatched=True) # add category information to plot pt_range = category.get_aux("pt", None) eta_range = category.get_aux("eta", None) if pt_range is not None and eta_range is not None: if not np.isinf(pt_range[1]): text = r"#splitline{%d < p_{T} < %d}{%.1f < |#eta| < %.1f}" % \ (pt_range[0], pt_range[1], eta_range[0], eta_range[1]) else: text = r"#splitline{p_{T} > %d}{%.1f < |#eta| < %.1f}" % \ (pt_range[0], eta_range[0], eta_range[1]) plot.draw_text(text, size=0.05, xpos=0.505, ypos=0.5, align=11) # ratio of data to mc below the main plot plot.cd(0, 0) # ratio histograms # mc error band ratio_mcerr_hist = mc_hist_sum.Clone() # divide without error propagation self.divide_hists(ratio_mcerr_hist, mc_hist_sum) # ratio ratio_hist = data_hist.Clone() self.divide_hists(ratio_hist, mc_hist_sum) y_axis = ratio_hist.GetYaxis() y_axis.SetRangeUser(0.5, 1.5) y_axis.SetTitle("data/MC") y_axis.SetTitleSize(y_axis.GetTitleSize() * plot.open_pad.scale_factor) y_axis.SetLabelSize(y_axis.GetLabelSize() * plot.open_pad.scale_factor) y_axis.SetNdivisions(505) y_axis.SetTitleOffset(0.65) x_axis = ratio_hist.GetXaxis() if self.x_title: aux = category.aux.copy() aux["b_tag_var"] = self.config_inst.get_aux("btaggers")[self.b_tagger]["label"] x_axis.SetTitle(self.x_title.format(**aux)) x_axis.SetTitleSize(x_axis.GetTitleSize() * plot.open_pad.scale_factor) x_axis.SetLabelSize(x_axis.GetLabelSize() * plot.open_pad.scale_factor) plot.draw({"invis": ratio_hist}, invis=True) plot.draw_as_graph(ratio_mcerr_hist, options="2") plot.draw({"data/mc": ratio_hist}) if self.draw_systematics: # build envelope of ratio to nominal hist for hist in up_shifted_mc_hists.values(): hist.Divide(mc_hist_sum) for hist in down_shifted_mc_hists.values(): hist.Divide(mc_hist_sum) scaled_envelope = build_hist_envelope(ratio_mcerr_hist, up_shifted_mc_hists, down_shifted_mc_hists, envelope_as_errors=True) plot.draw_as_graph(scaled_envelope, options="2", hatched=True) for category, plot in plot_dict.items(): plot_name = self.get_plot_name(category.name, self.variable, self.b_tagger, self.iteration) plot.save(os.path.join(local_tmp.path, plot_name), draw_legend=(False, True), log_y=self.logarithmic, lumi=self.config_inst.get_aux("lumi").values()[0]/1000.) del plot with outp.localize("w") as tmp: with tarfile.open(tmp.path, "w:gz") as tar: for plot_file in os.listdir(local_tmp.path): tar.add(os.path.join(local_tmp.path, plot_file), arcname=plot_file)
class WriteHistograms(DatasetTask, AnalysisSandboxTask, GridWorkflow, law.LocalWorkflow): iteration = luigi.IntParameter(default=0, description="iteration of the scale factor " "calculation, starting at zero, default: 0") final_it = luigi.BoolParameter( description="Flag for the final iteration of the scale factor " "calculation.") variable_tags = CSVParameter( default=[], description="Only consider variables with one or more of " "the given tags. Use all if empty.") category_tags = CSVParameter( default=[], description="Only consider categories whose top-level " "category has one or more of the given tags. Use all if empty.") used_shifts = CSVParameter( default=[] ) # needs to be named differently from the wrapper task parameter binning = CSVParameter( default=[], cls=luigi.FloatParameter, description="Overwrite default binning " "of variables. If exactly three values are provided, they are interpreted as a tuple of (n_bins, min, max)." ) b_tagger = luigi.Parameter(default="deepcsv", description="Name of the b-tagger to use.") optimize_binning = luigi.BoolParameter( description="Use optimized discriminant binning.") file_merging = "trees" workflow_run_decorators = [law.decorator.notify] sandbox = "singularity::/cvmfs/singularity.opensciencegrid.org/cmssw/cms:rhel7-m20200612" req_sandbox = "slc7" def __init__(self, *args, **kwargs): super(WriteHistograms, self).__init__(*args, **kwargs) # set shifts if self.dataset_inst.is_data: shifts = {"nominal"} else: jes_sources = self.config_inst.get_aux("jes_sources_{}".format( self.config_inst.get_aux("jes_scheme"))) shifts = {"nominal"} | format_shifts(jes_sources, prefix="jes") if self.iteration > 0: shifts = shifts | format_shifts([ "lf", "hf", "lf_stats1", "lf_stats2", "hf_stats1", "hf_stats2" ]) if self.final_it: # add c shifts shifts = shifts | format_shifts(["c_stats1", "c_stats2"]) if len(self.used_shifts) == 0: self.shifts = shifts elif any([shift not in shifts for shift in self.used_shifts]): raise ValueError("Unknown shift in {}".format(self.used_shifts)) else: self.shifts = self.used_shifts def workflow_requires(self): from analysis.tasks.measurement import BundleScaleFactors reqs = super(WriteHistograms, self).workflow_requires() if not self.cancel_jobs and not self.cleanup_jobs: reqs["meta"] = MergeMetaData.req( self, version=self.get_version(MergeMetaData), _prefer_cli=["version"]) if self.dataset_inst.is_mc: reqs["pu"] = CalculatePileupWeights.req(self) if not self.pilot: reqs["tree"] = MergeTrees.req( self, cascade_tree=-1, version=self.get_version(MergeTrees), _prefer_cli=["version"]) if self.iteration > 0: reqs["sf"] = BundleScaleFactors.req( self, iteration=self.iteration - 1, fix_normalization=self.final_it, include_cshifts=self.final_it, version=self.get_version(BundleScaleFactors), _prefer_cli=["version"]) if self.optimize_binning: from analysis.tasks.util import OptimizeBinning # prevent circular import reqs["binning"] = OptimizeBinning.req( self, version=self.get_version(OptimizeBinning), _prefer_cli=["version"]) return reqs def requires(self): from analysis.tasks.measurement import BundleScaleFactors reqs = { "tree": MergeTrees.req(self, cascade_tree=self.branch, branch=0, version=self.get_version(MergeTrees), _prefer_cli=["version", "workflow"]), "meta": MergeMetaData.req(self, version=self.get_version(MergeMetaData), _prefer_cli=["version"]), } if self.dataset_inst.is_mc: reqs["pu"] = CalculatePileupWeights.req(self) if self.iteration > 0: reqs["sf"] = BundleScaleFactors.req( self, iteration=self.iteration - 1, fix_normalization=self.final_it, include_cshifts=self.final_it, version=self.get_version(BundleScaleFactors), _prefer_cli=["version"]) if self.optimize_binning: from analysis.tasks.util import OptimizeBinning # prevent circular import reqs["binning"] = OptimizeBinning.req( self, version=self.get_version(OptimizeBinning), _prefer_cli=["version"]) return reqs def store_parts(self): binning_part = "optimized" if self.optimize_binning else "default" variable_part = "_".join( self.variable_tags) if self.variable_tags else "all" shift_part = "_".join(self.used_shifts) if self.used_shifts else "all" return super(WriteHistograms, self).store_parts() + (self.b_tagger,) + (self.iteration,) \ + (variable_part,) + (shift_part,) + (binning_part,) def output(self): return self.wlcg_target("hists_{}.root".format(self.branch)) def get_jec_identifier(self, shift): if shift.startswith("jes"): return "_" + shift else: return "" def get_pileup_weighter(self, inp): with inp.load() as pu_file: pu_hist = pu_file.Get("pileup_weights") pu_values = [ pu_hist.GetBinContent(i) for i in range(1, pu_hist.GetNbinsX() + 1) ] pu_values = [ value if (value < 1000) else 1. for value in pu_values ] # TODO: Temporary, due to high pu weights in 2018 data def add_branch(extender): extender.add_branch("pu_weight", unpack="pu") def add_value(entry): # some events have inf pileup, skip them weight = 1. pu = entry.pu[0] if np.isfinite(pu): pu_idx = int(pu) - 1 if 0 <= pu_idx < len(pu_values): weight = pu_values[pu_idx] entry.pu_weight[0] = weight return add_branch, add_value def get_scale_factor_weighter(self, inp, shift, nominal_sfs=None): sf_hists = {} input_files = [inp] # c scale factor files have no histograms for hf/lf, so use nominal ones if nominal_sfs is not None: input_files.append(nominal_sfs) for input_file in input_files: with input_file.load() as sfs: shift_dir = sfs.Get(shift) for category in shift_dir.GetListOfKeys(): category_dir = shift_dir.Get(category.GetName()) hist = category_dir.Get("sf") # decouple from open file hist.SetDirectory(0) if category.GetName() not in sf_hists: sf_hists[category.GetName()] = hist else: raise KeyError("Duplicate category {} in scale factor " "weighter.".format(category.GetName())) btag_var = self.config_inst.get_aux("btaggers")[ self.b_tagger]["variable"] identifier = self.get_jec_identifier(shift) def add_branch(extender): unpack_vars = sum([[ "jet{}_pt{}".format(idx, identifier), "jet{}_flavor{}".format( idx, identifier), "jet{}_eta{}".format(idx, identifier), "jet{}_{}{}".format(idx, btag_var, identifier) ] for idx in range(1, 5)], []) extender.add_branch("scale_factor_lf_{}".format(shift), unpack=unpack_vars) extender.add_branch("scale_factor_c_{}".format(shift), unpack=unpack_vars) extender.add_branch("scale_factor_hf_{}".format(shift), unpack=unpack_vars) def add_value(entry): scale_factor_lf = 1. scale_factor_c = 1. scale_factor_hf = 1. for jet_idx in range(1, 5): jet_pt = getattr(entry, "jet{}_pt{}".format(jet_idx, identifier))[0] jet_eta = getattr(entry, "jet{}_eta{}".format(jet_idx, identifier))[0] jet_flavor = getattr( entry, "jet{}_flavor{}".format(jet_idx, identifier))[0] jet_btag = getattr( entry, "jet{}_{}{}".format(jet_idx, btag_var, identifier))[0] # stop when number of jets is exceeded if jet_flavor < -999.: break # find category in which the scale factor of the jet was computed to get correct histogram if abs(jet_flavor) == 5: region = "hf" elif abs(jet_flavor) == 4: region = "c" else: region = "lf" # nominal c scale factors are 1 if region == "c" and not shift.startswith("c_stat"): continue category = self.category_getter.get_category( jet_pt, abs(jet_eta), region) # get scale factor sf_hist = sf_hists[category.name] bin_idx = sf_hist.FindBin(jet_btag) scale_factor = sf_hist.GetBinContent(bin_idx) scale_factor = max([0., scale_factor]) if abs(jet_flavor) == 5: scale_factor_hf *= scale_factor elif abs(jet_flavor) == 4: scale_factor_c *= scale_factor else: scale_factor_lf *= scale_factor getattr(entry, "scale_factor_lf_{}".format(shift))[0] = scale_factor_lf getattr(entry, "scale_factor_c_{}".format(shift))[0] = scale_factor_c getattr(entry, "scale_factor_hf_{}".format(shift))[0] = scale_factor_hf return add_branch, add_value @law.decorator.notify def run(self): import ROOT inp = self.input() outp = self.output() outp.parent.touch(0o0770) self.category_getter = CategoryGetter(self.config_inst, self.b_tagger) # get child categories categories = [] for category in self.config_inst.categories: # only consider top-level categories with at least one given tag if specified if len(self.category_tags) > 0 and not category.has_tag( self.category_tags, mode=any): continue # for intermediate iterations, skip merged categories not used for measurement # (to improve performance) if not self.final_it: if category.has_tag("merged") and not category.get_aux( "phase_space") == "measure": continue # recurse through all children of category, add leaf categories for cat, children in walk_categories(category): if not children: # only use categories matching the task config if cat.get_aux("config", None) != self.config_inst.name: continue # only use categories for the chosen b-tag algorithm if cat.has_tag(self.b_tagger): channel = cat.get_aux("channel") categories.append((channel, cat)) categories = list(set(categories)) # get processes if len(self.dataset_inst.processes) != 1: raise NotImplementedError( "only datasets with exactly one linked process can be" " handled, got {}".format(len(self.dataset_inst.processes))) processes = list(self.dataset_inst.processes.values()) # build a progress callback progress = self.create_progress_callback(len(categories)) # open the output file with outp.localize("w") as tmp: with tmp.dump("RECREATE") as output_file: with self.publish_step( "creating root output file directories ..."): process_dirs = {} for _, category in categories: output_file.cd() category_dir = output_file.mkdir(category.name) for process in processes: category_dir.cd() process_dir = category_dir.mkdir(process.name) process_dir.Write() process_dirs[(category.name, process.name)] = process_dir # open the input file and get the tree # as we need to extend the tree with custom weights, we do not cache the file with inp["tree"].load("UPDATE", cache=False) as input_file: tree = input_file.Get("tree") self.publish_message("{} events in tree".format( tree.GetEntries())) # identifier for jec shifted variables for shift in self.shifts: jec_identifier = self.get_jec_identifier(shift) # pt aliases for jets for obj in ["jet1", "jet2", "jet3", "jet4"]: tree.SetAlias( "{0}_pt{1}".format(obj, jec_identifier), "({0}_px{1}**2 + {0}_py{1}**2)**0.5".format( obj, jec_identifier)) # b-tagging alias btag_var = self.config_inst.get_aux("btaggers")[ self.b_tagger]["variable"] for obj in ["jet1", "jet2", "jet3", "jet4"]: variable = self.config_inst.get_variable( "{0}_{1}".format(obj, btag_var)) tree.SetAlias( variable.name + jec_identifier, variable.expression.format( **{"jec_identifier": jec_identifier})) # pt aliases for leptons for obj in ["lep1", "lep2"]: tree.SetAlias( "{0}_pt".format(obj), "({0}_px**2 + {0}_py**2)**0.5".format(obj)) # extend the tree if self.dataset_inst.is_mc: with self.publish_step( "extending the input tree with weights ..."): weighters = [] # pileup weight weighters.append( self.get_pileup_weighter(inp["pu"])) # weights from previous iterations if self.iteration > 0: # b-tagging scale factors for shift in self.shifts: nominal_sfs = inp["sf"]["nominal"]["sf"] if shift.startswith("c_stat") \ else None weighters.append( self.get_scale_factor_weighter( inp["sf"], shift, nominal_sfs=nominal_sfs)) input_file.cd() with TreeExtender(tree) as te: for add_branch, _ in weighters: add_branch(te) for i, entry in enumerate(te): if (i % 1000) == 0: print "event {}".format(i) for _, add_value in weighters: add_value(entry) # read in total number of events sum_weights = inp["meta"].load( )["event_weights"]["sum"] # get category-dependent binning if optimized binning is used # only for b-taaging discriminants if self.optimize_binning: category_binnings = inp["binning"].load() for i, (channel, category) in enumerate(categories): self.publish_message( "writing histograms in category {} ({}/{})".format( category.name, i + 1, len(categories))) # get the region (HF / LF) # not all child categories have regions associated, e.g. the phase space # inclusive regions ("measure", "closure") region = category.get_aux("region", None) # set weights that are common for all shifts base_weights = [] if self.dataset_inst.is_mc: base_weights.append("gen_weight") # lumi weight lumi = self.config_inst.get_aux("lumi")[channel] x_sec = process.get_xsec( self.config_inst.campaign.ecm).nominal lumi_weight = lumi * x_sec / sum_weights base_weights.append(str(lumi_weight)) # pu weight base_weights.append("pu_weight") for process in processes: # change into the correct directory process_dirs[(category.name, process.name)].cd() for shift in self.shifts: jec_identifier = self.get_jec_identifier(shift) # weights weights = base_weights[:] if self.dataset_inst.is_mc: # channel scale weight if self.iteration > 0: # b-tag scale factor weights phase_space = category.get_aux( "phase_space", None) # In measurement categories, # apply scale factors only for contamination if phase_space == "measure" and not self.final_it: weights.append( "scale_factor_c_{}".format( shift)) if region == "hf": weights.append( "scale_factor_lf_{}". format(shift)) elif region == "lf": weights.append( "scale_factor_hf_{}". format(shift)) elif region == "cont": weights.append( "scale_factor_lf_{}". format(shift)) weights.append( "scale_factor_hf_{}". format(shift)) else: raise ValueError( "Unexpected region {}". format(region)) else: weights.append( "scale_factor_lf_{}".format( shift)) weights.append( "scale_factor_c_{}".format( shift)) weights.append( "scale_factor_hf_{}".format( shift)) # totalWeight alias while len(weights) < 2: weights.insert(0, "1") tree.SetAlias( "totalWeight", join_root_selection(weights, op="*")) # actual projecting for variable in self.config_inst.variables: # save variable binning to reset at end of loop base_variable_binning = variable.binning if variable.has_tag("skip_all"): continue if region and variable.has_tag( "skip_{}".format(region)): continue # if variable tags is given, require at least one if len(self.variable_tags ) > 0 and not variable.has_tag( self.variable_tags, mode=any): continue # do not write one b-tag discriminant in the category of another if variable.get_aux( "b_tagger", self.b_tagger) != self.b_tagger: continue # if number of bins is specified, overwrite variable binning if self.binning: self.binning = list(self.binning) # if a tuple of (n_bins, x_min, x_max) is given, ensure that n_bins is an integer if len(self.binning) == 3: self.binning[0] = int( self.binning[0]) self.binning = tuple(self.binning) variable.binning = self.binning # use optimized binning for b-tag discriminants if provided if self.optimize_binning and variable.get_aux( "can_optimize_bins", False): binning_category = category.get_aux( "binning_category", category) # overwrite binning if specialized binning is defined for this category variable.binning = category_binnings.get( binning_category.name, variable.binning) hist = ROOT.TH1F( "{}_{}".format(variable.name, shift), variable.full_title(root=True), variable.n_bins, array.array("d", variable.bin_edges)) hist.Sumw2() # build the full selection string, including the total event weight selection = [ category.selection, "jetmet_pass{jec_identifier} == 1", "{} != -10000".format( variable.expression), ] if variable.selection: selection.append(variable.selection) selection = join_root_selection( selection).format( ** {"jec_identifier": jec_identifier}) selection = join_root_selection( selection, "totalWeight", op="*") # project and write the histogram tree.Project( "{}_{}".format(variable.name, shift), variable.expression.format( ** {"jec_identifier": jec_identifier }), selection) hist.Write() variable.binning = base_variable_binning progress(i)