Пример #1
0
    def add(self, task: DAGNode) -> None:
        """Add a task to this TaskGroup.

        :meta private:
        """
        from airflow.models.abstractoperator import AbstractOperator

        existing_tg = task.task_group
        if isinstance(task, AbstractOperator
                      ) and existing_tg is not None and existing_tg != self:
            raise TaskAlreadyInTaskGroup(task.node_id, existing_tg.node_id,
                                         self.node_id)

        # Set the TG first, as setting it might change the return value of node_id!
        task.task_group = weakref.proxy(self)
        key = task.node_id

        if key in self.children:
            node_type = "Task" if hasattr(task, 'task_id') else "Task Group"
            raise DuplicateTaskIdFound(
                f"{node_type} id '{key}' has already been added to the DAG")

        if isinstance(task, TaskGroup):
            if self.dag:
                if task.dag is not None and self.dag is not task.dag:
                    raise RuntimeError(
                        "Cannot mix TaskGroups from different DAGs: %s and %s",
                        self.dag, task.dag)
                task.dag = self.dag
            if task.children:
                raise AirflowException("Cannot add a non-empty TaskGroup")

        self.children[key] = task
Пример #2
0
    def __init__(
        self,
        group_id: Optional[str],
        prefix_group_id: bool = True,
        parent_group: Optional["TaskGroup"] = None,
        dag: Optional["DAG"] = None,
        tooltip: str = "",
        ui_color: str = "CornflowerBlue",
        ui_fgcolor: str = "#000",
    ):
        from airflow.models.dag import DagContext

        self.prefix_group_id = prefix_group_id

        if group_id is None:
            # This creates a root TaskGroup.
            if parent_group:
                raise AirflowException("Root TaskGroup cannot have parent_group")
            # used_group_ids is shared across all TaskGroups in the same DAG to keep track
            # of used group_id to avoid duplication.
            self.used_group_ids: Set[Optional[str]] = set()
            self._parent_group = None
        else:
            if not isinstance(group_id, str):
                raise ValueError("group_id must be str")
            if not group_id:
                raise ValueError("group_id must not be empty")

            dag = dag or DagContext.get_current_dag()

            if not parent_group and not dag:
                raise AirflowException("TaskGroup can only be used inside a dag")

            self._parent_group = parent_group or TaskGroupContext.get_current_task_group(dag)
            if not self._parent_group:
                raise AirflowException("TaskGroup must have a parent_group except for the root TaskGroup")
            self.used_group_ids = self._parent_group.used_group_ids

        self._group_id = group_id
        if self.group_id in self.used_group_ids:
            raise DuplicateTaskIdFound(f"group_id '{self.group_id}' has already been added to the DAG")
        self.used_group_ids.add(self.group_id)
        self.used_group_ids.add(self.downstream_join_id)
        self.used_group_ids.add(self.upstream_join_id)
        self.children: Dict[str, Union["BaseOperator", "TaskGroup"]] = {}
        if self._parent_group:
            self._parent_group.add(self)

        self.tooltip = tooltip
        self.ui_color = ui_color
        self.ui_fgcolor = ui_fgcolor

        # Keep track of TaskGroups or tasks that depend on this entire TaskGroup separately
        # so that we can optimize the number of edges when entire TaskGroups depend on each other.
        self.upstream_group_ids: Set[Optional[str]] = set()
        self.downstream_group_ids: Set[Optional[str]] = set()
        self.upstream_task_ids: Set[Optional[str]] = set()
        self.downstream_task_ids: Set[Optional[str]] = set()
Пример #3
0
    def add(self, task: Union["BaseOperator", "TaskGroup"]) -> None:
        """Add a task to this TaskGroup."""
        key = task.group_id if isinstance(task, TaskGroup) else task.task_id

        if key in self.children:
            raise DuplicateTaskIdFound(f"Task id '{key}' has already been added to the DAG")

        if isinstance(task, TaskGroup):
            if task.children:
                raise AirflowException("Cannot add a non-empty TaskGroup")

        self.children[key] = task  # type: ignore
Пример #4
0
 def _check_for_group_id_collisions(self, add_suffix_on_collision: bool):
     if self._group_id is None:
         return
     # if given group_id already used assign suffix by incrementing largest used suffix integer
     # Example : task_group ==> task_group__1 -> task_group__2 -> task_group__3
     if self._group_id in self.used_group_ids:
         if not add_suffix_on_collision:
             raise DuplicateTaskIdFound(f"group_id '{self._group_id}' has already been added to the DAG")
         base = re.split(r'__\d+$', self._group_id)[0]
         suffixes = sorted(
             int(re.split(r'^.+__', used_group_id)[1])
             for used_group_id in self.used_group_ids
             if used_group_id is not None and re.match(rf'^{base}__\d+$', used_group_id)
         )
         if not suffixes:
             self._group_id += '__1'
         else:
             self._group_id = f'{base}__{suffixes[-1] + 1}'
Пример #5
0
    def dag(self, dag):
        """
        Operators can be assigned to one DAG, one time. Repeat assignments to
        that same DAG are ok.
        """
        if not isinstance(dag, DAG):
            raise TypeError('Expected DAG; received {}'.format(
                dag.__class__.__name__))
        elif self.has_dag() and self.dag is not dag:
            raise AirflowException(
                "The DAG assigned to {} can not be changed.".format(self))
        elif self.task_id not in dag.task_dict:
            dag.add_task(self)
        elif self.task_id in dag.task_dict and dag.task_dict[
                self.task_id] != self:
            raise DuplicateTaskIdFound(
                "Task id '{}' has already been added to the DAG".format(
                    self.task_id))

        self._dag = dag  # pylint: disable=attribute-defined-outside-init
Пример #6
0
    def add(self, task: DAGNode) -> None:
        """Add a task to this TaskGroup."""
        key = task.node_id

        if key in self.children:
            node_type = "Task" if hasattr(task, 'task_id') else "Task Group"
            raise DuplicateTaskIdFound(
                f"{node_type} id '{key}' has already been added to the DAG")

        if isinstance(task, TaskGroup):
            if self.dag:
                if task.dag is not None and self.dag is not task.dag:
                    raise RuntimeError(
                        "Cannot mix TaskGroups from different DAGs: %s and %s",
                        self.dag, task.dag)
                task.dag = self.dag
            if task.children:
                raise AirflowException("Cannot add a non-empty TaskGroup")

        self.children[key] = task
        task.task_group = weakref.proxy(self)
Пример #7
0
    def __init__(
        self,
        group_id: Optional[str],
        prefix_group_id: bool = True,
        parent_group: Optional["TaskGroup"] = None,
        dag: Optional["DAG"] = None,
        default_args: Optional[Dict] = None,
        tooltip: str = "",
        ui_color: str = "CornflowerBlue",
        ui_fgcolor: str = "#000",
        add_suffix_on_collision: bool = False,
    ):
        from airflow.models.dag import DagContext

        self.prefix_group_id = prefix_group_id
        self.default_args = copy.deepcopy(default_args or {})

        if group_id is None:
            # This creates a root TaskGroup.
            if parent_group:
                raise AirflowException(
                    "Root TaskGroup cannot have parent_group")
            # used_group_ids is shared across all TaskGroups in the same DAG to keep track
            # of used group_id to avoid duplication.
            self.used_group_ids: Set[Optional[str]] = set()
            self._parent_group = None
        else:
            if not isinstance(group_id, str):
                raise ValueError("group_id must be str")
            if not group_id:
                raise ValueError("group_id must not be empty")

            dag = dag or DagContext.get_current_dag()

            if not parent_group and not dag:
                raise AirflowException(
                    "TaskGroup can only be used inside a dag")

            self._parent_group = parent_group or TaskGroupContext.get_current_task_group(
                dag)
            if not self._parent_group:
                raise AirflowException(
                    "TaskGroup must have a parent_group except for the root TaskGroup"
                )
            self.used_group_ids = self._parent_group.used_group_ids

        self._group_id = group_id
        # if given group_id already used assign suffix by incrementing largest used suffix integer
        # Example : task_group ==> task_group__1 -> task_group__2 -> task_group__3
        if group_id in self.used_group_ids:
            if not add_suffix_on_collision:
                raise DuplicateTaskIdFound(
                    f"group_id '{self.group_id}' has already been added to the DAG"
                )
            base = re.split(r'__\d+$', group_id)[0]
            suffixes = sorted(
                int(re.split(r'^.+__', used_group_id)[1])
                for used_group_id in self.used_group_ids
                if used_group_id is not None
                and re.match(rf'^{base}__\d+$', used_group_id))
            if not suffixes:
                self._group_id += '__1'
            else:
                self._group_id = f'{base}__{suffixes[-1] + 1}'

        self.used_group_ids.add(self.group_id)
        self.used_group_ids.add(self.downstream_join_id)
        self.used_group_ids.add(self.upstream_join_id)
        self.children: Dict[str, Union["BaseOperator", "TaskGroup"]] = {}
        if self._parent_group:
            self._parent_group.add(self)

        self.tooltip = tooltip
        self.ui_color = ui_color
        self.ui_fgcolor = ui_fgcolor

        # Keep track of TaskGroups or tasks that depend on this entire TaskGroup separately
        # so that we can optimize the number of edges when entire TaskGroups depend on each other.
        self.upstream_group_ids: Set[Optional[str]] = set()
        self.downstream_group_ids: Set[Optional[str]] = set()
        self.upstream_task_ids: Set[Optional[str]] = set()
        self.downstream_task_ids: Set[Optional[str]] = set()