示例#1
0
class DataflowStartFlexTemplateOperator(BaseOperator):
    """
    Starts flex templates with the Dataflow  pipeline.

    :param body: The request body. See:
        https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.locations.flexTemplates/launch#request-body
    :param location: The location of the Dataflow job (for example europe-west1)
    :type location: str
    :param project_id: The ID of the GCP project that owns the job.
        If set to ``None`` or missing, the default project_id from the GCP connection is used.
    :type project_id: Optional[str]
    :param gcp_conn_id: The connection ID to use connecting to Google Cloud
        Platform.
    :type gcp_conn_id: str
    :param delegate_to: The account to impersonate, if any.
        For this to work, the service account making the request must have
        domain-wide delegation enabled.
    :type delegate_to: str
    """

    template_fields = ["body", 'location', 'project_id', 'gcp_conn_id']

    @apply_defaults
    def __init__(
        self,
        body: Dict,
        location: str,
        project_id: Optional[str] = None,
        gcp_conn_id: str = 'google_cloud_default',
        delegate_to: Optional[str] = None,
        *args,
        **kwargs,
    ) -> None:
        super().__init__(*args, **kwargs)
        self.body = body
        self.location = location
        self.project_id = project_id
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.job_id = None
        self.hook: Optional[DataflowHook] = None

    def execute(self, context):
        self.hook = DataflowHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
        )

        def set_current_job_id(job_id):
            self.job_id = job_id

        job = self.hook.start_flex_template(
            body=self.body,
            location=self.location,
            project_id=self.project_id,
            on_new_job_id_callback=set_current_job_id,
        )

        return job

    def on_kill(self) -> None:
        self.log.info("On kill.")
        if self.job_id:
            self.hook.cancel_job(job_id=self.job_id,
                                 project_id=self.project_id)
示例#2
0
class DataflowStartFlexTemplateOperator(BaseOperator):
    """
    Starts flex templates with the Dataflow pipeline.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:DataflowStartFlexTemplateOperator`

    :param body: The request body. See:
        https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.locations.flexTemplates/launch#request-body
    :param location: The location of the Dataflow job (for example europe-west1)
    :param project_id: The ID of the GCP project that owns the job.
        If set to ``None`` or missing, the default project_id from the GCP connection is used.
    :param gcp_conn_id: The connection ID to use connecting to Google Cloud
        Platform.
    :param delegate_to: The account to impersonate, if any.
        For this to work, the service account making the request must have
        domain-wide delegation enabled.
    :param drain_pipeline: Optional, set to True if want to stop streaming job by draining it
        instead of canceling during killing task instance. See:
        https://cloud.google.com/dataflow/docs/guides/stopping-a-pipeline
    :param cancel_timeout: How long (in seconds) operator should wait for the pipeline to be
        successfully cancelled when task is being killed.
    :param wait_until_finished: (Optional)
        If True, wait for the end of pipeline execution before exiting.
        If False, only submits job.
        If None, default behavior.

        The default behavior depends on the type of pipeline:

        * for the streaming pipeline, wait for jobs to start,
        * for the batch pipeline, wait for the jobs to complete.

        .. warning::

            You cannot call ``PipelineResult.wait_until_finish`` method in your pipeline code for the operator
            to work properly. i. e. you must use asynchronous execution. Otherwise, your pipeline will
            always wait until finished. For more information, look at:
            `Asynchronous execution
            <https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#python_10>`__

        The process of starting the Dataflow job in Airflow consists of two steps:

        * running a subprocess and reading the stderr/stderr log for the job id.
        * loop waiting for the end of the job ID from the previous step.
          This loop checks the status of the job.

        Step two is started just after step one has finished, so if you have wait_until_finished in your
        pipeline code, step two will not start until the process stops. When this process stops,
        steps two will run, but it will only execute one iteration as the job will be in a terminal state.

        If you in your pipeline do not call the wait_for_pipeline method but pass wait_until_finish=True
        to the operator, the second loop will wait for the job's terminal state.

        If you in your pipeline do not call the wait_for_pipeline method, and pass wait_until_finish=False
        to the operator, the second loop will check once is job not in terminal state and exit the loop.
    """

    template_fields: Sequence[str] = ("body", "location", "project_id",
                                      "gcp_conn_id")
    operator_extra_links = (DataflowJobLink(), )

    def __init__(
        self,
        body: Dict,
        location: str,
        project_id: Optional[str] = None,
        gcp_conn_id: str = "google_cloud_default",
        delegate_to: Optional[str] = None,
        drain_pipeline: bool = False,
        cancel_timeout: Optional[int] = 10 * 60,
        wait_until_finished: Optional[bool] = None,
        *args,
        **kwargs,
    ) -> None:
        super().__init__(*args, **kwargs)
        self.body = body
        self.location = location
        self.project_id = project_id
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.drain_pipeline = drain_pipeline
        self.cancel_timeout = cancel_timeout
        self.wait_until_finished = wait_until_finished
        self.job = None
        self.hook: Optional[DataflowHook] = None

    def execute(self, context: 'Context'):
        self.hook = DataflowHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            drain_pipeline=self.drain_pipeline,
            cancel_timeout=self.cancel_timeout,
            wait_until_finished=self.wait_until_finished,
        )

        def set_current_job(current_job):
            self.job = current_job
            DataflowJobLink.persist(self, context, self.project_id,
                                    self.location, self.job.get("id"))

        job = self.hook.start_flex_template(
            body=self.body,
            location=self.location,
            project_id=self.project_id,
            on_new_job_callback=set_current_job,
        )

        return job

    def on_kill(self) -> None:
        self.log.info("On kill.")
        if self.job:
            self.hook.cancel_job(
                job_id=self.job.get("id"),
                project_id=self.job.get("projectId"),
                location=self.job.get("location"),
            )
示例#3
0
class DataflowStartFlexTemplateOperator(BaseOperator):
    """
    Starts flex templates with the Dataflow  pipeline.

    :param body: The request body. See:
        https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.locations.flexTemplates/launch#request-body
    :param location: The location of the Dataflow job (for example europe-west1)
    :type location: str
    :param project_id: The ID of the GCP project that owns the job.
        If set to ``None`` or missing, the default project_id from the GCP connection is used.
    :type project_id: Optional[str]
    :param gcp_conn_id: The connection ID to use connecting to Google Cloud
        Platform.
    :type gcp_conn_id: str
    :param delegate_to: The account to impersonate, if any.
        For this to work, the service account making the request must have
        domain-wide delegation enabled.
    :type delegate_to: str
    :param drain_pipeline: Optional, set to True if want to stop streaming job by draining it
        instead of canceling during during killing task instance. See:
        https://cloud.google.com/dataflow/docs/guides/stopping-a-pipeline
    :type drain_pipeline: bool
    :param cancel_timeout: How long (in seconds) operator should wait for the pipeline to be
        successfully cancelled when task is being killed.
    :type cancel_timeout: Optional[int]
    """

    template_fields = ["body", "location", "project_id", "gcp_conn_id"]

    @apply_defaults
    def __init__(
        self,
        body: Dict,
        location: str,
        project_id: Optional[str] = None,
        gcp_conn_id: str = "google_cloud_default",
        delegate_to: Optional[str] = None,
        drain_pipeline: bool = False,
        cancel_timeout: Optional[int] = 10 * 60,
        *args,
        **kwargs,
    ) -> None:
        super().__init__(*args, **kwargs)
        self.body = body
        self.location = location
        self.project_id = project_id
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.drain_pipeline = drain_pipeline
        self.cancel_timeout = cancel_timeout
        self.job_id = None
        self.hook: Optional[DataflowHook] = None

    def execute(self, context):
        self.hook = DataflowHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            drain_pipeline=self.drain_pipeline,
            cancel_timeout=self.cancel_timeout,
        )

        def set_current_job_id(job_id):
            self.job_id = job_id

        job = self.hook.start_flex_template(
            body=self.body,
            location=self.location,
            project_id=self.project_id,
            on_new_job_id_callback=set_current_job_id,
        )

        return job

    def on_kill(self) -> None:
        self.log.info("On kill.")
        if self.job_id:
            self.hook.cancel_job(job_id=self.job_id,
                                 project_id=self.project_id)