示例#1
0
    def execute(self, context):
        """Execute the Apache Beam Pipeline."""
        self.beam_hook = BeamHook(runner=self.runner)
        pipeline_options = self.default_pipeline_options.copy()
        process_line_callback: Optional[Callable] = None
        is_dataflow = self.runner.lower() == BeamRunnerType.DataflowRunner.lower()
        dataflow_job_name: Optional[str] = None

        if isinstance(self.dataflow_config, dict):
            self.dataflow_config = DataflowConfiguration(**self.dataflow_config)

        if is_dataflow:
            dataflow_job_name, pipeline_options, process_line_callback = self._set_dataflow(
                pipeline_options=pipeline_options, job_name_variable_key="job_name"
            )

        pipeline_options.update(self.pipeline_options)

        # Convert argument names from lowerCamelCase to snake case.
        formatted_pipeline_options = {
            convert_camel_to_snake(key): pipeline_options[key] for key in pipeline_options
        }

        with ExitStack() as exit_stack:
            if self.py_file.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(gcs_hook.provide_file(object_url=self.py_file))
                self.py_file = tmp_gcs_file.name

            if is_dataflow:
                with self.dataflow_hook.provide_authorized_gcloud():
                    self.beam_hook.start_python_pipeline(
                        variables=formatted_pipeline_options,
                        py_file=self.py_file,
                        py_options=self.py_options,
                        py_interpreter=self.py_interpreter,
                        py_requirements=self.py_requirements,
                        py_system_site_packages=self.py_system_site_packages,
                        process_line_callback=process_line_callback,
                    )

                self.dataflow_hook.wait_for_done(
                    job_name=dataflow_job_name,
                    location=self.dataflow_config.location,
                    job_id=self.dataflow_job_id,
                    multiple_jobs=False,
                )

            else:
                self.beam_hook.start_python_pipeline(
                    variables=formatted_pipeline_options,
                    py_file=self.py_file,
                    py_options=self.py_options,
                    py_interpreter=self.py_interpreter,
                    py_requirements=self.py_requirements,
                    py_system_site_packages=self.py_system_site_packages,
                    process_line_callback=process_line_callback,
                )

        return {"dataflow_job_id": self.dataflow_job_id}
示例#2
0
    def execute(self, context: 'Context') -> List[str]:
        # list all files in an Google Cloud Storage bucket
        hook = GCSHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            impersonation_chain=self.google_impersonation_chain,
        )

        self.log.info(
            'Getting list of the files. Bucket: %s; Delimiter: %s; Prefix: %s',
            self.bucket,
            self.delimiter,
            self.prefix,
        )

        files = hook.list(bucket_name=self.bucket,
                          prefix=self.prefix,
                          delimiter=self.delimiter)

        s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id,
                         verify=self.dest_verify,
                         extra_args=self.dest_s3_extra_args)

        if not self.replace:
            # if we are not replacing -> list all files in the S3 bucket
            # and only keep those files which are present in
            # Google Cloud Storage and not in S3
            bucket_name, prefix = S3Hook.parse_s3_url(self.dest_s3_key)
            # look for the bucket and the prefix to avoid look into
            # parent directories/keys
            existing_files = s3_hook.list_keys(bucket_name, prefix=prefix)
            # in case that no files exists, return an empty array to avoid errors
            existing_files = existing_files if existing_files is not None else []
            # remove the prefix for the existing files to allow the match
            existing_files = [
                file.replace(prefix, '', 1) for file in existing_files
            ]
            files = list(set(files) - set(existing_files))

        if files:

            for file in files:
                with hook.provide_file(
                        object_name=file,
                        bucket_name=self.bucket) as local_tmp_file:
                    dest_key = self.dest_s3_key + file
                    self.log.info("Saving file to %s", dest_key)

                    s3_hook.load_file(
                        filename=local_tmp_file.name,
                        key=dest_key,
                        replace=self.replace,
                        acl_policy=self.s3_acl_policy,
                    )

            self.log.info("All done, uploaded %d files to S3", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to S3")

        return files
示例#3
0
    def execute(self, context):
        """Execute the python dataflow job."""
        with ExitStack() as exit_stack:
            if self.py_file.lower().startswith('gs://'):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(  # pylint: disable=no-member
                    gcs_hook.provide_file(object_url=self.py_file)
                )
                self.py_file = tmp_gcs_file.name

            self.hook = DataflowHook(
                gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep
            )
            dataflow_options = self.dataflow_default_options.copy()
            dataflow_options.update(self.options)
            # Convert argument names from lowerCamelCase to snake case.
            camel_to_snake = lambda name: re.sub(r'[A-Z]', lambda x: '_' + x.group(0).lower(), name)
            formatted_options = {camel_to_snake(key): dataflow_options[key] for key in dataflow_options}

            def set_current_job_id(job_id):
                self.job_id = job_id

            self.hook.start_python_dataflow(
                job_name=self.job_name,
                variables=formatted_options,
                dataflow=self.py_file,
                py_options=self.py_options,
                py_interpreter=self.py_interpreter,
                py_requirements=self.py_requirements,
                py_system_site_packages=self.py_system_site_packages,
                on_new_job_id_callback=set_current_job_id,
                project_id=self.project_id,
                location=self.location,
            )
示例#4
0
文件: beam.py 项目: dskoda1/airflow
    def execute(self, context: 'Context'):
        """Execute the Apache Beam Pipeline."""
        (
            is_dataflow,
            dataflow_job_name,
            snake_case_pipeline_options,
            process_line_callback,
        ) = self._init_pipeline_options(format_pipeline_options=True,
                                        job_name_variable_key="job_name")

        if not self.beam_hook:
            raise AirflowException("Beam hook is not defined.")

        with ExitStack() as exit_stack:
            if self.go_file.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)

                with tempfile.TemporaryDirectory(
                        prefix="apache-beam-go") as tmp_dir:
                    tmp_gcs_file = exit_stack.enter_context(
                        gcs_hook.provide_file(object_url=self.go_file,
                                              dir=tmp_dir))
                    self.go_file = tmp_gcs_file.name
                    self.should_init_go_module = True

            if is_dataflow and self.dataflow_hook:
                with self.dataflow_hook.provide_authorized_gcloud():
                    self.beam_hook.start_go_pipeline(
                        variables=snake_case_pipeline_options,
                        go_file=self.go_file,
                        process_line_callback=process_line_callback,
                        should_init_module=self.should_init_go_module,
                    )

                DataflowJobLink.persist(
                    self,
                    context,
                    self.dataflow_config.project_id,
                    self.dataflow_config.location,
                    self.dataflow_job_id,
                )
                if dataflow_job_name and self.dataflow_config.location:
                    self.dataflow_hook.wait_for_done(
                        job_name=dataflow_job_name,
                        location=self.dataflow_config.location,
                        job_id=self.dataflow_job_id,
                        multiple_jobs=False,
                        project_id=self.dataflow_config.project_id,
                    )
                return {"dataflow_job_id": self.dataflow_job_id}
            else:
                self.beam_hook.start_go_pipeline(
                    variables=snake_case_pipeline_options,
                    go_file=self.go_file,
                    process_line_callback=process_line_callback,
                    should_init_module=self.should_init_go_module,
                )
示例#5
0
文件: beam.py 项目: dskoda1/airflow
    def execute(self, context: 'Context'):
        """Execute the Apache Beam Pipeline."""
        (
            is_dataflow,
            dataflow_job_name,
            snake_case_pipeline_options,
            process_line_callback,
        ) = self._init_pipeline_options(format_pipeline_options=True,
                                        job_name_variable_key="job_name")

        if not self.beam_hook:
            raise AirflowException("Beam hook is not defined.")

        with ExitStack() as exit_stack:
            if self.py_file.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(
                    gcs_hook.provide_file(object_url=self.py_file))
                self.py_file = tmp_gcs_file.name

            if is_dataflow and self.dataflow_hook:
                with self.dataflow_hook.provide_authorized_gcloud():
                    self.beam_hook.start_python_pipeline(
                        variables=snake_case_pipeline_options,
                        py_file=self.py_file,
                        py_options=self.py_options,
                        py_interpreter=self.py_interpreter,
                        py_requirements=self.py_requirements,
                        py_system_site_packages=self.py_system_site_packages,
                        process_line_callback=process_line_callback,
                    )
                DataflowJobLink.persist(
                    self,
                    context,
                    self.dataflow_config.project_id,
                    self.dataflow_config.location,
                    self.dataflow_job_id,
                )
                if dataflow_job_name and self.dataflow_config.location:
                    self.dataflow_hook.wait_for_done(
                        job_name=dataflow_job_name,
                        location=self.dataflow_config.location,
                        job_id=self.dataflow_job_id,
                        multiple_jobs=False,
                        project_id=self.dataflow_config.project_id,
                    )
                return {"dataflow_job_id": self.dataflow_job_id}
            else:
                self.beam_hook.start_python_pipeline(
                    variables=snake_case_pipeline_options,
                    py_file=self.py_file,
                    py_options=self.py_options,
                    py_interpreter=self.py_interpreter,
                    py_requirements=self.py_requirements,
                    py_system_site_packages=self.py_system_site_packages,
                    process_line_callback=process_line_callback,
                )
示例#6
0
文件: dataflow.py 项目: mhenc/airflow
    def execute(self, context: 'Context'):
        """Execute the python dataflow job."""
        self.beam_hook = BeamHook(runner=BeamRunnerType.DataflowRunner)
        self.dataflow_hook = DataflowHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            poll_sleep=self.poll_sleep,
            impersonation_chain=None,
            drain_pipeline=self.drain_pipeline,
            cancel_timeout=self.cancel_timeout,
            wait_until_finished=self.wait_until_finished,
        )

        job_name = self.dataflow_hook.build_dataflow_job_name(job_name=self.job_name)
        pipeline_options = self.dataflow_default_options.copy()
        pipeline_options["job_name"] = job_name
        pipeline_options["project"] = self.project_id or self.dataflow_hook.project_id
        pipeline_options["region"] = self.location
        pipeline_options.update(self.options)

        # Convert argument names from lowerCamelCase to snake case.
        camel_to_snake = lambda name: re.sub(r"[A-Z]", lambda x: "_" + x.group(0).lower(), name)
        formatted_pipeline_options = {camel_to_snake(key): pipeline_options[key] for key in pipeline_options}

        def set_current_job_id(job_id):
            self.job_id = job_id

        process_line_callback = process_line_and_extract_dataflow_job_id_callback(
            on_new_job_id_callback=set_current_job_id
        )

        with ExitStack() as exit_stack:
            if self.py_file.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(gcs_hook.provide_file(object_url=self.py_file))
                self.py_file = tmp_gcs_file.name

            with self.dataflow_hook.provide_authorized_gcloud():
                self.beam_hook.start_python_pipeline(
                    variables=formatted_pipeline_options,
                    py_file=self.py_file,
                    py_options=self.py_options,
                    py_interpreter=self.py_interpreter,
                    py_requirements=self.py_requirements,
                    py_system_site_packages=self.py_system_site_packages,
                    process_line_callback=process_line_callback,
                )

            self.dataflow_hook.wait_for_done(
                job_name=job_name,
                location=self.location,
                job_id=self.job_id,
                multiple_jobs=False,
            )

        return {"job_id": self.job_id}
示例#7
0
    def deserialize_value(result) -> Any:
        result = BaseXCom.deserialize_value(result)

        if isinstance(result, str) and result.startswith(
                GCSXComBackend.PREFIX):
            object_name = result.replace(GCSXComBackend.PREFIX, "")
            hook = GCSHook()

            with hook.provide_file(bucket_name=GCSXComBackend.BUCKET_NAME,
                                   object_name=object_name) as f:
                f.flush()
                result = pickle.load(f)

        return result
示例#8
0
    def execute(self, context):
        self.hook = DataflowHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            poll_sleep=self.poll_sleep,
            cancel_timeout=self.cancel_timeout,
            wait_until_finished=self.wait_until_finished,
        )
        dataflow_options = copy.copy(self.dataflow_default_options)
        dataflow_options.update(self.options)
        is_running = False
        if self.check_if_running != CheckJobRunning.IgnoreJob:
            is_running = self.hook.is_job_dataflow_running(  # type: ignore[attr-defined]
                name=self.job_name,
                variables=dataflow_options,
                project_id=self.project_id,
                location=self.location,
            )
            while is_running and self.check_if_running == CheckJobRunning.WaitForRun:
                is_running = self.hook.is_job_dataflow_running(  # type: ignore[attr-defined]
                    name=self.job_name,
                    variables=dataflow_options,
                    project_id=self.project_id,
                    location=self.location,
                )

        if not is_running:
            with ExitStack() as exit_stack:
                if self.jar.lower().startswith("gs://"):
                    gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                    tmp_gcs_file = exit_stack.enter_context(  # pylint: disable=no-member
                        gcs_hook.provide_file(object_url=self.jar)
                    )
                    self.jar = tmp_gcs_file.name

                def set_current_job_id(job_id):
                    self.job_id = job_id

                self.hook.start_java_dataflow(  # type: ignore[attr-defined]
                    job_name=self.job_name,
                    variables=dataflow_options,
                    jar=self.jar,
                    job_class=self.job_class,
                    append_job_name=True,
                    multiple_jobs=self.multiple_jobs,
                    on_new_job_id_callback=set_current_job_id,
                    project_id=self.project_id,
                    location=self.location,
                )
示例#9
0
    def execute(self, context: 'Context'):
        """Execute the Apache Beam Pipeline."""
        self.beam_hook = BeamHook(runner=BeamRunnerType.DataflowRunner)
        self.dataflow_hook = DataflowHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            poll_sleep=self.poll_sleep,
            cancel_timeout=self.cancel_timeout,
            wait_until_finished=self.wait_until_finished,
        )
        job_name = self.dataflow_hook.build_dataflow_job_name(
            job_name=self.job_name)
        pipeline_options = copy.deepcopy(self.dataflow_default_options)

        pipeline_options["jobName"] = self.job_name
        pipeline_options[
            "project"] = self.project_id or self.dataflow_hook.project_id
        pipeline_options["region"] = self.location
        pipeline_options.update(self.options)
        pipeline_options.setdefault("labels", {}).update({
            "airflow-version":
            "v" + version.replace(".", "-").replace("+", "-")
        })
        pipeline_options.update(self.options)

        def set_current_job_id(job_id):
            self.job_id = job_id

        process_line_callback = process_line_and_extract_dataflow_job_id_callback(
            on_new_job_id_callback=set_current_job_id)

        with ExitStack() as exit_stack:
            if self.jar.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(
                    gcs_hook.provide_file(object_url=self.jar))
                self.jar = tmp_gcs_file.name

            is_running = False
            if self.check_if_running != CheckJobRunning.IgnoreJob:
                is_running = self.dataflow_hook.is_job_dataflow_running(
                    name=self.job_name,
                    variables=pipeline_options,
                )
                while is_running and self.check_if_running == CheckJobRunning.WaitForRun:

                    is_running = self.dataflow_hook.is_job_dataflow_running(
                        name=self.job_name,
                        variables=pipeline_options,
                    )
            if not is_running:
                pipeline_options["jobName"] = job_name
                with self.dataflow_hook.provide_authorized_gcloud():
                    self.beam_hook.start_java_pipeline(
                        variables=pipeline_options,
                        jar=self.jar,
                        job_class=self.job_class,
                        process_line_callback=process_line_callback,
                    )
                self.dataflow_hook.wait_for_done(
                    job_name=job_name,
                    location=self.location,
                    job_id=self.job_id,
                    multiple_jobs=self.multiple_jobs,
                )

        return {"job_id": self.job_id}
示例#10
0
    def execute(self, context: 'Context'):
        """Execute the Apache Beam Pipeline."""
        self.beam_hook = BeamHook(runner=self.runner)
        pipeline_options = self.default_pipeline_options.copy()
        process_line_callback: Optional[Callable] = None
        is_dataflow = self.runner.lower(
        ) == BeamRunnerType.DataflowRunner.lower()
        dataflow_job_name: Optional[str] = None

        if is_dataflow:
            dataflow_job_name, pipeline_options, process_line_callback = self._set_dataflow(
                pipeline_options=pipeline_options, job_name_variable_key=None)

        pipeline_options.update(self.pipeline_options)

        with ExitStack() as exit_stack:
            if self.jar.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(
                    gcs_hook.provide_file(object_url=self.jar))
                self.jar = tmp_gcs_file.name

            if is_dataflow and self.dataflow_hook:
                is_running = False
                if self.dataflow_config.check_if_running != CheckJobRunning.IgnoreJob:
                    is_running = (
                        # The reason for disable=no-value-for-parameter is that project_id parameter is
                        # required but here is not passed, moreover it cannot be passed here.
                        # This method is wrapped by @_fallback_to_project_id_from_variables decorator which
                        # fallback project_id value from variables and raise error if project_id is
                        # defined both in variables and as parameter (here is already defined in variables)
                        self.dataflow_hook.is_job_dataflow_running(
                            name=self.dataflow_config.job_name,
                            variables=pipeline_options,
                        ))
                    while is_running and self.dataflow_config.check_if_running == CheckJobRunning.WaitForRun:
                        # The reason for disable=no-value-for-parameter is that project_id parameter is
                        # required but here is not passed, moreover it cannot be passed here.
                        # This method is wrapped by @_fallback_to_project_id_from_variables decorator which
                        # fallback project_id value from variables and raise error if project_id is
                        # defined both in variables and as parameter (here is already defined in variables)

                        is_running = self.dataflow_hook.is_job_dataflow_running(
                            name=self.dataflow_config.job_name,
                            variables=pipeline_options,
                        )
                if not is_running:
                    pipeline_options["jobName"] = dataflow_job_name
                    with self.dataflow_hook.provide_authorized_gcloud():
                        self.beam_hook.start_java_pipeline(
                            variables=pipeline_options,
                            jar=self.jar,
                            job_class=self.job_class,
                            process_line_callback=process_line_callback,
                        )
                    if dataflow_job_name and self.dataflow_config.location:
                        multiple_jobs = (self.dataflow_config.multiple_jobs
                                         if self.dataflow_config.multiple_jobs
                                         else False)
                        self.dataflow_hook.wait_for_done(
                            job_name=dataflow_job_name,
                            location=self.dataflow_config.location,
                            job_id=self.dataflow_job_id,
                            multiple_jobs=multiple_jobs,
                            project_id=self.dataflow_config.project_id,
                        )
            else:
                self.beam_hook.start_java_pipeline(
                    variables=pipeline_options,
                    jar=self.jar,
                    job_class=self.job_class,
                    process_line_callback=process_line_callback,
                )

        return {"dataflow_job_id": self.dataflow_job_id}
示例#11
0
    def execute(self, context):
        """Execute the Apache Beam Pipeline."""
        self.beam_hook = BeamHook(runner=self.runner)
        pipeline_options = self.default_pipeline_options.copy()
        process_line_callback: Optional[Callable] = None
        is_dataflow = self.runner.lower(
        ) == BeamRunnerType.DataflowRunner.lower()

        if isinstance(self.dataflow_config, dict):
            self.dataflow_config = DataflowConfiguration(
                **self.dataflow_config)

        if is_dataflow:
            self.dataflow_hook = DataflowHook(
                gcp_conn_id=self.dataflow_config.gcp_conn_id
                or self.gcp_conn_id,
                delegate_to=self.dataflow_config.delegate_to
                or self.delegate_to,
                poll_sleep=self.dataflow_config.poll_sleep,
                impersonation_chain=self.dataflow_config.impersonation_chain,
                drain_pipeline=self.dataflow_config.drain_pipeline,
                cancel_timeout=self.dataflow_config.cancel_timeout,
                wait_until_finished=self.dataflow_config.wait_until_finished,
            )
            self.dataflow_config.project_id = self.dataflow_config.project_id or self.dataflow_hook.project_id

            self._dataflow_job_name = DataflowHook.build_dataflow_job_name(
                self.dataflow_config.job_name,
                self.dataflow_config.append_job_name)
            pipeline_options["jobName"] = self.dataflow_config.job_name
            pipeline_options["project"] = self.dataflow_config.project_id
            pipeline_options["region"] = self.dataflow_config.location
            pipeline_options.setdefault("labels", {}).update({
                "airflow-version":
                "v" + version.replace(".", "-").replace("+", "-")
            })

            def set_current_dataflow_job_id(job_id):
                self.dataflow_job_id = job_id

            process_line_callback = process_line_and_extract_dataflow_job_id_callback(
                on_new_job_id_callback=set_current_dataflow_job_id)

        pipeline_options.update(self.pipeline_options)

        with ExitStack() as exit_stack:
            if self.jar.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(  # pylint: disable=no-member
                    gcs_hook.provide_file(object_url=self.jar))
                self.jar = tmp_gcs_file.name

            if is_dataflow:
                is_running = False
                if self.dataflow_config.check_if_running != CheckJobRunning.IgnoreJob:
                    is_running = (
                        # The reason for disable=no-value-for-parameter is that project_id parameter is
                        # required but here is not passed, moreover it cannot be passed here.
                        # This method is wrapped by @_fallback_to_project_id_from_variables decorator which
                        # fallback project_id value from variables and raise error if project_id is
                        # defined both in variables and as parameter (here is already defined in variables)
                        self.dataflow_hook.is_job_dataflow_running(  # pylint: disable=no-value-for-parameter
                            name=self.dataflow_config.job_name,
                            variables=pipeline_options,
                        ))
                    while is_running and self.dataflow_config.check_if_running == CheckJobRunning.WaitForRun:
                        # The reason for disable=no-value-for-parameter is that project_id parameter is
                        # required but here is not passed, moreover it cannot be passed here.
                        # This method is wrapped by @_fallback_to_project_id_from_variables decorator which
                        # fallback project_id value from variables and raise error if project_id is
                        # defined both in variables and as parameter (here is already defined in variables)
                        # pylint: disable=no-value-for-parameter
                        is_running = self.dataflow_hook.is_job_dataflow_running(
                            name=self.dataflow_config.job_name,
                            variables=pipeline_options,
                        )
                if not is_running:
                    pipeline_options["jobName"] = self._dataflow_job_name
                    self.beam_hook.start_java_pipeline(
                        variables=pipeline_options,
                        jar=self.jar,
                        job_class=self.job_class,
                        process_line_callback=process_line_callback,
                    )
                    self.dataflow_hook.wait_for_done(
                        job_name=self._dataflow_job_name,
                        location=self.dataflow_config.location,
                        job_id=self.dataflow_job_id,
                        multiple_jobs=self.dataflow_config.multiple_jobs,
                        project_id=self.dataflow_config.project_id,
                    )

            else:
                self.beam_hook.start_java_pipeline(
                    variables=pipeline_options,
                    jar=self.jar,
                    job_class=self.job_class,
                    process_line_callback=process_line_callback,
                )

        return {"dataflow_job_id": self.dataflow_job_id}
示例#12
0
    def execute(self, context):
        """Execute the Apache Beam Pipeline."""
        self.beam_hook = BeamHook(runner=self.runner)
        pipeline_options = self.default_pipeline_options.copy()
        process_line_callback: Optional[Callable] = None
        is_dataflow = self.runner.lower(
        ) == BeamRunnerType.DataflowRunner.lower()

        if isinstance(self.dataflow_config, dict):
            self.dataflow_config = DataflowConfiguration(
                **self.dataflow_config)

        if is_dataflow:
            self.dataflow_hook = DataflowHook(
                gcp_conn_id=self.dataflow_config.gcp_conn_id
                or self.gcp_conn_id,
                delegate_to=self.dataflow_config.delegate_to
                or self.delegate_to,
                poll_sleep=self.dataflow_config.poll_sleep,
                impersonation_chain=self.dataflow_config.impersonation_chain,
                drain_pipeline=self.dataflow_config.drain_pipeline,
                cancel_timeout=self.dataflow_config.cancel_timeout,
                wait_until_finished=self.dataflow_config.wait_until_finished,
            )
            self.dataflow_config.project_id = self.dataflow_config.project_id or self.dataflow_hook.project_id

            dataflow_job_name = DataflowHook.build_dataflow_job_name(
                self.dataflow_config.job_name,
                self.dataflow_config.append_job_name)
            pipeline_options["job_name"] = dataflow_job_name
            pipeline_options["project"] = self.dataflow_config.project_id
            pipeline_options["region"] = self.dataflow_config.location
            pipeline_options.setdefault("labels", {}).update({
                "airflow-version":
                "v" + version.replace(".", "-").replace("+", "-")
            })

            def set_current_dataflow_job_id(job_id):
                self.dataflow_job_id = job_id

            process_line_callback = process_line_and_extract_dataflow_job_id_callback(
                on_new_job_id_callback=set_current_dataflow_job_id)

        pipeline_options.update(self.pipeline_options)

        # Convert argument names from lowerCamelCase to snake case.
        formatted_pipeline_options = {
            convert_camel_to_snake(key): pipeline_options[key]
            for key in pipeline_options
        }

        with ExitStack() as exit_stack:
            if self.py_file.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(  # pylint: disable=no-member
                    gcs_hook.provide_file(object_url=self.py_file))
                self.py_file = tmp_gcs_file.name

            self.beam_hook.start_python_pipeline(
                variables=formatted_pipeline_options,
                py_file=self.py_file,
                py_options=self.py_options,
                py_interpreter=self.py_interpreter,
                py_requirements=self.py_requirements,
                py_system_site_packages=self.py_system_site_packages,
                process_line_callback=process_line_callback,
            )

            if is_dataflow:
                self.dataflow_hook.wait_for_done(  # pylint: disable=no-value-for-parameter
                    job_name=dataflow_job_name,
                    location=self.dataflow_config.location,
                    job_id=self.dataflow_job_id,
                    multiple_jobs=False,
                )

        return {"dataflow_job_id": self.dataflow_job_id}
示例#13
0
文件: beam.py 项目: leahecole/airflow
    def execute(self, context: 'Context'):
        """Execute the Apache Beam Pipeline."""
        (
            is_dataflow,
            dataflow_job_name,
            pipeline_options,
            process_line_callback,
        ) = self._init_pipeline_options()

        if not self.beam_hook:
            raise AirflowException("Beam hook is not defined.")

        with ExitStack() as exit_stack:
            if self.jar.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(
                    gcs_hook.provide_file(object_url=self.jar))
                self.jar = tmp_gcs_file.name

            if is_dataflow and self.dataflow_hook:
                is_running = False
                if self.dataflow_config.check_if_running != CheckJobRunning.IgnoreJob:
                    is_running = (
                        # The reason for disable=no-value-for-parameter is that project_id parameter is
                        # required but here is not passed, moreover it cannot be passed here.
                        # This method is wrapped by @_fallback_to_project_id_from_variables decorator which
                        # fallback project_id value from variables and raise error if project_id is
                        # defined both in variables and as parameter (here is already defined in variables)
                        self.dataflow_hook.is_job_dataflow_running(
                            name=self.dataflow_config.job_name,
                            variables=pipeline_options,
                        ))
                    while is_running and self.dataflow_config.check_if_running == CheckJobRunning.WaitForRun:
                        # The reason for disable=no-value-for-parameter is that project_id parameter is
                        # required but here is not passed, moreover it cannot be passed here.
                        # This method is wrapped by @_fallback_to_project_id_from_variables decorator which
                        # fallback project_id value from variables and raise error if project_id is
                        # defined both in variables and as parameter (here is already defined in variables)

                        is_running = self.dataflow_hook.is_job_dataflow_running(
                            name=self.dataflow_config.job_name,
                            variables=pipeline_options,
                        )
                if not is_running:
                    pipeline_options["jobName"] = dataflow_job_name
                    with self.dataflow_hook.provide_authorized_gcloud():
                        self.beam_hook.start_java_pipeline(
                            variables=pipeline_options,
                            jar=self.jar,
                            job_class=self.job_class,
                            process_line_callback=process_line_callback,
                        )
                    if dataflow_job_name and self.dataflow_config.location:
                        multiple_jobs = self.dataflow_config.multiple_jobs or False
                        DataflowJobLink.persist(
                            self,
                            context,
                            self.dataflow_config.project_id,
                            self.dataflow_config.location,
                            self.dataflow_job_id,
                        )
                        self.dataflow_hook.wait_for_done(
                            job_name=dataflow_job_name,
                            location=self.dataflow_config.location,
                            job_id=self.dataflow_job_id,
                            multiple_jobs=multiple_jobs,
                            project_id=self.dataflow_config.project_id,
                        )
                return {"dataflow_job_id": self.dataflow_job_id}
            else:
                self.beam_hook.start_java_pipeline(
                    variables=pipeline_options,
                    jar=self.jar,
                    job_class=self.job_class,
                    process_line_callback=process_line_callback,
                )