def execute(self, context): """Execute the Apache Beam Pipeline.""" self.beam_hook = BeamHook(runner=self.runner) pipeline_options = self.default_pipeline_options.copy() process_line_callback: Optional[Callable] = None is_dataflow = self.runner.lower() == BeamRunnerType.DataflowRunner.lower() dataflow_job_name: Optional[str] = None if isinstance(self.dataflow_config, dict): self.dataflow_config = DataflowConfiguration(**self.dataflow_config) if is_dataflow: dataflow_job_name, pipeline_options, process_line_callback = self._set_dataflow( pipeline_options=pipeline_options, job_name_variable_key="job_name" ) pipeline_options.update(self.pipeline_options) # Convert argument names from lowerCamelCase to snake case. formatted_pipeline_options = { convert_camel_to_snake(key): pipeline_options[key] for key in pipeline_options } with ExitStack() as exit_stack: if self.py_file.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context(gcs_hook.provide_file(object_url=self.py_file)) self.py_file = tmp_gcs_file.name if is_dataflow: with self.dataflow_hook.provide_authorized_gcloud(): self.beam_hook.start_python_pipeline( variables=formatted_pipeline_options, py_file=self.py_file, py_options=self.py_options, py_interpreter=self.py_interpreter, py_requirements=self.py_requirements, py_system_site_packages=self.py_system_site_packages, process_line_callback=process_line_callback, ) self.dataflow_hook.wait_for_done( job_name=dataflow_job_name, location=self.dataflow_config.location, job_id=self.dataflow_job_id, multiple_jobs=False, ) else: self.beam_hook.start_python_pipeline( variables=formatted_pipeline_options, py_file=self.py_file, py_options=self.py_options, py_interpreter=self.py_interpreter, py_requirements=self.py_requirements, py_system_site_packages=self.py_system_site_packages, process_line_callback=process_line_callback, ) return {"dataflow_job_id": self.dataflow_job_id}
def heartbeat(self): """ Heartbeats update the job's entry in the database with a timestamp for the latest_heartbeat and allows for the job to be killed externally. This allows at the system level to monitor what is actually active. For instance, an old heartbeat for SchedulerJob would mean something is wrong. This also allows for any job to be killed externally, regardless of who is running it or on which machine it is running. Note that if your heartbeat is set to 60 seconds and you call this method after 10 seconds of processing since the last heartbeat, it will sleep 50 seconds to complete the 60 seconds and keep a steady heart rate. If you go over 60 seconds before calling it, it won't sleep at all. """ previous_heartbeat = self.latest_heartbeat try: with create_session() as session: # This will cause it to load from the db session.merge(self) previous_heartbeat = self.latest_heartbeat if self.state == State.SHUTDOWN: self.kill() is_unit_test = conf.getboolean('core', 'unit_test_mode') if not is_unit_test: # Figure out how long to sleep for sleep_for = 0 if self.latest_heartbeat: seconds_remaining = self.heartrate - \ (timezone.utcnow() - self.latest_heartbeat)\ .total_seconds() sleep_for = max(0, seconds_remaining) sleep(sleep_for) # Update last heartbeat time with create_session() as session: # Make the sesion aware of this object session.merge(self) self.latest_heartbeat = timezone.utcnow() session.commit() # At this point, the DB has updated. previous_heartbeat = self.latest_heartbeat self.heartbeat_callback(session=session) self.log.debug('[heartbeat]') except OperationalError: Stats.incr( convert_camel_to_snake(self.__class__.__name__) + '_heartbeat_failure', 1, 1) self.log.exception("%s heartbeat got an exception", self.__class__.__name__) # We didn't manage to heartbeat, so make sure that the timestamp isn't updated self.latest_heartbeat = previous_heartbeat
def _init_pipeline_options( self, format_pipeline_options: bool = False, job_name_variable_key: Optional[str] = None, ) -> Tuple[bool, Optional[str], dict, Optional[Callable[[str], None]]]: self.beam_hook = BeamHook(runner=self.runner) pipeline_options = self.default_pipeline_options.copy() process_line_callback: Optional[Callable[[str], None]] = None is_dataflow = self.runner.lower( ) == BeamRunnerType.DataflowRunner.lower() dataflow_job_name: Optional[str] = None if is_dataflow: dataflow_job_name, pipeline_options, process_line_callback = self._set_dataflow( pipeline_options=pipeline_options, job_name_variable_key=job_name_variable_key, ) self.log.info(pipeline_options) pipeline_options.update(self.pipeline_options) if format_pipeline_options: snake_case_pipeline_options = { convert_camel_to_snake(key): pipeline_options[key] for key in pipeline_options } return is_dataflow, dataflow_job_name, snake_case_pipeline_options, process_line_callback return is_dataflow, dataflow_job_name, pipeline_options, process_line_callback
def heartbeat(self): """ Heartbeats update the job's entry in the database with a timestamp for the latest_heartbeat and allows for the job to be killed externally. This allows at the system level to monitor what is actually active. For instance, an old heartbeat for SchedulerJob would mean something is wrong. This also allows for any job to be killed externally, regardless of who is running it or on which machine it is running. Note that if your heartbeat is set to 60 seconds and you call this method after 10 seconds of processing since the last heartbeat, it will sleep 50 seconds to complete the 60 seconds and keep a steady heart rate. If you go over 60 seconds before calling it, it won't sleep at all. """ try: with create_session() as session: job = session.query(BaseJob).filter_by(id=self.id).one() make_transient(job) session.commit() if job.state == State.SHUTDOWN: self.kill() is_unit_test = conf.getboolean('core', 'unit_test_mode') if not is_unit_test: # Figure out how long to sleep for sleep_for = 0 if job.latest_heartbeat: seconds_remaining = self.heartrate - \ (timezone.utcnow() - job.latest_heartbeat)\ .total_seconds() sleep_for = max(0, seconds_remaining) sleep(sleep_for) # Update last heartbeat time with create_session() as session: job = session.query(BaseJob).filter( BaseJob.id == self.id).first() job.latest_heartbeat = timezone.utcnow() session.merge(job) session.commit() self.heartbeat_callback(session=session) self.log.debug('[heartbeat]') except OperationalError: Stats.incr( convert_camel_to_snake(self.__class__.__name__) + '_heartbeat_failure', 1, 1) self.log.exception("%s heartbeat got an exception", self.__class__.__name__)
def test_convert_camel_to_snake(self): self.assertEqual(helpers.convert_camel_to_snake('LocalTaskJob'), 'local_task_job') self.assertEqual(helpers.convert_camel_to_snake('somethingVeryRandom'), 'something_very_random')
def execute(self, context): """Execute the Apache Beam Pipeline.""" self.beam_hook = BeamHook(runner=self.runner) pipeline_options = self.default_pipeline_options.copy() process_line_callback: Optional[Callable] = None is_dataflow = self.runner.lower( ) == BeamRunnerType.DataflowRunner.lower() if isinstance(self.dataflow_config, dict): self.dataflow_config = DataflowConfiguration( **self.dataflow_config) if is_dataflow: self.dataflow_hook = DataflowHook( gcp_conn_id=self.dataflow_config.gcp_conn_id or self.gcp_conn_id, delegate_to=self.dataflow_config.delegate_to or self.delegate_to, poll_sleep=self.dataflow_config.poll_sleep, impersonation_chain=self.dataflow_config.impersonation_chain, drain_pipeline=self.dataflow_config.drain_pipeline, cancel_timeout=self.dataflow_config.cancel_timeout, wait_until_finished=self.dataflow_config.wait_until_finished, ) self.dataflow_config.project_id = self.dataflow_config.project_id or self.dataflow_hook.project_id dataflow_job_name = DataflowHook.build_dataflow_job_name( self.dataflow_config.job_name, self.dataflow_config.append_job_name) pipeline_options["job_name"] = dataflow_job_name pipeline_options["project"] = self.dataflow_config.project_id pipeline_options["region"] = self.dataflow_config.location pipeline_options.setdefault("labels", {}).update({ "airflow-version": "v" + version.replace(".", "-").replace("+", "-") }) def set_current_dataflow_job_id(job_id): self.dataflow_job_id = job_id process_line_callback = process_line_and_extract_dataflow_job_id_callback( on_new_job_id_callback=set_current_dataflow_job_id) pipeline_options.update(self.pipeline_options) # Convert argument names from lowerCamelCase to snake case. formatted_pipeline_options = { convert_camel_to_snake(key): pipeline_options[key] for key in pipeline_options } with ExitStack() as exit_stack: if self.py_file.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context( # pylint: disable=no-member gcs_hook.provide_file(object_url=self.py_file)) self.py_file = tmp_gcs_file.name self.beam_hook.start_python_pipeline( variables=formatted_pipeline_options, py_file=self.py_file, py_options=self.py_options, py_interpreter=self.py_interpreter, py_requirements=self.py_requirements, py_system_site_packages=self.py_system_site_packages, process_line_callback=process_line_callback, ) if is_dataflow: self.dataflow_hook.wait_for_done( # pylint: disable=no-value-for-parameter job_name=dataflow_job_name, location=self.dataflow_config.location, job_id=self.dataflow_job_id, multiple_jobs=False, ) return {"dataflow_job_id": self.dataflow_job_id}
def test_convert_camel_to_snake(self): assert helpers.convert_camel_to_snake( 'LocalTaskJob') == 'local_task_job' assert helpers.convert_camel_to_snake( 'somethingVeryRandom') == 'something_very_random'