def execute(self, context): """ Execute the bash command in a temporary directory which will be cleaned afterwards """ self.log.info('Tmp dir root location: \n %s', gettempdir()) # Prepare env for child process. if self.env is None: self.env = os.environ.copy() airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) self.log.info('Exporting the following env vars:\n' + '\n'.join(["{}={}".format(k, v) for k, v in airflow_context_vars.items()])) self.env.update(airflow_context_vars) self.lineage_data = self.bash_command with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as tmp_file: tmp_file.write(bytes(self.bash_command, 'utf_8')) tmp_file.flush() script_location = os.path.abspath(tmp_file.name) self.log.info('Temporary script location: %s', script_location) def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() self.log.info('Running command: %s', self.bash_command) sub_process = Popen( ['bash', tmp_file.name], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=self.env, preexec_fn=pre_exec) self.sub_process = sub_process self.log.info('Output:') line = '' for raw_line in iter(sub_process.stdout.readline, b''): line = raw_line.decode(self.output_encoding).rstrip() self.log.info(line) sub_process.wait() self.log.info('Command exited with return code %s', sub_process.returncode) if sub_process.returncode: raise AirflowException('Bash command failed') if self.xcom_push_flag: return line
def execute(self, context): with NamedTemporaryFile() as tmp_file: self.log.info("Fetching file from Hive") hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) hive.to_csv(hql=self.hql, csv_filepath=tmp_file.name, hive_conf=context_to_airflow_vars(context)) self.log.info("Pushing to samba") samba = SambaHook(samba_conn_id=self.samba_conn_id) samba.push_from_local(self.destination_filepath, tmp_file.name)
def execute(self, context): self.log.info('Executing: %s', self.hql) self.hook = self.get_hook() # set the mapred_job_name if it's not set with dag, task, execution time info if not self.mapred_job_name: ti = context['ti'] self.hook.mapred_job_name = 'Airflow HiveOperator task for {}.{}.{}.{}'\ .format(ti.hostname.split('.')[0], ti.dag_id, ti.task_id, ti.execution_date.isoformat()) if self.hiveconf_jinja_translate: self.hiveconfs = context_to_airflow_vars(context) else: self.hiveconfs.update(context_to_airflow_vars(context)) self.log.info('Passing HiveConf: %s', self.hiveconfs) self.hook.run_cli(hql=self.hql, schema=self.schema, hive_conf=self.hiveconfs)
def test_context_to_airflow_vars_all_context(self): self.assertDictEqual( operator_helpers.context_to_airflow_vars(self.context), { 'airflow.ctx.dag.dag_id': self.dag_id, 'airflow.ctx.dag_run.execution_date': self.execution_date, 'airflow.ctx.task.task_id': self.task_id, 'airflow.ctx.task_instance.execution_date': self.execution_date, } )
def test_context_to_airflow_vars_all_context(self): self.assertDictEqual( operator_helpers.context_to_airflow_vars(self.context), { 'airflow.ctx.dag_id': self.dag_id, 'airflow.ctx.execution_date': self.execution_date, 'airflow.ctx.task_id': self.task_id, 'airflow.ctx.dag_run_id': self.dag_run_id, } ) self.assertDictEqual( operator_helpers.context_to_airflow_vars(self.context, in_env_var_format=True), { 'AIRFLOW_CTX_DAG_ID': self.dag_id, 'AIRFLOW_CTX_EXECUTION_DATE': self.execution_date, 'AIRFLOW_CTX_TASK_ID': self.task_id, 'AIRFLOW_CTX_DAG_RUN_ID': self.dag_run_id, } )
def test_execute_with_hive_conf(self, mock_hive_hook, mock_mysql_hook): context = {} self.kwargs.update(dict(hive_conf={'mapreduce.job.queuename': 'fake_queue'})) HiveToMySqlTransfer(**self.kwargs).execute(context=context) hive_conf = context_to_airflow_vars(context) hive_conf.update(self.kwargs['hive_conf']) mock_hive_hook.return_value.get_records.assert_called_once_with( self.kwargs['sql'], hive_conf=hive_conf )
def test_execute(self, mock_tmp_file, mock_hive_hook, mock_samba_hook): type(mock_tmp_file).name = PropertyMock(return_value='tmp_file') mock_tmp_file.return_value.__enter__ = Mock(return_value=mock_tmp_file) context = {} Hive2SambaOperator(**self.kwargs).execute(context) mock_hive_hook.assert_called_once_with(hiveserver2_conn_id=self.kwargs['hiveserver2_conn_id']) mock_hive_hook.return_value.to_csv.assert_called_once_with( hql=self.kwargs['hql'], csv_filepath=mock_tmp_file.name, hive_conf=context_to_airflow_vars(context)) mock_samba_hook.assert_called_once_with(samba_conn_id=self.kwargs['samba_conn_id']) mock_samba_hook.return_value.push_from_local.assert_called_once_with( self.kwargs['destination_filepath'], mock_tmp_file.name)
def execute(self, context): # Export context to make it available for callables to use. airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) self.log.info("Exporting the following env vars:\n" + '\n'.join(["{}={}".format(k, v) for k, v in airflow_context_vars.items()])) os.environ.update(airflow_context_vars) if self.provide_context: context.update(self.op_kwargs) context['templates_dict'] = self.templates_dict self.op_kwargs = context return_value = self.execute_callable() self.log.info("Done. Returned value was: %s", return_value) return return_value
def get_env(self, context): """Builds the set of environment variables to be exposed for the bash command""" system_env = os.environ.copy() env = self.env if env is None: env = system_env else: if self.append_env: system_env.update(env) env = system_env airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) self.log.debug( 'Exporting the following env vars:\n%s', '\n'.join(f"{k}={v}" for k, v in airflow_context_vars.items()), ) env.update(airflow_context_vars) return env
def execute(self, context: Dict): # Export context to make it available for callables to use. airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) self.log.info( "Exporting the following env vars:\n%s", '\n'.join([ "{}={}".format(k, v) for k, v in airflow_context_vars.items() ])) os.environ.update(airflow_context_vars) context.update(self.op_kwargs) context['templates_dict'] = self.templates_dict self.op_kwargs = PythonOperator.determine_op_kwargs( self.python_callable, context, len(self.op_args)) return_value = self.execute_callable() self.log.info("Done. Returned value was: %s", return_value) return return_value
def execute(self, context): if self.env is None: self.env = os.environ.copy() airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) self.env.update(airflow_context_vars) self.lineage_data = self.filename with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: def pre_exec(): for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() self.log.info("Running Golang program: %s", self.filename) sp = Popen(['go', 'run', self.filename], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=self.env, preexec_fn=pre_exec) self.sp = sp self.log.info("Output:") line = '' for line in iter(sp.stdout.readline, b''): line = line.decode(self.output_encoding).rstrip() self.log.info(line) sp.wait() self.log.info("Command exited with return code %s", sp.returncode) if sp.returncode: raise AirflowException("Golang program failed") if self.xcom_push_flag: return line
def execute(self, context): # Export context to make it available for callables to use. airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) self.log.info( "Exporting the following env vars:\n%s", "\n".join([ "{}={}".format(k, v) for k, v in airflow_context_vars.items() ]), ) os.environ.update(airflow_context_vars) if self.provide_context: context.update(self.op_kwargs) context["templates_dict"] = self.templates_dict self.op_kwargs = context return_value = self.execute_callable() self.log.info("Done. Returned value was: %s", return_value) return return_value
def test_execute_bulk_load(self, mock_hive_hook, mock_tmp_file, mock_mysql_hook): type(mock_tmp_file).name = PropertyMock(return_value='tmp_file') context = {} self.kwargs.update(dict(bulk_load=True)) HiveToMySqlTransferOperator(**self.kwargs).execute(context=context) mock_tmp_file.assert_called_once_with() mock_hive_hook.return_value.to_csv.assert_called_once_with( self.kwargs['sql'], mock_tmp_file.return_value.name, delimiter='\t', lineterminator='\n', output_header=False, hive_conf=context_to_airflow_vars(context)) mock_mysql_hook.return_value.bulk_load.assert_called_once_with( table=self.kwargs['mysql_table'], tmp_file=mock_tmp_file.return_value.name) mock_tmp_file.return_value.close.assert_called_once_with()
def test_execute_with_hive_conf(self, mock_mysql_hook): context = {} mock_hive_hook = MockHiveServer2Hook() mock_hive_hook.get_records = MagicMock( return_value='test_hive_results') self.kwargs.update( dict(hive_conf={'mapreduce.job.queuename': 'fake_queue'})) with patch( 'airflow.providers.apache.hive.transfers.hive_to_mysql.HiveServer2Hook', return_value=mock_hive_hook, ): HiveToMySqlOperator(**self.kwargs).execute(context=context) hive_conf = context_to_airflow_vars(context) hive_conf.update(self.kwargs['hive_conf']) mock_hive_hook.get_records.assert_called_once_with(self.kwargs['sql'], hive_conf=hive_conf)
def test_execute_bulk_load(self, mock_hive_hook, mock_tmp_file, mock_mysql_hook): type(mock_tmp_file).name = PropertyMock(return_value='tmp_file') context = {} self.kwargs.update(dict(bulk_load=True)) HiveToMySqlTransfer(**self.kwargs).execute(context=context) mock_tmp_file.assert_called_once_with() mock_hive_hook.return_value.to_csv.assert_called_once_with( self.kwargs['sql'], mock_tmp_file.return_value.name, delimiter='\t', lineterminator='\n', output_header=False, hive_conf=context_to_airflow_vars(context) ) mock_mysql_hook.return_value.bulk_load.assert_called_once_with( table=self.kwargs['mysql_table'], tmp_file=mock_tmp_file.return_value.name ) mock_tmp_file.return_value.close.assert_called_once_with()
def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) self.log.info("Extracting data from Hive: %s", self.sql) hive_conf = context_to_airflow_vars(context) if self.hive_conf: hive_conf.update(self.hive_conf) if self.bulk_load: tmp_file = NamedTemporaryFile() hive.to_csv( self.sql, tmp_file.name, delimiter='\t', lineterminator='\n', output_header=False, hive_conf=hive_conf, ) else: hive_results = hive.get_records(self.sql, hive_conf=hive_conf) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.mysql_preoperator: self.log.info("Running MySQL preoperator") mysql.run(self.mysql_preoperator) self.log.info("Inserting rows into MySQL") if self.bulk_load: mysql.bulk_load(table=self.mysql_table, tmp_file=tmp_file.name) tmp_file.close() else: mysql.insert_rows(table=self.mysql_table, rows=hive_results) if self.mysql_postoperator: self.log.info("Running MySQL postoperator") mysql.run(self.mysql_postoperator) self.log.info("Done.")
def test_execute_bulk_load(self, mock_hive_hook, mock_tmp_file_context, mock_mysql_hook): mock_tmp_file = MagicMock() mock_tmp_file.name = 'tmp_file' mock_tmp_file_context.return_value.__enter__.return_value = mock_tmp_file context = {} self.kwargs.update(dict(bulk_load=True)) HiveToMySqlOperator(**self.kwargs).execute(context=context) mock_tmp_file_context.assert_called_once_with() mock_hive_hook.return_value.to_csv.assert_called_once_with( self.kwargs['sql'], 'tmp_file', delimiter='\t', lineterminator='\n', output_header=False, hive_conf=context_to_airflow_vars(context), ) mock_mysql_hook.return_value.bulk_load.assert_called_once_with( table=self.kwargs['mysql_table'], tmp_file='tmp_file') mock_tmp_file_context.return_value.__exit__.assert_called_once_with( None, None, None)
def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) self.log.info("Extracting data from Hive: %s", self.sql) hive_conf = context_to_airflow_vars(context) if self.hive_conf: hive_conf.update(self.hive_conf) if self.bulk_load: tmp_file = NamedTemporaryFile() hive.to_csv(self.sql, tmp_file.name, delimiter='\t', lineterminator='\n', output_header=False, hive_conf=hive_conf) else: hive_results = hive.get_records(self.sql, hive_conf=hive_conf) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.mysql_preoperator: self.log.info("Running MySQL preoperator") mysql.run(self.mysql_preoperator) self.log.info("Inserting rows into MySQL") if self.bulk_load: mysql.bulk_load(table=self.mysql_table, tmp_file=tmp_file.name) tmp_file.close() else: mysql.insert_rows(table=self.mysql_table, rows=hive_results) if self.mysql_postoperator: self.log.info("Running MySQL postoperator") mysql.run(self.mysql_postoperator) self.log.info("Done.")
def execute(self, context): """ Execute the R command or script in a temporary directory """ # Export additional environment variables os.environ.update(self.env) # Export context as environment variables airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) self.log.info( 'Exporting the following env vars:\n%s', '\n'.join([ "{}={}".format(k, v) for k, v in airflow_context_vars.items() ])) os.environ.update(airflow_context_vars) with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f: f.write(bytes(self.r_command, 'utf_8')) f.flush() fname = f.name script_location = os.path.abspath(fname) self.log.info("Temporary script location: %s", script_location) self.log.info("Running command(s):\n%s", self.r_command) try: res = robjects.r.source(fname, echo=False) except RRuntimeError as e: self.log.error("Received R error: %s", e) res = None # This will be a pickled rpy2.robjects.vectors.ListVector return res
def test_context_to_airflow_vars_empty_context(self): self.assertDictEqual(operator_helpers.context_to_airflow_vars({}), {})
def test_context_to_airflow_vars_empty_context(self): assert operator_helpers.context_to_airflow_vars({}) == {}
def execute(self, context): """ Execute the bash command in a temporary directory which will be cleaned afterwards """ self.log.info('Tmp dir root location: \n %s', gettempdir()) # Prepare env for child process. env = self.env if env is None: env = os.environ.copy() airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) self.log.info( 'Exporting the following env vars:\n%s', '\n'.join([ "{}={}".format(k, v) for k, v in airflow_context_vars.items() ])) env.update(airflow_context_vars) self.lineage_data = self.bash_command with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as tmp_file: tmp_file.write(bytes(self.bash_command, 'utf_8')) tmp_file.flush() script_location = os.path.abspath(tmp_file.name) self.log.info('Temporary script location: %s', script_location) def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() self.log.info('Running command: %s', self.bash_command) sub_process = Popen(['bash', tmp_file.name], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=env, preexec_fn=pre_exec) self.sub_process = sub_process self.log.info('Output:') line = '' for raw_line in iter(sub_process.stdout.readline, b''): line = raw_line.decode(self.output_encoding).rstrip() self.log.info(line) sub_process.wait() self.log.info('Command exited with return code %s', sub_process.returncode) if sub_process.returncode: raise AirflowException('Bash command failed') return line
def execute(self, context): _log.info('Executing: ' + self.hql) self.hook = self.get_hook() self.hook.run_cli(hql=self.hql, schema=self.schema, hive_conf=context_to_airflow_vars(context))
def test_context_to_airflow_vars_empty_context(self): self.assertDictEqual(operator_helpers.context_to_airflow_vars({}), {})
def execute(self, context): """ Execute the bash command in a temporary directory which will be cleaned afterwards """ self.log.info("Tmp dir root location: \n %s", gettempdir()) # Prepare env for child process. env = self.env if env is None: env = os.environ.copy() airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) self.log.debug( "Exporting the following env vars:\n%s", "\n".join([ "{}={}".format(k, v) for k, v in airflow_context_vars.items() ]), ) env.update(airflow_context_vars) self.lineage_data = self.bash_command with TemporaryDirectory(prefix="airflowtmp") as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f: f.write(bytes(self.bash_command, "utf_8")) f.flush() fname = f.name script_location = os.path.abspath(fname) self.log.info("Temporary script location: %s", script_location) def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ("SIGPIPE", "SIGXFZ", "SIGXFSZ"): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() self.log.info("Running command: %s", self.bash_command) self.sub_process = Popen(["bash", fname], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=env, preexec_fn=pre_exec) self.log.info("Output:") line = "" for line in iter(self.sub_process.stdout.readline, b""): line = line.decode(self.output_encoding).rstrip() self.log.info(line) self.sub_process.wait() self.log.info("Command exited with return code %s", self.sub_process.returncode) if self.sub_process.returncode: raise AirflowException("Bash command failed") if self.xcom_push_flag: return line
def execute(self, context): try: """ Execute the bash command in a temporary directory which will be cleaned afterwards """ self.log.info("Tmp dir root location: \n %s", gettempdir()) # Prepare env for child process. if self.env is None: self.env = os.environ.copy() airflow_context_vars = context_to_airflow_vars( context, in_env_var_format=True) self.log.info("Exporting the following env vars:\n" + '\n'.join([ "{}={}".format(k, v) for k, v in airflow_context_vars.items() ])) self.env.update(airflow_context_vars) self.lineage_data = self.bash_command with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f: f.write(bytes(self.bash_command, 'utf_8')) f.flush() fname = f.name script_location = os.path.abspath(fname) self.log.info("Temporary script location: %s", script_location) def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() self.log.info("Running command: %s", self.bash_command) sp = Popen(['bash', fname], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=self.env, preexec_fn=pre_exec) self.sp = sp self.log.info("Output:") line = '' for line in iter(sp.stdout.readline, b''): line = line.decode(self.output_encoding).rstrip() self.log.info(line) sp.wait() self.log.info("Command exited with return code %s", sp.returncode) if sp.returncode: raise AirflowException("Bash command failed") if self.xcom_push_flag: return line except Exception as e: self.log.info("exec throw a exception " + str(e)) with open(str(path_flag + self.flag + ".txt"), mode='a', encoding='utf-8') as f: f.write(self.flag + " " + str(self.task_name) + " run failed !!!!!\n") f.write("error is " + str(e) + "\n") f.close() raise Exception