def execute(self, context): self.log.info('Starting docker container from image %s', self.image) tls_config = self.__get_tls_config() if self.docker_conn_id: self.cli = self.get_hook().get_conn() else: self.cli = APIClient(base_url=self.docker_url, version=self.api_version, tls=tls_config) if ':' not in self.image: image = self.image + ':latest' else: image = self.image if self.force_pull or len(self.cli.images(name=image)) == 0: self.log.info('Pulling docker image %s', image) for l in self.cli.pull(image, stream=True): output = json.loads(l.decode('utf-8')) self.log.info("%s", output['status']) cpu_shares = int(round(self.cpus * 1024)) with TemporaryDirectory(prefix='airflowtmp') as host_tmp_dir: self.environment['AIRFLOW_TMP_DIR'] = self.tmp_dir self.volumes.append('{0}:{1}'.format(host_tmp_dir, self.tmp_dir)) self.container = self.cli.create_container( command=self.get_command(), cpu_shares=cpu_shares, environment=self.environment, host_config=self.cli.create_host_config( binds=self.volumes, network_mode=self.network_mode, shm_size=self.shm_size, dns=self.dns, dns_search=self.dns_search), image=image, mem_limit=self.mem_limit, user=self.user, working_dir=self.working_dir) self.cli.start(self.container['Id']) line = '' for line in self.cli.logs(container=self.container['Id'], stream=True): line = line.strip() if hasattr(line, 'decode'): line = line.decode('utf-8') self.log.info(line) exit_code = self.cli.wait(self.container['Id']) if exit_code != 0: raise AirflowException('docker container failed') if self.xcom_push_flag: return self.cli.logs(container=self.container['Id']) \ if self.xcom_all else str(line)
def upload_test_file(self, uri: str, file_name: str): with TemporaryDirectory(prefix="airflow-gcp") as tmp_dir: # 1. Create required files quickstart_path = os.path.join(tmp_dir, file_name) with open(quickstart_path, "w") as file: file.writelines( [ "#!/usr/bin/python\n", "import pyspark\n", "sc = pyspark.SparkContext()\n", "rdd = sc.parallelize(['Hello,', 'world!'])\n", "words = sorted(rdd.collect())\n", "print(words)\n", ] ) file.flush() os.chmod(quickstart_path, 555) self.execute_cmd( [ "gsutil", "cp", "{file}".format(file=quickstart_path), "{uri}".format(uri=uri), ] )
def execute_callable(self): with TemporaryDirectory(prefix='venv') as tmp_dir: if self.templates_dict: self.op_kwargs['templates_dict'] = self.templates_dict # generate filenames input_filename = os.path.join(tmp_dir, 'script.in') output_filename = os.path.join(tmp_dir, 'script.out') string_args_filename = os.path.join(tmp_dir, 'string_args.txt') script_filename = os.path.join(tmp_dir, 'script.py') # set up virtualenv self._execute_in_subprocess(self._generate_virtualenv_cmd(tmp_dir)) cmd = self._generate_pip_install_cmd(tmp_dir) if cmd: self._execute_in_subprocess(cmd) self._write_args(input_filename) self._write_script(script_filename) self._write_string_args(string_args_filename) # execute command in virtualenv self._execute_in_subprocess( self._generate_python_cmd(tmp_dir, script_filename, input_filename, output_filename, string_args_filename)) return self._read_result(output_filename)
def execute(self, context): """ Execute the bash command in a temporary directory which will be cleaned afterwards """ self.log.info('Tmp dir root location: \n %s', gettempdir()) # Prepare env for child process. if self.env is None: self.env = os.environ.copy() airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) self.log.info('Exporting the following env vars:\n' + '\n'.join( ["{}={}".format(k, v) for k, v in airflow_context_vars.items()])) self.env.update(airflow_context_vars) self.lineage_data = self.bash_command with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as tmp_file: tmp_file.write(bytes(self.bash_command, 'utf_8')) tmp_file.flush() script_location = os.path.abspath(tmp_file.name) self.log.info('Temporary script location: %s', script_location) def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() self.log.info('Running command: %s', self.bash_command) sub_process = Popen(['bash', tmp_file.name], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=self.env, preexec_fn=pre_exec) self.sub_process = sub_process self.log.info('Output:') line = '' for raw_line in iter(sub_process.stdout.readline, b''): line = raw_line.decode(self.output_encoding).rstrip() self.log.info(line) sub_process.wait() self.log.info('Command exited with return code %s', sub_process.returncode) if sub_process.returncode: raise AirflowException('Bash command failed') if self.xcom_push_flag: return line
def create_task_instance_by_dag_code(dag_code, dag_name, task_name, execution_date=None): with TemporaryDirectory(prefix='dcmp_dag_') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir) as f: f.write(dag_code.encode('UTF-8')) f.flush() ti = create_task_instance(dag_name, task_name, execution_date=execution_date, dag_folder=os.path.join(tmp_dir, f.name), include_examples=False) return ti
def create_dagbag_by_dag_code(dag_code): with TemporaryDirectory(prefix='dcmp_dag_') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir) as f: f.write(dag_code.encode('UTF-8')) f.flush() dagbag = DagBag(dag_folder=os.path.join(tmp_dir, f.name), include_examples=False) return dagbag
def execute(self, context): model_name = os.path.split(s3_key)[1] self.log.info('Model name: %s', model_name) s3_hook = S3Hook(self.aws_conn_id) firebase_hook = FirebaseHook(self.firebase_conn_id) with TemporaryDirectory(prefix='airflow_firebaseop_') as tmp_dir: with NamedTemporaryFile(model_name, dir=tmp_dir, delete=False) as tmp: self.log.info('Download s3://%s/%s', self.s3_bucket, self.s3_key) s3_obj = s3_hook.get_key(self.s3_key, self.s3_bucket) s3_obj.download_fileobj(tmp) model_filepath = tmp.name self.log.info('Model file: %s', model_filepath) self.log.info('Create/Update model') model_info = firebase_hook.put_model(self, model_name, model_filepath, self.model_tags) self.log.info('Model info: %s', model_info) return model_info
def execute(self, context): # get data, save temporarily dwhook = get_dwhook(self.dwh_engine)(self.dwh_conn_id) sql = dwhook._QUERY_TABLE.format( **{ "database_name": self.database, "schema_name": self.schema, "table_name": self.table, } ) self.log.info("Getting data with SQL:\n\n{0}".format(sql)) data = dwhook.execute_and_return_result(sql, return_dict=True) del dwhook with TemporaryDirectory(prefix="senddataasmail") as tmp_dir: self.files = [tmp_dir + os.sep + self.filename + ".csv"] self.log.info( "temporarily writing csv file to {0}".format( self.files[0], ) ) with open(self.files[0], mode="w") as csv_file: csvwriter = csv.DictWriter( csv_file, fieldnames=data[0].keys(), delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL, ) csvwriter.writeheader() for _ in range(len(data)): datum = data.pop(0) csvwriter.writerow(datum) super().execute(context)
def execute(self, context): logging.info('Starting docker container from image ' + self.image) tls_config = None if self.tls_ca_cert and self.tls_client_cert and self.tls_client_key: tls_config = tls.TLSConfig(ca_cert=self.tls_ca_cert, client_cert=(self.tls_client_cert, self.tls_client_key), verify=True, ssl_version=self.tls_ssl_version, assert_hostname=self.tls_hostname) self.docker_url = self.docker_url.replace('tcp://', 'https://') self.cli = DockerAPIClient(base_url=self.docker_url, version=self.api_version, tls=tls_config) if ':' not in self.image: image = self.image + ':latest' else: image = self.image if self.force_pull or len(self.cli.images(name=image)) == 0: logging.info('Pulling docker image ' + image) for l in self.cli.pull(image, stream=True): output = json.loads(l.decode('utf-8')) logging.info("{}".format(output['status'])) cpu_shares = int(round(self.cpus * 1024)) with TemporaryDirectory(prefix='airflowtmp') as host_tmp_dir: self.environment['AIRFLOW_TMP_DIR'] = self.tmp_dir self.volumes.append('{0}:{1}'.format(host_tmp_dir, self.tmp_dir)) self.container = self.cli.create_container( command=self.get_command(), cpu_shares=cpu_shares, environment=self.environment, host_config=self.cli.create_host_config( binds=self.volumes, network_mode=self.network_mode), image=image, mem_limit=self.mem_limit, user=self.user) self.cli.start(self.container['Id']) line = '' for line in self.cli.logs(container=self.container['Id'], stream=True): line = line.strip() if hasattr(line, 'decode'): line = line.decode('utf-8') logging.info(line) exit_code = self.cli.wait(self.container['Id']) if exit_code != 0: raise AirflowException('docker container failed') if self.xcom_push_flag: return self.cli.logs(container=self.container['Id'] ) if self.xcom_all else str(line)
def poke(self, context): """ Execute the bash command in a temporary directory which will be cleaned afterwards """ bash_command = self.bash_command self.log.info("Tmp dir root location: \n %s", gettempdir()) with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f: f.write(bytes(bash_command, 'utf_8')) f.flush() fname = f.name script_location = tmp_dir + "/" + fname self.log.info("Temporary script location: %s", script_location) self.log.info("Running command: %s", bash_command) sp = Popen(['bash', fname], stdout=PIPE, stderr=STDOUT, close_fds=True, cwd=tmp_dir, env=self.env, preexec_fn=os.setsid) self.sp = sp self.log.info("Output:") line = '' for line in iter(sp.stdout.readline, b''): line = line.decode(self.output_encoding).strip() self.log.info(line) sp.wait() self.log.info("Command exited with return code %s", sp.returncode) return not sp.returncode
def execute(self, context): self.log.info('Starting docker container from image %s', self.image) tls_config = self.__get_tls_config() if self.docker_conn_id: self.cli = self.get_hook().get_conn() else: self.cli = APIClient(base_url=self.docker_url, version=self.api_version, tls=tls_config) if self.force_pull or len(self.cli.images(name=self.image)) == 0: self.log.info('Pulling docker image %s', self.image) for l in self.cli.pull(self.image, stream=True): output = json.loads(l.decode('utf-8').strip()) if 'status' in output: self.log.info("%s", output['status']) with TemporaryDirectory(prefix='airflowtmp', dir=self.host_tmp_dir) as host_tmp_dir: self.environment['AIRFLOW_TMP_DIR'] = self.tmp_dir self.volumes.append('{0}:{1}'.format(host_tmp_dir, self.tmp_dir)) self.container = self.cli.create_container( command=self.get_command(), environment=self.environment, host_config=self.cli.create_host_config( auto_remove=self.auto_remove, binds=self.volumes, network_mode=self.network_mode, shm_size=self.shm_size, dns=self.dns, dns_search=self.dns_search, cpu_shares=int(round(self.cpus * 1024)), mem_limit=self.mem_limit), image=self.image, user=self.user, working_dir=self.working_dir) self.cli.start(self.container['Id']) line = '' for line in self.cli.attach(container=self.container['Id'], stdout=True, stderr=True, stream=True): line = line.strip() if hasattr(line, 'decode'): line = line.decode('utf-8') self.log.info(line) result = self.cli.wait(self.container['Id']) if result['StatusCode'] != 0: raise AirflowException('docker container failed: ' + repr(result)) # duplicated conditional logic because of expensive operation if self.do_xcom_push: return self.cli.logs(container=self.container['Id']) \ if self.xcom_all else line.encode('utf-8')
def execute(self, context): """ Execute the bash command in a temporary directory which will be cleaned afterwards """ bash_command = self.bash_command logging.info("tmp dir root location: \n" + gettempdir()) with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f: f.write(bytes(bash_command, 'utf_8')) f.flush() fname = f.name script_location = tmp_dir + "/" + fname logging.info("Temporary script " "location :{0}".format(script_location)) logging.info("Running command: " + bash_command) input_file = None if self.input_file: input_file = fopen(self.input_file) out = None if self.output_file: out = fopen(self.output_file, mode='w') ON_POSIX = 'posix' in sys.builtin_module_names sp = Popen(['bash', fname], stdin=PIPE if input_file else None, stdout=PIPE if out else None, stderr=PIPE, cwd=tmp_dir, env=self.env, preexec_fn=os.setsid, bufsize=1, close_fds=ON_POSIX) self.sp = sp if input_file: pipe_stream(input_file, sp.stdin) if out: pipe_stream(sp.stdout, out) for line in iter(sp.stderr.readline, b''): logging.info(line) sp.wait() if input_file: input_file.read_key.close(fast=True) logging.info("Command exited with " "return code {0}".format(sp.returncode)) if sp.returncode: raise AirflowException("Bash command failed")
def execute(self, context): """ Execute the bash command in a temporary directory which will be cleaned afterwards """ self.log.info(f"Tmp dir root location: \n {gettempdir()}") # Prepare env for child process. env = self.env if env is None: env = os.environ.copy() airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) acv_log = "\n".join( [f"{k}={v}" for k, v in airflow_context_vars.items()]) self.log.debug("Exporting the following env vars:\n" f"{acv_log}") env.update(airflow_context_vars) self.lineage_data = self.bash_command with TemporaryDirectory(prefix="airflowtmp") as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f: f.write(bytes(self.bash_command, "utf_8")) f.flush() fname = f.name script_location = os.path.abspath(fname) self.log.info(f"Temporary script location: {script_location}") def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ("SIGPIPE", "SIGXFZ", "SIGXFSZ"): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() self.log.info(f"Running command: {self.bash_command}") self.sub_process = Popen(["bash", fname], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=env, preexec_fn=pre_exec) self.log.info("Output:") line = "" for line in iter(self.sub_process.stdout.readline, b""): line = line.decode(self.output_encoding).rstrip() self.log.info(line) self.sub_process.wait() self.log.info( f"Command exited with return code {self.sub_process.returncode}" ) if self.sub_process.returncode: raise AirflowException("Bash command failed") if self.xcom_push_flag: return line
def execute(self, context): """ Execute the bash command in a temporary directory which will be cleaned afterwards """ self.log.info("Tmp dir root location: \n %s", gettempdir()) airflow_home_value = conf.get('core', AIRFLOW_HOME_VAR) pythonpath_value = os.environ.get(PYTHONPATH_VAR, '') bash_command = ('export {}={}; '.format(AIRFLOW_HOME_VAR, airflow_home_value) + 'export {}={}; '.format(PYTHONPATH_VAR, pythonpath_value) + self.bash_command) self.lineage_data = bash_command with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f: f.write(bytes(bash_command, 'utf_8')) f.flush() fname = f.name script_location = os.path.abspath(fname) self.log.info( "Temporary script location: %s", script_location ) def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() self.log.info("Running command: %s", bash_command) sp = Popen( ['bash', fname], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=self.env, preexec_fn=pre_exec) self.sp = sp self.log.info("Output:") line = '' for line in iter(sp.stdout.readline, b''): line = line.decode(self.output_encoding).rstrip() self.log.info(line) sp.wait() self.log.info( "Command exited with return code %s", sp.returncode ) if sp.returncode: raise AirflowException("Bash command failed") if self.xcom_push_flag: return line
def execute(self, context): embulk_command = context['ti'].xcom_pull(task_ids=self.input_task_id, key='query_embulk') #self.bash_command = embulk_command self.log.info("Tmp dir root location: \n %s", gettempdir()) # Prepare env for child process. if self.env is None: self.env = os.environ.copy() airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) self.log.info("Exporting the following env vars:\n" + '\n'.join( ["{}={}".format(k, v) for k, v in airflow_context_vars.items()])) self.env.update(airflow_context_vars) self.lineage_data = embulk_command with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f: f.write(bytes(embulk_command, 'utf_8')) f.flush() fname = f.name script_location = os.path.abspath(fname) self.log.info("Temporary script location: %s", script_location) def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() self.log.info("Running command: %s", embulk_command) sp = Popen(['bash', fname], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=self.env, preexec_fn=pre_exec) self.sp = sp self.log.info("Output:") line = '' for line in iter(sp.stdout.readline, b''): line = line.decode(self.output_encoding).rstrip() self.log.info(line) sp.wait() self.log.info("Command exited with return code %s", sp.returncode) if sp.returncode: raise AirflowException("Bash command failed") if self.xcom_push_flag: return line
def execute(self, context): """ Execute the bash command in a temporary directory which will be cleaned afterwards """ self.log.info('Tmp dir root location: \n %s', gettempdir()) # Prepare env for child process. env = self.env if env is None: env = os.environ.copy() airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) self.log.info( 'Exporting the following env vars:\n%s', '\n'.join([ "{}={}".format(k, v) for k, v in airflow_context_vars.items() ])) env.update(airflow_context_vars) self.lineage_data = self.bash_command with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() self.log.info('Running command: %s', self.bash_command) sub_process = Popen(['bash', "-c", self.bash_command], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=env, preexec_fn=pre_exec) self.sub_process = sub_process self.log.info('Output:') line = '' for raw_line in iter(sub_process.stdout.readline, b''): line = raw_line.decode(self.output_encoding).rstrip() self.log.info("%s", line) sub_process.wait() self.log.info('Command exited with return code %s', sub_process.returncode) if sub_process.returncode != 0: raise AirflowException( 'Bash command failed. The command returned a non-zero exit code.' ) return line
def execute(self, context): with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: master = _SparkMaster(None, tmp_dir, self._env) logging.info("Running command: " + self.cmd) result = master.run_and_get_json(self._cmd) cluster_id = result["ClusterId"] master.cluster_id = cluster_id logging.info("Cluster `%s` has been created. Waiting for execution to finish." % cluster_id) master.wait_for_finish()
def clone_repo(self, clone_to: str, env: Optional[Dict[str, str]] = None) -> None: """Clone the repository into a specific location. :param clone_to: Directory to clone to. :param env: Dictionary of environment variables to use. Defaults to os.environ.copy(). Requires a $HOME environment variable to work with SSH. """ env = env or os.environ.copy() cmd = [] cmd.append("eval `ssh-agent`") with TemporaryDirectory(prefix="__ewah_git_") as tmp_dir: # temporarily save SSH key, if applicable, in this folder with NamedTemporaryFile(dir=tmp_dir) as ssh_key_file: if self.conn.private_key: ssh_key_filepath = os.path.abspath(ssh_key_file.name) ssh_key_file.write(self.conn.private_key.encode()) ssh_key_file.seek(0) if self.conn.password: passfile_path = os.path.abspath( tmp_dir + os.path.sep + "ssh_key_pw" ) with open(passfile_path, "w+") as passfile: passfile.write( '#!/bin/bash\necho "{0}"'.format(self.conn.password) ) passfile.seek(0) cmd.append("chmod 777 {0}".format(passfile_path)) cmd.append( 'DISPLAY=":0.0" SSH_ASKPASS="******" ssh-add {1}'.format( passfile_path, ssh_key_filepath ) ) else: cmd.append("ssh-add {0}".format(ssh_key_filepath)) # Add actual clone commands git_link = self.conn.git_link cmd.append("mkdir -p $HOME/.ssh") # Must add the host to the known hosts or it will ask for confirmation ssh_domain = re.search("@(.*):", git_link).group(1) cmd.append( "ssh-keyscan -H {0} >> $HOME/.ssh/known_hosts".format(ssh_domain) ) if self.conn.branch: cmd.append( "git clone -b {0} {1} {2}".format( self.conn.branch, git_link, clone_to ) ) else: cmd.append("git clone {0} {1}".format(git_link, clone_to)) # Execute commands! assert run_cmd(cmd, env, self.log.info) == 0
def execute(self, context): with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: master = _SparkMaster(self.cluster_id, tmp_dir, self._env) cmd = [Template(t).render(dag_id=self.dag_id, steps=self.steps, cluster_id=self.cluster_id) for t in self.template] logging.info("Checking status of the cluster %s" % self.cluster_id) master.verify_cluster_is_ready() logging.info("Running command: " + " ".join(cmd)) master.run_and_get_json(cmd) logging.info("Job is running. Waiting for execution to finish.") master.wait_for_finish()
def execute(self, context): with TemporaryDirectory(prefix='dockervariables') as tmp_var_dir: for key in self.variables: value = Variable.get(key) with open(os.path.join(tmp_var_dir, key), 'w') as value_file: # import pdb # pdb.set_trace() value_file.write(value) self.volumes.append('{0}:{1}'.format(tmp_var_dir, self.mount_point)) return super().execute(context)
def mongoclient(self): if not hasattr(self, "_mc"): if self.conn.ssh_conn_id: if self.conn.conn_style == "uri": raise Exception( "Cannot have SSH tunnel with uri connection type!") if not hasattr(self, "_ssh_hook"): self._ssh_hook = EWAHBaseHook.get_hook_from_conn_id( conn_id=self.conn.ssh_conn_id) self.local_bind_address = self._ssh_hook.start_tunnel( self.conn.host, self.conn.port) else: self.local_bind_address = (self.conn.host, self.conn.port) conn_kwargs = {"tz_aware": True} if self.conn.conn_style == "uri": conn_kwargs["host"] = self.conn.uri else: conn_kwargs["host"] = self.local_bind_address[0] conn_kwargs["port"] = self.local_bind_address[1] if self.conn.username: conn_kwargs["username"] = self.conn.username if self.conn.password: conn_kwargs["password"] = self.conn.password with TemporaryDirectory() as tmp_dir: if self.conn.tls: conn_kwargs["tls"] = True with NamedTemporaryFile(dir=tmp_dir) as ssl_cert: with NamedTemporaryFile(dir=tmp_dir) as ssl_private: if self.conn.ssl_cert: ssl_cert.write(self.conn.ssl_cert.encode()) ssl_cert.seek(0) conn_kwargs["ssl_certfile"] = os.path.abspath( ssl_cert.name) if self.conn.ssl_private: ssl_private.write(self.conn.ssl_private.encode()) ssl_private.seek(0) conn_kwargs["ssl_keyfile"] = os.path.abspath( ssl_private.name) if self.conn.ssl_password: conn_kwargs[ "tlsCertificateKeyFilePassword"] = self.conn.ssl_password if self.conn.tls_insecure: conn_kwargs["tlsInsecure"] = True if self.conn.auth_source: conn_kwargs["authSource"] = self.conn.auth_source if self.conn.auth_mechanism: conn_kwargs[ "authMechanism"] = self.conn.auth_mechanism self._mc = MongoClient(**conn_kwargs) return self._mc
def execute(self, context): """ Execute the bash command in a temporary directory which will be cleaned afterwards """ bash_command = self.bash_command self.log.info("Tmp dir root location: \n %s", gettempdir()) with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f: f.write(bytes(bash_command, 'utf_8')) f.flush() fname = f.name script_location = tmp_dir + "/" + fname self.log.info("Temporary script location: %s", script_location) def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() self.log.info("Running command: %s", bash_command) sp = Popen(['bash', fname], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=self.env, preexec_fn=pre_exec) self.sp = sp self.log.info("Output:") output = '' line = '' for line in iter(sp.stdout.readline, b''): line = line.decode(self.output_encoding).strip() output = output + line + "\n" self.log.info(line) sp.wait() self.log.info("Command exited with return code %s", sp.returncode) if sp.returncode: raise AirflowException("Bash command failed") if self.process_output: return self.process_output(output) if self.xcom_push_flag: return line
def execute(self, context): mysql_infields = ','.join('`{}`'.format(infield) for infield in self.mysql_infields) self.log.info('MySQL fields: %s', mysql_infields) s3_hook = S3Hook(self.aws_conn_id) mysql_hook = MySqlHook(mysql_conn_id=self.mysql_conn_id) self.log.info('Listing files in s3://%s/%s', self.s3_bucket, self.s3_prefix) s3_infiles = s3_hook.list_keys(self.s3_bucket, prefix=self.s3_prefix, delimiter=self.s3_delimiter) if not s3_infiles: raise RuntimeError('no file to process') with TemporaryDirectory(prefix='airflow_mysqlloadop_') as tmp_dir: with NamedTemporaryFile('ab', dir=tmp_dir, delete=False) as tmp: for s3_infile in s3_infiles: self.log.info('Download s3://%s/%s', self.s3_bucket, s3_infile) s3_obj = s3_hook.get_key(s3_infile, self.s3_bucket) if s3_obj.content_type == 'application/x-directory': self.log.info('Skip directory: s3://%s/%s', self.s3_bucket, s3_infile) continue s3_obj.download_fileobj(tmp) mysql_infile = tmp.name self.log.info('MySQL infile: %s', mysql_infile) mysql_sql_fmt = ''' LOAD DATA LOCAL INFILE '{file}' INTO TABLE `{database}`.`{table}` FIELDS TERMINATED BY '{seps[0]}' ENCLOSED BY '{seps[1]}' LINES TERMINATED BY '{seps[2]}' ({fields}) ; ''' mysql_sql = mysql_sql_fmt.format(file=mysql_infile, database=self.mysql_database, table=self.mysql_table, seps=self.mysql_inseps, fields=mysql_infields) self.log.info('Execute SQL') mysql_hook.run(mysql_sql)
def _run_image(self): """ Run a Docker container with the provided image """ self.log.info('Starting docker container from image %s', self.image) with TemporaryDirectory(prefix='airflowtmp', dir=self.host_tmp_dir) as host_tmp_dir: self.volumes.append('{0}:{1}'.format(host_tmp_dir, self.tmp_dir)) self.container = self.cli.create_container( command=self.get_command(), name=self.container_name, environment=self.environment, host_config=self.cli.create_host_config( auto_remove=self.auto_remove, binds=self.volumes, network_mode=self.network_mode, shm_size=self.shm_size, dns=self.dns, dns_search=self.dns_search, cpu_shares=int(round(self.cpus * 1024)), mem_limit=self.mem_limit), image=self.image, user=self.user, working_dir=self.working_dir, tty=self.tty, ) self.cli.start(self.container['Id']) line = '' for line in self.cli.attach(container=self.container['Id'], stdout=True, stderr=True, stream=True): line = line.strip() if hasattr(line, 'decode'): line = line.decode('utf-8') self.log.info(line) result = self.cli.wait(self.container['Id']) if result['StatusCode'] != 0: raise AirflowException('docker container failed: ' + repr(result)) # duplicated conditional logic because of expensive operation if self.do_xcom_push: return self.cli.logs(container=self.container['Id']) \ if self.xcom_all else line.encode('utf-8') else: return None
def create_repository_and_bucket(self): """Create a bucket and a repository with sample application.""" with TemporaryDirectory(prefix="airflow-gcp") as tmp_dir: # 1. Create required files quickstart_path = os.path.join(tmp_dir, "quickstart.sh") with open(quickstart_path, "w") as file: file.write("#!/bin/sh\n") file.write('echo "Hello, world! The time is $(date)."\n') file.flush() os.chmod(quickstart_path, 555) with open(os.path.join(tmp_dir, "Dockerfile"), "w") as file: file.write("FROM alpine\n") file.write("COPY quickstart.sh /\n") file.write('CMD ["/quickstart.sh"]\n') file.flush() # 2. Prepare bucket self.execute_cmd( ["gsutil", "mb", "gs://{}".format(GCP_BUCKET_NAME)]) self.execute_cmd([ "bash", "-c", "tar -zcvf - -C {} . | gsutil cp -r - {}".format( tmp_dir, GCP_ARCHIVE_URL) ]) # 3. Prepare repo self.execute_cmd( ["gcloud", "source", "repos", "create", GCP_REPOSITORY_NAME]) self.execute_cmd(["git", "init"], cwd=tmp_dir) self.execute_cmd( ["git", "config", "user.email", "*****@*****.**"], cwd=tmp_dir) self.execute_cmd(["git", "config", "user.name", "system-test"]) self.execute_cmd( [ "git", "config", "credential.https://source.developers.google.com.helper", "gcloud.sh" ], cwd=tmp_dir, ) self.execute_cmd(["git", "add", "."], cwd=tmp_dir) self.execute_cmd(["git", "commit", "-m", "Initial commit"], cwd=tmp_dir) repo_url = "https://source.developers.google.com/p/{}/r/{}".format( GCP_PROJECT_ID, GCP_REPOSITORY_NAME) self.execute_cmd(["git", "remote", "add", "origin", repo_url], cwd=tmp_dir) self.execute_cmd(["git", "push", "origin", "master"], cwd=tmp_dir)
def test_write_temp_file(self): task_id = "some_test_id" sql = "some_sql" sql_params = {':p_data': "2018-01-01"} oracle_conn_id = "oracle_conn_id" filename = "some_filename" azure_data_lake_conn_id = 'azure_data_lake_conn_id' azure_data_lake_path = 'azure_data_lake_path' delimiter = '|' encoding = 'utf-8' cursor_description = [('id', "<class 'cx_Oracle.NUMBER'>", 39, None, 38, 0, 0), ('description', "<class 'cx_Oracle.STRING'>", 60, 240, None, None, 1)] cursor_rows = [[1, 'description 1'], [2, 'description 2']] mock_cursor = MagicMock() mock_cursor.description = cursor_description mock_cursor.__iter__.return_value = cursor_rows op = OracleToAzureDataLakeTransfer( task_id=task_id, filename=filename, oracle_conn_id=oracle_conn_id, sql=sql, sql_params=sql_params, azure_data_lake_conn_id=azure_data_lake_conn_id, azure_data_lake_path=azure_data_lake_path, delimiter=delimiter, encoding=encoding) with TemporaryDirectory(prefix='airflow_oracle_to_azure_op_') as temp: op._write_temp_file(mock_cursor, os.path.join(temp, filename)) assert os.path.exists(os.path.join(temp, filename)) == 1 with open(os.path.join(temp, filename), 'rb') as csvfile: temp_file = csv.reader(csvfile, delimiter=delimiter, encoding=encoding) rownum = 0 for row in temp_file: if rownum == 0: self.assertEqual(row[0], 'id') self.assertEqual(row[1], 'description') else: self.assertEqual(row[0], str(cursor_rows[rownum - 1][0])) self.assertEqual(row[1], cursor_rows[rownum - 1][1]) rownum = rownum + 1
def execute(self, context): """ if the bash command contains only spaces, then send a skip exception """ LOG.info("Running command: %s" % (self.bash_command, )) if self.bash_command.strip() == '': raise AirflowSkipException('empty bash command script') # LOG.info("Tmp dir root location: \n %s", gettempdir()) self.lineage_data = self.bash_command with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f: f.write(bytes(self.bash_command, 'utf_8')) f.flush() fname = f.name script_location = os.path.abspath(fname) LOG.info("Temporary script location: %s", script_location) def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() # LOG.info("Running command: %s", self.bash_command) sp = Popen(['bash', fname], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=self.env, preexec_fn=pre_exec) self.sp = sp LOG.info("Output:") line = '' for line in iter(sp.stdout.readline, b''): line = line.decode(self.output_encoding).rstrip() LOG.info(line) sp.wait() LOG.info("Command exited with return code %s", sp.returncode) if sp.returncode: raise AirflowException("Bash command failed") if self.xcom_push_flag: return line
def run_cli(self, pig, pig_opts=None, verbose=True): """ Run an pig script using the pig cli >>> ph = PigCliHook() >>> result = ph.run_cli("ls /;", pig_opts="-x mapreduce") >>> ("hdfs://" in result) True """ with TemporaryDirectory(prefix='airflow_pigop_') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir) as f: f.write(pig.encode('utf-8')) f.flush() fname = f.name pig_bin = 'pig' cmd_extra = [] pig_cmd = [pig_bin] if self.pig_properties: pig_properties_list = self.pig_properties.split() pig_cmd.extend(pig_properties_list) if pig_opts: pig_opts_list = pig_opts.split() pig_cmd.extend(pig_opts_list) pig_cmd.extend(['-f', fname] + cmd_extra) if verbose: self.log.info("%s", " ".join(pig_cmd)) sp = subprocess.Popen(pig_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=tmp_dir, close_fds=True) self.sp = sp stdout = '' for line in iter(sp.stdout.readline, b''): stdout += line.decode('utf-8') if verbose: self.log.info(line.strip()) sp.wait() if sp.returncode: raise AirflowException(stdout) return stdout
def test_should_detect_changes_in_directory(self): with TemporaryDirectory(prefix="tmp") as tempdir, \ mock.patch("airflow.bin.cli.settings.PLUGINS_FOLDER", tempdir): self._prepare_test_file("{}/file1.txt".format(tempdir), 100) self._prepare_test_file( "{}/nested/nested/nested/nested/file2.txt".format(tempdir), 200) self._prepare_test_file("{}/file3.txt".format(tempdir), 300) monitor = cli.GunicornMonitor( gunicorn_master_pid=1, num_workers_expected=4, master_timeout=60, worker_refresh_interval=60, worker_refresh_batch_size=2, reload_on_plugin_change=True, ) # When the files have not changed, the result should be constant state_a = monitor._generate_plugin_state() state_b = monitor._generate_plugin_state() self.assertEqual(state_a, state_b) self.assertEqual(3, len(state_a)) # Should detect new file self._prepare_test_file("{}/file4.txt".format(tempdir), 400) state_c = monitor._generate_plugin_state() self.assertNotEqual(state_b, state_c) self.assertEqual(4, len(state_c)) # Should detect changes in files self._prepare_test_file("{}/file4.txt".format(tempdir), 450) state_d = monitor._generate_plugin_state() self.assertNotEqual(state_c, state_d) self.assertEqual(4, len(state_d)) # Should support large files self._prepare_test_file("{}/file4.txt".format(tempdir), 4000000) state_d = monitor._generate_plugin_state() self.assertNotEqual(state_c, state_d) self.assertEqual(4, len(state_d))
def execute(self, context): """ Execute the bash command in a temporary directory which will be cleaned afterwards. """ logging.info("tmp dir root location: \n" + gettempdir()) with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: os.chmod(tmp_dir, 0777) # Ensure the sudo user has perms to their current working directory for making tempfiles # This is not really a security flaw because the only thing in that dir is the # temp script, owned by the airflow user and any temp files made by the sudo user # and all of those will be created with the owning user's umask # If a process needs finer control over the tempfiles it creates, that process can chmod # them as they are created. with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f: if self.user == getpass.getuser( ): # don't try to sudo as yourself f.write(bytes(self.bash_command, 'utf_8')) else: sudo_cmd = "sudo -u {} sh -c '{}'".format( self.user, self.bash_command) f.write(bytes(sudo_cmd, 'utf_8')) f.flush() logging.info('Temporary script location: {0}'.format(f.name)) logging.info('Running command: {}'.format(self.bash_command)) self.sp = Popen(['bash', f.name], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=self.env) logging.info('Output:') line = '' for line in iter(self.sp.stdout.readline, b''): line = line.decode(self.output_encoding).strip() logging.info(line) self.sp.wait() logging.info("Command exited with return code {0}".format( self.sp.returncode)) if self.sp.returncode: raise AirflowException("Bash command failed") if self.xcom_push_flag: return line