def execute(self, context):
        self.log.info('Starting docker container from image %s', self.image)

        tls_config = self.__get_tls_config()

        if self.docker_conn_id:
            self.cli = self.get_hook().get_conn()
        else:
            self.cli = APIClient(base_url=self.docker_url,
                                 version=self.api_version,
                                 tls=tls_config)

        if ':' not in self.image:
            image = self.image + ':latest'
        else:
            image = self.image

        if self.force_pull or len(self.cli.images(name=image)) == 0:
            self.log.info('Pulling docker image %s', image)
            for l in self.cli.pull(image, stream=True):
                output = json.loads(l.decode('utf-8'))
                self.log.info("%s", output['status'])

        cpu_shares = int(round(self.cpus * 1024))

        with TemporaryDirectory(prefix='airflowtmp') as host_tmp_dir:
            self.environment['AIRFLOW_TMP_DIR'] = self.tmp_dir
            self.volumes.append('{0}:{1}'.format(host_tmp_dir, self.tmp_dir))

            self.container = self.cli.create_container(
                command=self.get_command(),
                cpu_shares=cpu_shares,
                environment=self.environment,
                host_config=self.cli.create_host_config(
                    binds=self.volumes,
                    network_mode=self.network_mode,
                    shm_size=self.shm_size,
                    dns=self.dns,
                    dns_search=self.dns_search),
                image=image,
                mem_limit=self.mem_limit,
                user=self.user,
                working_dir=self.working_dir)
            self.cli.start(self.container['Id'])

            line = ''
            for line in self.cli.logs(container=self.container['Id'],
                                      stream=True):
                line = line.strip()
                if hasattr(line, 'decode'):
                    line = line.decode('utf-8')
                self.log.info(line)

            exit_code = self.cli.wait(self.container['Id'])
            if exit_code != 0:
                raise AirflowException('docker container failed')

            if self.xcom_push_flag:
                return self.cli.logs(container=self.container['Id']) \
                    if self.xcom_all else str(line)
    def upload_test_file(self, uri: str, file_name: str):
        with TemporaryDirectory(prefix="airflow-gcp") as tmp_dir:
            # 1. Create required files
            quickstart_path = os.path.join(tmp_dir, file_name)
            with open(quickstart_path, "w") as file:
                file.writelines(
                    [
                        "#!/usr/bin/python\n",
                        "import pyspark\n",
                        "sc = pyspark.SparkContext()\n",
                        "rdd = sc.parallelize(['Hello,', 'world!'])\n",
                        "words = sorted(rdd.collect())\n",
                        "print(words)\n",
                    ]
                )
                file.flush()

            os.chmod(quickstart_path, 555)

            self.execute_cmd(
                [
                    "gsutil",
                    "cp",
                    "{file}".format(file=quickstart_path),
                    "{uri}".format(uri=uri),
                ]
            )
Пример #3
0
    def execute_callable(self):
        with TemporaryDirectory(prefix='venv') as tmp_dir:
            if self.templates_dict:
                self.op_kwargs['templates_dict'] = self.templates_dict
            # generate filenames
            input_filename = os.path.join(tmp_dir, 'script.in')
            output_filename = os.path.join(tmp_dir, 'script.out')
            string_args_filename = os.path.join(tmp_dir, 'string_args.txt')
            script_filename = os.path.join(tmp_dir, 'script.py')

            # set up virtualenv
            self._execute_in_subprocess(self._generate_virtualenv_cmd(tmp_dir))
            cmd = self._generate_pip_install_cmd(tmp_dir)
            if cmd:
                self._execute_in_subprocess(cmd)

            self._write_args(input_filename)
            self._write_script(script_filename)
            self._write_string_args(string_args_filename)

            # execute command in virtualenv
            self._execute_in_subprocess(
                self._generate_python_cmd(tmp_dir, script_filename,
                                          input_filename, output_filename,
                                          string_args_filename))
            return self._read_result(output_filename)
Пример #4
0
    def execute(self, context):
        """
        Execute the bash command in a temporary directory
        which will be cleaned afterwards
        """
        self.log.info('Tmp dir root location: \n %s', gettempdir())

        # Prepare env for child process.
        if self.env is None:
            self.env = os.environ.copy()

        airflow_context_vars = context_to_airflow_vars(context,
                                                       in_env_var_format=True)
        self.log.info('Exporting the following env vars:\n' + '\n'.join(
            ["{}={}".format(k, v) for k, v in airflow_context_vars.items()]))
        self.env.update(airflow_context_vars)

        self.lineage_data = self.bash_command

        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir,
                                    prefix=self.task_id) as tmp_file:
                tmp_file.write(bytes(self.bash_command, 'utf_8'))
                tmp_file.flush()
                script_location = os.path.abspath(tmp_file.name)
                self.log.info('Temporary script location: %s', script_location)

                def pre_exec():
                    # Restore default signal disposition and invoke setsid
                    for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
                        if hasattr(signal, sig):
                            signal.signal(getattr(signal, sig), signal.SIG_DFL)
                    os.setsid()

                self.log.info('Running command: %s', self.bash_command)
                sub_process = Popen(['bash', tmp_file.name],
                                    stdout=PIPE,
                                    stderr=STDOUT,
                                    cwd=tmp_dir,
                                    env=self.env,
                                    preexec_fn=pre_exec)

                self.sub_process = sub_process

                self.log.info('Output:')
                line = ''
                for raw_line in iter(sub_process.stdout.readline, b''):
                    line = raw_line.decode(self.output_encoding).rstrip()
                    self.log.info(line)

                sub_process.wait()

                self.log.info('Command exited with return code %s',
                              sub_process.returncode)

                if sub_process.returncode:
                    raise AirflowException('Bash command failed')

        if self.xcom_push_flag:
            return line
Пример #5
0
def create_task_instance_by_dag_code(dag_code, dag_name, task_name, execution_date=None):
    with TemporaryDirectory(prefix='dcmp_dag_') as tmp_dir:
        with NamedTemporaryFile(dir=tmp_dir) as f:
            f.write(dag_code.encode('UTF-8'))
            f.flush()
            ti = create_task_instance(dag_name, task_name, execution_date=execution_date, dag_folder=os.path.join(tmp_dir, f.name), include_examples=False)
    return ti
Пример #6
0
def create_dagbag_by_dag_code(dag_code):
    with TemporaryDirectory(prefix='dcmp_dag_') as tmp_dir:
        with NamedTemporaryFile(dir=tmp_dir) as f:
            f.write(dag_code.encode('UTF-8'))
            f.flush()
            dagbag = DagBag(dag_folder=os.path.join(tmp_dir, f.name), include_examples=False)
    return dagbag
Пример #7
0
    def execute(self, context):
        model_name = os.path.split(s3_key)[1]
        self.log.info('Model name: %s', model_name)

        s3_hook = S3Hook(self.aws_conn_id)
        firebase_hook = FirebaseHook(self.firebase_conn_id)

        with TemporaryDirectory(prefix='airflow_firebaseop_') as tmp_dir:
            with NamedTemporaryFile(model_name, dir=tmp_dir,
                                    delete=False) as tmp:
                self.log.info('Download s3://%s/%s', self.s3_bucket,
                              self.s3_key)

                s3_obj = s3_hook.get_key(self.s3_key, self.s3_bucket)
                s3_obj.download_fileobj(tmp)

                model_filepath = tmp.name
                self.log.info('Model file: %s', model_filepath)

            self.log.info('Create/Update model')
            model_info = firebase_hook.put_model(self, model_name,
                                                 model_filepath,
                                                 self.model_tags)

        self.log.info('Model info: %s', model_info)
        return model_info
Пример #8
0
    def execute(self, context):
        # get data, save temporarily
        dwhook = get_dwhook(self.dwh_engine)(self.dwh_conn_id)
        sql = dwhook._QUERY_TABLE.format(
            **{
                "database_name": self.database,
                "schema_name": self.schema,
                "table_name": self.table,
            }
        )
        self.log.info("Getting data with SQL:\n\n{0}".format(sql))
        data = dwhook.execute_and_return_result(sql, return_dict=True)
        del dwhook

        with TemporaryDirectory(prefix="senddataasmail") as tmp_dir:
            self.files = [tmp_dir + os.sep + self.filename + ".csv"]
            self.log.info(
                "temporarily writing csv file to {0}".format(
                    self.files[0],
                )
            )
            with open(self.files[0], mode="w") as csv_file:
                csvwriter = csv.DictWriter(
                    csv_file,
                    fieldnames=data[0].keys(),
                    delimiter=",",
                    quotechar='"',
                    quoting=csv.QUOTE_MINIMAL,
                )
                csvwriter.writeheader()
                for _ in range(len(data)):
                    datum = data.pop(0)
                    csvwriter.writerow(datum)
            super().execute(context)
Пример #9
0
    def execute(self, context):
        logging.info('Starting docker container from image ' + self.image)

        tls_config = None
        if self.tls_ca_cert and self.tls_client_cert and self.tls_client_key:
            tls_config = tls.TLSConfig(ca_cert=self.tls_ca_cert,
                                       client_cert=(self.tls_client_cert,
                                                    self.tls_client_key),
                                       verify=True,
                                       ssl_version=self.tls_ssl_version,
                                       assert_hostname=self.tls_hostname)
            self.docker_url = self.docker_url.replace('tcp://', 'https://')

        self.cli = DockerAPIClient(base_url=self.docker_url,
                                   version=self.api_version,
                                   tls=tls_config)

        if ':' not in self.image:
            image = self.image + ':latest'
        else:
            image = self.image

        if self.force_pull or len(self.cli.images(name=image)) == 0:
            logging.info('Pulling docker image ' + image)
            for l in self.cli.pull(image, stream=True):
                output = json.loads(l.decode('utf-8'))
                logging.info("{}".format(output['status']))

        cpu_shares = int(round(self.cpus * 1024))

        with TemporaryDirectory(prefix='airflowtmp') as host_tmp_dir:
            self.environment['AIRFLOW_TMP_DIR'] = self.tmp_dir
            self.volumes.append('{0}:{1}'.format(host_tmp_dir, self.tmp_dir))

            self.container = self.cli.create_container(
                command=self.get_command(),
                cpu_shares=cpu_shares,
                environment=self.environment,
                host_config=self.cli.create_host_config(
                    binds=self.volumes, network_mode=self.network_mode),
                image=image,
                mem_limit=self.mem_limit,
                user=self.user)
            self.cli.start(self.container['Id'])

            line = ''
            for line in self.cli.logs(container=self.container['Id'],
                                      stream=True):
                line = line.strip()
                if hasattr(line, 'decode'):
                    line = line.decode('utf-8')
                logging.info(line)

            exit_code = self.cli.wait(self.container['Id'])
            if exit_code != 0:
                raise AirflowException('docker container failed')

            if self.xcom_push_flag:
                return self.cli.logs(container=self.container['Id']
                                     ) if self.xcom_all else str(line)
Пример #10
0
    def poke(self, context):
        """
        Execute the bash command in a temporary directory
        which will be cleaned afterwards
        """
        bash_command = self.bash_command
        self.log.info("Tmp dir root location: \n %s", gettempdir())
        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f:
                f.write(bytes(bash_command, 'utf_8'))
                f.flush()
                fname = f.name
                script_location = tmp_dir + "/" + fname
                self.log.info("Temporary script location: %s", script_location)
                self.log.info("Running command: %s", bash_command)
                sp = Popen(['bash', fname],
                           stdout=PIPE,
                           stderr=STDOUT,
                           close_fds=True,
                           cwd=tmp_dir,
                           env=self.env,
                           preexec_fn=os.setsid)

                self.sp = sp

                self.log.info("Output:")
                line = ''
                for line in iter(sp.stdout.readline, b''):
                    line = line.decode(self.output_encoding).strip()
                    self.log.info(line)
                sp.wait()
                self.log.info("Command exited with return code %s",
                              sp.returncode)

                return not sp.returncode
Пример #11
0
    def execute(self, context):
        self.log.info('Starting docker container from image %s', self.image)

        tls_config = self.__get_tls_config()

        if self.docker_conn_id:
            self.cli = self.get_hook().get_conn()
        else:
            self.cli = APIClient(base_url=self.docker_url,
                                 version=self.api_version,
                                 tls=tls_config)

        if self.force_pull or len(self.cli.images(name=self.image)) == 0:
            self.log.info('Pulling docker image %s', self.image)
            for l in self.cli.pull(self.image, stream=True):
                output = json.loads(l.decode('utf-8').strip())
                if 'status' in output:
                    self.log.info("%s", output['status'])

        with TemporaryDirectory(prefix='airflowtmp',
                                dir=self.host_tmp_dir) as host_tmp_dir:
            self.environment['AIRFLOW_TMP_DIR'] = self.tmp_dir
            self.volumes.append('{0}:{1}'.format(host_tmp_dir, self.tmp_dir))

            self.container = self.cli.create_container(
                command=self.get_command(),
                environment=self.environment,
                host_config=self.cli.create_host_config(
                    auto_remove=self.auto_remove,
                    binds=self.volumes,
                    network_mode=self.network_mode,
                    shm_size=self.shm_size,
                    dns=self.dns,
                    dns_search=self.dns_search,
                    cpu_shares=int(round(self.cpus * 1024)),
                    mem_limit=self.mem_limit),
                image=self.image,
                user=self.user,
                working_dir=self.working_dir)
            self.cli.start(self.container['Id'])

            line = ''
            for line in self.cli.attach(container=self.container['Id'],
                                        stdout=True,
                                        stderr=True,
                                        stream=True):
                line = line.strip()
                if hasattr(line, 'decode'):
                    line = line.decode('utf-8')
                self.log.info(line)

            result = self.cli.wait(self.container['Id'])
            if result['StatusCode'] != 0:
                raise AirflowException('docker container failed: ' +
                                       repr(result))

            # duplicated conditional logic because of expensive operation
            if self.do_xcom_push:
                return self.cli.logs(container=self.container['Id']) \
                    if self.xcom_all else line.encode('utf-8')
Пример #12
0
    def execute(self, context):
        """
        Execute the bash command in a temporary directory
        which will be cleaned afterwards
        """
        bash_command = self.bash_command
        logging.info("tmp dir root location: \n" + gettempdir())
        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f:

                f.write(bytes(bash_command, 'utf_8'))
                f.flush()
                fname = f.name
                script_location = tmp_dir + "/" + fname
                logging.info("Temporary script "
                             "location :{0}".format(script_location))
                logging.info("Running command: " + bash_command)

                input_file = None
                if self.input_file:
                    input_file = fopen(self.input_file)

                out = None
                if self.output_file:
                    out = fopen(self.output_file, mode='w')

                ON_POSIX = 'posix' in sys.builtin_module_names

                sp = Popen(['bash', fname],
                           stdin=PIPE if input_file else None,
                           stdout=PIPE if out else None,
                           stderr=PIPE,
                           cwd=tmp_dir,
                           env=self.env,
                           preexec_fn=os.setsid,
                           bufsize=1,
                           close_fds=ON_POSIX)

                self.sp = sp

                if input_file:
                    pipe_stream(input_file, sp.stdin)

                if out:
                    pipe_stream(sp.stdout, out)

                for line in iter(sp.stderr.readline, b''):
                    logging.info(line)

                sp.wait()

                if input_file:
                    input_file.read_key.close(fast=True)

                logging.info("Command exited with "
                             "return code {0}".format(sp.returncode))

                if sp.returncode:
                    raise AirflowException("Bash command failed")
Пример #13
0
    def execute(self, context):
        """
        Execute the bash command in a temporary directory
        which will be cleaned afterwards
        """
        self.log.info(f"Tmp dir root location: \n {gettempdir()}")

        # Prepare env for child process.
        env = self.env
        if env is None:
            env = os.environ.copy()
        airflow_context_vars = context_to_airflow_vars(context,
                                                       in_env_var_format=True)
        acv_log = "\n".join(
            [f"{k}={v}" for k, v in airflow_context_vars.items()])
        self.log.debug("Exporting the following env vars:\n" f"{acv_log}")
        env.update(airflow_context_vars)

        self.lineage_data = self.bash_command

        with TemporaryDirectory(prefix="airflowtmp") as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f:

                f.write(bytes(self.bash_command, "utf_8"))
                f.flush()
                fname = f.name
                script_location = os.path.abspath(fname)
                self.log.info(f"Temporary script location: {script_location}")

                def pre_exec():
                    # Restore default signal disposition and invoke setsid
                    for sig in ("SIGPIPE", "SIGXFZ", "SIGXFSZ"):
                        if hasattr(signal, sig):
                            signal.signal(getattr(signal, sig), signal.SIG_DFL)
                    os.setsid()

                self.log.info(f"Running command: {self.bash_command}")
                self.sub_process = Popen(["bash", fname],
                                         stdout=PIPE,
                                         stderr=STDOUT,
                                         cwd=tmp_dir,
                                         env=env,
                                         preexec_fn=pre_exec)

                self.log.info("Output:")
                line = ""
                for line in iter(self.sub_process.stdout.readline, b""):
                    line = line.decode(self.output_encoding).rstrip()
                    self.log.info(line)
                self.sub_process.wait()
                self.log.info(
                    f"Command exited with return code {self.sub_process.returncode}"
                )

                if self.sub_process.returncode:
                    raise AirflowException("Bash command failed")

        if self.xcom_push_flag:
            return line
Пример #14
0
    def execute(self, context):
        """
        Execute the bash command in a temporary directory
        which will be cleaned afterwards
        """
        self.log.info("Tmp dir root location: \n %s", gettempdir())

        airflow_home_value = conf.get('core', AIRFLOW_HOME_VAR)
        pythonpath_value = os.environ.get(PYTHONPATH_VAR, '')

        bash_command = ('export {}={}; '.format(AIRFLOW_HOME_VAR, airflow_home_value) +
                        'export {}={}; '.format(PYTHONPATH_VAR, pythonpath_value) +
                        self.bash_command)
        self.lineage_data = bash_command

        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f:

                f.write(bytes(bash_command, 'utf_8'))
                f.flush()
                fname = f.name
                script_location = os.path.abspath(fname)
                self.log.info(
                    "Temporary script location: %s",
                    script_location
                )

                def pre_exec():
                    # Restore default signal disposition and invoke setsid
                    for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
                        if hasattr(signal, sig):
                            signal.signal(getattr(signal, sig), signal.SIG_DFL)
                    os.setsid()

                self.log.info("Running command: %s", bash_command)
                sp = Popen(
                    ['bash', fname],
                    stdout=PIPE, stderr=STDOUT,
                    cwd=tmp_dir, env=self.env,
                    preexec_fn=pre_exec)

                self.sp = sp

                self.log.info("Output:")
                line = ''
                for line in iter(sp.stdout.readline, b''):
                    line = line.decode(self.output_encoding).rstrip()
                    self.log.info(line)
                sp.wait()
                self.log.info(
                    "Command exited with return code %s",
                    sp.returncode
                )

                if sp.returncode:
                    raise AirflowException("Bash command failed")

        if self.xcom_push_flag:
            return line
    def execute(self, context):

        embulk_command = context['ti'].xcom_pull(task_ids=self.input_task_id,
                                                 key='query_embulk')
        #self.bash_command = embulk_command

        self.log.info("Tmp dir root location: \n %s", gettempdir())

        # Prepare env for child process.
        if self.env is None:
            self.env = os.environ.copy()
        airflow_context_vars = context_to_airflow_vars(context,
                                                       in_env_var_format=True)
        self.log.info("Exporting the following env vars:\n" + '\n'.join(
            ["{}={}".format(k, v) for k, v in airflow_context_vars.items()]))
        self.env.update(airflow_context_vars)

        self.lineage_data = embulk_command

        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f:

                f.write(bytes(embulk_command, 'utf_8'))
                f.flush()
                fname = f.name
                script_location = os.path.abspath(fname)
                self.log.info("Temporary script location: %s", script_location)

                def pre_exec():
                    # Restore default signal disposition and invoke setsid
                    for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
                        if hasattr(signal, sig):
                            signal.signal(getattr(signal, sig), signal.SIG_DFL)
                    os.setsid()

                self.log.info("Running command: %s", embulk_command)
                sp = Popen(['bash', fname],
                           stdout=PIPE,
                           stderr=STDOUT,
                           cwd=tmp_dir,
                           env=self.env,
                           preexec_fn=pre_exec)

                self.sp = sp

                self.log.info("Output:")
                line = ''
                for line in iter(sp.stdout.readline, b''):
                    line = line.decode(self.output_encoding).rstrip()
                    self.log.info(line)
                sp.wait()
                self.log.info("Command exited with return code %s",
                              sp.returncode)

                if sp.returncode:
                    raise AirflowException("Bash command failed")

        if self.xcom_push_flag:
            return line
Пример #16
0
    def execute(self, context):
        """
        Execute the bash command in a temporary directory
        which will be cleaned afterwards
        """
        self.log.info('Tmp dir root location: \n %s', gettempdir())

        # Prepare env for child process.
        env = self.env
        if env is None:
            env = os.environ.copy()

        airflow_context_vars = context_to_airflow_vars(context,
                                                       in_env_var_format=True)
        self.log.info(
            'Exporting the following env vars:\n%s', '\n'.join([
                "{}={}".format(k, v) for k, v in airflow_context_vars.items()
            ]))
        env.update(airflow_context_vars)

        self.lineage_data = self.bash_command

        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:

            def pre_exec():
                # Restore default signal disposition and invoke setsid
                for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
                    if hasattr(signal, sig):
                        signal.signal(getattr(signal, sig), signal.SIG_DFL)
                os.setsid()

            self.log.info('Running command: %s', self.bash_command)
            sub_process = Popen(['bash', "-c", self.bash_command],
                                stdout=PIPE,
                                stderr=STDOUT,
                                cwd=tmp_dir,
                                env=env,
                                preexec_fn=pre_exec)

            self.sub_process = sub_process

            self.log.info('Output:')
            line = ''
            for raw_line in iter(sub_process.stdout.readline, b''):
                line = raw_line.decode(self.output_encoding).rstrip()
                self.log.info("%s", line)

            sub_process.wait()

            self.log.info('Command exited with return code %s',
                          sub_process.returncode)

            if sub_process.returncode != 0:
                raise AirflowException(
                    'Bash command failed. The command returned a non-zero exit code.'
                )

        return line
Пример #17
0
 def execute(self, context):
     with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
         master = _SparkMaster(None, tmp_dir, self._env)
         logging.info("Running command: " + self.cmd)
         result = master.run_and_get_json(self._cmd)
         cluster_id = result["ClusterId"]
         master.cluster_id = cluster_id
         logging.info("Cluster `%s` has been created. Waiting for execution to finish." % cluster_id)
         master.wait_for_finish()
Пример #18
0
    def clone_repo(self, clone_to: str, env: Optional[Dict[str, str]] = None) -> None:
        """Clone the repository into a specific location.

        :param clone_to: Directory to clone to.
        :param env: Dictionary of environment variables to use. Defaults to
            os.environ.copy(). Requires a $HOME environment variable to work with SSH.
        """

        env = env or os.environ.copy()

        cmd = []
        cmd.append("eval `ssh-agent`")
        with TemporaryDirectory(prefix="__ewah_git_") as tmp_dir:
            # temporarily save SSH key, if applicable, in this folder
            with NamedTemporaryFile(dir=tmp_dir) as ssh_key_file:
                if self.conn.private_key:
                    ssh_key_filepath = os.path.abspath(ssh_key_file.name)
                    ssh_key_file.write(self.conn.private_key.encode())
                    ssh_key_file.seek(0)
                    if self.conn.password:
                        passfile_path = os.path.abspath(
                            tmp_dir + os.path.sep + "ssh_key_pw"
                        )
                        with open(passfile_path, "w+") as passfile:
                            passfile.write(
                                '#!/bin/bash\necho "{0}"'.format(self.conn.password)
                            )
                            passfile.seek(0)
                        cmd.append("chmod 777 {0}".format(passfile_path))
                        cmd.append(
                            'DISPLAY=":0.0" SSH_ASKPASS="******" ssh-add {1}'.format(
                                passfile_path, ssh_key_filepath
                            )
                        )
                    else:
                        cmd.append("ssh-add {0}".format(ssh_key_filepath))

                # Add actual clone commands
                git_link = self.conn.git_link
                cmd.append("mkdir -p $HOME/.ssh")

                # Must add the host to the known hosts or it will ask for confirmation
                ssh_domain = re.search("@(.*):", git_link).group(1)
                cmd.append(
                    "ssh-keyscan -H {0} >> $HOME/.ssh/known_hosts".format(ssh_domain)
                )
                if self.conn.branch:
                    cmd.append(
                        "git clone -b {0} {1} {2}".format(
                            self.conn.branch, git_link, clone_to
                        )
                    )
                else:
                    cmd.append("git clone {0} {1}".format(git_link, clone_to))

                # Execute commands!
                assert run_cmd(cmd, env, self.log.info) == 0
Пример #19
0
 def execute(self, context):
     with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
         master = _SparkMaster(self.cluster_id, tmp_dir, self._env)
         cmd = [Template(t).render(dag_id=self.dag_id, steps=self.steps, cluster_id=self.cluster_id) for t in self.template]
         logging.info("Checking status of the cluster %s" % self.cluster_id)
         master.verify_cluster_is_ready()
         logging.info("Running command: " + " ".join(cmd))
         master.run_and_get_json(cmd)
         logging.info("Job is running. Waiting for execution to finish.")
         master.wait_for_finish()
Пример #20
0
 def execute(self, context):
     with TemporaryDirectory(prefix='dockervariables') as tmp_var_dir:
         for key in self.variables:
             value = Variable.get(key)
             with open(os.path.join(tmp_var_dir, key), 'w') as value_file:
                 # import pdb
                 # pdb.set_trace()
                 value_file.write(value)
         self.volumes.append('{0}:{1}'.format(tmp_var_dir,
                                              self.mount_point))
         return super().execute(context)
Пример #21
0
    def mongoclient(self):
        if not hasattr(self, "_mc"):
            if self.conn.ssh_conn_id:
                if self.conn.conn_style == "uri":
                    raise Exception(
                        "Cannot have SSH tunnel with uri connection type!")
                if not hasattr(self, "_ssh_hook"):
                    self._ssh_hook = EWAHBaseHook.get_hook_from_conn_id(
                        conn_id=self.conn.ssh_conn_id)
                    self.local_bind_address = self._ssh_hook.start_tunnel(
                        self.conn.host, self.conn.port)
            else:
                self.local_bind_address = (self.conn.host, self.conn.port)

            conn_kwargs = {"tz_aware": True}
            if self.conn.conn_style == "uri":
                conn_kwargs["host"] = self.conn.uri
            else:
                conn_kwargs["host"] = self.local_bind_address[0]
                conn_kwargs["port"] = self.local_bind_address[1]
                if self.conn.username:
                    conn_kwargs["username"] = self.conn.username
                if self.conn.password:
                    conn_kwargs["password"] = self.conn.password

            with TemporaryDirectory() as tmp_dir:
                if self.conn.tls:
                    conn_kwargs["tls"] = True
                with NamedTemporaryFile(dir=tmp_dir) as ssl_cert:
                    with NamedTemporaryFile(dir=tmp_dir) as ssl_private:
                        if self.conn.ssl_cert:
                            ssl_cert.write(self.conn.ssl_cert.encode())
                            ssl_cert.seek(0)
                            conn_kwargs["ssl_certfile"] = os.path.abspath(
                                ssl_cert.name)
                        if self.conn.ssl_private:
                            ssl_private.write(self.conn.ssl_private.encode())
                            ssl_private.seek(0)
                            conn_kwargs["ssl_keyfile"] = os.path.abspath(
                                ssl_private.name)
                        if self.conn.ssl_password:
                            conn_kwargs[
                                "tlsCertificateKeyFilePassword"] = self.conn.ssl_password
                        if self.conn.tls_insecure:
                            conn_kwargs["tlsInsecure"] = True
                        if self.conn.auth_source:
                            conn_kwargs["authSource"] = self.conn.auth_source
                        if self.conn.auth_mechanism:
                            conn_kwargs[
                                "authMechanism"] = self.conn.auth_mechanism
                        self._mc = MongoClient(**conn_kwargs)

        return self._mc
Пример #22
0
    def execute(self, context):
        """
        Execute the bash command in a temporary directory
        which will be cleaned afterwards
        """
        bash_command = self.bash_command
        self.log.info("Tmp dir root location: \n %s", gettempdir())
        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f:

                f.write(bytes(bash_command, 'utf_8'))
                f.flush()
                fname = f.name
                script_location = tmp_dir + "/" + fname
                self.log.info("Temporary script location: %s", script_location)

                def pre_exec():
                    # Restore default signal disposition and invoke setsid
                    for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
                        if hasattr(signal, sig):
                            signal.signal(getattr(signal, sig), signal.SIG_DFL)
                    os.setsid()

                self.log.info("Running command: %s", bash_command)
                sp = Popen(['bash', fname],
                           stdout=PIPE,
                           stderr=STDOUT,
                           cwd=tmp_dir,
                           env=self.env,
                           preexec_fn=pre_exec)

                self.sp = sp

                self.log.info("Output:")
                output = ''
                line = ''
                for line in iter(sp.stdout.readline, b''):
                    line = line.decode(self.output_encoding).strip()
                    output = output + line + "\n"
                    self.log.info(line)
                sp.wait()
                self.log.info("Command exited with return code %s",
                              sp.returncode)

                if sp.returncode:
                    raise AirflowException("Bash command failed")

        if self.process_output:
            return self.process_output(output)

        if self.xcom_push_flag:
            return line
    def execute(self, context):
        mysql_infields = ','.join('`{}`'.format(infield)
                                  for infield in self.mysql_infields)
        self.log.info('MySQL fields: %s', mysql_infields)

        s3_hook = S3Hook(self.aws_conn_id)
        mysql_hook = MySqlHook(mysql_conn_id=self.mysql_conn_id)

        self.log.info('Listing files in s3://%s/%s', self.s3_bucket,
                      self.s3_prefix)
        s3_infiles = s3_hook.list_keys(self.s3_bucket,
                                       prefix=self.s3_prefix,
                                       delimiter=self.s3_delimiter)
        if not s3_infiles:
            raise RuntimeError('no file to process')

        with TemporaryDirectory(prefix='airflow_mysqlloadop_') as tmp_dir:
            with NamedTemporaryFile('ab', dir=tmp_dir, delete=False) as tmp:
                for s3_infile in s3_infiles:
                    self.log.info('Download s3://%s/%s', self.s3_bucket,
                                  s3_infile)

                    s3_obj = s3_hook.get_key(s3_infile, self.s3_bucket)
                    if s3_obj.content_type == 'application/x-directory':
                        self.log.info('Skip directory: s3://%s/%s',
                                      self.s3_bucket, s3_infile)
                        continue

                    s3_obj.download_fileobj(tmp)

                mysql_infile = tmp.name

            self.log.info('MySQL infile: %s', mysql_infile)

            mysql_sql_fmt = '''
                LOAD DATA LOCAL INFILE '{file}'
                INTO TABLE `{database}`.`{table}`
                FIELDS TERMINATED BY '{seps[0]}'
                ENCLOSED BY '{seps[1]}'
                LINES TERMINATED BY '{seps[2]}'
                ({fields})
                ;
            '''
            mysql_sql = mysql_sql_fmt.format(file=mysql_infile,
                                             database=self.mysql_database,
                                             table=self.mysql_table,
                                             seps=self.mysql_inseps,
                                             fields=mysql_infields)

            self.log.info('Execute SQL')
            mysql_hook.run(mysql_sql)
Пример #24
0
    def _run_image(self):
        """
        Run a Docker container with the provided image
        """
        self.log.info('Starting docker container from image %s', self.image)

        with TemporaryDirectory(prefix='airflowtmp',
                                dir=self.host_tmp_dir) as host_tmp_dir:
            self.volumes.append('{0}:{1}'.format(host_tmp_dir, self.tmp_dir))

            self.container = self.cli.create_container(
                command=self.get_command(),
                name=self.container_name,
                environment=self.environment,
                host_config=self.cli.create_host_config(
                    auto_remove=self.auto_remove,
                    binds=self.volumes,
                    network_mode=self.network_mode,
                    shm_size=self.shm_size,
                    dns=self.dns,
                    dns_search=self.dns_search,
                    cpu_shares=int(round(self.cpus * 1024)),
                    mem_limit=self.mem_limit),
                image=self.image,
                user=self.user,
                working_dir=self.working_dir,
                tty=self.tty,
            )
            self.cli.start(self.container['Id'])

            line = ''
            for line in self.cli.attach(container=self.container['Id'],
                                        stdout=True,
                                        stderr=True,
                                        stream=True):
                line = line.strip()
                if hasattr(line, 'decode'):
                    line = line.decode('utf-8')
                self.log.info(line)

            result = self.cli.wait(self.container['Id'])
            if result['StatusCode'] != 0:
                raise AirflowException('docker container failed: ' +
                                       repr(result))

            # duplicated conditional logic because of expensive operation
            if self.do_xcom_push:
                return self.cli.logs(container=self.container['Id']) \
                    if self.xcom_all else line.encode('utf-8')
            else:
                return None
Пример #25
0
    def create_repository_and_bucket(self):
        """Create a bucket and a repository with sample application."""

        with TemporaryDirectory(prefix="airflow-gcp") as tmp_dir:
            # 1. Create required files
            quickstart_path = os.path.join(tmp_dir, "quickstart.sh")
            with open(quickstart_path, "w") as file:
                file.write("#!/bin/sh\n")
                file.write('echo "Hello, world! The time is $(date)."\n')
                file.flush()

            os.chmod(quickstart_path, 555)

            with open(os.path.join(tmp_dir, "Dockerfile"), "w") as file:
                file.write("FROM alpine\n")
                file.write("COPY quickstart.sh /\n")
                file.write('CMD ["/quickstart.sh"]\n')
                file.flush()

            # 2. Prepare bucket
            self.execute_cmd(
                ["gsutil", "mb", "gs://{}".format(GCP_BUCKET_NAME)])
            self.execute_cmd([
                "bash", "-c", "tar -zcvf - -C {} . | gsutil cp -r - {}".format(
                    tmp_dir, GCP_ARCHIVE_URL)
            ])

            # 3. Prepare repo
            self.execute_cmd(
                ["gcloud", "source", "repos", "create", GCP_REPOSITORY_NAME])
            self.execute_cmd(["git", "init"], cwd=tmp_dir)
            self.execute_cmd(
                ["git", "config", "user.email", "*****@*****.**"],
                cwd=tmp_dir)
            self.execute_cmd(["git", "config", "user.name", "system-test"])
            self.execute_cmd(
                [
                    "git", "config",
                    "credential.https://source.developers.google.com.helper",
                    "gcloud.sh"
                ],
                cwd=tmp_dir,
            )
            self.execute_cmd(["git", "add", "."], cwd=tmp_dir)
            self.execute_cmd(["git", "commit", "-m", "Initial commit"],
                             cwd=tmp_dir)
            repo_url = "https://source.developers.google.com/p/{}/r/{}".format(
                GCP_PROJECT_ID, GCP_REPOSITORY_NAME)
            self.execute_cmd(["git", "remote", "add", "origin", repo_url],
                             cwd=tmp_dir)
            self.execute_cmd(["git", "push", "origin", "master"], cwd=tmp_dir)
    def test_write_temp_file(self):
        task_id = "some_test_id"
        sql = "some_sql"
        sql_params = {':p_data': "2018-01-01"}
        oracle_conn_id = "oracle_conn_id"
        filename = "some_filename"
        azure_data_lake_conn_id = 'azure_data_lake_conn_id'
        azure_data_lake_path = 'azure_data_lake_path'
        delimiter = '|'
        encoding = 'utf-8'
        cursor_description = [('id', "<class 'cx_Oracle.NUMBER'>", 39, None,
                               38, 0, 0),
                              ('description', "<class 'cx_Oracle.STRING'>", 60,
                               240, None, None, 1)]
        cursor_rows = [[1, 'description 1'], [2, 'description 2']]
        mock_cursor = MagicMock()
        mock_cursor.description = cursor_description
        mock_cursor.__iter__.return_value = cursor_rows

        op = OracleToAzureDataLakeTransfer(
            task_id=task_id,
            filename=filename,
            oracle_conn_id=oracle_conn_id,
            sql=sql,
            sql_params=sql_params,
            azure_data_lake_conn_id=azure_data_lake_conn_id,
            azure_data_lake_path=azure_data_lake_path,
            delimiter=delimiter,
            encoding=encoding)

        with TemporaryDirectory(prefix='airflow_oracle_to_azure_op_') as temp:
            op._write_temp_file(mock_cursor, os.path.join(temp, filename))

            assert os.path.exists(os.path.join(temp, filename)) == 1

            with open(os.path.join(temp, filename), 'rb') as csvfile:
                temp_file = csv.reader(csvfile,
                                       delimiter=delimiter,
                                       encoding=encoding)

                rownum = 0
                for row in temp_file:
                    if rownum == 0:
                        self.assertEqual(row[0], 'id')
                        self.assertEqual(row[1], 'description')
                    else:
                        self.assertEqual(row[0],
                                         str(cursor_rows[rownum - 1][0]))
                        self.assertEqual(row[1], cursor_rows[rownum - 1][1])
                    rownum = rownum + 1
Пример #27
0
    def execute(self, context):
        """ if the bash command contains only spaces, then send a skip exception """
        LOG.info("Running command: %s" % (self.bash_command, ))
        if self.bash_command.strip() == '':
            raise AirflowSkipException('empty bash command script')

        # LOG.info("Tmp dir root location: \n %s", gettempdir())
        self.lineage_data = self.bash_command

        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f:

                f.write(bytes(self.bash_command, 'utf_8'))
                f.flush()
                fname = f.name
                script_location = os.path.abspath(fname)
                LOG.info("Temporary script location: %s", script_location)

                def pre_exec():
                    # Restore default signal disposition and invoke setsid
                    for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
                        if hasattr(signal, sig):
                            signal.signal(getattr(signal, sig), signal.SIG_DFL)
                    os.setsid()

                # LOG.info("Running command: %s", self.bash_command)
                sp = Popen(['bash', fname],
                           stdout=PIPE,
                           stderr=STDOUT,
                           cwd=tmp_dir,
                           env=self.env,
                           preexec_fn=pre_exec)

                self.sp = sp

                LOG.info("Output:")
                line = ''
                for line in iter(sp.stdout.readline, b''):
                    line = line.decode(self.output_encoding).rstrip()
                    LOG.info(line)
                sp.wait()
                LOG.info("Command exited with return code %s", sp.returncode)

                if sp.returncode:
                    raise AirflowException("Bash command failed")

        if self.xcom_push_flag:
            return line
Пример #28
0
    def run_cli(self, pig, pig_opts=None, verbose=True):
        """
        Run an pig script using the pig cli

        >>> ph = PigCliHook()
        >>> result = ph.run_cli("ls /;", pig_opts="-x mapreduce")
        >>> ("hdfs://" in result)
        True
        """

        with TemporaryDirectory(prefix='airflow_pigop_') as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir) as f:
                f.write(pig.encode('utf-8'))
                f.flush()
                fname = f.name
                pig_bin = 'pig'
                cmd_extra = []

                pig_cmd = [pig_bin]

                if self.pig_properties:
                    pig_properties_list = self.pig_properties.split()
                    pig_cmd.extend(pig_properties_list)
                if pig_opts:
                    pig_opts_list = pig_opts.split()
                    pig_cmd.extend(pig_opts_list)

                pig_cmd.extend(['-f', fname] + cmd_extra)

                if verbose:
                    self.log.info("%s", " ".join(pig_cmd))
                sp = subprocess.Popen(pig_cmd,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.STDOUT,
                                      cwd=tmp_dir,
                                      close_fds=True)
                self.sp = sp
                stdout = ''
                for line in iter(sp.stdout.readline, b''):
                    stdout += line.decode('utf-8')
                    if verbose:
                        self.log.info(line.strip())
                sp.wait()

                if sp.returncode:
                    raise AirflowException(stdout)

                return stdout
    def test_should_detect_changes_in_directory(self):
        with TemporaryDirectory(prefix="tmp") as tempdir, \
                mock.patch("airflow.bin.cli.settings.PLUGINS_FOLDER", tempdir):
            self._prepare_test_file("{}/file1.txt".format(tempdir), 100)
            self._prepare_test_file(
                "{}/nested/nested/nested/nested/file2.txt".format(tempdir),
                200)
            self._prepare_test_file("{}/file3.txt".format(tempdir), 300)

            monitor = cli.GunicornMonitor(
                gunicorn_master_pid=1,
                num_workers_expected=4,
                master_timeout=60,
                worker_refresh_interval=60,
                worker_refresh_batch_size=2,
                reload_on_plugin_change=True,
            )

            # When the files have not changed, the result should be constant
            state_a = monitor._generate_plugin_state()
            state_b = monitor._generate_plugin_state()

            self.assertEqual(state_a, state_b)
            self.assertEqual(3, len(state_a))

            # Should detect new file
            self._prepare_test_file("{}/file4.txt".format(tempdir), 400)

            state_c = monitor._generate_plugin_state()

            self.assertNotEqual(state_b, state_c)
            self.assertEqual(4, len(state_c))

            # Should detect changes in files
            self._prepare_test_file("{}/file4.txt".format(tempdir), 450)

            state_d = monitor._generate_plugin_state()

            self.assertNotEqual(state_c, state_d)
            self.assertEqual(4, len(state_d))

            # Should support large files
            self._prepare_test_file("{}/file4.txt".format(tempdir), 4000000)

            state_d = monitor._generate_plugin_state()

            self.assertNotEqual(state_c, state_d)
            self.assertEqual(4, len(state_d))
Пример #30
0
    def execute(self, context):
        """
        Execute the bash command in a temporary directory which will be cleaned afterwards.
        """
        logging.info("tmp dir root location: \n" + gettempdir())
        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
            os.chmod(tmp_dir, 0777)
            # Ensure the sudo user has perms to their current working directory for making tempfiles
            # This is not really a security flaw because the only thing in that dir is the
            # temp script, owned by the airflow user and any temp files made by the sudo user
            # and all of those will be created with the owning user's umask
            # If a process needs finer control over the tempfiles it creates, that process can chmod
            # them as they are created.
            with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f:

                if self.user == getpass.getuser(
                ):  # don't try to sudo as yourself
                    f.write(bytes(self.bash_command, 'utf_8'))
                else:
                    sudo_cmd = "sudo -u {} sh -c '{}'".format(
                        self.user, self.bash_command)
                    f.write(bytes(sudo_cmd, 'utf_8'))
                f.flush()

                logging.info('Temporary script location: {0}'.format(f.name))
                logging.info('Running command: {}'.format(self.bash_command))
                self.sp = Popen(['bash', f.name],
                                stdout=PIPE,
                                stderr=STDOUT,
                                cwd=tmp_dir,
                                env=self.env)

                logging.info('Output:')
                line = ''
                for line in iter(self.sp.stdout.readline, b''):
                    line = line.decode(self.output_encoding).strip()
                    logging.info(line)
                self.sp.wait()
                logging.info("Command exited with return code {0}".format(
                    self.sp.returncode))

                if self.sp.returncode:
                    raise AirflowException("Bash command failed")

        if self.xcom_push_flag:
            return line