def execute(self, context): file_msg = None try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( "can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: file_msg = "from {0} to {1}".format(self.remote_filepath, self.local_filepath) self.log.debug("Starting to transfer %s", file_msg) sftp_client.get(self.remote_filepath, self.local_filepath) else: file_msg = "from {0} to {1}".format(self.local_filepath, self.remote_filepath) self.log.debug("Starting to transfer file %s", file_msg) sftp_client.put(self.local_filepath, self.remote_filepath, confirm=self.confirm) except Exception as e: raise AirflowException( "Error while transferring {0}, error: {1}".format( file_msg, str(e))) return None
def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.ssh_hook import SSHHook from airflow.hooks.S3_hook import S3Hook hook = SSHHook(ssh_conn_id='ssh_default') s3_hook = S3Hook('aws_default') hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.s3_hook = s3_hook self.ssh_client = self.hook.get_conn() self.sftp_client = self.ssh_client.open_sftp() self.dag = dag self.s3_bucket = BUCKET self.sftp_path = SFTP_PATH self.s3_key = S3_KEY
def execute(self, context): ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) ssh_client = ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() # Get list of files in sftp_path self.log.info(f"Getting list of files in sftp_path: `{self.sftp_folder_path}`") path_content = sftp_client.listdir(self.sftp_folder_path) files = [ file for file in path_content if fnmatch.fnmatch(file, self.sftp_filename) ] sftp_object = None try: if not files: self.log.info( f"No files found in folder that matches `{self.sftp_filename}` parameter." ) for file in files: sftp_object = os.path.join(self.sftp_folder_path, file) sftp_client.remove(path=sftp_object) self.log.info(f"Deleted file `{sftp_object}`") except IOError as ex: # IOError raised by client does not consistently use the same # number of arguments when raised. When a file does not exist # the first argument is the error code `2`. If a folder is # passed then only a text error is used. If a permissions # error occurs then the first argument is error code 13. # # We only want to handle when a file does not exist, all other # exceptions should be reraised to fail the Airflow task. if ex.args[0] == 2: self.log.info(f"File does not exist `{sftp_object}`") else: raise
def execute(self, context): try: s3_hook = S3Hook(self.s3_conn_id) logging.info("Connected to S3 hook") except AirflowException as e: logging.info("Error in Connecting to S3 Hook") exit(1) try: ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) logging.info("Connected to SSH Hook") ssh_client = ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() logging.info("Connecting to SFTP") except AirflowException as e: logging.info("Error in Connecting to SFTP") exit(1) try: with NamedTemporaryFile("w") as f: sftp_client.get(self.sftp_path, f.name) s3_hook.load_file(filename=f.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True) logging.info("SUCCEEDED") except AirflowException as e: logging.info("Transfer to S3 FAILED", str(e)) exit(1)
def runDAP(**kwargs): """ Connects to App Server via SSH and executes script, capturing and reporting output. """ sshSource = SSHHook(ssh_conn_id='DAP_App_Server') command = 'E:\\Airflow_Test\\DAP\DAPConsoleProcessor.exe -config "E:\\Airflow_Test\\DAP\\Configuration\\DAPOrderCancellation.xml" -jobname "DAPOrderCancellation.xml"' try: sshConn = sshSource.get_conn() stdIn, stdOut, stdErr = sshConn.exec_command(command=command) exitStatus = stdOut.channel.recv_exit_status() errorMessage = stdErr.read().decode('ascii') stdOutput = stdOut.read().decode('ascii') if exitStatus == 0: print('DAP Started Successfully.') if errorMessage: logging.error('DAP Processor Failure. See Exception') raise Exception(errorMessage) finally: print('Exit Status: {}'.format(exitStatus)) print('StdOut: {}'.format(stdOutput)) print('StdErr: {}'.format(errorMessage)) if sshConn: sshConn.close()
def setUp(self): from airflow.contrib.hooks.ssh_hook import SSHHook from airflow.hooks.S3_hook import S3Hook hook = SSHHook(ssh_conn_id='ssh_default') s3_hook = S3Hook('aws_default') hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.s3_hook = s3_hook self.ssh_client = self.hook.get_conn() self.sftp_client = self.ssh_client.open_sftp() self.dag = dag self.s3_bucket = BUCKET self.sftp_path = SFTP_PATH self.s3_key = S3_KEY
def execute(self, context): try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() remote_folder = os.path.dirname(self.remote_filepath) if self.create_intermediate_dirs: _make_intermediate_dirs( sftp_client=sftp_client, remote_directory=remote_folder, ) self.log.info("Starting to transfer file to %s", self.remote_filepath) file_contents_fo = StringIO(self.file_contents) sftp_client.putfo(file_contents_fo, self.remote_filepath) if self.file_mode is not None: sftp_client.chmod(self.remote_filepath, self.file_mode) except Exception as e: raise AirflowException("Error while uploading to {0}, error: {1}" .format(self.remote_filepath, str(e))) return self.remote_filepath
def setUp(self): from airflow.contrib.hooks.ssh_hook import SSHHook hook = SSHHook(ssh_conn_id='ssh_default') hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag self.test_dir = "/tmp" self.test_local_dir = "/tmp/tmp2" self.test_remote_dir = "/tmp/tmp1" self.test_local_filename = 'test_local_file' self.test_remote_filename = 'test_remote_file' self.test_local_filepath = '{0}/{1}'.format(self.test_dir, self.test_local_filename) # Local Filepath with Intermediate Directory self.test_local_filepath_int_dir = '{0}/{1}'.format( self.test_local_dir, self.test_local_filename) self.test_remote_filepath = '{0}/{1}'.format(self.test_dir, self.test_remote_filename) # Remote Filepath with Intermediate Directory self.test_remote_filepath_int_dir = '{0}/{1}'.format( self.test_remote_dir, self.test_remote_filename)
def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.ssh_hook import SSHHook hook = SSHHook(ssh_conn_id='ssh_default') hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag self.test_dir = "/tmp" self.test_local_dir = "/tmp/tmp2" self.test_remote_dir = "/tmp/tmp1" self.test_local_filename = 'test_local_file' self.test_remote_filename = 'test_remote_file' self.test_local_filepath = '{0}/{1}'.format(self.test_dir, self.test_local_filename) # Local Filepath with Intermediate Directory self.test_local_filepath_int_dir = '{0}/{1}'.format(self.test_local_dir, self.test_local_filename) self.test_remote_filepath = '{0}/{1}'.format(self.test_dir, self.test_remote_filename) # Remote Filepath with Intermediate Directory self.test_remote_filepath_int_dir = '{0}/{1}'.format(self.test_remote_dir, self.test_remote_filename)
class SSHHookTest(unittest.TestCase): def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.ssh_hook import SSHHook self.hook = SSHHook(ssh_conn_id='ssh_default', keepalive_interval=10) self.hook.no_host_key_check = True def test_ssh_connection(self): ssh_hook = self.hook.get_conn() self.assertIsNotNone(ssh_hook) def test_tunnel(self): print("Setting up remote listener") import subprocess import socket self.server_handle = subprocess.Popen(["python", "-c", HELLO_SERVER_CMD], stdout=subprocess.PIPE) print("Setting up tunnel") with self.hook.create_tunnel(2135, 2134): print("Tunnel up") server_output = self.server_handle.stdout.read(5) self.assertEqual(server_output, b"ready") print("Connecting to server via tunnel") s = socket.socket() s.connect(("localhost", 2135)) print("Receiving...", ) response = s.recv(5) self.assertEqual(response, b"hello") print("Closing connection") s.close() print("Waiting for listener...") output, _ = self.server_handle.communicate() self.assertEqual(self.server_handle.returncode, 0) print("Closing tunnel")
def execute(self, context): gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( 'can not operate without ssh_hook or ssh_conn_id') if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() with NamedTemporaryFile('wb') as temp_file: sftp_client.get(self.remote_filepath, temp_file.name) gcs_hook.upload(self.destination_gcs_bucket, self.destination_gcs_path, temp_file.name) except Exception as error_object: raise AirflowException( 'Error while transferring. Error details: {1}'.format( str(error_object))) return None
def execute(self, context): if not self._adls_hook: self._adls_hook = ADLSGen2Hook( container=self.adls_container, azure_data_lake_conn_id=self.azure_data_lake_conn_id, ) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) ssh_client = ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() # Get list of files in ADLS folder source_files = [ os.path.split(file)[1] # get only the file portion of the path for file in self._get_adls_files() if fnmatch.fnmatch(os.path.split(file)[1], self.source_object) ] self.log.info(f"Source Files: `{source_files}`") # Get list of files in sftp_path try: self.log.info( f"Getting list of files in sftp_path: `{self.sftp_folder_path}`" ) sftp_files = sftp_client.listdir(self.sftp_folder_path) except IOError as e: self.log.error( f"The folder `{self.sftp_folder_path}` does not exist on the sftp server." ) raise e # determine the files to be processed. If all files are to be reloaded # then process all filesin the ADLS folder that match the `source object`. # If all files are not to be reloaded then only process files for which # the file name does not currently exist in the sftp folder if self.reload_all: files_to_process = source_files self.log.info(f"Files to process: `{files_to_process}`") else: self.log.info(f"Existing files in sftp folder: `{sftp_files}`") files_to_process = set(source_files) - set(sftp_files) self.log.info(f"Files to process: `{files_to_process}`") # create temporary folder and process files with tempfile.TemporaryDirectory() as temp_folder: for file in files_to_process: temp_path = os.path.join(temp_folder, file) adls_object = os.path.join(self.adls_folder_path, file) sftp_object = os.path.join(self.sftp_folder_path, file) self.log.info(f"Processing file: `{adls_object}`") self._adls_hook.download_file( local_path=temp_path, remote_path=adls_object, overwrite=True ) sftp_client.put(localpath=temp_path, remotepath=sftp_object) os.remove(temp_path) # Close ADLS Connection self._adls_hook.connection.close()
def execute(self, context): file_msg = None try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info( "ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( "Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info( "remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: local_folder = os.path.dirname(self.local_filepath) if self.create_intermediate_dirs: # Create Intermediate Directories if it doesn't exist try: os.makedirs(local_folder) except OSError: if not os.path.isdir(local_folder): raise file_msg = "from {0} to {1}".format( self.remote_filepath, self.local_filepath) self.log.debug("Starting to transfer %s", file_msg) sftp_client.get(self.remote_filepath, self.local_filepath) else: remote_folder = os.path.dirname(self.remote_filepath) if self.create_intermediate_dirs: _make_intermediate_dirs( sftp_client=sftp_client, remote_directory=remote_folder, ) file_msg = "from {0} to {1}".format( self.local_filepath, self.remote_filepath) self.log.debug("Starting to transfer file %s", file_msg) sftp_client.put(self.local_filepath, self.remote_filepath, confirm=self.confirm) except Exception as e: raise AirflowException( "Error while transferring {0}, error: {1}".format( file_msg, str(e))) return None
def check_for_file_py(**kwargs): path = kwargs.get('path', None) sftp_conn_id = kwargs.get('sftp_conn_id', None) #filename = kwargs.get('templates_dict').get('filename', None) ssh_hook = SSHHook(ssh_conn_id=sftp_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() ftp_files = sftp_client.listdir(path) for filename in ftp_files: print(filename) logging.info('Filename: ' + str(filename))
def execute(self, context): self.log.info("Going to start Bulk sftp to s3 operator") sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id) sftp_hook.no_host_key_check = True list_dir = sftp_hook.list_directory(self.sftp_path) if len(list_dir) < 1: self.log.info("Got no files to process. Skipping") return False self.log.info(f"Got {len(list_dir)} files to move") temp_files = [] file_path_list = [] ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() s3_hook = S3Hook(self.aws_conn_id) for file_name in list_dir: file_path = os.path.join(self.sftp_path, file_name) file_path_list.append(file_path) s3_key = str(os.path.join(self.dest_path, file_name)) file_metadata = {"ftp": NamedTemporaryFile("w"), "s3_key": s3_key} for i in range(0, 5): try: self.log.info(f"Downloading {file_path}") sftp_client.get(file_path, file_metadata["ftp"].name) file_metadata["ftp"].flush() temp_files.append(file_metadata) break except Exception: self.log.info( f"Got no response from server, waiting for next try number {(i + 1)}" ) if i < 4: time.sleep(2 ** i + random.random()) sftp_client = ( SSHHook(ssh_conn_id=self.sftp_conn_id) .get_conn() .open_sftp() ) else: raise self.log.info(f"Uploading to S3 with {self.workers} workers") with Pool(self.workers) as pool: pool.starmap( s3_hook.load_file, [ (x["ftp"].name, x["s3_key"], self.dest_bucket, True, False) for x in temp_files ], ) self.log.info("Finished executing Bulk sftp to s3 operator") return file_path_list
def execute(self, context): self.s3_key = self.get_s3_key(self.s3_key) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) s3_client = s3_hook.get_conn() sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: s3_client.download_file(self.s3_bucket, self.s3_key, f.name) sftp_client.put(f.name, self.sftp_path)
def execute(self, context): try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( "can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() if not self.command: raise AirflowException( "no command specified so nothing to execute here.") # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command( command=self.command, get_pty=get_pty, timeout=self.timeout) stdin.close() output = b'' for line in stdout: output += line.encode('utf-8') self.log.info(line.strip('\n')) exit_status = stdout.channel.recv_exit_status() if exit_status is 0: # only returning on output if do_xcom_push is set # otherwise its not suppose to be disclosed if self.do_xcom_push: enable_pickling = configuration.getboolean( 'core', 'enable_xcom_pickling') if enable_pickling: return output else: return b64encode(output).decode('utf-8') else: error_msg = stderr.read() raise AirflowException( "error running cmd: {0}, error: {1}".format( self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True
def get_crawler_report() -> str: """Get crawler report.""" ssh = SSHHook(ssh_conn_id='ssh_big_airflow') client = ssh.get_conn() stdin, stdout, stderr = client.exec_command(""" docker exec `docker ps --filter name=bigscrapy_projects_airflow -q` \ sh -c 'cat /bigcrawler-scrapy/summary.txt' """) message = "".join([line for line in stdout.readlines()]) print(f'crawler_report: {message}') return message
def setUp(self): from airflow.contrib.hooks.ssh_hook import SSHHook hook = SSHHook(ssh_conn_id='ssh_default') hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag
def execute(self, context): if not self._adls_hook: self._adls_hook = ADLSGen2Hook( container=self.adls_container, azure_data_lake_conn_id=self.azure_data_lake_conn_id, ) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) ssh_client = ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() # Get list of files in sftp_path self.log.info(f"Getting list of files in sftp_path: `{self.sftp_folder_path}`") path_content = sftp_client.listdir(self.sftp_folder_path) files = [ file for file in path_content if fnmatch.fnmatch(file, self.sftp_filename) ] # Get files that already exist in the ADLS folder existing_files = self._get_adls_files() # Determine the files to be processed. If all files are to be reloaded then process all files # in the sftp file list. If all files are not to be reloaded then only process files for # which the file name does not currently exist in the ADLS folder if self.reload_all: files_to_process = files else: existing_set = {os.path.split(filename)[1] for filename in existing_files} files_to_process = set(files) - existing_set self.log.info(f"Existing files in ADLS: `{existing_set}`") self.log.info(f"Files to process: `{files_to_process}`") # create temporary folder and process files with tempfile.TemporaryDirectory() as temp_folder: for file in files_to_process: temp_path = os.path.join(temp_folder, file) adls_object = os.path.join(self.adls_folder_path, file) sftp_object = os.path.join(self.sftp_folder_path, file) self.log.info(f"Processing: `{sftp_object}`") try: sftp_client.get(sftp_object, temp_path) self._adls_hook.upload_file( local_path=temp_path, remote_path=adls_object, overwrite=self.reload_all, ) os.remove(temp_path) except IOError: self.log.info(f"Skipping directory `{sftp_object}`.") # Close ADLS Connection self._adls_hook.connection.close()
def test_conn_with_extra_parameters(self): from airflow.contrib.hooks.ssh_hook import SSHHook db.merge_conn( models.Connection(conn_id='ssh_with_extra', host='localhost', conn_type='ssh', extra='{"compress" : true, "no_host_key_check" : "true"}' ) ) ssh_hook = SSHHook(ssh_conn_id='ssh_with_extra', keepalive_interval=10) ssh_hook.get_conn() self.assertEqual(ssh_hook.compress, True) self.assertEqual(ssh_hook.no_host_key_check, True)
def execute(self, context): self.s3_key = self.get_s3_key(self.s3_key) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: sftp_client.get(self.sftp_path, f.name) s3_hook.load_file(filename=f.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True)
def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.ssh_hook import SSHHook hook = SSHHook() hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag
def execute(self, context): ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) gcs_hook = GoogleCloudStorageHook(self.google_cloud_storage_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: filename = f.name gcs_hook.download(bucket=self.gcs_bucket, object=self.gcs_dest, filename=filename) file_msg = "from {0} to {1}".format(filename, self.sftp_dest_path) self.log.info("Starting to transfer file %s", file_msg) sftp_client.put(filename, self.sftp_dest_path, confirm=True)
def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.ssh_hook import SSHHook hook = SSHHook() hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID+'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag
def execute(self, context): ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) s3_files = s3_hook.list_keys(bucket_name=self.s3_bucket, prefix=self.s3_path) s3_client = s3_hook.get_conn() sftp_client = ssh_hook.get_conn().open_sftp() for key in s3_files: file_name = key.split("/")[-1] with NamedTemporaryFile("w") as f: s3_client.download_file(self.s3_bucket, key, f.name) sftp_client.put(f.name, os.path.join(self.sftp_path, file_name))
def test_ssh_connection_without_password(self, ssh_mock): hook = SSHHook(remote_host='remote_host', port='port', username='******', timeout=10, key_file='fake.file') with hook.get_conn(): ssh_mock.return_value.connect.assert_called_once_with( hostname='remote_host', username='******', key_filename='fake.file', timeout=10, compress=True, port='port', sock=None)
def get_sub_ssh_cmds_dag(parent_dag, task_id, args): ssh_dag = DAG( '%s.%s' % (parent_dag.dag_id, task_id), default_args=args, start_date=args['start_date'], schedule_interval=parent_dag.schedule_interval, ) start = DummyOperator( task_id='ssh_start', dag=ssh_dag) end = DummyOperator( task_id='ssh_end', dag=ssh_dag) # generate the task to submit dynamically depending on the number of hive script that needs to be run response = s3_client.list_objects_v2(Bucket=wk_conf.get('s3_bucket'),Prefix=wk_conf.get('s3_hive_script_location')) hive_scripts = [c.get('Key') for c in response.get('Contents')] if len(hive_scripts)>0: ssh_emr_hook = SSHHook(conn_id='ssh_emr_default') ssh_tasks = [ SSHExecuteOperator( task_id=str(key.replace(':','_').replace('/','_')), ssh_hook=ssh_emr_hook, bash_command='hive -f "s3://'+wk_conf.get('s3_bucket')+'/'+str(key)+'"', dag=ssh_dag) for key in hive_scripts if key.endswith('hql')] start.set_downstream(ssh_tasks) end.set_upstream(ssh_tasks) # if no hive scripts generrated short circuit step in the begining of main dag return ssh_dag
def execute(self, context): file_msg = None try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: file_msg = "from {0} to {1}".format(self.remote_filepath, self.local_filepath) logging.debug("Starting to transfer {0}".format(file_msg)) sftp_client.get(self.remote_filepath, self.local_filepath) else: file_msg = "from {0} to {1}".format(self.local_filepath, self.remote_filepath) logging.debug("Starting to transfer file {0}".format(file_msg)) sftp_client.put(self.local_filepath, self.remote_filepath) except Exception as e: raise AirflowException("Error while transferring {0}, error: {1}" .format(file_msg, str(e))) return None
def execute(self, context): self.s3_key = self.get_s3_key(self.s3_key) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: sftp_client.get(self.sftp_path, f.name) s3_hook.load_file( filename=f.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True )
def execute(self, context): """ :raises AirflowException: when the SSH endpoint of the HDI cluster cannot be found """ azure_hook = AzureHDInsightHook(azure_conn_id=self.azure_conn_id) azure_conn_opts = azure_hook.get_connection( self.azure_conn_id).extra_dejson ssh_username = azure_conn_opts['SSH_USER_NAME'] ssh_password = azure_conn_opts['SSH_PASSWORD'] state = azure_hook.get_cluster_state(self.cluster_name) for endpoint in state.connectivity_endpoints: if endpoint.name == 'SSH': ssh_endpoint = endpoint.location ssh_port = endpoint.port if not ssh_endpoint: raise AirflowException( "Could not find SSH endpoint for cluster {}", self.cluster_name) self.ssh_hook = SSHHook(remote_host=ssh_endpoint, port=ssh_port, username=ssh_username, password=ssh_password) self.log.info("Running SSH command on cluster (%s): %s", self.cluster_name, self.command) super(AzureHDInsightSshOperator, self).execute(context)
def test_tunnel_without_password(self, ssh_mock): hook = SSHHook(remote_host='remote_host', port='port', username='******', timeout=10, key_file='fake.file') with hook.get_tunnel(1234): ssh_mock.assert_called_once_with('remote_host', ssh_port='port', ssh_username='******', ssh_pkey='fake.file', ssh_proxy=None, local_bind_address=('localhost', ), remote_bind_address=('localhost', 1234), host_pkey_directories=[], logger=hook.log)
def test_ssh_connection_without_password(self, ssh_mock): hook = SSHHook(remote_host='remote_host', port='port', username='******', timeout=10, key_file='fake.file') with hook.get_conn(): ssh_mock.return_value.connect.assert_called_once_with( hostname='remote_host', username='******', key_filename='fake.file', timeout=10, compress=True, port='port', sock=None )
def test_tunnel(self): hook = SSHHook(ssh_conn_id='ssh_default') import subprocess import socket server_handle = subprocess.Popen(["python", "-c", HELLO_SERVER_CMD], stdout=subprocess.PIPE) with hook.create_tunnel(2135, 2134): server_output = server_handle.stdout.read(5) self.assertEqual(server_output, b"ready") s = socket.socket() s.connect(("localhost", 2135)) response = s.recv(5) self.assertEqual(response, b"hello") s.close() output, _ = server_handle.communicate() self.assertEqual(server_handle.returncode, 0)
def test_ssh_connection_with_private_key_extra(self, ssh_mock): hook = SSHHook( ssh_conn_id=self.CONN_SSH_WITH_PRIVATE_KEY_EXTRA, remote_host='remote_host', port='port', username='******', timeout=10, ) with hook.get_conn(): ssh_mock.return_value.connect.assert_called_once_with( hostname='remote_host', username='******', pkey=TEST_PKEY, timeout=10, compress=True, port='port', sock=None)
def test_conn_with_extra_parameters(self): db.merge_conn( models.Connection( conn_id='ssh_with_extra', host='localhost', conn_type='ssh', extra='{"compress" : true, "no_host_key_check" : "true"}')) ssh_hook = SSHHook(ssh_conn_id='ssh_with_extra') self.assertEqual(ssh_hook.compress, True) self.assertEqual(ssh_hook.no_host_key_check, True)
def execute(self, context): file_msg = None try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info("remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: local_folder = os.path.dirname(self.local_filepath) if self.create_intermediate_dirs: # Create Intermediate Directories if it doesn't exist try: os.makedirs(local_folder) except OSError: if not os.path.isdir(local_folder): raise file_msg = "from {0} to {1}".format(self.remote_filepath, self.local_filepath) self.log.info("Starting to transfer %s", file_msg) sftp_client.get(self.remote_filepath, self.local_filepath) else: remote_folder = os.path.dirname(self.remote_filepath) if self.create_intermediate_dirs: _make_intermediate_dirs( sftp_client=sftp_client, remote_directory=remote_folder, ) file_msg = "from {0} to {1}".format(self.local_filepath, self.remote_filepath) self.log.info("Starting to transfer file %s", file_msg) sftp_client.put(self.local_filepath, self.remote_filepath, confirm=self.confirm) except Exception as e: raise AirflowException("Error while transferring {0}, error: {1}" .format(file_msg, str(e))) return self.local_filepath
class SSHHookTest(unittest.TestCase): def setUp(self): configuration.test_mode() from airflow.contrib.hooks.ssh_hook import SSHHook self.hook = SSHHook() self.hook.no_host_key_check = True def test_remote_cmd(self): output = self.hook.check_output(["echo", "-n", "airflow"]) self.assertEqual(output, b"airflow") def test_tunnel(self): print("Setting up remote listener") import subprocess import socket self.handle = self.hook.Popen([ "python", "-c", '"{0}"'.format(HELLO_SERVER_CMD) ], stdout=subprocess.PIPE) print("Setting up tunnel") with self.hook.tunnel(2135, 2134): print("Tunnel up") server_output = self.handle.stdout.read(5) self.assertEqual(server_output, b"ready") print("Connecting to server via tunnel") s = socket.socket() s.connect(("localhost", 2135)) print("Receiving...",) response = s.recv(5) self.assertEqual(response, b"hello") print("Closing connection") s.close() print("Waiting for listener...") output, _ = self.handle.communicate() self.assertEqual(self.handle.returncode, 0) print("Closing tunnel")
def execute(self, context): try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() if not self.command: raise AirflowException("no command specified so nothing to execute here.") # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command(command=self.command, get_pty=get_pty, timeout=self.timeout ) exit_status = stdout.channel.recv_exit_status() if exit_status is 0: # only returning on output if do_xcom_push is set # otherwise its not suppose to be disclosed if self.do_xcom_push: enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') if enable_pickling: return stdout.read() else: return b64encode(stdout.read()).decode('utf-8') else: error_msg = stderr.read() raise AirflowException("error running cmd: {0}, error: {1}" .format(self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True
def execute(self, context): file_msg = None try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info("remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: file_msg = "from {0} to {1}".format(self.remote_filepath, self.local_filepath) self.log.debug("Starting to transfer %s", file_msg) sftp_client.get(self.remote_filepath, self.local_filepath) else: file_msg = "from {0} to {1}".format(self.local_filepath, self.remote_filepath) self.log.debug("Starting to transfer file %s", file_msg) sftp_client.put(self.local_filepath, self.remote_filepath, confirm=self.confirm) except Exception as e: raise AirflowException("Error while transferring {0}, error: {1}" .format(file_msg, str(e))) return None
class SFTPOperator(BaseOperator): """ SFTPOperator for transferring files from remote host to local or vice a versa. This operator uses ssh_hook to open sftp trasport channel that serve as basis for file transfer. :param ssh_hook: predefined ssh_hook to use for remote execution. Either `ssh_hook` or `ssh_conn_id` needs to be provided. :type ssh_hook: :class:`SSHHook` :param ssh_conn_id: connection id from airflow Connections. `ssh_conn_id` will be ingored if `ssh_hook` is provided. :type ssh_conn_id: str :param remote_host: remote host to connect (templated) Nullable. If provided, it will replace the `remote_host` which was defined in `ssh_hook` or predefined in the connection of `ssh_conn_id`. :type remote_host: str :param local_filepath: local file path to get or put. (templated) :type local_filepath: str :param remote_filepath: remote file path to get or put. (templated) :type remote_filepath: str :param operation: specify operation 'get' or 'put', defaults to put :type get: bool :param confirm: specify if the SFTP operation should be confirmed, defaults to True :type confirm: bool """ template_fields = ('local_filepath', 'remote_filepath', 'remote_host') @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, local_filepath=None, remote_filepath=None, operation=SFTPOperation.PUT, confirm=True, *args, **kwargs): super(SFTPOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.local_filepath = local_filepath self.remote_filepath = remote_filepath self.operation = operation self.confirm = confirm if not (self.operation.lower() == SFTPOperation.GET or self.operation.lower() == SFTPOperation.PUT): raise TypeError("unsupported operation value {0}, expected {1} or {2}" .format(self.operation, SFTPOperation.GET, SFTPOperation.PUT)) def execute(self, context): file_msg = None try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info("remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: file_msg = "from {0} to {1}".format(self.remote_filepath, self.local_filepath) self.log.debug("Starting to transfer %s", file_msg) sftp_client.get(self.remote_filepath, self.local_filepath) else: file_msg = "from {0} to {1}".format(self.local_filepath, self.remote_filepath) self.log.debug("Starting to transfer file %s", file_msg) sftp_client.put(self.local_filepath, self.remote_filepath, confirm=self.confirm) except Exception as e: raise AirflowException("Error while transferring {0}, error: {1}" .format(file_msg, str(e))) return None
def execute(self, context): try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id, timeout=self.timeout) if not self.ssh_hook: raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info("remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host if not self.command: raise AirflowException("SSH command not specified. Aborting.") with self.ssh_hook.get_conn() as ssh_client: # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True self.log.info("Running command: %s", self.command) # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command(command=self.command, get_pty=get_pty, timeout=self.timeout ) # get channels channel = stdout.channel # closing stdin stdin.close() channel.shutdown_write() agg_stdout = b'' agg_stderr = b'' # capture any initial output in case channel is closed already stdout_buffer_length = len(stdout.channel.in_buffer) if stdout_buffer_length > 0: agg_stdout += stdout.channel.recv(stdout_buffer_length) # read from both stdout and stderr while not channel.closed or \ channel.recv_ready() or \ channel.recv_stderr_ready(): readq, _, _ = select([channel], [], [], self.timeout) for c in readq: if c.recv_ready(): line = stdout.channel.recv(len(c.in_buffer)) line = line agg_stdout += line self.log.info(line.decode('utf-8').strip('\n')) if c.recv_stderr_ready(): line = stderr.channel.recv_stderr(len(c.in_stderr_buffer)) line = line agg_stderr += line self.log.warning(line.decode('utf-8').strip('\n')) if stdout.channel.exit_status_ready()\ and not stderr.channel.recv_stderr_ready()\ and not stdout.channel.recv_ready(): stdout.channel.shutdown_read() stdout.channel.close() break stdout.close() stderr.close() exit_status = stdout.channel.recv_exit_status() if exit_status == 0: enable_pickling = configuration.conf.getboolean( 'core', 'enable_xcom_pickling' ) if enable_pickling: return agg_stdout else: return b64encode(agg_stdout).decode('utf-8') else: error_msg = agg_stderr.decode('utf-8') raise AirflowException("error running cmd: {0}, error: {1}" .format(self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True
class SSHOperator(BaseOperator): """ SSHOperator to execute commands on given remote host using the ssh_hook. :param ssh_hook: predefined ssh_hook to use for remote execution :type ssh_hook: :class:`SSHHook` :param ssh_conn_id: connection id from airflow Connections :type ssh_conn_id: str :param remote_host: remote host to connect :type remote_host: str :param command: command to execute on remote host :type command: str :param timeout: timeout for executing the command. :type timeout: int :param do_xcom_push: return the stdout which also get set in xcom by airflow platform :type do_xcom_push: bool """ template_fields = ('command',) @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, command=None, timeout=10, do_xcom_push=False, *args, **kwargs): super(SSHOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.command = command self.timeout = timeout self.do_xcom_push = do_xcom_push def execute(self, context): try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() if not self.command: raise AirflowException("no command specified so nothing to execute here.") # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command(command=self.command, get_pty=get_pty, timeout=self.timeout ) exit_status = stdout.channel.recv_exit_status() if exit_status is 0: # only returning on output if do_xcom_push is set # otherwise its not suppose to be disclosed if self.do_xcom_push: return stdout.read() else: error_msg = stderr.read() raise AirflowException("error running cmd: {0}, error: {1}" .format(self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True def tunnel(self): ssh_client = self.ssh_hook.get_conn() ssh_client.get_transport()
def execute(self, context): try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() if not self.command: raise AirflowException("no command specified so nothing to execute here.") # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command(command=self.command, get_pty=get_pty, timeout=self.timeout ) # get channels channel = stdout.channel # closing stdin stdin.close() channel.shutdown_write() agg_stdout = b'' agg_stderr = b'' # capture any initial output in case channel is closed already stdout_buffer_length = len(stdout.channel.in_buffer) if stdout_buffer_length > 0: agg_stdout += stdout.channel.recv(stdout_buffer_length) # read from both stdout and stderr while not channel.closed or channel.recv_ready() or channel.recv_stderr_ready(): readq, _, _ = select([channel], [], [], self.timeout) for c in readq: if c.recv_ready(): line = stdout.channel.recv(len(c.in_buffer)) line = line agg_stdout += line self.log.info(line.decode('utf-8').strip('\n')) if c.recv_stderr_ready(): line = stderr.channel.recv_stderr(len(c.in_stderr_buffer)) line = line agg_stderr += line self.log.warning(line.decode('utf-8').strip('\n')) if stdout.channel.exit_status_ready()\ and not stderr.channel.recv_stderr_ready()\ and not stdout.channel.recv_ready(): stdout.channel.shutdown_read() stdout.channel.close() break stdout.close() stderr.close() exit_status = stdout.channel.recv_exit_status() if exit_status is 0: # returning output if do_xcom_push is set if self.do_xcom_push: enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') if enable_pickling: return agg_stdout else: return b64encode(agg_stdout).decode('utf-8') else: error_msg = agg_stderr.decode('utf-8') raise AirflowException("error running cmd: {0}, error: {1}" .format(self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True
class SFTPOperator(BaseOperator): """ SFTPOperator for transferring files from remote host to local or vice a versa. This operator uses ssh_hook to open sftp trasport channel that serve as basis for file transfer. :param ssh_hook: predefined ssh_hook to use for remote execution :type ssh_hook: :class:`SSHHook` :param ssh_conn_id: connection id from airflow Connections :type ssh_conn_id: str :param remote_host: remote host to connect :type remote_host: str :param local_filepath: local file path to get or put :type local_filepath: str :param remote_filepath: remote file path to get or put :type remote_filepath: str :param operation: specify operation 'get' or 'put', defaults to get :type get: bool """ template_fields = ('local_filepath', 'remote_filepath') @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, local_filepath=None, remote_filepath=None, operation=SFTPOperation.PUT, *args, **kwargs): super(SFTPOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.local_filepath = local_filepath self.remote_filepath = remote_filepath self.operation = operation if not (self.operation.lower() == SFTPOperation.GET or self.operation.lower() == SFTPOperation.PUT): raise TypeError("unsupported operation value {0}, expected {1} or {2}" .format(self.operation, SFTPOperation.GET, SFTPOperation.PUT)) def execute(self, context): file_msg = None try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: file_msg = "from {0} to {1}".format(self.remote_filepath, self.local_filepath) logging.debug("Starting to transfer {0}".format(file_msg)) sftp_client.get(self.remote_filepath, self.local_filepath) else: file_msg = "from {0} to {1}".format(self.local_filepath, self.remote_filepath) logging.debug("Starting to transfer file {0}".format(file_msg)) sftp_client.put(self.local_filepath, self.remote_filepath) except Exception as e: raise AirflowException("Error while transferring {0}, error: {1}" .format(file_msg, str(e))) return None
class SSHOperator(BaseOperator): """ SSHOperator to execute commands on given remote host using the ssh_hook. :param ssh_hook: predefined ssh_hook to use for remote execution :type ssh_hook: :class:`SSHHook` :param ssh_conn_id: connection id from airflow Connections :type ssh_conn_id: str :param remote_host: remote host to connect :type remote_host: str :param command: command to execute on remote host :type command: str :param timeout: timeout (in seconds) for executing the command. :type timeout: int :param do_xcom_push: return the stdout which also get set in xcom by airflow platform :type do_xcom_push: bool """ template_fields = ('command',) @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, command=None, timeout=10, do_xcom_push=False, *args, **kwargs): super(SSHOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.command = command self.timeout = timeout self.do_xcom_push = do_xcom_push def execute(self, context): try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() if not self.command: raise AirflowException("no command specified so nothing to execute here.") # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command(command=self.command, get_pty=get_pty, timeout=self.timeout ) # get channels channel = stdout.channel # closing stdin stdin.close() channel.shutdown_write() agg_stdout = b'' agg_stderr = b'' # capture any initial output in case channel is closed already stdout_buffer_length = len(stdout.channel.in_buffer) if stdout_buffer_length > 0: agg_stdout += stdout.channel.recv(stdout_buffer_length) # read from both stdout and stderr while not channel.closed or channel.recv_ready() or channel.recv_stderr_ready(): readq, _, _ = select([channel], [], [], self.timeout) for c in readq: if c.recv_ready(): line = stdout.channel.recv(len(c.in_buffer)) line = line agg_stdout += line self.log.info(line.decode('utf-8').strip('\n')) if c.recv_stderr_ready(): line = stderr.channel.recv_stderr(len(c.in_stderr_buffer)) line = line agg_stderr += line self.log.warning(line.decode('utf-8').strip('\n')) if stdout.channel.exit_status_ready()\ and not stderr.channel.recv_stderr_ready()\ and not stdout.channel.recv_ready(): stdout.channel.shutdown_read() stdout.channel.close() break stdout.close() stderr.close() exit_status = stdout.channel.recv_exit_status() if exit_status is 0: # returning output if do_xcom_push is set if self.do_xcom_push: enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') if enable_pickling: return agg_stdout else: return b64encode(agg_stdout).decode('utf-8') else: error_msg = agg_stderr.decode('utf-8') raise AirflowException("error running cmd: {0}, error: {1}" .format(self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True def tunnel(self): ssh_client = self.ssh_hook.get_conn() ssh_client.get_transport()
def setUp(self): configuration.test_mode() from airflow.contrib.hooks.ssh_hook import SSHHook self.hook = SSHHook() self.hook.no_host_key_check = True
def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.ssh_hook import SSHHook self.hook = SSHHook(ssh_conn_id='ssh_default', keepalive_interval=10) self.hook.no_host_key_check = True
class SSHOperator(BaseOperator): """ SSHOperator to execute commands on given remote host using the ssh_hook. :param ssh_hook: predefined ssh_hook to use for remote execution. Either `ssh_hook` or `ssh_conn_id` needs to be provided. :type ssh_hook: airflow.contrib.hooks.ssh_hook.SSHHook :param ssh_conn_id: connection id from airflow Connections. `ssh_conn_id` will be ignored if `ssh_hook` is provided. :type ssh_conn_id: str :param remote_host: remote host to connect (templated) Nullable. If provided, it will replace the `remote_host` which was defined in `ssh_hook` or predefined in the connection of `ssh_conn_id`. :type remote_host: str :param command: command to execute on remote host. (templated) :type command: str :param timeout: timeout (in seconds) for executing the command. :type timeout: int """ template_fields = ('command', 'remote_host') template_ext = ('.sh',) @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, command=None, timeout=10, *args, **kwargs): super(SSHOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.command = command self.timeout = timeout def execute(self, context): try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id, timeout=self.timeout) if not self.ssh_hook: raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info("remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host if not self.command: raise AirflowException("SSH command not specified. Aborting.") with self.ssh_hook.get_conn() as ssh_client: # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True self.log.info("Running command: %s", self.command) # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command(command=self.command, get_pty=get_pty, timeout=self.timeout ) # get channels channel = stdout.channel # closing stdin stdin.close() channel.shutdown_write() agg_stdout = b'' agg_stderr = b'' # capture any initial output in case channel is closed already stdout_buffer_length = len(stdout.channel.in_buffer) if stdout_buffer_length > 0: agg_stdout += stdout.channel.recv(stdout_buffer_length) # read from both stdout and stderr while not channel.closed or \ channel.recv_ready() or \ channel.recv_stderr_ready(): readq, _, _ = select([channel], [], [], self.timeout) for c in readq: if c.recv_ready(): line = stdout.channel.recv(len(c.in_buffer)) line = line agg_stdout += line self.log.info(line.decode('utf-8').strip('\n')) if c.recv_stderr_ready(): line = stderr.channel.recv_stderr(len(c.in_stderr_buffer)) line = line agg_stderr += line self.log.warning(line.decode('utf-8').strip('\n')) if stdout.channel.exit_status_ready()\ and not stderr.channel.recv_stderr_ready()\ and not stdout.channel.recv_ready(): stdout.channel.shutdown_read() stdout.channel.close() break stdout.close() stderr.close() exit_status = stdout.channel.recv_exit_status() if exit_status == 0: enable_pickling = configuration.conf.getboolean( 'core', 'enable_xcom_pickling' ) if enable_pickling: return agg_stdout else: return b64encode(agg_stdout).decode('utf-8') else: error_msg = agg_stderr.decode('utf-8') raise AirflowException("error running cmd: {0}, error: {1}" .format(self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True def tunnel(self): ssh_client = self.ssh_hook.get_conn() ssh_client.get_transport()
class SFTPOperator(BaseOperator): """ SFTPOperator for transferring files from remote host to local or vice a versa. This operator uses ssh_hook to open sftp transport channel that serve as basis for file transfer. :param ssh_hook: predefined ssh_hook to use for remote execution. Either `ssh_hook` or `ssh_conn_id` needs to be provided. :type ssh_hook: airflow.contrib.hooks.ssh_hook.SSHHook :param ssh_conn_id: connection id from airflow Connections. `ssh_conn_id` will be ignored if `ssh_hook` is provided. :type ssh_conn_id: str :param remote_host: remote host to connect (templated) Nullable. If provided, it will replace the `remote_host` which was defined in `ssh_hook` or predefined in the connection of `ssh_conn_id`. :type remote_host: str :param local_filepath: local file path to get or put. (templated) :type local_filepath: str :param remote_filepath: remote file path to get or put. (templated) :type remote_filepath: str :param operation: specify operation 'get' or 'put', defaults to put :type operation: str :param confirm: specify if the SFTP operation should be confirmed, defaults to True :type confirm: bool :param create_intermediate_dirs: create missing intermediate directories when copying from remote to local and vice-versa. Default is False. Example: The following task would copy ``file.txt`` to the remote host at ``/tmp/tmp1/tmp2/`` while creating ``tmp``,``tmp1`` and ``tmp2`` if they don't exist. If the parameter is not passed it would error as the directory does not exist. :: put_file = SFTPOperator( task_id="test_sftp", ssh_conn_id="ssh_default", local_filepath="/tmp/file.txt", remote_filepath="/tmp/tmp1/tmp2/file.txt", operation="put", create_intermediate_dirs=True, dag=dag ) :type create_intermediate_dirs: bool """ template_fields = ('local_filepath', 'remote_filepath', 'remote_host') @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, local_filepath=None, remote_filepath=None, operation=SFTPOperation.PUT, confirm=True, create_intermediate_dirs=False, *args, **kwargs): super(SFTPOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.local_filepath = local_filepath self.remote_filepath = remote_filepath self.operation = operation self.confirm = confirm self.create_intermediate_dirs = create_intermediate_dirs if not (self.operation.lower() == SFTPOperation.GET or self.operation.lower() == SFTPOperation.PUT): raise TypeError("unsupported operation value {0}, expected {1} or {2}" .format(self.operation, SFTPOperation.GET, SFTPOperation.PUT)) def execute(self, context): file_msg = None try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info("remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: local_folder = os.path.dirname(self.local_filepath) if self.create_intermediate_dirs: # Create Intermediate Directories if it doesn't exist try: os.makedirs(local_folder) except OSError: if not os.path.isdir(local_folder): raise file_msg = "from {0} to {1}".format(self.remote_filepath, self.local_filepath) self.log.info("Starting to transfer %s", file_msg) sftp_client.get(self.remote_filepath, self.local_filepath) else: remote_folder = os.path.dirname(self.remote_filepath) if self.create_intermediate_dirs: _make_intermediate_dirs( sftp_client=sftp_client, remote_directory=remote_folder, ) file_msg = "from {0} to {1}".format(self.local_filepath, self.remote_filepath) self.log.info("Starting to transfer file %s", file_msg) sftp_client.put(self.local_filepath, self.remote_filepath, confirm=self.confirm) except Exception as e: raise AirflowException("Error while transferring {0}, error: {1}" .format(file_msg, str(e))) return self.local_filepath
def test_ssh_connection(self): hook = SSHHook(ssh_conn_id='ssh_default') with hook.get_conn() as client: (_, stdout, _) = client.exec_command('ls') self.assertIsNotNone(stdout.read())