def poke(self, context): self.log.info( 'Poking for blob: %s\nin wasb://%s', self.blob_name, self.container_name ) hook = WasbHook(wasb_conn_id=self.wasb_conn_id) return hook.check_for_blob(self.container_name, self.blob_name, **self.check_options)
def test_load_string(self, mock_service): mock_instance = mock_service.return_value hook = WasbHook(wasb_conn_id='wasb_test_sas_token') hook.load_string('big string', 'container', 'blob', max_connections=1) mock_instance.create_blob_from_text.assert_called_once_with( 'container', 'blob', 'big string', max_connections=1 )
def test_read_file(self, mock_service): mock_instance = mock_service.return_value hook = WasbHook(wasb_conn_id='wasb_test_sas_token') hook.read_file('container', 'blob', max_connections=1) mock_instance.get_blob_to_text.assert_called_once_with( 'container', 'blob', max_connections=1 )
def execute(self, context): """Upload a file to Azure Blob Storage.""" hook = WasbHook(wasb_conn_id=self.wasb_conn_id) self.log.info( 'Uploading {self.file_path} to wasb://{self.container_name} as {self.blob_name}'.format(**locals()) ) hook.load_file(self.file_path, self.container_name, self.blob_name, **self.load_options)
def test_delete_single_blob(self, mock_service): mock_instance = mock_service.return_value hook = WasbHook(wasb_conn_id='wasb_test_sas_token') hook.delete_file('container', 'blob', is_prefix=False) mock_instance.delete_blob.assert_called_once_with( 'container', 'blob', delete_snapshots='include' )
def execute(self, context): self.log.info('Deleting blob: %s\nin wasb://%s', self.blob_name, self.container_name) hook = WasbHook(wasb_conn_id=self.wasb_conn_id) hook.delete_file(self.container_name, self.blob_name, self.is_prefix, self.ignore_if_missing, **self.check_options)
def test_check_for_prefix(self, mock_service): mock_instance = mock_service.return_value hook = WasbHook(wasb_conn_id='wasb_test_sas_token') hook.load_file('path', 'container', 'blob', max_connections=1) mock_instance.create_blob_from_path.assert_called_once_with( 'container', 'blob', 'path', max_connections=1 )
def execute(self, context): self.log.info('Deleting blob: {self.blob_name}\n' 'in wasb://{self.container_name}'.format(**locals())) hook = WasbHook(wasb_conn_id=self.wasb_conn_id) hook.delete_file(self.container_name, self.blob_name, self.is_prefix, self.ignore_if_missing, **self.check_options)
def list_some_blobs(*args, **kwargs): blob_storage = WasbHook(wasb_conn_id='cgm-azure-storage') blob_connection = blob_storage.get_conn( ) # a BlockBlobService object from azure-sdk blobs = blob_connection.list_blobs(container_name='preprocessed', prefix='omdena_datasets/sample_dataset', num_results=1) print(blobs)
def poke(self, context): self.log.info( 'Poking for blob: {self.blob_name}\n' 'in wasb://{self.container_name}'.format(**locals()) ) hook = WasbHook(wasb_conn_id=self.wasb_conn_id) return hook.check_for_blob(self.container_name, self.blob_name, **self.check_options)
def test_check_for_blob(self, mock_service): mock_instance = mock_service.return_value mock_instance.exists.return_value = True hook = WasbHook(wasb_conn_id='wasb_test_sas_token') self.assertTrue(hook.check_for_blob('container', 'blob', timeout=3)) mock_instance.exists.assert_called_once_with( 'container', 'blob', timeout=3 )
def execute(self, context): """Upload a file to Azure Blob Storage.""" hook = WasbHook(wasb_conn_id=self.wasb_conn_id) self.log.info('Uploading %s to wasb://%s ' 'as %s'.format(self.file_path, self.container_name, self.blob_name)) hook.load_file(self.file_path, self.container_name, self.blob_name, **self.load_options)
def poke(self, context): logging.info( 'Poking for prefix: {self.prefix}\n' 'in wasb://{self.container_name}'.format(**locals()) ) hook = WasbHook(wasb_conn_id=self.wasb_conn_id) return hook.check_for_prefix(self.container_name, self.prefix, **self.check_options)
def execute(self, context): """Upload a file to Azure Blob Storage.""" hook = WasbHook(wasb_conn_id=self.wasb_conn_id) self.log.info( 'Uploading %s to wasb://%s ' 'as %s'.format(self.file_path, self.container_name, self.blob_name) ) hook.load_file(self.file_path, self.container_name, self.blob_name, **self.load_options)
def test_check_for_prefix(self, mock_service): mock_instance = mock_service.return_value mock_instance.list_blobs.return_value = iter(['blob_1']) hook = WasbHook(wasb_conn_id='wasb_test_sas_token') self.assertTrue(hook.check_for_prefix('container', 'prefix', timeout=3)) mock_instance.list_blobs.assert_called_once_with( 'container', 'prefix', num_results=1, timeout=3 )
def execute(self, context): self.log.info( 'Deleting blob: %s\nin wasb://%s', self.blob_name, self.container_name ) hook = WasbHook(wasb_conn_id=self.wasb_conn_id) hook.delete_file(self.container_name, self.blob_name, self.is_prefix, self.ignore_if_missing, **self.check_options)
def test_check_for_prefix(self, mock_service): mock_instance = mock_service.return_value mock_instance.list_blobs.return_value = iter(['blob_1']) hook = WasbHook(wasb_conn_id='wasb_test_sas_token') self.assertTrue(hook.check_for_prefix('container', 'prefix', timeout=3)) mock_instance.list_blobs.assert_called_once_with( 'container', 'prefix', timeout=3 )
def test_delete_multiple_nonexisting_blobs_fails(self, mock_service): mock_instance = mock_service.return_value mock_instance.list_blobs.return_value = iter([]) hook = WasbHook(wasb_conn_id='wasb_test_sas_token') with self.assertRaises(Exception) as context: hook.delete_file( 'container', 'nonexisting_blob_prefix', is_prefix=True, ignore_if_missing=False ) self.assertIsInstance(context.exception, AirflowException)
def execute(self, context): self.log.info( 'Deleting blob: {self.blob_name}\n' 'in wasb://{self.container_name}'.format(**locals()) ) hook = WasbHook(wasb_conn_id=self.wasb_conn_id) hook.delete_file(self.container_name, self.blob_name, self.is_prefix, self.ignore_if_missing, **self.check_options)
def test_delete_multiple_blobs(self, mock_service): mock_instance = mock_service.return_value Blob = namedtuple('Blob', ['name']) mock_instance.list_blobs.return_value = iter( [Blob('blob_prefix/blob1'), Blob('blob_prefix/blob2')] ) hook = WasbHook(wasb_conn_id='wasb_test_sas_token') hook.delete_file('container', 'blob_prefix', is_prefix=True) mock_instance.delete_blob.assert_any_call( 'container', 'blob_prefix/blob1', delete_snapshots='include' ) mock_instance.delete_blob.assert_any_call( 'container', 'blob_prefix/blob2', delete_snapshots='include' )
def _build_hook(self): remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID') try: return WasbHook(remote_conn_id) except AzureHttpError: self.log.error( 'Could not create an WasbHook with connection id "%s". ' 'Please make sure that airflow[azure] is installed and ' 'the Wasb connection exists.', remote_conn_id)
def hook(self): remote_conn_id = conf.get('logging', 'REMOTE_LOG_CONN_ID') try: from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(remote_conn_id) except AzureHttpError: self.log.error( 'Could not create an WasbHook with connection id "%s". ' 'Please make sure that airflow[azure] is installed and ' 'the Wasb connection exists.', remote_conn_id)
def get_hook(self): try: if self.conn_type == 'mysql': from airflow.hooks.mysql_hook import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.contrib.hooks.bigquery_hook import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.hooks.postgres_hook import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.hooks.hive_hooks import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.hooks.presto_hook import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.hooks.hive_hooks import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.hooks.sqlite_hook import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.hooks.jdbc_hook import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.hooks.mssql_hook import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.hooks.oracle_hook import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.contrib.hooks.vertica_hook import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.contrib.hooks.cloudant_hook import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.contrib.hooks.jira_hook import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.contrib.hooks.redis_hook import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.hooks.docker_hook import DockerHook return DockerHook(docker_conn_id=self.conn_id) except: pass
def execute(self, context): source_hook = WasbHook(wasb_conn_id=self.azure_blob_conn_id) # Assumption 1: there is sufficient disk space to download the blob in question # Assumption 2: The file is a correctly formatted csv file with NamedTemporaryFile(mode='a+', delete=True) as f: source_hook.get_file(file_path=f.name, container_name=self.src_blob_container, blob_name=self.src_blob) f.flush() self.log.info("Saving file to %s", f.name) csv_reader = reader(f) list_of_tuples = list(map(tuple, csv_reader)) self.log.info(list_of_tuples) self.log.info(f"Inserting into {self.dest_table}") hook = MsSqlHook(mssql_conn_id=self.azure_sql_conn_id, schema=self.database) hook.insert_rows(self.dest_table, list_of_tuples) self.log.info(f"Data inserted into {self.database}.{self.dest_table}")
def execute(self, context): source_hook = WasbHook(wasb_conn_id=self.azure_blob_conn_id) # Assumption: there is sufficient disk space to download the blob in question with NamedTemporaryFile(mode='wb', delete=True) as f: source_hook.get_file(file_path=f.name, container_name=self.src_blob_container, blob_name=self.src_blob) f.flush() self.log.info("Saving file to %s", f.name) if self.adls_gen == 1: self.log.info("Uploading to ADLS Gen 1") adls_hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id) adls_hook.upload_file(local_path=f.name, remote_path=f.name) else: self.log.info("Uploading to ADLS Gen 2") adls_hook = WasbHook(wasb_conn_id=self.azure_data_lake_conn_id) adls_hook.load_file(f.name, container_name=self.dest_adls_container, blob_name=self.dest_adls) self.log.info("All done, uploaded files to Azure Data Lake Store")
def poke(self, context): self.log.info('Poking for wildcard prefix: %s in wasb://%s', self.wildcard_prefix, self.container_name) hook = WasbHook(wasb_conn_id=self.wasb_conn_id) prefix = re.split(r'[*]', self.wildcard_prefix, 1)[0] klist = hook.connection.list_blobs(self.container_name, prefix, num_results=1, **self.check_options) if klist: blob_matches = [ k for k in klist if fnmatch.fnmatch(k.name, self.wildcard_prefix) ] if blob_matches: return True return False
def get_hook(self): if self.conn_type == 'mysql': from airflow.hooks.mysql_hook import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.gcp.hooks.bigquery import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.hooks.postgres_hook import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'pig_cli': from airflow.hooks.pig_hook import PigCliHook return PigCliHook(pig_cli_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.hooks.hive_hooks import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.hooks.presto_hook import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.hooks.hive_hooks import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.hooks.sqlite_hook import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.hooks.jdbc_hook import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.hooks.mssql_hook import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.hooks.oracle_hook import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.contrib.hooks.vertica_hook import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.contrib.hooks.cloudant_hook import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.contrib.hooks.jira_hook import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.contrib.hooks.redis_hook import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.hooks.docker_hook import DockerHook return DockerHook(docker_conn_id=self.conn_id) elif self.conn_type == 'azure_data_lake': from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id) elif self.conn_type == 'azure_cosmos': from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id) elif self.conn_type == 'cassandra': from airflow.contrib.hooks.cassandra_hook import CassandraHook return CassandraHook(cassandra_conn_id=self.conn_id) elif self.conn_type == 'mongo': from airflow.contrib.hooks.mongo_hook import MongoHook return MongoHook(conn_id=self.conn_id) elif self.conn_type == 'gcpcloudsql': from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id) elif self.conn_type == 'grpc': from airflow.contrib.hooks.grpc_hook import GrpcHook return GrpcHook(grpc_conn_id=self.conn_id) raise AirflowException("Unknown hook type {}".format(self.conn_type))
def test_check_for_prefix_empty(self, mock_service): mock_instance = mock_service.return_value mock_instance.list_blobs.return_value = iter([]) hook = WasbHook(wasb_conn_id='wasb_test_sas_token') self.assertFalse(hook.check_for_prefix('container', 'prefix'))
def test_check_for_blob_empty(self, mock_service): mock_service.return_value.exists.return_value = False hook = WasbHook(wasb_conn_id='wasb_test_sas_token') self.assertFalse(hook.check_for_blob('container', 'blob'))
# 'wait_for_downstream': False, # 'dag': dag, 'sla': timedelta(minutes=1), # 'execution_timeout': timedelta(seconds=300), # 'on_failure_callback': some_function, # 'on_success_callback': some_other_function, # 'on_retry_callback': another_function, # 'trigger_rule': u'all_success' } wasb_connection_id = 'wasb_file_upload' input_container = '111' output_container = '222' processing_file_prefix = '' blob_service = WasbHook(wasb_conn_id=wasb_connection_id) dag = DAG( dag_id='azure_blob_reader', default_args=default_args, description='A dag to pull new images from blob and process them', schedule_interval=timedelta(days=1), ) new_files = WasbPrefixSensor( task_id='new_files_sensor', container_name=input_container, prefix=processing_file_prefix, wasb_conn_id=wasb_connection_id, dag=dag, )
def test_sas_token(self): from azure.storage.blob import BlockBlobService hook = WasbHook(wasb_conn_id='wasb_test_sas_token') self.assertEqual(hook.conn_id, 'wasb_test_sas_token') self.assertIsInstance(hook.connection, BlockBlobService)
def test_check_for_prefix(self, mock_service): mock_instance = mock_service.return_value hook = WasbHook(wasb_conn_id='wasb_test_sas_token') hook.load_file('path', 'container', 'blob', max_connections=1) mock_instance.create_blob_from_path.assert_called_once_with( 'container', 'blob', 'path', max_connections=1)
def get_azure_blob_files(): '''Downloads file from Azure blob storage ''' azure = WasbHook(wasb_conn_id='azure_blob') azure.get_file(data_file, container_name='covid-data', blob_name='or/20201208.csv')
def poke(self, context): self.log.info('Poking for blob: {self.blob_name}\n' 'in wasb://{self.container_name}'.format(**locals())) hook = WasbHook(wasb_conn_id=self.wasb_conn_id) return hook.check_for_blob(self.container_name, self.blob_name, **self.check_options)
def poke(self, context): self.log.info('Poking for prefix: %s in wasb://%s', self.prefix, self.container_name) hook = WasbHook(wasb_conn_id=self.wasb_conn_id) return hook.check_for_prefix(self.container_name, self.prefix, **self.check_options)