class JdbcOperator(BaseOperator): """ Executes sql code in a database using jdbc driver. Requires jaydebeapi. :param jdbc_conn_id: reference to a predefined database :type jdbc_conn_id: string :param sql: the sql code to be executed. (templated) :type sql: Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' """ template_fields = ('sql',) template_ext = ('.sql',) ui_color = '#ededed' @apply_defaults def __init__( self, sql, jdbc_conn_id='jdbc_default', autocommit=False, parameters=None, *args, **kwargs): super(JdbcOperator, self).__init__(*args, **kwargs) self.parameters = parameters self.sql = sql self.jdbc_conn_id = jdbc_conn_id self.autocommit = autocommit def execute(self, context): self.log.info('Executing: %s', self.sql) self.hook = JdbcHook(jdbc_conn_id=self.jdbc_conn_id) self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
def _query_jdbc(self): """ Queries jdbc and returns a cursor to the results. """ jdbc = JdbcHook(jdbc_conn_id=self.jdbc_conn_id) conn = jdbc.get_conn() cursor = conn.cursor() self.log.info('Querying SQL: %s', self.sql) cursor.execute(self.sql) return cursor
class JdbcOperator(BaseOperator): """ Executes sql code in a database using jdbc driver. Requires jaydebeapi. :param jdbc_url: driver specific connection url with string variables, e.g. for exasol jdbc:exa:{0}:{1};schema={2} Template vars are defined like this: {0} = hostname, {1} = port, {2} = dbschema, {3} = extra :type jdbc_url: string :param jdbc_driver_name: classname of the specific jdbc driver, for exasol com.exasol.jdbc.EXADriver :type jdbc_driver_name: string :param jdbc_driver_loc: absolute path to jdbc driver location, for example /var/exasol/exajdbc.jar :type jdbc_driver_loc: string :param conn_id: reference to a predefined database :type conn_id: string :param sql: the sql code to be executed :type sql: Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' """ template_fields = ('sql', ) template_ext = ('.sql', ) ui_color = '#ededed' @apply_defaults def __init__(self, sql, jdbc_conn_id='jdbc_default', autocommit=False, parameters=None, *args, **kwargs): super(JdbcOperator, self).__init__(*args, **kwargs) self.parameters = parameters self.sql = sql self.jdbc_conn_id = jdbc_conn_id self.autocommit = autocommit def execute(self, context): _log.info('Executing: ' + str(self.sql)) self.hook = JdbcHook(jdbc_conn_id=self.jdbc_conn_id) self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
class JdbcOperator(BaseOperator): """ Executes sql code in a database using jdbc driver. Requires jaydebeapi. :param jdbc_url: driver specific connection url with string variables, e.g. for exasol jdbc:exa:{0}:{1};schema={2} Template vars are defined like this: {0} = hostname, {1} = port, {2} = dbschema, {3} = extra :type jdbc_url: string :param jdbc_driver_name: classname of the specific jdbc driver, for exasol com.exasol.jdbc.EXADriver :type jdbc_driver_name: string :param jdbc_driver_loc: absolute path to jdbc driver location, for example /var/exasol/exajdbc.jar :type jdbc_driver_loc: string :param conn_id: reference to a predefined database :type conn_id: string :param sql: the sql code to be executed :type sql: Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' """ template_fields = ('sql',) template_ext = ('.sql',) ui_color = '#ededed' @apply_defaults def __init__( self, sql, jdbc_conn_id='jdbc_default', autocommit=False, parameters=None, *args, **kwargs): super(JdbcOperator, self).__init__(*args, **kwargs) self.parameters = parameters self.sql = sql self.jdbc_conn_id = jdbc_conn_id self.autocommit = autocommit def execute(self, context): logging.info('Executing: ' + str(self.sql)) self.hook = JdbcHook(jdbc_conn_id=self.jdbc_conn_id) self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
class JdbcOperator(BaseOperator): """ Executes sql code in a database using jdbc driver. Requires jaydebeapi. :param sql: the sql code to be executed. (templated) :type sql: Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' :param jdbc_conn_id: reference to a predefined database :type jdbc_conn_id: str :param autocommit: if True, each command is automatically committed. (default value: False) :type autocommit: bool :param parameters: (optional) the parameters to render the SQL query with. :type parameters: mapping or iterable """ template_fields = ('sql', ) template_ext = ('.sql', ) ui_color = '#ededed' @apply_defaults def __init__(self, sql, jdbc_conn_id='jdbc_default', autocommit=False, parameters=None, *args, **kwargs): super(JdbcOperator, self).__init__(*args, **kwargs) self.parameters = parameters self.sql = sql self.jdbc_conn_id = jdbc_conn_id self.autocommit = autocommit def execute(self, context): self.log.info('Executing: %s', self.sql) self.hook = JdbcHook(jdbc_conn_id=self.jdbc_conn_id) self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
def get_hook(self): try: if self.conn_type == 'mysql': from airflow.hooks.mysql_hook import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.contrib.hooks.bigquery_hook import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.hooks.postgres_hook import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.hooks.hive_hooks import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.hooks.presto_hook import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.hooks.hive_hooks import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.hooks.sqlite_hook import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.hooks.jdbc_hook import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.hooks.mssql_hook import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.hooks.oracle_hook import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.contrib.hooks.vertica_hook import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.contrib.hooks.cloudant_hook import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.contrib.hooks.jira_hook import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.contrib.hooks.redis_hook import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.hooks.docker_hook import DockerHook return DockerHook(docker_conn_id=self.conn_id) except: pass
def execute(self, context): self.hook = JdbcHook(jdbc_conn_id=self.snowflake_conn_id) self.s3 = S3Hook(s3_conn_id=self.s3_conn_id) sql = self.pre_sql if self.drop_and_create: sql += self._build_pre_sql() s3_bucket, s3_key = self.s3.parse_s3_url(self.data_s3_key) if s3_bucket != S3_BUCKET: raise ValueError( 'For Snowflake loads the S3 bucket must be {}. Got: {}'.format( S3_BUCKET, s3_bucket)) copy_sql = """ COPY INTO {table} FROM @airflow.{stage}/{s3_key}; """.format( table=self.table, stage=self.stage, s3_key=s3_key, ) sql.append(copy_sql) self.hook.run(['BEGIN;'] + sql + ['COMMIT;'])
class JdbcOperator(BaseOperator): """ Executes sql code in a database using jdbc driver. Requires jaydebeapi. :param jdbc_conn_id: reference to a predefined database :type jdbc_conn_id: string :param sql: the sql code to be executed :type sql: Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' """ template_fields = ('sql', ) template_ext = ('.sql', ) ui_color = '#ededed' @apply_defaults def __init__(self, sql, jdbc_conn_id='jdbc_default', autocommit=False, parameters=None, *args, **kwargs): super(JdbcOperator, self).__init__(*args, **kwargs) self.parameters = parameters self.sql = sql self.jdbc_conn_id = jdbc_conn_id self.autocommit = autocommit def execute(self, context): self.log.info('Executing: %s', self.sql) self.hook = JdbcHook(jdbc_conn_id=self.jdbc_conn_id) self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
class JdbcOperator(BaseOperator): """ Executes sql code in a database using jdbc driver. Requires jaydebeapi. :param jdbc_url: driver specific connection url with string variables, e.g. for exasol jdbc:exa:{0}:{1};schema={2} Template vars are defined like this: {0} = hostname, {1} = port, {2} = dbschema, {3} = extra :type jdbc_url: string :param jdbc_driver_name: classname of the specific jdbc driver, for exasol com.exasol.jdbc.EXADriver :type jdbc_driver_name: string :param jdbc_driver_loc: absolute path to jdbc driver location, for example /var/exasol/exajdbc.jar :type jdbc_driver_loc: string :param conn_id: reference to a predefined database :type conn_id: string :param sql: the sql code to be executed :type sql: string or string pointing to a template file. File must have a '.sql' extensions. """ template_fields = ('sql',) template_ext = ('.sql',) ui_color = '#ededed' @apply_defaults def __init__( self, sql, jdbc_url, jdbc_driver_name, jdbc_driver_loc, conn_id='jdbc_default', autocommit=False, *args, **kwargs): super(JdbcOperator, self).__init__(*args, **kwargs) self.jdbc_url=jdbc_url self.jdbc_driver_name=jdbc_driver_name self.jdbc_driver_loc=jdbc_driver_loc self.sql = sql self.conn_id = conn_id self.autocommit = autocommit def execute(self, context): logging.info('Executing: ' + self.sql) self.hook = JdbcHook(conn_id=self.conn_id,jdbc_driver_loc=self.jdbc_driver_loc, jdbc_driver_name=self.jdbc_driver_name,jdbc_url=self.jdbc_url) for row in self.hook.get_records(self.sql, self.autocommit): logging.info('Result: ' + ','.join(map(str,row)) )
def execute(self, context): self.log.info('Executing: %s', self.sql) self.hook = JdbcHook(jdbc_conn_id=self.jdbc_conn_id) self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
class FBS3ToSnowflakeOperator(BaseOperator): template_fields = ( 'table', 'data_s3_key', 'pre_sql', 'schema_s3_key', ) @apply_defaults @require_keyword_args(['task_id', 'table', 'data_s3_key', 'stage', 'dag']) def __init__(self, snowflake_conn_id=SNOWFLAKE_CONN_ID, pre_sql=[], s3_conn_id='s3_default', drop_and_create=False, schema_s3_key=None, forced_string_columns=[], *args, **kwargs): self.snowflake_conn_id = snowflake_conn_id self.table = kwargs['table'] self.data_s3_key = kwargs['data_s3_key'] if isinstance(pre_sql, str): pre_sql = [pre_sql] elif not isinstance(pre_sql, list): raise TypeError('pre_sql must be str or list!') self.pre_sql = pre_sql self.s3_conn_id = s3_conn_id self.stage = kwargs['stage'] self.drop_and_create = drop_and_create self.schema_s3_key = schema_s3_key self.forced_string_columns = forced_string_columns del kwargs['table'] del kwargs['data_s3_key'] del kwargs['stage'] super(FBS3ToSnowflakeOperator, self).__init__(*args, **kwargs) def _build_pre_sql(self): # A helper function that only needs to be called in the `_build_pre_sql` function def determine_schema(): schema_sql = '' logging.info('Reading from s3: ' + self.schema_s3_key) schema_key = self.s3.get_key(self.schema_s3_key) if schema_key is None: raise AirflowException( 's3 key {} was not found. Did you forget to run a dependency?' .format(schema_key)) # Schema must be stored as a JSONified array schema_array = json.loads(schema_key.get_contents_as_string()) schema_strings = [] for column in schema_array: column_name = column[0] if column_name in COLUMNS_TO_QUOTE: column[0] = '"{}"'.format(column_name) # We're assuming well-formed type information type_and_len = column[1].lower().split('(') use_precise_type = ( type_and_len[0] in POSTGRES_TO_SNOWFLAKE_DATA_TYPES and column_name not in self.forced_string_columns) if use_precise_type: new_type = POSTGRES_TO_SNOWFLAKE_DATA_TYPES[ type_and_len[0]] if new_type != FLOATESQUE_TYPE: column[1] = new_type # For numeric and decimal, if no arguments is provided then postgres # says "numeric values of any precision and scale can be stored". # The only way to emulate this behavior is to use a float (which is what # matillion + redshift also does). elif new_type == FLOATESQUE_TYPE and len( type_and_len) == 1: column[1] = FLOAT_TYPE else: # Replace any non-supported data types with the string type, aka VARCHAR column[1] = STRING_TYPE schema_strings.append(' '.join(column)) # Extra spaces added to make it look good in the logs return ',\n '.join(schema_strings) pre_sql = [ 'DROP TABLE IF EXISTS {table};'.format(table=self.table), """ CREATE TABLE IF NOT EXISTS {table} ( {schema} ); """.format(table=self.table, schema=determine_schema()) ] return pre_sql def execute(self, context): self.hook = JdbcHook(jdbc_conn_id=self.snowflake_conn_id) self.s3 = S3Hook(s3_conn_id=self.s3_conn_id) sql = self.pre_sql if self.drop_and_create: sql += self._build_pre_sql() s3_bucket, s3_key = self.s3.parse_s3_url(self.data_s3_key) if s3_bucket != S3_BUCKET: raise ValueError( 'For Snowflake loads the S3 bucket must be {}. Got: {}'.format( S3_BUCKET, s3_bucket)) copy_sql = """ COPY INTO {table} FROM @airflow.{stage}/{s3_key}; """.format( table=self.table, stage=self.stage, s3_key=s3_key, ) sql.append(copy_sql) self.hook.run(['BEGIN;'] + sql + ['COMMIT;'])
def execute(self, context): logging.info('Executing: ' + self.sql) self.hook = JdbcHook(conn_id=self.conn_id,jdbc_driver_loc=self.jdbc_driver_loc, jdbc_driver_name=self.jdbc_driver_name,jdbc_url=self.jdbc_url) for row in self.hook.get_records(self.sql, self.autocommit): logging.info('Result: ' + ','.join(map(str,row)) )
def get_hook(self): if self.conn_type == 'mysql': from airflow.hooks.mysql_hook import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.gcp.hooks.bigquery import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.hooks.postgres_hook import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'pig_cli': from airflow.hooks.pig_hook import PigCliHook return PigCliHook(pig_cli_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.hooks.hive_hooks import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.hooks.presto_hook import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.hooks.hive_hooks import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.hooks.sqlite_hook import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.hooks.jdbc_hook import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.hooks.mssql_hook import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.hooks.oracle_hook import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.contrib.hooks.vertica_hook import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.contrib.hooks.cloudant_hook import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.contrib.hooks.jira_hook import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.contrib.hooks.redis_hook import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.hooks.docker_hook import DockerHook return DockerHook(docker_conn_id=self.conn_id) elif self.conn_type == 'azure_data_lake': from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id) elif self.conn_type == 'azure_cosmos': from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id) elif self.conn_type == 'cassandra': from airflow.contrib.hooks.cassandra_hook import CassandraHook return CassandraHook(cassandra_conn_id=self.conn_id) elif self.conn_type == 'mongo': from airflow.contrib.hooks.mongo_hook import MongoHook return MongoHook(conn_id=self.conn_id) elif self.conn_type == 'gcpcloudsql': from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id) elif self.conn_type == 'grpc': from airflow.contrib.hooks.grpc_hook import GrpcHook return GrpcHook(grpc_conn_id=self.conn_id) raise AirflowException("Unknown hook type {}".format(self.conn_type))
def test_jdbc_conn_connection(self, jdbc_mock): jdbc_hook = JdbcHook() jdbc_conn = jdbc_hook.get_conn() self.assertTrue(jdbc_mock.called) self.assertIsInstance(jdbc_conn, Mock) self.assertEqual(jdbc_conn.name, jdbc_mock.return_value.name)
def execute(self, context): logging.info('Executing: ' + str(self.sql)) self.hook = JdbcHook(jdbc_conn_id=self.jdbc_conn_id) self.hook.run(self.sql, self.autocommit)
def _execute(self, sql): logging.info('Executing: ' + str(sql)) hook = JdbcHook(jdbc_conn_id=self.conn_id) hook.run(sql, self.autocommit)
def test_jdbc_conn_get_autocommit(self, _): jdbc_hook = JdbcHook() jdbc_conn = jdbc_hook.get_conn() jdbc_hook.get_autocommit(jdbc_conn) jdbc_conn.jconn.getAutoCommit.assert_called_once_with()
def _select(self, sql): logging.info('Querying: ' + str(sql)) hook = JdbcHook(jdbc_conn_id=self.conn_id) return hook.get_records(sql)