class MySqlOperator(BaseOperator): """ Executes sql code in a specific MySQL database .. seealso:: For more information on how to use this operator, take a look at the guide: :ref:`howto/operator:MySqlOperator` :param sql: the sql code to be executed. Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' (templated) :param mysql_conn_id: Reference to :ref:`mysql connection id <howto/connection:mysql>`. :param parameters: (optional) the parameters to render the SQL query with. Template reference are recognized by str ending in '.json' (templated) :param autocommit: if True, each command is automatically committed. (default value: False) :param database: name of database which overwrite defined one in connection """ template_fields: Sequence[str] = ('sql', 'parameters') # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement. template_fields_renderers = { 'sql': 'mysql' if 'mysql' in wwwutils.get_attr_renderer() else 'sql', 'parameters': 'json', } template_ext: Sequence[str] = ('.sql', '.json') ui_color = '#ededed' def __init__( self, *, sql: Union[str, List[str]], mysql_conn_id: str = 'mysql_default', parameters: Optional[Union[Mapping, Iterable]] = None, autocommit: bool = False, database: Optional[str] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.mysql_conn_id = mysql_conn_id self.sql = sql self.autocommit = autocommit self.parameters = parameters self.database = database def prepare_template(self) -> None: """Parse template file for attribute parameters.""" if isinstance(self.parameters, str): self.parameters = ast.literal_eval(self.parameters) def execute(self, context: 'Context') -> None: self.log.info('Executing: %s', self.sql) hook = MySqlHook(mysql_conn_id=self.mysql_conn_id, schema=self.database) hook.run(self.sql, autocommit=self.autocommit, parameters=self.parameters)
class PrestoToMySqlOperator(BaseOperator): """ Moves data from Presto to MySQL, note that for now the data is loaded into memory before being pushed to MySQL, so this operator should be used for smallish amount of data. :param sql: SQL query to execute against Presto. (templated) :param mysql_table: target MySQL table, use dot notation to target a specific database. (templated) :param mysql_conn_id: Reference to :ref:`mysql connection id <howto/connection:mysql>`. :param presto_conn_id: source presto connection :param mysql_preoperator: sql statement to run against mysql prior to import, typically use to truncate of delete in place of the data coming in, allowing the task to be idempotent (running the task twice won't double load data). (templated) """ template_fields: Sequence[str] = ('sql', 'mysql_table', 'mysql_preoperator') template_ext: Sequence[str] = ('.sql', ) # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement. template_fields_renderers = { "sql": "sql", "mysql_preoperator": "mysql" if "mysql" in wwwutils.get_attr_renderer() else "sql", } ui_color = '#a0e08c' def __init__( self, *, sql: str, mysql_table: str, presto_conn_id: str = 'presto_default', mysql_conn_id: str = 'mysql_default', mysql_preoperator: Optional[str] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.sql = sql self.mysql_table = mysql_table self.mysql_conn_id = mysql_conn_id self.mysql_preoperator = mysql_preoperator self.presto_conn_id = presto_conn_id def execute(self, context: 'Context') -> None: presto = PrestoHook(presto_conn_id=self.presto_conn_id) self.log.info("Extracting data from Presto: %s", self.sql) results = presto.get_records(self.sql) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.mysql_preoperator: self.log.info("Running MySQL preoperator") self.log.info(self.mysql_preoperator) mysql.run(self.mysql_preoperator) self.log.info("Inserting rows into MySQL") mysql.insert_rows(table=self.mysql_table, rows=results)
class RedshiftSQLOperator(BaseOperator): """ Executes SQL Statements against an Amazon Redshift cluster .. seealso:: For more information on how to use this operator, take a look at the guide: :ref:`howto/operator:RedshiftSQLOperator` :param sql: the SQL code to be executed as a single string, or a list of str (sql statements), or a reference to a template file. Template references are recognized by str ending in '.sql' :param redshift_conn_id: reference to :ref:`Amazon Redshift connection id<howto/connection:redshift>` :param parameters: (optional) the parameters to render the SQL query with. :param autocommit: if True, each command is automatically committed. (default value: False) """ template_fields: Sequence[str] = ('sql', ) template_ext: Sequence[str] = ('.sql', ) # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement. template_fields_renderers = { "sql": "postgresql" if "postgresql" in wwwutils.get_attr_renderer() else "sql" } def __init__( self, *, sql: Union[str, Iterable[str]], redshift_conn_id: str = 'redshift_default', parameters: Optional[Union[Iterable, Mapping]] = None, autocommit: bool = True, **kwargs, ) -> None: super().__init__(**kwargs) self.redshift_conn_id = redshift_conn_id self.sql = sql self.autocommit = autocommit self.parameters = parameters def get_hook(self) -> RedshiftSQLHook: """Create and return RedshiftSQLHook. :return RedshiftSQLHook: A RedshiftSQLHook instance. """ return RedshiftSQLHook(redshift_conn_id=self.redshift_conn_id) def execute(self, context: 'Context') -> None: """Execute a statement against Amazon Redshift""" self.log.info("Executing statement: %s", self.sql) hook = self.get_hook() hook.run(self.sql, autocommit=self.autocommit, parameters=self.parameters)
class PostgresOperator(BaseOperator): """ Executes sql code in a specific Postgres database :param sql: the SQL code to be executed as a single string, or a list of str (sql statements), or a reference to a template file. Template references are recognized by str ending in '.sql' :param postgres_conn_id: The :ref:`postgres conn id <howto/connection:postgres>` reference to a specific postgres database. :param autocommit: if True, each command is automatically committed. (default value: False) :param parameters: (optional) the parameters to render the SQL query with. :param database: name of database which overwrite defined one in connection """ template_fields: Sequence[str] = ('sql', ) # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement. template_fields_renderers = { 'sql': 'postgresql' if 'postgresql' in wwwutils.get_attr_renderer() else 'sql' } template_ext: Sequence[str] = ('.sql', ) ui_color = '#ededed' def __init__( self, *, sql: Union[str, List[str]], postgres_conn_id: str = 'postgres_default', autocommit: bool = False, parameters: Optional[Union[Mapping, Iterable]] = None, database: Optional[str] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.sql = sql self.postgres_conn_id = postgres_conn_id self.autocommit = autocommit self.parameters = parameters self.database = database self.hook: Optional[PostgresHook] = None def execute(self, context: 'Context'): self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) self.hook.run(self.sql, self.autocommit, parameters=self.parameters) for output in self.hook.conn.notices: self.log.info(output)
def setUp(self): self.attr_renderer = utils.get_attr_renderer()
class MsSqlOperator(BaseOperator): """ Executes sql code in a specific Microsoft SQL database .. seealso:: For more information on how to use this operator, take a look at the guide: :ref:`howto/operator:MsSqlOperator` This operator may use one of two hooks, depending on the ``conn_type`` of the connection. If conn_type is ``'odbc'``, then :py:class:`~airflow.providers.odbc.hooks.odbc.OdbcHook` is used. Otherwise, :py:class:`~airflow.providers.microsoft.mssql.hooks.mssql.MsSqlHook` is used. :param sql: the sql code to be executed (templated) :param mssql_conn_id: reference to a specific mssql database :param parameters: (optional) the parameters to render the SQL query with. :param autocommit: if True, each command is automatically committed. (default value: False) :param database: name of database which overwrite defined one in connection """ template_fields: Sequence[str] = ('sql', ) template_ext: Sequence[str] = ('.sql', ) # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement. template_fields_renderers = { 'sql': 'tsql' if 'tsql' in wwwutils.get_attr_renderer() else 'sql' } ui_color = '#ededed' def __init__( self, *, sql: str, mssql_conn_id: str = 'mssql_default', parameters: Optional[Union[Mapping, Iterable]] = None, autocommit: bool = False, database: Optional[str] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.mssql_conn_id = mssql_conn_id self.sql = sql self.parameters = parameters self.autocommit = autocommit self.database = database self._hook: Optional[Union[MsSqlHook, 'DbApiHook']] = None def get_hook(self) -> Optional[Union[MsSqlHook, 'DbApiHook']]: """ Will retrieve hook as determined by :meth:`~.Connection.get_hook` if one is defined, and :class:`~.MsSqlHook` otherwise. For example, if the connection ``conn_type`` is ``'odbc'``, :class:`~.OdbcHook` will be used. """ if not self._hook: conn = MsSqlHook.get_connection(conn_id=self.mssql_conn_id) try: self._hook = conn.get_hook() self._hook.schema = self.database # type: ignore[union-attr] except AirflowException: self._hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id, schema=self.database) return self._hook def execute(self, context: 'Context') -> None: self.log.info('Executing: %s', self.sql) hook = self.get_hook() hook.run( # type: ignore[union-attr] sql=self.sql, autocommit=self.autocommit, parameters=self.parameters)
class MsSqlToHiveOperator(BaseOperator): """ Moves data from Microsoft SQL Server to Hive. The operator runs your query against Microsoft SQL Server, stores the file locally before loading it into a Hive table. If the ``create`` or ``recreate`` arguments are set to ``True``, a ``CREATE TABLE`` and ``DROP TABLE`` statements are generated. Hive data types are inferred from the cursor's metadata. Note that the table generated in Hive uses ``STORED AS textfile`` which isn't the most efficient serialization format. If a large amount of data is loaded and/or if the table gets queried considerably, you may want to use this operator only to stage the data into a temporary table before loading it into its final destination using a ``HiveOperator``. :param sql: SQL query to execute against the Microsoft SQL Server database. (templated) :param hive_table: target Hive table, use dot notation to target a specific database. (templated) :param create: whether to create the table if it doesn't exist :param recreate: whether to drop and recreate the table at every execution :param partition: target partition as a dict of partition columns and values. (templated) :param delimiter: field delimiter in the file :param mssql_conn_id: source Microsoft SQL Server connection :param hive_cli_conn_id: Reference to the :ref:`Hive CLI connection id <howto/connection:hive_cli>`. :param tblproperties: TBLPROPERTIES of the hive table being created """ template_fields: Sequence[str] = ('sql', 'partition', 'hive_table') template_ext: Sequence[str] = ('.sql', ) # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement. template_fields_renderers = { 'sql': 'tsql' if 'tsql' in wwwutils.get_attr_renderer() else 'sql' } ui_color = '#a0e08c' def __init__( self, *, sql: str, hive_table: str, create: bool = True, recreate: bool = False, partition: Optional[Dict] = None, delimiter: str = chr(1), mssql_conn_id: str = 'mssql_default', hive_cli_conn_id: str = 'hive_cli_default', tblproperties: Optional[Dict] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.sql = sql self.hive_table = hive_table self.partition = partition self.create = create self.recreate = recreate self.delimiter = delimiter self.mssql_conn_id = mssql_conn_id self.hive_cli_conn_id = hive_cli_conn_id self.partition = partition or {} self.tblproperties = tblproperties @classmethod def type_map(cls, mssql_type: int) -> str: """Maps MsSQL type to Hive type.""" map_dict = { pymssql.BINARY.value: 'INT', pymssql.DECIMAL.value: 'FLOAT', pymssql.NUMBER.value: 'INT', } return map_dict.get(mssql_type, 'STRING') def execute(self, context: "Context"): mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id) self.log.info( "Dumping Microsoft SQL Server query results to local file") with mssql.get_conn() as conn: with conn.cursor() as cursor: cursor.execute(self.sql) with NamedTemporaryFile("w") as tmp_file: csv_writer = csv.writer(tmp_file, delimiter=self.delimiter, encoding='utf-8') field_dict = OrderedDict() col_count = 0 for field in cursor.description: col_count += 1 col_position = f"Column{col_count}" field_dict[col_position if field[0] == '' else field[0]] = self.type_map(field[1]) csv_writer.writerows(cursor) tmp_file.flush() hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) self.log.info("Loading file into Hive") hive.load_file( tmp_file.name, self.hive_table, field_dict=field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate, tblproperties=self.tblproperties, )
# under the License. """This module contains an operator to move data from Hive to MySQL.""" from tempfile import NamedTemporaryFile from typing import TYPE_CHECKING, Dict, Optional, Sequence from airflow.models import BaseOperator from airflow.providers.apache.hive.hooks.hive import HiveServer2Hook from airflow.providers.mysql.hooks.mysql import MySqlHook from airflow.utils.operator_helpers import context_to_airflow_vars from airflow.www import utils as wwwutils if TYPE_CHECKING: from airflow.utils.context import Context # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement. MYSQL_RENDERER = 'mysql' if 'mysql' in wwwutils.get_attr_renderer() else 'sql' class HiveToMySqlOperator(BaseOperator): """ Moves data from Hive to MySQL, note that for now the data is loaded into memory before being pushed to MySQL, so this operator should be used for smallish amount of data. :param sql: SQL query to execute against Hive server. (templated) :param mysql_table: target MySQL table, use dot notation to target a specific database. (templated) :param mysql_conn_id: source mysql connection :param metastore_conn_id: Reference to the :ref:`metastore thrift service connection id <howto/connection:hive_metastore>`. :param mysql_preoperator: sql statement to run against mysql prior to
class PostgresOperator(BaseOperator): """ Executes sql code in a specific Postgres database :param sql: the SQL code to be executed as a single string, or a list of str (sql statements), or a reference to a template file. Template references are recognized by str ending in '.sql' :param postgres_conn_id: The :ref:`postgres conn id <howto/connection:postgres>` reference to a specific postgres database. :param autocommit: if True, each command is automatically committed. (default value: False) :param parameters: (optional) the parameters to render the SQL query with. :param database: name of database which overwrite defined one in connection """ template_fields: Sequence[str] = ('sql', ) # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement. template_fields_renderers = { 'sql': 'postgresql' if 'postgresql' in wwwutils.get_attr_renderer() else 'sql' } template_ext: Sequence[str] = ('.sql', ) ui_color = '#ededed' def __init__( self, *, sql: Union[str, Iterable[str]], postgres_conn_id: str = 'postgres_default', autocommit: bool = False, parameters: Optional[Union[Iterable, Mapping]] = None, database: Optional[str] = None, runtime_parameters: Optional[Mapping] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.sql = sql self.postgres_conn_id = postgres_conn_id self.autocommit = autocommit self.parameters = parameters self.database = database self.runtime_parameters = runtime_parameters self.hook: Optional[PostgresHook] = None def execute(self, context: 'Context'): self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) if self.runtime_parameters: final_sql = [] sql_param = {} for param in self.runtime_parameters: set_param_sql = f"SET {{}} TO %({param})s;" dynamic_sql = SQL(set_param_sql).format(Identifier(f"{param}")) final_sql.append(dynamic_sql) for param, val in self.runtime_parameters.items(): sql_param.update({f"{param}": f"{val}"}) if self.parameters: sql_param.update(self.parameters) if isinstance(self.sql, str): final_sql.append(SQL(self.sql)) else: final_sql.extend(list(map(SQL, self.sql))) self.hook.run(final_sql, self.autocommit, parameters=sql_param) else: self.hook.run(self.sql, self.autocommit, parameters=self.parameters) for output in self.hook.conn.notices: self.log.info(output)