def extract_data_source(ti): try: conn = OdbcHook().get_conn() most_recent_date = ti.xcom_pull(key='most_recent_date', task_ids=['catchup_date']) print('most_recent_date') print(most_recent_date) sql = None if not np.array(most_recent_date ) or not most_recent_date or most_recent_date is None: print('run without xcom value') sql = 'SELECT TOP 10 * FROM [dbo].[MV_Amarok_DailyProdWellData]' df = pd.read_sql(sql, conn) else: print('run with xcom value') sql = 'SELECT TOP 10 * FROM [dbo].[MV_Amarok_DailyProdWellData] where Date > ?' df = pd.read_sql(sql, conn, params=[most_recent_date]) print(sql) df.to_json('/home/mdkumarmca/data/daily_production_data.json', orient='records') except pyodbc.Error as err: print('Failed to read data from table', err) finally: if conn: conn.close() print('ODBC connection is closed')
def extract_most_recent_date(ti): try: conn = OdbcHook().get_conn() cursor = conn.cursor() most_recent_date = ti.xcom_pull(key='most_recent_date', task_ids=['catchup_date']) print(most_recent_date) sql = None if not np.array(most_recent_date ) or not most_recent_date or most_recent_date is None: sql = 'SELECT MAX(minDate.Date) AS Date FROM(SELECT TOP 10 * FROM [dbo].[MV_Amarok_DailyProdWellData] ORDER BY Date) AS minDate' cursor.execute(sql) else: sql = 'SELECT MAX(minDate.Date) AS Date FROM(SELECT TOP 10 * FROM [dbo].[MV_Amarok_DailyProdWellData] where Date > ? ORDER BY Date) AS minDate' cursor.execute(sql, most_recent_date) print(sql) row = cursor.fetchone() if row is not None: ti.xcom_push(key='most_recent_date', value=row.Date.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]) print(row.Date) cursor.close() except pyodbc.Error as err: print('Failed to read data from table', err) finally: if conn: conn.close() print('ODBC connection is closed')
def get_data(self): odbcHook = OdbcHook(conn_id=self.odbc_conn_id) with closing(odbcHook.get_conn()) as conn: with closing(conn.cursor()) as cur: cur.execute(self.odbc_sql) rows = cur.fetchall() rows = [row[0] for row in rows] return rows
def del_duplicados(**kwargs): try: query = 'EXEC [dbo].[Remover_Duplicados] @NOMBRE_TABLA = "Unificado"' odbc_hook = OdbcHook(odbc_conn_id='con_ej_pi_mssql', database='Testing_ETL', driver='{ODBC Driver 17 for SQL Server}') odbc_hook.run(query) except Exception as e: logging.error(f'Ejercucion de Query: {query} - {e}')
def load_to_db(query, i): try: odbc_hook = OdbcHook(odbc_conn_id='con_ej_pi_mssql', database='Testing_ETL', driver='{ODBC Driver 17 for SQL Server}') odbc_hook.run(query) return i + 1 except Exception as e: logging.error(f'Ejercucion de Query: {query} - {e}') return i
def get_hook(self=None, hook_params=None, conn_params=None): hook_params = hook_params or {} conn_params = conn_params or {} connection = Connection( **{ **dict(login='******', password='******', host='host', schema='schema', port=1234), **conn_params, } ) hook = OdbcHook(**hook_params) hook.get_connection = mock.Mock() hook.get_connection.return_value = connection return hook
def sample_select(): odbc_hook = OdbcHook() # above will use 'odbc_default' connection # which you must setup in Admin->Connections # http://airflow.apache.org/docs/apache-airflow-providers-odbc/stable/connections/odbc.html#configuring-the-connection cnxn = odbc_hook.get_conn() cursor = cnxn.cursor() cursor.execute("SELECT @@SERVERNAME, @@VERSION;") row = cursor.fetchone() while row: print("Server Name:" + row[0]) print("Server Version:" + row[1]) row = cursor.fetchone()
def _rank_movies(odbc_conn_id, wasb_conn_id, ratings_container, rankings_container, **context): year = context["execution_date"].year month = context["execution_date"].month # Determine storage account name, needed for query source URL. blob_account_name = WasbHook.get_connection(wasb_conn_id).login query = RANK_QUERY.format( year=year, month=month, blob_account_name=blob_account_name, blob_container=ratings_container, ) logging.info(f"Executing query: {query}") odbc_hook = OdbcHook(odbc_conn_id, driver="ODBC Driver 17 for SQL Server") with odbc_hook.get_conn() as conn: with conn.cursor() as cursor: cursor.execute(query) rows = cursor.fetchall() colnames = [field[0] for field in cursor.description] ranking = pd.DataFrame.from_records(rows, columns=colnames) logging.info(f"Retrieved {ranking.shape[0]} rows") # Write ranking to temp file. logging.info( f"Writing results to {rankings_container}/{year}/{month:02d}.csv") with tempfile.TemporaryDirectory() as tmp_dir: tmp_path = path.join(tmp_dir, "ranking.csv") ranking.to_csv(tmp_path, index=False) # Upload file to Azure Blob. wasb_hook = WasbHook(wasb_conn_id) wasb_hook.load_file( tmp_path, container_name=rankings_container, blob_name=f"{year}/{month:02d}.csv", )
def execute(self, context): mongoHook = MongoHook(conn_id=self.mongo_conn_id) self.mongo_db = mongoHook.connection.schema log.info('odbc_conn_id: %s', self.odbc_conn_id) log.info('postgres_conn_id: %s', self.postgres_conn_id) log.info('mongo_conn_id: %s', self.mongo_conn_id) log.info('postgres_sql: %s', self.postgres_sql) # log.info('prev_exec_date: %s', self.prev_exec_date) log.info('mongo_db: %s', self.mongo_db) log.info('mongo_collection: %s', self.mongo_collection) odbcHook = OdbcHook(self.odbc_conn_id) well_data = self.get_data() log.info('postgres well data: %s', well_data) most_recent_date = self.get_most_recent_date(mongoHook) print(most_recent_date) if most_recent_date: print('store most recent date inside airflow variable') Variable.set("most_recent_date", most_recent_date) with closing(odbcHook.get_conn()) as conn: for index, well in well_data.iterrows(): print(well['well_name'], well['is_newly_added']) if well is not None and well['is_newly_added']: sql = 'SELECT * FROM [dbo].[MV_Amarok_DailyProdWellDemoData] where Name = ?' df = pd.read_sql(sql, conn, params=[well['well_name']]) else: sql = 'SELECT * FROM [dbo].[MV_Amarok_DailyProdWellDemoData] where Name = ? and Date > ?' df = pd.read_sql( sql, conn, params=[well['well_name'], most_recent_date]) if not df.empty: data_dict = df.to_dict("records") self.insert_records(mongoHook, data_dict)
def get_hook(self): if self.conn_type == 'mysql': from airflow.providers.mysql.hooks.mysql import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.gcp.hooks.bigquery import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.providers.postgres.hooks.postgres import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'pig_cli': from airflow.providers.apache.pig.hooks.pig import PigCliHook return PigCliHook(pig_cli_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.providers.apache.hive.hooks.hive import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.providers.presto.hooks.presto import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.providers.apache.hive.hooks.hive import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.providers.sqlite.hooks.sqlite import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.providers.jdbc.hooks.jdbc import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.providers.microsoft.mssql.hooks.mssql import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'odbc': from airflow.providers.odbc.hooks.odbc import OdbcHook return OdbcHook(odbc_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.providers.oracle.hooks.oracle import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.providers.vertica.hooks.vertica import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.providers.cloudant.hooks.cloudant import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.providers.jira.hooks.jira import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.providers.redis.hooks.redis import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.providers.microsoft.azure.hooks.wasb import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.providers.docker.hooks.docker import DockerHook return DockerHook(docker_conn_id=self.conn_id) elif self.conn_type == 'azure_data_lake': from airflow.providers.microsoft.azure.hooks.azure_data_lake import AzureDataLakeHook return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id) elif self.conn_type == 'azure_cosmos': from airflow.providers.microsoft.azure.hooks.azure_cosmos import AzureCosmosDBHook return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id) elif self.conn_type == 'cassandra': from airflow.providers.apache.cassandra.hooks.cassandra import CassandraHook return CassandraHook(cassandra_conn_id=self.conn_id) elif self.conn_type == 'mongo': from airflow.providers.mongo.hooks.mongo import MongoHook return MongoHook(conn_id=self.conn_id) elif self.conn_type == 'gcpcloudsql': from airflow.gcp.hooks.cloud_sql import CloudSQLDatabaseHook return CloudSQLDatabaseHook(gcp_cloudsql_conn_id=self.conn_id) elif self.conn_type == 'grpc': from airflow.providers.grpc.hooks.grpc import GrpcHook return GrpcHook(grpc_conn_id=self.conn_id) raise AirflowException("Unknown hook type {}".format(self.conn_type))
def check_datasource_connection(): conn = OdbcHook().get_conn() cursor = conn.cursor() cursor.close() conn.close()