def execute_from_file(connection_name, query): try: logging.info('Executing: ' + str(query)) hook = MsSqlHook(mssql_conn_id=connection_name) hook.run(str(query)) except Exception as e: raise AirflowException(e)
def get_sidewalk_data(**kwargs): """Get sidewalk condition data from DB.""" sw_query = general.file_to_string('./sql/sidewalk_insp.sql', __file__) sw_conn = MsSqlHook(mssql_conn_id='streets_cg_sql') df = sw_conn.get_pandas_df(sw_query) # Rename columns we're keeping df = df.rename(columns={ 'sap_id': 'seg_id', 'legacy_id': 'geojoin_id', 'inspectiondate': 'oci_date', 'rating': 'oci_desc', 'condition': 'oci' }) df = df.drop(['cgLastModified', 'MaxInspect', 'MaxMod'],axis=1) # Write csv logging.info('Writing ' + str(df.shape[0])) general.pos_write_csv( df, cond_file, date_format=conf['date_format_ymd']) return "Successfully wrote prod file"
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id) self.logger.info( "Dumping Microsoft SQL Server query results to local file") conn = mssql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) with NamedTemporaryFile("w") as f: csv_writer = csv.writer(f, delimiter=self.delimiter, encoding='utf-8') field_dict = OrderedDict() col_count = 0 for field in cursor.description: col_count += 1 col_position = "Column{position}".format(position=col_count) field_dict[col_position if field[0] == '' else field[0]] = self.type_map(field[1]) csv_writer.writerows(cursor) f.flush() cursor.close() conn.close() self.logger.info("Loading file into Hive") hive.load_file(f.name, self.hive_table, field_dict=field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate, tblproperties=self.tblproperties)
def get_sidewalk_data(**kwargs): """Get sidewalk condition data from DB.""" sw_query = general.file_to_string('./sql/sidewalk_insp.sql', __file__) sw_conn = MsSqlHook(mssql_conn_id='streets_cg_sql') df = sw_conn.get_pandas_df(sw_query) # Rename columns we're keeping df = df.rename( columns={ 'sap_id': 'seg_id', 'legacy_id': 'geojoin_id', 'inspectiondate': 'oci_date', 'rating': 'oci_desc', 'condition': 'oci' }) df = df.drop(['cgLastModified', 'MaxInspect', 'MaxMod'], axis=1) # Write csv logging.info('Writing ' + str(df.shape[0])) general.pos_write_csv(df, cond_file, date_format=conf['date_format_ymd']) return "Successfully wrote prod file"
def execute(self, context): self.log.info('Executing: %s', self.sql) hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id, schema=self.database) hook.run(self.sql, autocommit=self.autocommit, parameters=self.parameters)
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id) logging.info("Dumping Microsoft SQL Server query results to local file") conn = mssql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) with NamedTemporaryFile("w") as f: csv_writer = csv.writer(f, delimiter=self.delimiter, encoding='utf-8') field_dict = OrderedDict() col_count = 0 for field in cursor.description: col_count += 1 col_position = "Column{position}".format(position=col_count) field_dict[col_position if field[0] == '' else field[0]] = self.type_map(field[1]) csv_writer.writerows(cursor) f.flush() cursor.close() conn.close() logging.info("Loading file into Hive") hive.load_file( f.name, self.hive_table, field_dict=field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate)
def _query_mssql(self): """ Queries MSSQL and returns a cursor of results. :return: mssql cursor """ mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id) conn = mssql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) return cursor
def query(self): """ Queries MSSQL and returns a cursor of results. :return: mssql cursor """ mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id) conn = mssql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) return cursor
def get_streets_paving_data(): """Get streets paving data from DB.""" pv_query = general.file_to_string('./sql/pavement_ex.sql', __file__) pv_conn = MsSqlHook(mssql_conn_id='streets_cg_sql') df = pv_conn.get_pandas_df(pv_query) results = df.shape[0] general.pos_write_csv( df, temp_file) return f"Successfully wrote temp file with {results} records"
def get_special_events(): """Get special events from DB.""" se_query = general.file_to_string('./sql/special_events.sql', __file__) se_conn = MsSqlHook(mssql_conn_id='special_events_sql') df = se_conn.get_pandas_df(se_query) df['event_id'] = pd.to_numeric(df['event_id'], errors='coerce', downcast='integer') general.pos_write_csv(df, temp_file, date_format=conf['date_format_ymd_hms']) return "Retrieved special events to temp file."
def _query_mssql(self): """ Query the mssql instance using the mssql hook and return a dataframe. Using a dataframe makes working with JSON objects much easier. :return df: the dataframe that relates to the sql query. """ mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id) conn = mssql.get_conn() logging.info("Connected to mssql db {0}".format(conn)) #CHANGE THIS TO MSSQL.GETPANDASDF! for df in pd.read_sql(self.sql, conn, chunksize=25000): yield df
def execute(self, context): mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id) df = mssql_hook.get_pandas_df(self.select_sql) # Convert columns data types to int if self.int_columns: for col in self.int_columns: df[col] = df[col].astype("Int64") # Create folder if not exists if not os.path.exists(self.target_file_dir): os.mkdir(self.target_file_dir) file_path = os.path.join(self.target_file_dir, self.file_name) df.to_csv(file_path, index=False)
def execute(self, context): self.log.info("Querying data from source: {0}".format( self.mssql_source_conn_id)) src_mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_source_conn_id) df = src_mssql_hook.get_pandas_df(sql=self.source_sql, parameters=self.source_sql_params) rows_total = df.shape[0] self.log.info("Writing data to {0}.".format(self.destination_filepath)) df.to_csv(self.destination_filepath, sep=self.sep, index=False) self.log.info("Total inserted to file: {0} rows".format(rows_total)) return rows_total
def poke(self, context): hook = MsSqlHook.get_connection(self.conn_id).get_hook() record = hook.get_first(sql=self.sql, parameters=self.params) if not record: return False return True if record[0] == 1 else False
def __init__(self, source_conn_id, s3_conn_id, s3_bucket, s3_key, query_string, query_merge_field=None, *args, **kwargs): super().__init__(*args, **kwargs) #Define our connections + #Create connections to our DB and our storage self.source_conn_id = source_conn_id self.s3_conn_id = s3_conn_id self.source_hook = self.source_hook = MsSqlHook(self.source_conn_id) self.s3_hook = S3Hook(aws_conn_id=self.s3_conn_id) #Get our DB query + merge fields self.query_string = query_string self.query_merge_field = query_merge_field #Get our S3 Bucket and Key self.s3_bucket = s3_bucket self.s3_key = s3_key #Get the session and endpoint url self.session = self.s3_hook.get_session() self.endpoint_url = self.s3_hook.get_endpoint_url()
def update_finger_data(): import pandas as pd import os import requests import datetime from pprint import pprint from os.path import join, dirname from airflow.hooks.mssql_hook import MsSqlHook try: yest = datetime.datetime.now() - datetime.timedelta(days=1) dt_string = yest.date().strftime("%Y-%m-%d") hook = MsSqlHook(mssql_conn_id="odoo_finger") conn = hook.get_conn() # what is the output of conn ? df = pd.read_sql( "SELECT max(bsevtdt) as checkout,min(bsevtdt) as checkin ,user_id from TA.dbo.punchlog where CONVERT (date,createdAt)=CONVERT(date, GETDATE()-1) GROUP by user_id;", conn) # catch read_sql connection errors attendances = [] for line in range(0, len(df)): attendances.append({ 'check_in': df['checkin'][line].isoformat(), 'check_out': df['checkout'][line].isoformat(), 'emp_code': df['user_id'][line], 'index': line }) DOMAIN = "http://10.0.1.49/b/v1" ADD_ATT = DOMAIN + "/attendance/add" json_data = { 'attendances': attendances, 'tz': 'Asia/Kuwait', 'name': dt_string, 'db': 'Boutiquaat_Test', 'login': '******', 'pswd': 'admin', } print(json_data, "PPPPPPPPPPPPPP") response = requests.post(ADD_ATT, json=json_data) print('__________ Response : ') pprint(response.json()) except Exception as e: raise AirflowException(e)
def execute(self, context): logging.info('MS SQL Query Operator Starting') hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id) if isinstance(self.s3_conn_id, str): self.execute_results_to_s3(hook=hook) else: self.execute_no_results(hook=hook) logging.info('MS SQL Query Operator Complete')
def workflow(src_conn_id, src_schema, dt, target_conn_id, target_table): # TODO Split code into functions etl_conn = PostgresHook(postgres_conn_id='etl').get_conn() with Session(etl_conn, f'{dag.dag_id}__{src_schema}') as session: # Load data source_conn = MsSqlHook(mssql_conn_id=src_conn_id, schema=src_schema).get_conn() query = f""" SELECT id, start_time, end_time, type, data FROM dbo.Orders WHERE CONVERT(DATE, start_time) = '{dt}' """ df = pd.read_sql_query(query, source_conn) # Skip if no rows if df.empty: raise AirflowSkipException('No rows to load') # Add service fields df['etl_source'] = src_schema df['etl_id'] = session.id df['hash_id'] = hash_pandas_object(df[['etl_source', 'id']]) # Export data to CSV buffer buffer = StringIO() df.to_csv(buffer, index=False, sep='|', na_rep='NUL', quoting=csv.QUOTE_MINIMAL, header=False, float_format='%.8f', doublequote=False, escapechar='\\') buffer.seek(0) # Push CSV target_conn = VerticaHook(vertica_conn_id=target_conn_id).get_conn() copy_stmt = f""" COPY {target_table}({df.columns.to_list()}) FROM STDIN DELIMITER '|' ENCLOSED '"' ABORT ON ERROR NULL 'NUL' """ cursor = target_conn.cursor() cursor.copy(copy_stmt, buffer) session.loaded_rows = cursor.rowcount session.successful = True
def get_fd_data(**kwargs): """Get fire department data from Data Base.""" logging.info("Get fire department data from CAD archive") fd_query = general.file_to_string('./sql/fd.sql', __file__) fd_conn = MsSqlHook(mssql_conn_id='fire_department') logging.info("Read data to panda DataFrame") df = fd_conn.get_pandas_df(fd_query) # Write csv logging.info('Writing ' + prod_file) general.pos_write_csv(df, prod_file, date_format=conf['date_format_ymd_hms']) return "Successfully wrote prod file at " + prod_file
def get_fd_data( **kwargs): """Get fire department data from Data Base.""" logging.info("Get fire department data from CAD archive") fd_query = general.file_to_string('./sql/fd.sql', __file__) fd_conn = MsSqlHook(mssql_conn_id='fire_department') logging.info("Read data to panda DataFrame") df = fd_conn.get_pandas_df(fd_query) # Write csv logging.info('Writing ' + prod_file) general.pos_write_csv( df, prod_file, date_format=conf['date_format_ymd_hms']) return "Successfully wrote prod file at " + prod_file
def pg_to_mssql(**kwargs): pg_hook = PostgresHook('etl_postgres') mssql_hook = MsSqlHook('mssql_wtproperty') # The fields we want to work with target_fields = [ 'master_account_num', 'master_account_name', 'sub_account_num', 'sub_account_name', 'short_name', 'account_status', 'group_num', 'item_num', 'original_balance', 'fed_withholding_tax_this_period', 'ytd_fed_withholding_tax', 'int_paid_this_period', 'ytd_int_paid', 'int_split_this_period', 'escrow_balance' ] # pull records from postgres recs = pg_hook.get_records( f"select {', '.join(target_fields)} from escrow.escrow") # insert records to mssql mssql_hook.insert_rows('property_data_escrowbalance', recs, target_fields)
def get_table_cols_name(conn_id: str, schema: str, table: str): """ Obtem a lista de colunas de uma tabela. """ conn_values = BaseHook.get_connection(conn_id) if conn_values.conn_type == 'mssql': db_hook = MsSqlHook(mssql_conn_id=conn_id) elif conn_values.conn_type == 'postgres': db_hook = PostgresHook(postgres_conn_id=conn_id) else: raise Exception('Conn_type not implemented.') with db_hook.get_conn() as db_conn: with db_conn.cursor() as db_cur: db_cur.execute(f'SELECT * FROM {schema}.{table} WHERE 1=2') column_names = [tup[0] for tup in db_cur.description] return column_names
def _table_column_max_string(db_hook: MsSqlHook, table: str, column: str): """ Calcula o valor máximo da coluna (column) na tabela (table). Se a coluna for 'dataalteracao' a string retornada é formatada. """ sql = f"SELECT MAX({column}) FROM {table};" max_value = db_hook.get_first(sql)[0] # TODO: Descobrir se é data pelo tipo do BD if column == 'dataalteracao': return max_value.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] else: return str(max_value)
def get_sidewalk_gis(**kwargs): """ Get sidewalk geodatabase from shared drive""" sde_query = general.file_to_string('./sql/sidewalk_sde.sql', __file__) sde_conn = MsSqlHook(mssql_conn_id='atlas_sde') #sde_server = conf['sde_server'] #sde_user = conf['sde_user'] #sde_pw = conf['sde_pw'] #sde_conn = pymssql.connect(sde_server, sde_user, sde_pw, 'sdw') #query = "SELECT *, [Shape].STAsText() as geom FROM SDW.IAMSD.SIDEWALK" df = sde_conn.get_pandas_df(sde_query) df.columns = [x.lower() for x in df.columns] df = df.drop('shape', 1) df = df.rename( columns={ 'sapid': 'seg_id', 'cdcode': 'council', 'cpcode': 'comm_plan', 'legacy_id': 'geojoin_id', 'iamfloc': 'fun_loc_id', 'loc_descr': 'loc_desc' }) df = df.fillna('') logging.info('processed {} rows'.format(rows)) geospatial.df2shp(df=df, folder=prod_dir, layername=layername, dtypes=dtypes, gtype=gtype, epsg=2230) return "Converted table to shapefile"
def __init__(self, source_conn_id, target_conn_id, target_schema, target_table, query_string, query_merge_field=None, *args, **kwargs): super().__init__(*args, **kwargs) #Create connections to our source and target db self.source_db = MsSqlHook(source_conn_id) self.target_db = PostgresHook(target_conn_id) #If there are additional parameters in the query, then take care of them self.query = query_string #Get our target db schema and table self.target_schema = target_schema self.target_table = target_table
def get_documentum(mode, **kwargs): """Get tables from Documentum.""" logging.info('Getting files from documentum') table_name = dn.table_name(mode) for name in table_name: logging.info('Querying for {0} table'.format(name)) query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name) logging.info('Connecting to MS Database') documentum_conn = MsSqlHook(mssql_conn_id='docm_sql') logging.info('Reading data to Pandas DataFrame') df = documentum_conn.get_pandas_df(query_string) logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) save_path = conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower()) logging.info('Writing Production file') general.pos_write_csv(df, save_path) return "Successfully retrieved Documentum tables"
def get_hook(self): try: if self.conn_type == 'mysql': from airflow.hooks.mysql_hook import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.contrib.hooks.bigquery_hook import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.hooks.postgres_hook import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.hooks.hive_hooks import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.hooks.presto_hook import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.hooks.hive_hooks import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.hooks.sqlite_hook import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.hooks.jdbc_hook import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.hooks.mssql_hook import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.hooks.oracle_hook import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.contrib.hooks.vertica_hook import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.contrib.hooks.cloudant_hook import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.contrib.hooks.jira_hook import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.contrib.hooks.redis_hook import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.hooks.docker_hook import DockerHook return DockerHook(docker_conn_id=self.conn_id) except: pass
def get_fd_data(**kwargs): """Get fire department data from Data Base.""" logging.info("Get fire department data from Data Base") fd_query = general.file_to_string('./sql/fd_pn.sql', __file__) fd_conn = MsSqlHook(mssql_conn_id='fire_department') logging.info("Read data to panda DataFrame") df = fd_conn.get_pandas_df(fd_query) df = df.rename( columns={ 'city': 'address_city', 'response_month': 'month_response', 'response_year': 'year_response' }) logging.info("Writing {} rows to prod".format(df.shape[0])) # Write csv general.pos_write_csv(df, prod_file) return "Successfully wrote prod file"
def get_onbase(): """Get tables from OnBase.""" logging.info('Getting files from onbase') for root, dirs, files in os.walk('./poseidon/dags/city_docs/sql/onbase'): for name in files: logging.info('Querying for '+name) path = './sql/onbase/{}'.format(name) query_string = general.file_to_string(path, __file__) logging.info('Connecting to MS Database') onbase_conn = MsSqlHook(mssql_conn_id='onbase_sql') logging.info('Reading data to Pandas DataFrame') df = onbase_conn.get_pandas_df(query_string) table_type = name[0:-4] logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) save_path = '{0}/onbase_{1}.csv'.format(conf['prod_data_dir'],table_type) logging.info('Writting Production file') general.pos_write_csv(df, save_path) return "Successfully retrieved OnBase tables"
def get_onbase_test(): """Get tables from OnBase.""" logging.info('Getting files from onbase') for root, dirs, files in os.walk('./poseidon/dags/city_docs/sql/onbase'): for name in files: logging.info('Querying for '+name) path = './sql/onbase/{}'.format(name) query_string = general.file_to_string(path, __file__) logging.info('Connecting to MS Database') onbase_conn = MsSqlHook(mssql_conn_id='onbase_test_sql') logging.info('Reading data to Pandas DataFrame') df = onbase_conn.get_pandas_df(query_string) table_type = name[0:-4] logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) save_path = '{0}/onbase_test_{1}.csv'.format(conf['prod_data_dir'],table_type) logging.info('Writting Production file') general.pos_write_csv(df, save_path) return "Successfully retrieved OnBase tables"
def execute(self, context): source_hook = WasbHook(wasb_conn_id=self.azure_blob_conn_id) # Assumption 1: there is sufficient disk space to download the blob in question # Assumption 2: The file is a correctly formatted csv file with NamedTemporaryFile(mode='a+', delete=True) as f: source_hook.get_file(file_path=f.name, container_name=self.src_blob_container, blob_name=self.src_blob) f.flush() self.log.info("Saving file to %s", f.name) csv_reader = reader(f) list_of_tuples = list(map(tuple, csv_reader)) self.log.info(list_of_tuples) self.log.info(f"Inserting into {self.dest_table}") hook = MsSqlHook(mssql_conn_id=self.azure_sql_conn_id, schema=self.database) hook.insert_rows(self.dest_table, list_of_tuples) self.log.info(f"Data inserted into {self.database}.{self.dest_table}")
def execute(self, context): self.log.info("Querying data from source: {0}".format( self.mssql_source_conn_id)) src_mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_source_conn_id) df = src_mssql_hook.get_pandas_df(sql=self.source_sql, parameters=self.source_sql_params) rows_total = df.shape[0] self.log.info("Writing data to {0}.".format(self.destination_filepath)) writer = pd.ExcelWriter(self.destination_filepath, engine=self.excel_engine) if self.write_mode == 'append_tab' and path.isfile( self.destination_filepath): writer.book = load_workbook(self.destination_filepath) pandas.io.formats.excel.header_style = None if self.sheet_name is not None: df.to_excel(writer, sheet_name=self.sheet_name, index=False) else: df.to_excel(writer, index=False) writer.save() self.log.info("Total inserted to file: {0} rows".format(rows_total))
def get_documentum(mode, **kwargs): """Get tables from Documentum.""" logging.info('Getting files from documentum') table_name = dn.table_name(mode) for name in table_name: logging.info('Querying for {0} table'.format(name)) query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name) logging.info('Connecting to MS Database') documentum_conn = MsSqlHook(mssql_conn_id='docm_sql') logging.info('Reading data to Pandas DataFrame') df = documentum_conn.get_pandas_df(query_string) logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) if mode == 'schedule_24': save_path = conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower()) else: save_path = conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower()) logging.info('Writing Production file') general.pos_write_csv(df, save_path) return "Successfully retrieved Documentum tables"
def get_documentum_test(): """Get tables from Documentum test database.""" logging.info('Getting files for documentum test') table_name = dn.table_name('schedule_daily')+dn.table_name('schedule_hourly_15')+dn.table_name('schedule_hourly_30') logging.info(table_name) for name in table_name: logging.info('Querying for {0} table'.format(name)) query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name) logging.info('Connecting to MS Database') documentum_conn = MsSqlHook(mssql_conn_id='docm_test_sql') logging.info('Reading data to Pandas DataFrame') try: df = documentum_conn.get_pandas_df(query_string) logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) save_path = conf['prod_data_dir'] + '/documentum_{0}_test.csv'.format(name.lower()) general.pos_write_csv(df, save_path) except Exception as e: logging.info(f'Could not read {0} because {e}') return "Successfully retrieved Documentum tables"
def _build_filter_condition(dest_hook: MsSqlHook, table: str, date_column: str, key_column: str): if date_column: sql = f"SELECT MAX({date_column}) FROM {table}" else: sql = f"SELECT MAX({key_column}) FROM {table}" max_value = dest_hook.get_first(sql)[0] if date_column: max_value = max_value.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] where_condition = f"{date_column} > '{max_value}'" else: max_value = str(max_value) where_condition = f"{key_column} > '{max_value}'" return max_value, where_condition
def get_hook(self): if self.conn_type == 'mysql': from airflow.hooks.mysql_hook import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.gcp.hooks.bigquery import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.hooks.postgres_hook import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'pig_cli': from airflow.hooks.pig_hook import PigCliHook return PigCliHook(pig_cli_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.hooks.hive_hooks import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.hooks.presto_hook import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.hooks.hive_hooks import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.hooks.sqlite_hook import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.hooks.jdbc_hook import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.hooks.mssql_hook import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.hooks.oracle_hook import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.contrib.hooks.vertica_hook import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.contrib.hooks.cloudant_hook import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.contrib.hooks.jira_hook import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.contrib.hooks.redis_hook import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.hooks.docker_hook import DockerHook return DockerHook(docker_conn_id=self.conn_id) elif self.conn_type == 'azure_data_lake': from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id) elif self.conn_type == 'azure_cosmos': from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id) elif self.conn_type == 'cassandra': from airflow.contrib.hooks.cassandra_hook import CassandraHook return CassandraHook(cassandra_conn_id=self.conn_id) elif self.conn_type == 'mongo': from airflow.contrib.hooks.mongo_hook import MongoHook return MongoHook(conn_id=self.conn_id) elif self.conn_type == 'gcpcloudsql': from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id) elif self.conn_type == 'grpc': from airflow.contrib.hooks.grpc_hook import GrpcHook return GrpcHook(grpc_conn_id=self.conn_id) raise AirflowException("Unknown hook type {}".format(self.conn_type))
def execute(self, context): _log.info('Executing: ' + str(self.sql)) hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id) hook.run(self.sql, autocommit=self.autocommit, parameters=self.parameters)
def execute(self, context): logging.info('Executing: ' + str(self.sql)) hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id) hook.run(self.sql, parameters=self.parameters)
def get_streets_paving_data(mode='sdif', **kwargs): """Get streets paving data from DB.""" pv_query = general.file_to_string('./sql/pavement_ex.sql', __file__) pv_conn = MsSqlHook(mssql_conn_id='streets_cg_sql') moratorium_string = "Post Construction" phone_UTLY = "858-627-3200" phone_OTHER = "619-527-7500" TSW_PM = "*****@*****.**" UTLY_PM = "*****@*****.**" ACT_OVERLAY_CONCRETE_PM = "*****@*****.**" ACT_SLURRY_SERIES_PM = "*****@*****.**" # Different String for imcat mode. if mode == 'imcat': moratorium_string = "Post-Construction" df = pv_conn.get_pandas_df(pv_query) for i in [ 'seg_id', 'rd_seg_id', 'wo_id', 'wo_name', 'wo_status', 'wo_proj_type', 'job_activity', 'seg_func_class' ]: df[i] = df[i].astype(str) df['job_completed_cbox'] = df['job_completed_cbox'].astype(bool) # Backfill - set all fields to mora df.loc[df.wo_status.str.contains( 'post construction|moratorium|post-construction', regex=True, case=False), "wo_status"] = moratorium_string # Remove Records w/o A Completed Date ONLY in the UTLY and TSW work order # IMCAT ONLY if mode == 'imcat': df = df.query('not '\ + '((wo_id == "UTLY" & job_end_dt.isnull()) '\ + 'or (wo_id == "TSW" & job_end_dt.isnull()))') # Remove empty activities (IMCAT ONLY) df = df.query('not '\ + '(job_activity.isnull() '\ + '| job_activity == "" '\ + '| job_activity == "None")') # Remove Data Entry # Remove mill / pave # Remove Structure Widening # Remove Patching if mode == 'imcat': remove_search = 'data entry|mill|structure wid|patching' else: remove_search = 'data entry|structure wid|patching' df = df[~(df.job_activity.str.contains( remove_search, regex=True, case=False))] # Search Strings concrete_search = "panel rep|pcc - reconstruc" slurry_search = "surface treatment|scrub seal|cape seal|central mix" overlay_search = "resurfacing|overlay|ac overlay|mill|ac - overlay|ac - ac overlay|ac - reconstruct|ac inlay" # Set Proj Type to NA df['wo_proj_type'] = None # Concrete df.loc[df.job_activity.str.contains( concrete_search, regex=True, case=False), 'wo_proj_type'] = 'Concrete' # Slurry df.loc[df.job_activity.str.contains( slurry_search, regex=True, case=False), 'wo_proj_type'] = 'Slurry' # Overlay df.loc[df.job_activity.str.contains( overlay_search, regex=True, case=False), 'wo_proj_type'] = 'Overlay' # Remove All Records over 5 Years Old; #pv <- pv[(as.Date(pv$job_end_dt) > (today() - years(5))) | is.na(pv$job_end_dt),] # Create ref dates #today = kwargs['execution_date'] today = general.today() five_yrs_ago = today.replace(year=(today.year - 5)) three_yrs_ago = today.replace(year=(today.year - 3)) # Remove records df = df[(df.job_end_dt > five_yrs_ago) | (df.job_end_dt.isnull())] # Remove Slurry Records > 3 Years Old # IMCAT ONLY if mode == 'imcat': mask = ~((df.wo_proj_type == 'Slurry') & (df.job_end_dt < three_yrs_ago)) df = df[mask] # Create a feature for completed jobs df['final_job_completion_state'] = False #pv[(!is.na(pv$job_end_dt) & pv$job_completed_cbox == 1), "final_job_completion_state"] <- 1 df.loc[df.job_end_dt.notnull(), "final_job_completion_state"] = True # Set all completed jobs to Moratorium status df.loc[df.final_job_completion_state == True, "wo_status"] = moratorium_string # Set Dates in The future for TSW work orders as Construction. mask = (df.wo_id == 'TSW') & \ (df.job_end_dt.notnull()) & \ (df.job_end_dt > today) df.loc[mask, "wo_status"] = "Construction" # Set Phone # For UTLY df.loc[df.wo_id == 'UTLY', 'wo_pm_phone'] = phone_UTLY # Set Phone # for Everything else df.loc[df.wo_id != 'UTLY', 'wo_pm_phone'] = phone_OTHER # Set PM for UTLY df.loc[df.wo_id == 'UTLY', 'wo_pm'] = UTLY_PM # Set PM for TSW df.loc[df.wo_id == 'TSW', 'wo_pm'] = TSW_PM # Set PM for Overlay / Concrete #mask = (df.wo_proj_type == 'Overlay') | (df.wo_proj_type == 'Concrete') & (df.wo_pm.isnull()) mask = (df.wo_pm.isnull()) & ((df.wo_proj_type == 'Overlay') | (df.wo_proj_type == 'Concrete')) df.loc[mask, 'wo_pm'] = ACT_OVERLAY_CONCRETE_PM # Set PM for Slurry / Series mask = (df.wo_pm.isnull()) & ((df.wo_proj_type == 'Slurry') | (df.wo_proj_type == 'Series Circuit')) df.loc[mask, 'wo_pm'] = ACT_SLURRY_SERIES_PM # Spot Unknown mask = (df.job_activity.isnull()) | (df.job_activity == None) | (df.job_activity == 'None') | (df.job_activity == '')\ |(df.wo_proj_type.isnull()) | (df.wo_proj_type == None) | (df.wo_proj_type == 'None') | (df.wo_proj_type == '')\ |(df.wo_status.isnull()) | (df.wo_status == None) | (df.wo_status == 'None') | (df.wo_status == '') spot_unknown = df[mask] logging.info('Found {} records with no activity, type or status'.format( spot_unknown.shape[0])) # Remove unknown df = df[~mask] # Sort by job end date time df = df.sort_values(by='job_end_dt', na_position='last', ascending=False) # Remove duplicates, although it doesn't make sense # This is wrong. df = df.drop_duplicates('seg_id', keep='first') # Rename columns we're keeping df = df.rename(columns={ 'pve_id': 'PVE_ID', 'seg_id': 'SEG_ID', 'wo_id': 'PROJECTID', 'wo_name': 'TITLE', 'wo_pm': 'PM', 'wo_pm_phone': 'PM_PHONE', 'wo_design_start_dt': 'START', 'wo_design_end_dt': 'END', 'wo_resident_engineer': 'RESIDENT_ENGINEER', 'job_end_dt': 'MORATORIUM', 'wo_status': 'STATUS', 'wo_proj_type': 'TYPE', 'seg_length_ft': 'LENGTH', 'seg_width_ft': 'WIDTH', }) # Regex for the keeps: df = df.filter(regex="^[A-Z0-9]") # Remove START and END for projects in moratorium: df.loc[df.STATUS == moratorium_string, ['START', 'END']] = None # For IMCAT uppercase status if mode == 'imcat': df['STATUS'] = df['STATUS'].str.upper() # Write csv logging.info('Writing ' + str(df.shape[0]) + ' rows in mode ' + mode) general.pos_write_csv( df, prod_file[mode], date_format=conf['date_format_ymd_hms']) return "Successfully wrote prod file at " + prod_file[mode]