def query_pandas(self, query, reuse_odbc_connection=False, args=[]): """ Reads database source to pandas dataframe. Depending on the configuration of the connector either odbc, jdbc via spark or jaydebeapi jdbc will be used :param reuse_odbc_connection: if set connection will be reused (currently works only for ODBC sources) """ logger.debug(f"Executing query on connector {str(self)}:" + query +" with args:"+str(args)) if self.supports_odbc: logger.debug(" using ODBC") conn = self.get_odbc_connection(reuse_odbc_connection) def pd_read_sql(conn): return pandas.read_sql(query, conn, coerce_float=False, parse_dates=None, params=args) try: return pd_read_sql(conn) except Exception as ex: if self.is_connection_broken_error(ex): logger.info("Connection is broken, reconnecting.") conn = self.get_odbc_connection(False) return pd_read_sql(conn) else: raise ex else: if len(args)>0: raise Exception("Query args not supported for spark od jaydebeapi.") if self.jdbc_access_method=="spark": logger.debug("\nWarning: conversion from spark df to pandas needed.") return self.query_spark(query).toPandas() else: logger.debug("\nUsing jaydebeapi jdbc access method") return self.read_jdbc_to_pd_df(query, self.jdbc_driver, self.get_jdbc_conn_string(),[self.username, self.get_password()])
def query_multiple(self, queries, reuse_odbc_connection=False, args=[], batch_size=1, continue_on_error=False): """ Runs query_pandas. If query does not start with "SELECT" then query will be transformed to "SELECT * FROM {query}" """ ret=[] errors=[] i=0 for query in batch(queries, batch_size): sql=" UNION ALL ".join(query) try: df=None df=self.query_pandas(sql, reuse_odbc_connection, args=args) i=i+1 logger.info("Batches completion progress:"+str(i)+"/"+str(len(queries)/batch_size) +" batch size:"+str(batch_size) + " "+str(i*batch_size)+"/"+str(len(queries))) except Exception as e: logger.debug("ERROR during execution of query:"+ str(("ERROR", query, e))) if continue_on_error: if batch_size>1: df, error=self.query_multiple(query, reuse_odbc_connection=reuse_odbc_connection, args=args, batch_size=1, continue_on_error=True) errors.extend(error) else: return (None, [("ERROR", query, e)]) else: raise if df is not None: ret.append(df) if continue_on_error: return (pandas.concat(ret), errors) else: return pandas.concat(ret)
def execute_statement(self, statement, add_col_names=False, reuse_odbc_connection=False, ignore_error=False, args=[], commit=False): if self.supports_odbc: conn = self.get_odbc_connection(reuse_odbc_connection ) cursor = conn.cursor() try: logger.debug(f"Executing statement:{statement}") cursor.execute(statement, *args) except DatabaseError as ex: logger.info(f"DB Exception caught:{ex}.\nReconnecting.") conn = self.get_odbc_connection(reuse_odbc_connection=False) cursor.execute(statement, *args) except Exception as ex: if ignore_error: logger.debug(f"Ignored exception when running {statement}:" + str(ex)) else: raise ex res = [] rowcount = cursor.rowcount try: recs = cursor.fetchall() if add_col_names: fields = tuple(map(lambda x: x[0], cursor.description)) recs.insert(0, fields) res.append(recs) except pyodbc.ProgrammingError: pass while cursor.nextset(): # NB: This always skips the first resultset try: recs = cursor.fetchall() if add_col_names: fields = tuple(map(lambda x: x[0], cursor.description)) recs.insert(0, fields) res.append(recs) except pyodbc.ProgrammingError: continue if commit: conn.execute("COMMIT;") return (rowcount,res); else: if self.jdbc_access_method=="spark": print("\nWarning: conversion from spark df to pandas needed.") return self.query_spark(statement).toPandas() else: print("\nUsing jaydebeapi jdbc access method") return self.read_jdbc_to_pd_df(statement, self.jdbc_driver, self.get_jdbc_conn_string(),[self.username, self.get_password()])
def copy_data(self, destination_db_connector, query, destination_tb_name, batch_size=10000, truncate_destination_table=False, fast_executemany=True): if self.supports_odbc: import time src_conn = self.get_odbc_connection() dest_conn = destination_db_connector.get_odbc_connection() src_curs= src_conn.cursor() tgt_curs = dest_conn.cursor() tgt_curs.fast_executemany = fast_executemany src_curs.execute(f"select count(*) cnt from ({query}) x") src_cnt=int(src_curs.fetchone()[0]) src_curs.commit() if truncate_destination_table: destination_db_connector.execute_statement(f"TRUNCATE TABLE {destination_tb_name}; COMMIT;") start = time.time() tot=0 logger.info(destination_tb_name+": Copying "+str(src_cnt)+" records.") src_curs = self.get_odbc_connection().cursor() src_curs.execute(query) columns = [column[0] for column in src_curs.description] COLUMNS = ",".join(columns) PARAMS = ",".join(["?" for c in columns]) time_batch_start = time.time() src_data = src_curs.fetchmany(batch_size) print(len(src_data)) while len(src_data) > 0: print(destination_tb_name+": Inserting "+str(len(src_data))+".", end="") tgt_curs.executemany(f'INSERT INTO {destination_tb_name} ({COLUMNS}) VALUES ({PARAMS})', src_data) tgt_curs.commit() time_batch_end = time.time() tot=tot+len(src_data) logger.info(" DONE "+ str(round(tot/src_cnt*100))+"%" ) time_passed=round(time.time() - start) total_time=round(time_passed*(1.0/(tot/src_cnt))) time_left=round(total_time-time_passed) logger.info(destination_tb_name+": Time passed: "+str(time_passed)+ " sec. Total time est: "+str(total_time)+ " sec. Time left: "+str(time_left) +"sec ("+str(round(time_left/60/60,2))+" hours)"+ " Speed from beginning: "+ str(round(tot/time_passed,2))+" rec/sec" + " Speed last batch:" + str(round(len(src_data)/(time_batch_end-time_batch_start),2)) +" rec/sec") time_batch_start = time.time() src_data = src_curs.fetchmany(batch_size) time_batch_start = time.time() logger.info(destination_tb_name+": Copying time: "+ str((time.time()) - start)) else: raise Exception("ODBC support required")
def read_jdbc_to_pd_df(self, sql, jclassname, con, driver_args, jars=None, libs=None): ''' Reads jdbc compliant data sources and returns a Pandas DataFrame uses jaydebeapi.connect and doc strings :-) https://pypi.python.org/pypi/JayDeBeApi/ :param sql: select statement :param jclassname: Full qualified Java class name of the JDBC driver. e.g. org.postgresql.Driver or com.ibm.db2.jcc.DB2Driver :param driver_args: Argument or sequence of arguments to be passed to the Java DriverManager.getConnection method. Usually the database URL. See http://docs.oracle.com/javase/6/docs/api/java/sql/DriverManager.html for more details :param jars: Jar filename or sequence of filenames for the JDBC driver :param libs: Dll/so filenames or sequence of dlls/sos used as shared library by the JDBC driver :return: Pandas DataFrame ''' try: conn = jaydebeapi.connect(jclassname, con, driver_args, jars, libs) except jaydebeapi.DatabaseError as de: raise try: curs = conn.cursor() logger.info("Executing:" +sql) curs.execute(sql) columns = [desc[0] for desc in curs.description] # getting column headers # convert the list of tuples from fetchall() to a df data=curs.fetchall() logger.info("Fetching DONE.") return pandas.DataFrame(data, columns=columns) except jaydebeapi.DatabaseError as de: raise finally: curs.close() conn.close()
def save_dataframes_to_excel(dfs, sheet_names, output_file, show_in_excel=False): logger.info("Saving dataframes to excel: " + output_file) writer = ExcelWriter(output_file, engine='xlsxwriter') workbook = writer.book def get_col_widths(df): # First we find the maximum length of the index column idx_max = max([len(str(s)) for s in df.index.values] + [len(str(df.index.name))]) # Then, we concatenate this to the max of the lengths of column name and its values for each column, left to right return [idx_max] + [ max([len(str(s)) for s in df[col].values] + [len(col)]) for col in df.columns ] for n, df in enumerate(dfs): if not isinstance(df, pd.DataFrame): continue df.to_excel(writer, sheet_names[n]) worksheet = writer.sheets[sheet_names[n]] for i, width in enumerate(get_col_widths(df)): worksheet.set_column(i, i, min(width, 100)) if len(df.index) > 0: worksheet.add_table(0, 0, len(df.index), len(df.columns), { 'columns': [{ 'header': 'Idx' }] + [{ 'header': c } for c in list(df)] }) writer.save() writer.close() if show_in_excel: os.system(f"start {output_file}")
def get_persistence_values_hist(db_con, workflow_name): query = f""" SELECT osu.SUBJ_NAME SUBJECT_NAME, osu.SUBJ_ID SUBJECT_ID, otw.TASK_ID WORKFLOW_ID, otw.TASK_NAME WORKFLOW_NAME, otw.VERSION_NUMBER WORKFLOW_VERSION, oti.INSTANCE_ID, oti.INSTANCE_NAME, ots.TASK_ID SESSION_ID, ots.TASK_NAME SESSION_NAME, ots.VERSION_NUMBER SESSION_VERSION, om.MAPPING_ID, om.MAPPING_NAME, om.VERSION_NUMBER MAPPING_VERSION, omp.PV_NAME, omp.PV_ID, omp.PV_DEFAULT, ompv.PV_VALUE, cast(ompv.LAST_SAVED as datetime) PV_LAST_SAVED, osu.SUBJ_NAME + ';' + otw.TASK_NAME + ';' + oti.INSTANCE_NAME + ';' + omp.PV_NAME + ';' + isnull(convert(varchar(2000), ompv.PV_VALUE), '') as PV_FIL_VAERDI FROM dbo.OPB_TASK_INST AS oti INNER JOIN dbo.OPB_TASK AS ots ON ots.TASK_ID = oti.TASK_ID AND oti.TASK_TYPE = ots.TASK_TYPE AND ots.IS_VISIBLE = 1 INNER JOIN dbo.OPB_SESSION AS os ON ots.VERSION_NUMBER = os.VERSION_NUMBER AND ots.TASK_ID = os.SESSION_ID INNER JOIN dbo.OPB_MAPPING AS om ON os.MAPPING_ID = om.MAPPING_ID AND om.IS_VISIBLE = 1 INNER JOIN dbo.OPB_TASK AS otw ON otw.TASK_ID = oti.WORKFLOW_ID AND otw.IS_VISIBLE = 1 AND oti.VERSION_NUMBER = otw.VERSION_NUMBER INNER JOIN dbo.OPB_SUBJECT AS osu ON osu.SUBJ_ID = otw.SUBJECT_ID INNER JOIN dbo.OPB_MAP_PARMVAR AS omp ON omp.MAPPING_ID = om.MAPPING_ID AND omp.SUBJECT_ID = om.SUBJECT_ID AND omp.VERSION_NUMBER = om.VERSION_NUMBER LEFT OUTER JOIN dbo.OPB_MAP_PERSISVAL_hist AS ompv ON ompv.SUBJECT_ID = omp.SUBJECT_ID AND ompv.MAPPING_ID = omp.MAPPING_ID AND ompv.PV_ID = omp.PV_ID AND oti.INSTANCE_ID = ompv.SESSION_INST_ID WHERE (omp.PV_FLAG % 2 = 1) and otw.TASK_NAME like '{workflow_name}' order by PV_LAST_SAVED""" logger.info(query) return db_con.query_pandas(query)
def get_persistence_values(db_con, workflow_name): query = rf""" SELECT __OPB_Subject.SUBJ_NAME AS Folder, __OPB_TASK_Workflow.TASK_NAME AS Workflow, __OPB_TASK_INST.INSTANCE_NAME AS Session, __OPB_MAP_PARMVAR.PV_NAME AS PersistentVariable, convert(varchar(100),__OPB_MAP_PERSISVAL.PV_VALUE) AS Value, isnull(__OPB_MAP_PERSISVAL.RUNINST_NAME, '<NULL>') as RUNINST_NAME, case fdt.DTYPE_GROUP_CODE when 'N' then RIGHT( SPACE(100) + convert(varchar(100),__OPB_MAP_PERSISVAL.PV_VALUE), 100) when 'D' then convert(varchar(100), convert(datetime, convert(varchar(23),__OPB_MAP_PERSISVAL.PV_VALUE), 101), 121) else convert(varchar(100),__OPB_MAP_PERSISVAL.PV_VALUE) end Sort_Value, __OPB_Subject.SUBJ_NAME + ';' + __OPB_TASK_Workflow.TASK_NAME + ';' + __OPB_TASK_INST.INSTANCE_NAME + ';' + __OPB_MAP_PARMVAR.PV_NAME + ';' + isnull(convert(varchar(2000), __OPB_MAP_PERSISVAL.PV_VALUE), '')+';'+isnull(__OPB_MAP_PERSISVAL.RUNINST_NAME, 'NULL') as PV_FILE FROM dbo.OPB_TASK AS __OPB_TASK_Workflow INNER JOIN dbo.OPB_TASK_INST AS __OPB_TASK_INST ON __OPB_TASK_Workflow.TASK_ID = __OPB_TASK_INST.WORKFLOW_ID AND __OPB_TASK_Workflow.VERSION_NUMBER = __OPB_TASK_INST.VERSION_NUMBER AND __OPB_TASK_Workflow.IS_VISIBLE = 1 AND __OPB_TASK_Workflow.TASK_TYPE = 71 AND __OPB_TASK_INST.TASK_TYPE = 68 INNER JOIN dbo.OPB_TASK AS __OPB_TASK_Session ON __OPB_TASK_INST.TASK_ID = __OPB_TASK_Session.TASK_ID AND __OPB_TASK_INST.TASK_TYPE = __OPB_TASK_Session.TASK_TYPE AND __OPB_TASK_Session.IS_VISIBLE = 1 AND __OPB_TASK_Session.TASK_TYPE = 68 INNER JOIN dbo.OPB_SESSION AS __OPB_SESSION ON __OPB_TASK_Session.VERSION_NUMBER = __OPB_SESSION.VERSION_NUMBER AND __OPB_TASK_Session.TASK_ID = __OPB_SESSION.SESSION_ID INNER JOIN dbo.OPB_MAPPING AS __OPB_MAPPING ON __OPB_SESSION.MAPPING_ID = __OPB_MAPPING.MAPPING_ID AND __OPB_MAPPING.IS_VISIBLE = 1 INNER JOIN dbo.OPB_SUBJECT AS __OPB_SUBJECT ON __OPB_SUBJECT.SUBJ_ID = __OPB_TASK_Workflow.SUBJECT_ID INNER JOIN dbo.OPB_MAP_PARMVAR AS __OPB_MAP_PARMVAR ON __OPB_MAP_PARMVAR.MAPPING_ID = __OPB_MAPPING.MAPPING_ID AND __OPB_MAP_PARMVAR.SUBJECT_ID = __OPB_MAPPING.SUBJECT_ID AND __OPB_MAP_PARMVAR.VERSION_NUMBER = __OPB_MAPPING.VERSION_NUMBER AND __OPB_MAP_PARMVAR.PV_FLAG <> 2 left outer JOIN (SELECT * FROM dbo.OPB_MAP_PERSISVAL) AS __OPB_MAP_PERSISVAL ON __OPB_MAP_PERSISVAL.SUBJECT_ID = __OPB_MAP_PARMVAR.SUBJECT_ID AND __OPB_MAP_PERSISVAL.MAPPING_ID = __OPB_MAP_PARMVAR.MAPPING_ID AND __OPB_MAP_PERSISVAL.PV_ID = __OPB_MAP_PARMVAR.PV_ID AND __OPB_MAP_PERSISVAL.SESSION_INST_ID = __OPB_TASK_INST.INSTANCE_ID inner join dbo.REP_FLD_DATATYPE fdt on fdt.DTYPE_NUM = __OPB_MAP_PARMVAR.PV_DATATYPE WHERE (1 = 1) and __OPB_TASK_Workflow.TASK_NAME like '%\_[0-9]%' escape '\' and coalesce(__OPB_MAP_PERSISVAL.RUNINST_NAME,'NULL') = 'NULL' and __OPB_Subject.SUBJ_NAME not like 'z_%' --and __OPB_Subject.SUBJ_NAME = 'RAP' --and __OPB_Subject.SUBJ_NAME = 'EDW_NZ' --and __OPB_TASK_INST.INSTANCE_NAME = 's_m_TEDW2500POSTERING' and __OPB_TASK_Workflow.TASK_NAME like '{workflow_name}' --and __OPB_TASK_INST.INSTANCE_NAME like 's_m_TEJD0020%' --and __OPB_MAP_PARMVAR.PV_NAME = '$$LAST_KORREKTION_I' order by 2 """ logger.info(query) return db_con.query_pandas(query)