示例#1
0
    def query_pandas(self, query, reuse_odbc_connection=False, args=[]):
        """
            Reads database source to pandas dataframe.
            Depending on the configuration of the connector either odbc, jdbc via spark or jaydebeapi jdbc will be used

            :param reuse_odbc_connection: if set connection will be reused (currently works only for ODBC sources)
        """
        logger.debug(f"Executing query on connector {str(self)}:" + query +" with args:"+str(args))
        if self.supports_odbc:
             logger.debug(" using ODBC")
             conn = self.get_odbc_connection(reuse_odbc_connection)
             def pd_read_sql(conn):
                 return pandas.read_sql(query, conn, coerce_float=False, parse_dates=None, params=args)
             try:
                return pd_read_sql(conn)
             except Exception as ex:
                if self.is_connection_broken_error(ex):
                    logger.info("Connection is broken, reconnecting.")
                    conn = self.get_odbc_connection(False)
                    return pd_read_sql(conn)
                else:
                    raise ex
        else:
            if len(args)>0:
                raise Exception("Query args not supported for spark od jaydebeapi.")
            if self.jdbc_access_method=="spark":
                logger.debug("\nWarning: conversion from spark df to pandas needed.")
                return self.query_spark(query).toPandas()
            else:
                logger.debug("\nUsing jaydebeapi jdbc access method")
                return self.read_jdbc_to_pd_df(query, self.jdbc_driver, self.get_jdbc_conn_string(),[self.username, self.get_password()])
示例#2
0
    def query_multiple(self, queries, reuse_odbc_connection=False, args=[], batch_size=1, continue_on_error=False):
        """
            Runs query_pandas. If query does not start with "SELECT" then query will be transformed to "SELECT * FROM {query}"
        """
        ret=[]
        errors=[]
        i=0
        for query in batch(queries, batch_size):
            sql=" UNION ALL ".join(query)
            try:
                df=None
                df=self.query_pandas(sql, reuse_odbc_connection, args=args)
                i=i+1
                logger.info("Batches completion progress:"+str(i)+"/"+str(len(queries)/batch_size) +"  batch size:"+str(batch_size) + " "+str(i*batch_size)+"/"+str(len(queries)))
            except Exception as e:
                logger.debug("ERROR during execution of query:"+ str(("ERROR", query, e)))
                if continue_on_error:
                    if batch_size>1:
                        df, error=self.query_multiple(query, reuse_odbc_connection=reuse_odbc_connection, args=args, batch_size=1, continue_on_error=True)
                        errors.extend(error)
                    else:

                        return (None, [("ERROR", query, e)])
                else:
                    raise
            if df is not None:
                ret.append(df)
        if continue_on_error:
            return (pandas.concat(ret), errors)
        else:
            return pandas.concat(ret)
示例#3
0
    def execute_statement(self, statement, add_col_names=False, reuse_odbc_connection=False, ignore_error=False, args=[], commit=False):

        if self.supports_odbc:
            conn = self.get_odbc_connection(reuse_odbc_connection )
            cursor = conn.cursor()
            try:
                logger.debug(f"Executing statement:{statement}")
                cursor.execute(statement, *args)
            except DatabaseError as ex:
                logger.info(f"DB Exception caught:{ex}.\nReconnecting.")
                conn = self.get_odbc_connection(reuse_odbc_connection=False)
                cursor.execute(statement, *args)
            except Exception as ex:
                if ignore_error:
                    logger.debug(f"Ignored exception when running {statement}:" + str(ex))
                else:
                    raise ex


            res = []
            rowcount = cursor.rowcount
            try:
                recs = cursor.fetchall()
                if add_col_names:
                    fields = tuple(map(lambda x: x[0], cursor.description))
                    recs.insert(0, fields)
                res.append(recs)
            except pyodbc.ProgrammingError:
                pass
            while cursor.nextset():  # NB: This always skips the first resultset
                try:
                    recs = cursor.fetchall()
                    if add_col_names:
                        fields = tuple(map(lambda x: x[0], cursor.description))
                        recs.insert(0, fields)
                    res.append(recs)
                except pyodbc.ProgrammingError:
                    continue
            if commit:
                conn.execute("COMMIT;")
            return (rowcount,res);
        else:
             if self.jdbc_access_method=="spark":
                print("\nWarning: conversion from spark df to pandas needed.")
                return self.query_spark(statement).toPandas()
             else:
                print("\nUsing jaydebeapi jdbc access method")
                return self.read_jdbc_to_pd_df(statement, self.jdbc_driver, self.get_jdbc_conn_string(),[self.username, self.get_password()])
示例#4
0
    def copy_data(self, destination_db_connector, query, destination_tb_name,  batch_size=10000,
                  truncate_destination_table=False, fast_executemany=True):

        if self.supports_odbc:
            import time
            src_conn = self.get_odbc_connection()
            dest_conn = destination_db_connector.get_odbc_connection()

            src_curs= src_conn.cursor()
            tgt_curs = dest_conn.cursor()

            tgt_curs.fast_executemany = fast_executemany

            src_curs.execute(f"select  count(*) cnt from ({query}) x")
            src_cnt=int(src_curs.fetchone()[0])
            src_curs.commit()

            if truncate_destination_table:
                destination_db_connector.execute_statement(f"TRUNCATE TABLE {destination_tb_name}; COMMIT;")

            start = time.time()
            tot=0
            logger.info(destination_tb_name+": Copying "+str(src_cnt)+" records.")

            src_curs = self.get_odbc_connection().cursor()
            src_curs.execute(query)
            columns = [column[0] for column in src_curs.description]

            COLUMNS = ",".join(columns)
            PARAMS = ",".join(["?" for c in columns])

            time_batch_start = time.time()
            src_data = src_curs.fetchmany(batch_size)
            print(len(src_data))

            while len(src_data) > 0:
                print(destination_tb_name+": Inserting "+str(len(src_data))+".", end="")

                tgt_curs.executemany(f'INSERT INTO {destination_tb_name} ({COLUMNS}) VALUES ({PARAMS})', src_data)
                tgt_curs.commit()
                time_batch_end = time.time()
                tot=tot+len(src_data)
                logger.info(" DONE "+ str(round(tot/src_cnt*100))+"%" )
                time_passed=round(time.time() - start)
                total_time=round(time_passed*(1.0/(tot/src_cnt)))
                time_left=round(total_time-time_passed)
                logger.info(destination_tb_name+": Time passed: "+str(time_passed)+
                      " sec. Total time est: "+str(total_time)+
                      " sec. Time left: "+str(time_left) +"sec ("+str(round(time_left/60/60,2))+" hours)"+
                      " Speed from beginning: "+ str(round(tot/time_passed,2))+" rec/sec" +
                      " Speed last batch:" + str(round(len(src_data)/(time_batch_end-time_batch_start),2)) +" rec/sec")

                time_batch_start = time.time()
                src_data = src_curs.fetchmany(batch_size)
                time_batch_start = time.time()
            logger.info(destination_tb_name+": Copying time: "+ str((time.time()) - start))
        else:
            raise Exception("ODBC support required")
示例#5
0
    def read_jdbc_to_pd_df(self, sql, jclassname, con, driver_args, jars=None, libs=None):
        '''
        Reads jdbc compliant data sources and returns a Pandas DataFrame

        uses jaydebeapi.connect and doc strings :-)
        https://pypi.python.org/pypi/JayDeBeApi/

        :param sql: select statement
        :param jclassname: Full qualified Java class name of the JDBC driver.
            e.g. org.postgresql.Driver or com.ibm.db2.jcc.DB2Driver
        :param driver_args: Argument or sequence of arguments to be passed to the
           Java DriverManager.getConnection method. Usually the
           database URL. See
           http://docs.oracle.com/javase/6/docs/api/java/sql/DriverManager.html
           for more details
        :param jars: Jar filename or sequence of filenames for the JDBC driver
        :param libs: Dll/so filenames or sequence of dlls/sos used as
           shared library by the JDBC driver
        :return: Pandas DataFrame
        '''

        try:
            conn = jaydebeapi.connect(jclassname, con, driver_args, jars, libs)
        except jaydebeapi.DatabaseError as de:
            raise

        try:
            curs = conn.cursor()
            logger.info("Executing:" +sql)
            curs.execute(sql)
            columns = [desc[0] for desc in curs.description]  # getting column headers
            # convert the list of tuples from fetchall() to a df
            data=curs.fetchall()
            logger.info("Fetching DONE.")
            return pandas.DataFrame(data, columns=columns)

        except jaydebeapi.DatabaseError as de:
            raise

        finally:
            curs.close()
            conn.close()
示例#6
0
def save_dataframes_to_excel(dfs,
                             sheet_names,
                             output_file,
                             show_in_excel=False):
    logger.info("Saving dataframes to excel: " + output_file)
    writer = ExcelWriter(output_file, engine='xlsxwriter')
    workbook = writer.book

    def get_col_widths(df):
        # First we find the maximum length of the index column
        idx_max = max([len(str(s))
                       for s in df.index.values] + [len(str(df.index.name))])
        # Then, we concatenate this to the max of the lengths of column name and its values for each column, left to right
        return [idx_max] + [
            max([len(str(s)) for s in df[col].values] + [len(col)])
            for col in df.columns
        ]

    for n, df in enumerate(dfs):
        if not isinstance(df, pd.DataFrame):
            continue
        df.to_excel(writer, sheet_names[n])
        worksheet = writer.sheets[sheet_names[n]]
        for i, width in enumerate(get_col_widths(df)):
            worksheet.set_column(i, i, min(width, 100))
        if len(df.index) > 0:
            worksheet.add_table(0, 0, len(df.index), len(df.columns), {
                'columns': [{
                    'header': 'Idx'
                }] + [{
                    'header': c
                } for c in list(df)]
            })

    writer.save()
    writer.close()
    if show_in_excel:
        os.system(f"start  {output_file}")
示例#7
0
def get_persistence_values_hist(db_con, workflow_name):
    query = f"""
SELECT
    osu.SUBJ_NAME SUBJECT_NAME, osu.SUBJ_ID SUBJECT_ID, 
	otw.TASK_ID WORKFLOW_ID, otw.TASK_NAME WORKFLOW_NAME, otw.VERSION_NUMBER WORKFLOW_VERSION,
	oti.INSTANCE_ID, oti.INSTANCE_NAME, 
	ots.TASK_ID SESSION_ID, ots.TASK_NAME SESSION_NAME, ots.VERSION_NUMBER SESSION_VERSION,
	om.MAPPING_ID, om.MAPPING_NAME, om.VERSION_NUMBER MAPPING_VERSION,
	omp.PV_NAME, omp.PV_ID, omp.PV_DEFAULT,
	ompv.PV_VALUE, cast(ompv.LAST_SAVED as datetime) PV_LAST_SAVED,
	osu.SUBJ_NAME + ';' + otw.TASK_NAME + ';' + oti.INSTANCE_NAME + ';' + omp.PV_NAME + ';' + isnull(convert(varchar(2000), ompv.PV_VALUE), '') as PV_FIL_VAERDI 
FROM         dbo.OPB_TASK_INST AS oti INNER JOIN
                      dbo.OPB_TASK AS ots ON ots.TASK_ID = oti.TASK_ID AND oti.TASK_TYPE = ots.TASK_TYPE AND ots.IS_VISIBLE = 1 INNER JOIN
                      dbo.OPB_SESSION AS os ON ots.VERSION_NUMBER = os.VERSION_NUMBER AND ots.TASK_ID = os.SESSION_ID INNER JOIN
                      dbo.OPB_MAPPING AS om ON os.MAPPING_ID = om.MAPPING_ID AND om.IS_VISIBLE = 1 INNER JOIN
                      dbo.OPB_TASK AS otw ON otw.TASK_ID = oti.WORKFLOW_ID AND otw.IS_VISIBLE = 1 AND oti.VERSION_NUMBER = otw.VERSION_NUMBER INNER JOIN
                      dbo.OPB_SUBJECT AS osu ON osu.SUBJ_ID = otw.SUBJECT_ID INNER JOIN
                      dbo.OPB_MAP_PARMVAR AS omp ON omp.MAPPING_ID = om.MAPPING_ID AND omp.SUBJECT_ID = om.SUBJECT_ID AND 
                      omp.VERSION_NUMBER = om.VERSION_NUMBER LEFT OUTER JOIN
                      dbo.OPB_MAP_PERSISVAL_hist AS ompv ON ompv.SUBJECT_ID = omp.SUBJECT_ID AND ompv.MAPPING_ID = omp.MAPPING_ID AND 
                      ompv.PV_ID = omp.PV_ID AND oti.INSTANCE_ID = ompv.SESSION_INST_ID
WHERE     (omp.PV_FLAG % 2 = 1)  and otw.TASK_NAME like '{workflow_name}' order by PV_LAST_SAVED"""
    logger.info(query)
    return db_con.query_pandas(query)
示例#8
0
def get_persistence_values(db_con, workflow_name):
    query = rf"""
SELECT __OPB_Subject.SUBJ_NAME AS Folder,
           __OPB_TASK_Workflow.TASK_NAME AS Workflow,
           __OPB_TASK_INST.INSTANCE_NAME AS Session,
           __OPB_MAP_PARMVAR.PV_NAME AS PersistentVariable,
           convert(varchar(100),__OPB_MAP_PERSISVAL.PV_VALUE) AS Value,
            isnull(__OPB_MAP_PERSISVAL.RUNINST_NAME, '<NULL>') as RUNINST_NAME,
case fdt.DTYPE_GROUP_CODE 
                         when 'N' then RIGHT( SPACE(100) + convert(varchar(100),__OPB_MAP_PERSISVAL.PV_VALUE), 100)
                         when 'D' then convert(varchar(100), convert(datetime, convert(varchar(23),__OPB_MAP_PERSISVAL.PV_VALUE), 101), 121)
                         else convert(varchar(100),__OPB_MAP_PERSISVAL.PV_VALUE)
            end Sort_Value,
            __OPB_Subject.SUBJ_NAME + ';' + __OPB_TASK_Workflow.TASK_NAME + ';' + __OPB_TASK_INST.INSTANCE_NAME + ';' + __OPB_MAP_PARMVAR.PV_NAME + ';' + isnull(convert(varchar(2000), __OPB_MAP_PERSISVAL.PV_VALUE), '')+';'+isnull(__OPB_MAP_PERSISVAL.RUNINST_NAME, 'NULL') as PV_FILE 
FROM
           dbo.OPB_TASK AS __OPB_TASK_Workflow 
INNER JOIN
            dbo.OPB_TASK_INST AS __OPB_TASK_INST 
ON __OPB_TASK_Workflow.TASK_ID = __OPB_TASK_INST.WORKFLOW_ID AND 
            __OPB_TASK_Workflow.VERSION_NUMBER = __OPB_TASK_INST.VERSION_NUMBER AND
            __OPB_TASK_Workflow.IS_VISIBLE = 1 AND 
            __OPB_TASK_Workflow.TASK_TYPE = 71 AND
            __OPB_TASK_INST.TASK_TYPE = 68
INNER JOIN
            dbo.OPB_TASK AS __OPB_TASK_Session 
ON __OPB_TASK_INST.TASK_ID = __OPB_TASK_Session.TASK_ID AND 
            __OPB_TASK_INST.TASK_TYPE = __OPB_TASK_Session.TASK_TYPE AND
            __OPB_TASK_Session.IS_VISIBLE = 1 AND
            __OPB_TASK_Session.TASK_TYPE = 68
INNER JOIN
            dbo.OPB_SESSION AS __OPB_SESSION 
ON __OPB_TASK_Session.VERSION_NUMBER = __OPB_SESSION.VERSION_NUMBER AND 
            __OPB_TASK_Session.TASK_ID = __OPB_SESSION.SESSION_ID
INNER JOIN
            dbo.OPB_MAPPING AS __OPB_MAPPING 
ON __OPB_SESSION.MAPPING_ID = __OPB_MAPPING.MAPPING_ID AND
            __OPB_MAPPING.IS_VISIBLE = 1
INNER JOIN
            dbo.OPB_SUBJECT AS __OPB_SUBJECT 
ON __OPB_SUBJECT.SUBJ_ID = __OPB_TASK_Workflow.SUBJECT_ID 

INNER JOIN
            dbo.OPB_MAP_PARMVAR AS __OPB_MAP_PARMVAR 
ON __OPB_MAP_PARMVAR.MAPPING_ID = __OPB_MAPPING.MAPPING_ID AND 
            __OPB_MAP_PARMVAR.SUBJECT_ID = __OPB_MAPPING.SUBJECT_ID AND 
            __OPB_MAP_PARMVAR.VERSION_NUMBER = __OPB_MAPPING.VERSION_NUMBER AND
            __OPB_MAP_PARMVAR.PV_FLAG <> 2
left outer JOIN 

(SELECT * FROM dbo.OPB_MAP_PERSISVAL) AS __OPB_MAP_PERSISVAL
ON __OPB_MAP_PERSISVAL.SUBJECT_ID = __OPB_MAP_PARMVAR.SUBJECT_ID AND 
            __OPB_MAP_PERSISVAL.MAPPING_ID = __OPB_MAP_PARMVAR.MAPPING_ID AND 
            __OPB_MAP_PERSISVAL.PV_ID = __OPB_MAP_PARMVAR.PV_ID AND 
            __OPB_MAP_PERSISVAL.SESSION_INST_ID = __OPB_TASK_INST.INSTANCE_ID
inner join dbo.REP_FLD_DATATYPE fdt on fdt.DTYPE_NUM = __OPB_MAP_PARMVAR.PV_DATATYPE

WHERE (1 = 1)
     and __OPB_TASK_Workflow.TASK_NAME  like '%\_[0-9]%' escape '\'                    
     and   coalesce(__OPB_MAP_PERSISVAL.RUNINST_NAME,'NULL') = 'NULL'
      and __OPB_Subject.SUBJ_NAME not like 'z_%'
--and __OPB_Subject.SUBJ_NAME = 'RAP'
--and __OPB_Subject.SUBJ_NAME = 'EDW_NZ'
--and __OPB_TASK_INST.INSTANCE_NAME = 's_m_TEDW2500POSTERING'
and  __OPB_TASK_Workflow.TASK_NAME like  '{workflow_name}'
--and __OPB_TASK_INST.INSTANCE_NAME like 's_m_TEJD0020%'
--and __OPB_MAP_PARMVAR.PV_NAME = '$$LAST_KORREKTION_I'
order by 2
"""
    logger.info(query)
    return db_con.query_pandas(query)