def process_all(): myconn = ConnectionHelper().get_named_connection("current") cursor = myconn.cursor() cursor.execute("select etl_file_id from etl_file order by etl_file_id") for row in cursor: etl_file_id = row[0] logging.info("about to process %s", etl_file_id) process(etl_file_id)
def test_sqlite(self): ch = ConnectionHelper(ConnectionHelperTest.yaml_file_path) connection = ch.get_named_connection("sqlite3") self.assertEqual(ConnectionHelper.get_dialect(connection), dialects.DIALECT_SQLITE) cursor = connection.cursor() self.assertEqual(ConnectionHelper.get_dialect(cursor), dialects.DIALECT_SQLITE)
def test_memory_sqlite(self): ch = ConnectionHelper(ConnectionHelperTest.yaml_file_path) connection = ch.get_named_connection("sqlite3") connection.cursor().execute("select 'x'") connection = ch.get_named_connection("sqlite3_mem") conn_info = ch.connections["sqlite3_mem"] dialect, dburl = ch.get_components(conn_info["url"]) self.assertEqual(dialect, "sqlite") self.assertEqual(dburl, ":memory:") connection.cursor().execute("select 'x'")
def test_sqlite_returning(self): logger = logging.getLogger(__name__ + ":test_sqlite_returning") drop_sql = "drop table if exists a" create_sql = "create table a (b serial primary key, c numeric)" insert_sql = "insert into a (c) values (%(c)s)" returning_text = "returning b" ch = ConnectionHelper(ConnectionHelperTest.yaml_file_path) connection = ch.get_named_connection("sqlite3_mem") cursor = CursorHelper(connection.cursor()) cursor.execute(drop_sql) cursor.execute(create_sql) new_id = cursor.execute(insert_sql, {"c": 3}, returning=returning_text) self.assertEqual(1, new_id)
def test_all(self): connection = ConnectionHelper().get_named_connection("test") up = UtProcess(connection) binds = { "schema_nm": "test", "process_nm": "UtProcess_test", "thread_nm": None, "process_run_nbr": 1, #TODO "status_msg": "testing", "status_id": None, #class UtProcess_test(unittest.TestCase): "status_ts": None, #datetime().date(2017,4,6,0,0,0), "ignore_flg": "N" } up.insert_process(binds) connection.rollback()
def test_yaml_reader(self): connections = ConnectionHelper.get_connections_yaml( ConnectionHelperTest.yaml_file_path) self.assertEqual(len(connections), 2) self.assertTrue("sqlite3" in connections) self.assertTrue("sqlite3_mem" in connections) sql = connections["sqlite3_mem"]["sql"][0] self.assertEqual(sql, "PRAGMA foreign_keys = ON")
def get_postgres_connection( cls): # TODO externalize use - test schema to initialize db_url = "postgres:host='localhost' dbname='sales_reporting' user='******' password='******'" # TODO externalize cls.connection = ConnectionHelper.get_connection(db_url) cursor = cls.connection.cursor() schema = "integration_test" cursor.execute("create schema %s" % schema) cursor.execute("set schema '%s'" % schema) return cls.connection
def test_dataset_from_sql(self): sql = "select * from etl_sale where etl_file_id = %(ETL_FILE_ID)s" binds = {"ETL_FILE_ID": 201723} connection = ConnectionHelper().get_named_connection("current") # pandas.read_sql(sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None)[source] now = datetime.datetime.now() sale_dataframe = Dataset.read_sql(sql, connection, params=binds) logger.debug("test_dataset_from_sql elapsed %s " % (datetime.datetime.now() - now)) logger.debug("rowcount %s" % len(sale_dataframe.rows))
def test_fetch_or_insert(self): connection = ConnectionHelper().get_named_connection("test") cursors = Cursors(connection) binds = { "rule_name": "TEST_COND", "table_name": "etl_sale", "msg": "I don't know", "sql_text": "select id from etl_sale where etl_file_id = %(ETL_FILE_ID)", "narrative": "huh", "severity": 3, "format_str": "id % is %s", # "CORRECTIVE_ACTION" : "Fix it" } id = UtConditionPersistence.fetch_or_insert(cursors, binds) connection.commit() assert (id is not None)
def main(): """ SqlRunner.py [-h] --connection_name CONNECTION_NAME --infile_name INFILE_NAME [--continue_on_error CONTINUE_ON_ERROR] [--print_sql] [--commit] optional arguments: -h, --help show this help message and exit --connection_name CONNECTION_NAME name of database connection --infile_name INFILE_NAME name of sql script file --continue_on_error CONTINUE_ON_ERROR continue running statements after an error --print_sql print each executed SQL statement --commit commit at end :return: """ logging.basicConfig(level=logging.INFO) PARSER = argparse.ArgumentParser() PARSER.add_argument("--connection_name", required=True, default="test", help="name of database connection") PARSER.add_argument("--infile_name", required=True, help="name of sql script file") PARSER.add_argument("--continue_on_error", help="continue running statements after an error") PARSER.add_argument("--print_sql", action="store_true", help="print each executed SQL statement") PARSER.add_argument("--commit", action="store_true", help="commit at end") PARSER.add_argument("--interactive", action="store_true", help="Interactive seesion") PARSER.set_defaults(print_sql=False, continue_on_error=False) ARGS = PARSER.parse_args() CONNECTION = ConnectionHelper(None).get_named_connection(ARGS.connection_name) RUNNER = SqlRunner(infile_name=ARGS.infile_name, conn=CONNECTION, continue_on_error=ARGS.continue_on_error, print_sql=ARGS.print_sql, commit=True) if ARGS.interactive: RUNNER.interactive() else: RUNNER.process()
def sales_pivot_from_sql(): from pdsutil.DbUtil import ConnectionHelper sql = "select * from etl_sale where etl_file_id = %(ETL_FILE_ID)s" parms = {"ETL_FILE_ID": 201723} connection = ConnectionHelper().get_named_connection("current") df = pandas.read_sql(sql, connection, params=parms) print(df) result = pandas.pivot_table(df, columns='ship_dt', values='cases_shipped', index=['ship_to_cust_id', 'product_descr']) print("pivot %s\n" % result)
def main(): # # parser = argparse.ArgumentParser(description='load a file') # parser.add_argument('--etl_file_id', dest='etl_file_id', required=True) # parser.add_argument('--rerun', action='store_true') # parser.set_defaults(rerun=False) # # args = parser.parse_args() logging.basicConfig(level=logging.INFO) myconn = ConnectionHelper(None).get_named_connection("test") CdsDataloadConditions().process(myconn, {"ETL_FILE_ID": 20}) # TODO add args
def main(): logging.basicConfig(level=logging.INFO) # parser = argparse.ArgumentParser(description='load a file') # parser.add_argument('--inputfile', dest='inputfile') # parser.add_argument("--connection", required=True) # parser.add_argument("--distributor") # parser.set_defaults(test=False) # args = parser.parse_args() # # mainconn = ConnectionHelper().get_named_connection(args.connections) # CdsDataloader().process(args.inputfile, mainconn, args.distributor, False) # # import sqlite3 mainconn = ConnectionHelper().get_named_connection("it") # mainconn = sqlite3.Connection("/tmp/scratch.dbf") CdsDataloader().process("/tmp/customers.cds", mainconn, "EXOTICTX", False)
def setUp(self): # db_url = os.getenv(self.POSTGRES_TEST_URL) if self.dialect == dialects.DIALECT_POSTGRES: db_url = "postgres:host='localhost' dbname='sales_reporting_db' user='******' password='******'" # TODO externalize test_schema = self.TEST_SCHEMA elif self.dialect == dialects.DIALECT_SQLITE: db_url = "sqlite::memory:" test_schema = None else: raise Exception("unsupported dialect %s" % self.dialect) message = "Skipping test as %s environment variable not set" % self.POSTGRES_TEST_URL if not db_url: logging.warning(message) self.skipTest(message) self.connection = ConnectionHelper.get_connection(db_url) SchemaInitter(self.connection, test_schema).process() ut_condition_init(self.connection).process() sr_data_init(self.connection).process()
def sales_pivot_from_sql_by_month(): full_begin = time.time() # Given a dict of dataframes, for example: # dfs = {'gadgets': df_gadgets, 'widgets': df_widgets} # writer = pd.ExcelWriter(filename, engine='xlsxwriter') # for sheetname, df in dfs.items(): # loop through `dict` of dataframes # df.to_excel(writer, sheet_name=sheetname) # send df to writer # worksheet = writer.sheets[sheetname] # pull worksheet object # for idx, col in enumerate(df): # loop through all columns # series = df[col] # max_len = max(( # series.astype(str).map(len).max(), # len of largest item # len(str(series.name)) # len of column name/header # )) + 1 # adding a little extra space # worksheet.set_column(idx, idx, max_len) # set column width # writer.save() from pdsutil.DbUtil import ConnectionHelper sql = "select * from etl_cust_product_month_mv where sum_cases_shipped > 0" parms = {"ETL_FILE_ID": 201723} connection = ConnectionHelper().get_named_connection("current") before_query = time.time() df = pandas.read_sql(sql, connection, params=parms) after_query = time.time() # print (df) pivot_df = pandas.pivot_table(df, columns='ship_month', values='sum_cases_shipped', index=['ship_to_cust_id', 'product_descr']) print("pivot_df index %s" % pivot_df.index.name) after_pivot = time.time() # print("pivot %s\n" % pivot_df) row_count = 0 # for row in pivot_df.iterrows(): # row_length = len(row) # print ("row len %s data %s" % (row_length, str(row))) # row_count += 1 # if row_count > 9: # break; # TODO http://stackoverflow.com/questions/17241004/pandas-how-to-get-the-data-frame-index-as-an-array set index_columns before_to_excel = time.time() # Create a Pandas Excel writer using XlsxWriter as the engine. writer = pandas.ExcelWriter('/tmp/all_sales_pivot.xlsx', engine='xlsxwriter') # Convert the dataframe to an XlsxWriter Excel object. sheet_name = "All Sales" pivot_df.to_excel(writer, sheet_name=sheet_name) worksheet = writer.sheets[sheet_name] # set_columnm_widths(worksheet,pivot_df) to_csv_begin = time.time() import io output = io.StringIO() pivot_df.to_csv(output) to_csv_end = time.time() to_record_begin = time.time() records = pivot_df.to_records() to_record_end = time.time() ### to_excel_internal_start = time.time() import xlsxwriter output = open("/tmp/crosstab.xslx", "wb") workbook = xlsxwriter.Workbook(output) to_excel(workbook, "wank", records) workbook.close() to_excel_internal_end = time.time() writer.save() after_excel = time.time() full_end = time.time() log_time("query", before_query, after_query) log_time("pivot", after_query, after_pivot) log_time("to_excel", before_to_excel, after_excel) log_time("to_csv", to_csv_begin, to_csv_end) log_time("to_records", to_record_begin, to_record_end) log_time("internal to_excel", to_excel_internal_start, to_excel_internal_end) log_time("full time", full_begin, full_end)
from pdsutil.DbUtil import ConnectionHelper connection = ConnectionHelper().get_named_connection("it") cursor = connection.cursor() cursor.execute( "select distinct ship_to_cust_id from etl_sale order by ship_to_cust_id") sale_ship_to_ids = cursor.fetchall() print(sale_ship_to_ids) cursor.execute( "select distinct ship_to_cust_id from etl_customer order by ship_to_cust_id" ) cust_ship_to_ids = cursor.fetchall() print(cust_ship_to_ids) for sale_id_t, cust_id_t in zip(sale_ship_to_ids, cust_ship_to_ids): binds = {"to_id": sale_id_t[0], "cust_id": cust_id_t[0]} print("cust_id %s to_id %s" % (sale_id_t[0], cust_id_t[0])) cursor.execute( "update etl_customer set ship_to_cust_id = %(to_id)s where ship_to_cust_id = %(cust_id)s", binds) connection.commit()
:param etl_file_id: the etl_file.etl_file_id to extract :param file_name: output file name :param by_line_number: boolean - much slower but records in original order :return: None """ if by_line_number: self.process_by_line_number(etl_file_id, file_name) else: self.process_by_table(etl_file_id, file_name) def unload_all(self): cursor = CursorHelper(self.connection.cursor()) sql = "select etl_file_id from etl_file" rows = cursor.execute(sql) for row in rows: etl_file_id = row[0] # self.process(etl_file_id, "../pdssr_testdata/%s.cds" % etl_file_id, False) self.process(etl_file_id, "/tmp/python/%s.cds" % etl_file_id, False) if __name__ == "__main__": myconn = ConnectionHelper().get_named_connection("it") unloader = CdsUnload(myconn) #unloader.process(30, "/tmp/201502.cds", False) # TODO unloader.unload_all()
def process(etl_file_id): myconn = ConnectionHelper().get_named_connection("current") Post(myconn).process(etl_file_id)
from pdsutil.DbUtil import ConnectionHelper, CursorHelper import sys import csv if __name__ == "__main__": connection_name = sys.argv[0] connection = ConnectionHelper().get_named_connection(connection_name) cursor = CursorHelper(connection.cursor()) rows = cursor.execute(sql,binds) quoting_types = [csv.QUOTE_NONNUMERIC, csv.QUOTE_ALL, csv.QUOTE_MINIMAL, csv.QUOTE_NONE] def to_csv(self, file, emit_headers: bool = True, dialect: str = "excel", delimiter: str = ",", quotechar: str = "'", quoting: str = csv.QUOTE_NONNUMERIC): writer = csv.writer(file, dialect="excel", delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) if emit_headers: writer.writerow(self.column_names) for row in self.rows: writer.writerow(row)
import sys from pdsutil.DbUtil import ConnectionHelper, CursorHelper from pdsutil.Dataset import Dataset import datetime import logging logging.basicConfig(level=logging.INFO) sql = "select * from etl_sale where etl_file_id = %(ETL_FILE_ID)s" binds = {"ETL_FILE_ID" : 201723} connection = ConnectionHelper().get_named_connection("current") cursor = CursorHelper(connection.cursor()) sales = Dataset.from_sql(connection,sql,binds) # sales.to_csv(sys.stdout) # to_csv out_file = open("/tmp/sales.csv","w") sales.to_csv(out_file) # to_sqlite sales.set_column_meta("curr_cd",str,3) sales.set_column_meta("org_customer_id",str,10) db = sales.to_sqlite("etl_sale",verbose=False) cursor = CursorHelper(db.cursor()) rows = cursor.execute("select count(*) from etl_sale") for row in rows: print (row)
def connect(self, name: str): self.connection = ConnectionHelper().get_named_connection(name) self.cursor = CursorHelper(self.connection.cursor())
class isql: def __init__(self): self.statements = SqlStatements.from_statement_list([]) print(self.statements) self.statement = None self.connection = None self.cursor = None self.binds = {} def dump_statements(self): print("about to dump") print(yaml.dump(self.statements)) def connect(self, name: str): self.connection = ConnectionHelper().get_named_connection(name) self.cursor = CursorHelper(self.connection.cursor()) def load(self, filename: str): self.statements = SqlStatements.from_yaml(filename).statements self.list_statements() def list_statements(self, verbose=False): for i, v in enumerate(self.statements): print("%s %s" % (i, v)) # for k in self.statements: # print (k) def bind_date(self, name, year, month, day): self.binds[name] = datetime.datetime(year, month, day) def use(self, number: int): for i, k in enumerate(self.statements): if i == number: self.statement = self.statements[k] def use_statement(self, name: str): self.statement = name def run(self, index=None): if index == None: print("stmt: %s" % self.statement) else: for i, k in enumerate(self.statements): if i == index: self.statement = self.statements[k] print("k: %s sql: %s" % (k, self.statements["sql"])) rows = self.cursor.execute(self.statement["sql"], self.binds) for row in rows: print(row) def bind(self, name, value): self.binds[name] = value def execute(self, statement_name: str = None): if self.connection is None: print("use connect before use") if statement_name: sql = self.statements[statement_name] elif self.statement is None: sql = self.statements[statement_name] else: print("pass statement or call use_statement() before execute") result = self.cursor.execute(sql, self.binds) import sys def to_csv(self, headers=True, outfile=sys.stdout): import csv rows = self.execute() column_names = [i for i[0] in self.cursor.description] print(column_names) writer = csv.writer(outfile, dialect="excel", delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) if headers: writer.writerow(column_names) for row in rows: writer.writerow(row) @staticmethod def help(): print("connect()") print("list_statements()") print("execute(statement_name)") print("bind_date(name,yr,month,day)") print("load('filename')") print("use statement(statement_name") print("use")
def get_test_postgres_connection(): db_url = "postgres:host='localhost' dbname='sales_reporting_db' user='******' password='******'" # TODO externalize return ConnectionHelper.get_connection(db_url)