def load_orders(self): print(f'Loading orders ({self.warehouse_id})') self.order_lines = [] self.c_ids = list(range(1, NUM_ORDERS + 1)) self.random.shuffle(self.c_ids) with DBConn(self.dsn) as conn: it = StringIteratorIO(( self.generate_order(d_id, o_id) # generate in the order that a higher order number means a later transaction for o_id in range(1, NUM_ORDERS + 1) for d_id in range(1, DIST_PER_WARE + 1))) conn.cursor.copy_from(it, 'orders', null='None', size=COPY_SIZE) with DBConn(self.dsn) as conn: conn.cursor.execute(f''' INSERT INTO new_orders(no_o_id, no_d_id, no_w_id) SELECT o_id, o_d_id, o_w_id FROM orders WHERE o_id >= {FIRST_UNPROCESSED_O_ID} AND o_w_id = { self.warehouse_id }''' ) print(f'Loading order_line ({self.warehouse_id})') with DBConn(self.dsn) as conn: it = StringIteratorIO((self.generate_order_lines(order_line) for order_line in self.order_lines)) conn.cursor.copy_from(it, 'order_line', null='None', size=COPY_SIZE)
def load_supplier(self): print(f'Loading suppliers') assert self.warehouse_id == 0 with DBConn(self.dsn) as conn: it = StringIteratorIO((self.generate_supplier(su_id) for su_id in range(NUM_SUPPLIERS))) conn.cursor.copy_from(it, 'supplier', null='None', size=COPY_SIZE)
def load_item(self): print(f'Loading items') assert self.warehouse_id == 0 with DBConn(self.dsn) as conn: it = StringIteratorIO( (self.generate_item(i_id) for i_id in range(1, MAX_ITEMS + 1))) conn.cursor.copy_from(it, 'item', null='None', size=COPY_SIZE)
def load_customer(self): print(f'Loading customer ({self.warehouse_id})') with DBConn(self.dsn) as conn: it = StringIteratorIO((self.generate_customer(d_id, c_id) for d_id in range(1, DIST_PER_WARE + 1) for c_id in range(1, CUST_PER_DIST + 1))) conn.cursor.copy_from(it, 'customer', null='None', size=COPY_SIZE)
def _query_num_warehouses(self): with DBConn(self.args.dsn) as conn: try: conn.cursor.execute( 'SELECT count(distinct(w_id)) from warehouse') return conn.cursor.fetchone()[0] except ProgrammingError: self._sql_error('Could not query number of warehouses.')
def _query_range_delivery_date(self): with DBConn(self.args.dsn) as conn: try: conn.cursor.execute( 'SELECT min(ol_delivery_d), max(ol_delivery_d) FROM order_line' ) return conn.cursor.fetchone() except ProgrammingError: self._sql_error('Could not query the latest delivery date.')
def load_nation(self): print('Loading nation') assert self.warehouse_id == 0 with DBConn(self.dsn) as conn: for i in range(NUM_NATIONS): nation_key, name, region_key = NATIONS[i] self.insert_data('nation', [[ nation_key, name, region_key, self.olap_text.random_length_text(31, 114) ]])
def load_region(self): print('Loading regions') assert self.warehouse_id == 0 with DBConn(self.dsn) as conn: for i in range(NUM_REGIONS): region_key, name = REGIONS[i] self.insert_data('region', [[ region_key, name, self.olap_text.random_length_text(31, 115) ]])
def _prepare_stats_db(self): dsn_url = urlparse(self.args.stats_dsn) dbname = dsn_url.path[1:] with DBConn(f'{dsn_url.scheme}://{dsn_url.netloc}/postgres') as conn: try: conn.cursor.execute( f"CREATE DATABASE {dbname} TEMPLATE template0 ENCODING 'UTF-8'" ) except DuplicateDatabase: pass with DBConn(self.args.stats_dsn) as conn: stats_schema_path = os.path.join('benchmarks', 'htap', 'stats_schema.sql') with open(stats_schema_path, 'r') as schema: schema_sql = schema.read() try: conn.cursor.execute(schema_sql) except DuplicateTable: pass
def load_history(self): print(f'Loading history ({self.warehouse_id})') copy_columns = ('h_c_id', 'h_c_d_id', 'h_c_w_id', 'h_d_id', 'h_w_id', 'h_date', 'h_amount', 'h_data') with DBConn(self.dsn) as conn: it = StringIteratorIO((self.generate_history(d_id, c_id) for d_id in range(1, DIST_PER_WARE + 1) for c_id in range(1, CUST_PER_DIST + 1))) conn.cursor.copy_from(it, 'history', null='None', columns=copy_columns, size=COPY_SIZE)
def oltp_worker(self, worker_id): # do NOT introduce timeouts for the oltp queries! this will make that # the workload gets inbalanaced and eventually the whole benchmark stalls with DBConn(self.args.dsn) as conn: oltp_worker = TransactionalWorker(worker_id, self.num_warehouses, self.latest_timestamp, conn, self.args.dry_run) next_reporting_time = time.time() + 0.1 while True: self.oltp_sleep() oltp_worker.next_transaction() if next_reporting_time <= time.time(): # its beneficial to send in chunks so try to batch the stats by accumulating 0.1s of samples self.stats_queue.put(('oltp', oltp_worker.stats())) next_reporting_time += 0.1
def analyze_worker(self): tables = [ 'customer', 'district', 'history', 'item', 'nation', 'new_orders', 'order_line', 'orders', 'region', 'stock', 'supplier', 'warehouse' ] os.makedirs('results', exist_ok=True) with open('results/analyze.csv', 'w+') as csv: with DBConn(self.args.dsn) as conn: while True: for table in tables: start = time.time() conn.cursor.execute(f'ANALYZE {table}') runtime = time.time() - start csv.write( f'{datetime.now()}, {table}, {runtime:.2f}\n') csv.flush() time.sleep(600)
def update(self): # open the files and connections lazily to avoid serialization problems when forking os.makedirs('results', exist_ok=True) if not self.csv_oltp: self.csv_oltp = open('results/oltp.csv', 'w') if not self.csv_olap: self.csv_olap = open('results/olap.csv', 'w') if not self.csv_olap_stream: self.csv_olap_stream = open('results/olap_stream.csv', 'w') if not self.csv_dbstats: self.csv_dbstats = open('results/dbstats.csv', 'w') if not self.conn: self.conn = DBConn(self.dsn, use_dict_cursor=True) self._update_cached_stats() self.updates += 1 if self.updates % 10 == 0: self._update_cached_stats() if self.csv_interval and self.updates % self.csv_interval == 0: self._write_oltp_stats() self._write_dbstats()
def load_stock(self): print(f'Loading stock ({self.warehouse_id})') with DBConn(self.dsn) as conn: it = StringIteratorIO( (self.generate_stock(s_id) for s_id in range(1, STOCKS + 1))) conn.cursor.copy_from(it, 'stock', null='None', size=COPY_SIZE)
def run(self): begin = datetime.now() elapsed = timedelta() burnin_duration = None if not self.args.dont_wait_until_enough_data else timedelta( ) if self.args.stats_dsn is not None: print(f"Statistics will be collected in '{self.args.stats_dsn}'.") self._prepare_stats_db() stats_conn_holder = DBConn(self.args.stats_dsn) else: print(f'Database statistics collection is disabled.') stats_conn_holder = nullcontext() def worker_init(): signal.signal(signal.SIGINT, signal.SIG_IGN) num_total_workers = self.args.oltp_workers + self.args.olap_workers + 1 with stats_conn_holder as stats_conn: with Pool(num_total_workers, worker_init) as pool: oltp_workers = pool.map_async(self.oltp_worker, range(self.args.oltp_workers)) olap_workers = pool.map_async(self.olap_worker, range(self.args.olap_workers)) analyze_worker = pool.apply_async(self.analyze_worker) try: update_interval = timedelta(seconds=min( self.args.monitoring_interval, self.args.csv_interval)) display_interval = timedelta( seconds=self.args.monitoring_interval) next_display = datetime.now() + display_interval next_update = datetime.now() + update_interval while True: # the workers are not supposed to ever stop. # so test for errors by testing for ready() and if so propagate them # by calling .get() if self.args.oltp_workers > 0 and oltp_workers.ready(): oltp_workers.get() if self.args.olap_workers > 0 and olap_workers.ready(): olap_workers.get() if analyze_worker.ready(): analyze_worker.get() while datetime.now() < next_update: self.stats.process_queue(self.stats_queue) time.sleep(0.1) time_now = datetime.now() available_data = datetime.fromtimestamp( self.latest_timestamp.value ) - self.range_delivery_date[0] if burnin_duration == None and available_data >= WANTED_RANGE: burnin_duration = time_now - begin elapsed = time_now - begin if elapsed.total_seconds() >= self.args.duration: break self.stats.update() next_update = time_now + update_interval if 'print' in self.args.output and next_display <= time_now: next_display += display_interval self.monitor.update_display( elapsed, burnin_duration, time_now, stats_conn, datetime.fromtimestamp( self.latest_timestamp.value)) except KeyboardInterrupt: pass finally: if burnin_duration == None: burnin_duration = elapsed self.monitor.display_summary(elapsed, burnin_duration) self.stats.write_summary(self.args.csv_file, elapsed)
def load_district(self): print(f'Loading district ({self.warehouse_id})') with DBConn(self.dsn) as conn: it = StringIteratorIO((self.generate_district(d_id) for d_id in range(1, DIST_PER_WARE + 1))) conn.cursor.copy_from(it, 'district', null='None', size=COPY_SIZE)
def insert_data(self, table, data): with DBConn(self.dsn) as conn: sql = f'INSERT INTO {table} VALUES %s' execute_values(conn.cursor, sql, data)