def run_middle_out(self): relationships = database_helper.get_fk_relationships( self.__all_tables, self.__source_conn) order = get_topological_order_by_tables(relationships, self.__all_tables) order = list(order) database_helper.run_query( f'CREATE SCHEMA IF NOT EXISTS {self.temp_schema}', self.__destination_conn) # randomly sample the targets, per their target percentage targets = compute_targets(config_reader.get_target_table(), order) print('Beginning subsetting with these direct targets: ' + str(targets)) start_time = time.time() for t in targets: columns_query = self.__columns_to_copy(t, relationships) q = f'SELECT {columns_query} FROM "{schema_name(t)}"."{table_name(t)}" WHERE random() < {targets[t]/100}' database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, table_name(t), schema_name(t)) print(f'Direct target tables completed in {time.time()-start_time}s') # greedily grab as many downstream rows as the target strata can support downstream_tables = compute_downstream_tables( config_reader.get_target_table(), order) print('Beginning greedy downstream subsetting with these tables: ' + str(downstream_tables)) start_time = time.time() processed_tables = set(targets.keys()) for t in downstream_tables: self.__subset_greedily(t, processed_tables, relationships) processed_tables.add(t) print(f'Greedy subsettings completed in {time.time()-start_time}s') # use subset_via_parents to get all supporting rows according to existing needs upstream_tables = list( reversed( compute_upstream_tables(config_reader.get_target_table(), order))) print('Beginning upstream subsetting with these tables: ' + str(upstream_tables)) start_time = time.time() for t in upstream_tables: self.subset_via_parents(t, relationships) print(f'Upstream subsetting completed in {time.time()-start_time}s')
def run_downward(self, scalePercent): relationships = database_helper.get_fk_relationships( self.__all_tables, self.__source_conn) order = get_topological_order_by_tables(relationships, self.__all_tables) order = list(reversed(order)) database_helper.run_query( 'CREATE SCHEMA IF NOT EXISTS {}'.format(self.temp_schema), self.__destination_conn) if len(order) == 0: return passthrough_tables = self.__get_passthrough_tables(order) sampled_tables = self.__get_sampled_tables(order, passthrough_tables) if len(sampled_tables) == 0: return for t in sampled_tables: columns_query = self.__columns_to_copy(t, relationships) q = 'SELECT {} FROM "{}"."{}" WHERE random() < {}'.format( columns_query, schema_name(t), table_name(t), scalePercent / 100) database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, table_name(t), schema_name(t)) for t in passthrough_tables: #copy passthrough tables directly to new database q = 'SELECT * FROM "{}"."{}"'.format(schema_name(t), table_name(t)) database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, table_name(t), schema_name(t)) for c in range(1, len(order)): for t in order[c]: if t in passthrough_tables: continue self.subset_via_parents(t, relationships) database_helper.run_query( 'DROP SCHEMA IF EXISTS {} CASCADE'.format(self.temp_schema), self.__destination_conn)
def run_downward(self): relationships = database_helper.get_fk_relationships( self.__source_conn) order = TopoOrderer().get_topological_order_by_tables(relationships) order = list(reversed(order)) database_helper.run_query( f'CREATE SCHEMA IF NOT EXISTS {self.temp_schema}', self.__destination_conn) if len(order) == 0: return passthrough_tables = self.__get_passthrough_tables(order) sampled_tables = self.__get_sampled_tables(order, passthrough_tables) if len(sampled_tables) == 0: return for t in sampled_tables: columns_query = self.__columns_to_copy(t, relationships) q = f'SELECT {columns_query} FROM "{self.schema}"."{t}" WHERE random() < {self.scalePercent/100}' database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, t, self.schema) for t in passthrough_tables: #copy passthrough tables directly to new database q = f'SELECT * FROM "{self.schema}"."{t}"' database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, t, self.schema) for c in range(1, len(order)): for t in order[c]: if t in passthrough_tables: continue self.subset_via_parents(t, relationships) database_helper.run_query( f'DROP SCHEMA IF EXISTS {self.temp_schema} CASCADE', self.__destination_conn)
def __subset_greedily(self, target, processed_tables, relationships): destination_conn = self.__destination_dbc.get_db_connection() temp_target_name = 'subset_temp_' + table_name(target) try: # copy the whole table columns_query = self.__columns_to_copy(target, relationships) database_helper.run_query( 'CREATE TABLE "{}"."{}" AS SELECT * FROM "{}"."{}" LIMIT 0'. format(self.temp_schema, temp_target_name, schema_name(target), table_name(target)), destination_conn) query = 'SELECT {} FROM "{}"."{}"'.format(columns_query, schema_name(target), table_name(target)) database_helper.copy_rows(self.__source_conn, destination_conn, query, temp_target_name, self.temp_schema) # filter it down in the target database relevant_key_constraints = list( filter( lambda r: r["child_table_name"] in processed_tables and r[ "parent_table_name"] == target, relationships)) clauses = map( lambda kc: "\"{}\".\"{}\" IN (SELECT \"{}\" FROM \"{}\".\"{}\")".format( temp_target_name, kc['fk_column_name'], kc[ 'pk_column_name'], schema_name(kc['child_table_name']), table_name(kc['child_table_name'])), relevant_key_constraints) query = 'SELECT * FROM \"{}\".\"{}\" WHERE TRUE AND {}'.format( self.temp_schema, temp_target_name, " AND ".join(clauses)) database_helper.run_query( 'INSERT INTO "{}"."{}" {}'.format(schema_name(target), table_name(target), query), destination_conn) destination_conn.commit() finally: # delete temporary table database_helper.run_query( 'DROP TABLE IF EXISTS "{}"."{}"'.format( self.temp_schema, temp_target_name), destination_conn) destination_conn.close()
def run_middle_out(self): relationships = database_helper.get_fk_relationships( self.__all_tables, self.__source_conn) disconnected_tables = compute_disconnected_tables( config_reader.get_target_table(), self.__all_tables, relationships) connected_tables = [ table for table in self.__all_tables if table not in disconnected_tables ] order = get_topological_order_by_tables(relationships, connected_tables) order = list(order) database_helper.run_query( 'CREATE SCHEMA IF NOT EXISTS {}'.format(self.temp_schema), self.__destination_conn) # randomly sample the targets, per their target percentage targets = compute_targets(config_reader.get_target_table(), order) print('Beginning subsetting with these direct targets: ' + str(targets)) start_time = time.time() for t in targets: columns_query = self.__columns_to_copy(t, relationships) q = 'SELECT {} FROM "{}"."{}" WHERE random() < {}'.format( columns_query, schema_name(t), table_name(t), targets[t] / 100) database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, table_name(t), schema_name(t)) print('Direct target tables completed in {}s'.format(time.time() - start_time)) # greedily grab as many downstream rows as the target strata can support downstream_tables = compute_downstream_tables( config_reader.get_target_table(), order) print('Beginning greedy downstream subsetting with these tables: ' + str(downstream_tables)) start_time = time.time() processed_tables = set(targets.keys()) for t in downstream_tables: self.__subset_greedily(t, processed_tables, relationships) processed_tables.add(t) print('Greedy subsettings completed in {}s'.format(time.time() - start_time)) # use subset_via_parents to get all supporting rows according to existing needs upstream_tables = list( reversed( compute_upstream_tables(config_reader.get_target_table(), order))) print('Beginning upstream subsetting with these tables: ' + str(upstream_tables)) start_time = time.time() for t in upstream_tables: self.subset_via_parents(t, relationships) print('Upstream subsetting completed in {}s'.format(time.time() - start_time)) # get all the data for tables in disconnected components (i.e. pass those tables through) print( "Beginning pass-through of tables disconnected from the main component: " + str(disconnected_tables)) start_time = time.time() for t in disconnected_tables: q = 'SELECT * FROM "{}"."{}"'.format(schema_name(t), table_name(t)) database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, table_name(t), schema_name(t)) print('Disconnected tables completed in {}s'.format(time.time() - start_time)) # clean out the temp schema database_helper.run_query( 'DROP SCHEMA IF EXISTS {} CASCADE;'.format(self.temp_schema), self.__destination_conn)