def run_middle_out(self): relationships = database_helper.get_fk_relationships( self.__all_tables, self.__source_conn) order = get_topological_order_by_tables(relationships, self.__all_tables) order = list(order) database_helper.run_query( f'CREATE SCHEMA IF NOT EXISTS {self.temp_schema}', self.__destination_conn) # randomly sample the targets, per their target percentage targets = compute_targets(config_reader.get_target_table(), order) print('Beginning subsetting with these direct targets: ' + str(targets)) start_time = time.time() for t in targets: columns_query = self.__columns_to_copy(t, relationships) q = f'SELECT {columns_query} FROM "{schema_name(t)}"."{table_name(t)}" WHERE random() < {targets[t]/100}' database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, table_name(t), schema_name(t)) print(f'Direct target tables completed in {time.time()-start_time}s') # greedily grab as many downstream rows as the target strata can support downstream_tables = compute_downstream_tables( config_reader.get_target_table(), order) print('Beginning greedy downstream subsetting with these tables: ' + str(downstream_tables)) start_time = time.time() processed_tables = set(targets.keys()) for t in downstream_tables: self.__subset_greedily(t, processed_tables, relationships) processed_tables.add(t) print(f'Greedy subsettings completed in {time.time()-start_time}s') # use subset_via_parents to get all supporting rows according to existing needs upstream_tables = list( reversed( compute_upstream_tables(config_reader.get_target_table(), order))) print('Beginning upstream subsetting with these tables: ' + str(upstream_tables)) start_time = time.time() for t in upstream_tables: self.subset_via_parents(t, relationships) print(f'Upstream subsetting completed in {time.time()-start_time}s')
def run_downward(self, scalePercent): relationships = database_helper.get_fk_relationships( self.__all_tables, self.__source_conn) order = get_topological_order_by_tables(relationships, self.__all_tables) order = list(reversed(order)) database_helper.run_query( 'CREATE SCHEMA IF NOT EXISTS {}'.format(self.temp_schema), self.__destination_conn) if len(order) == 0: return passthrough_tables = self.__get_passthrough_tables(order) sampled_tables = self.__get_sampled_tables(order, passthrough_tables) if len(sampled_tables) == 0: return for t in sampled_tables: columns_query = self.__columns_to_copy(t, relationships) q = 'SELECT {} FROM "{}"."{}" WHERE random() < {}'.format( columns_query, schema_name(t), table_name(t), scalePercent / 100) database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, table_name(t), schema_name(t)) for t in passthrough_tables: #copy passthrough tables directly to new database q = 'SELECT * FROM "{}"."{}"'.format(schema_name(t), table_name(t)) database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, table_name(t), schema_name(t)) for c in range(1, len(order)): for t in order[c]: if t in passthrough_tables: continue self.subset_via_parents(t, relationships) database_helper.run_query( 'DROP SCHEMA IF EXISTS {} CASCADE'.format(self.temp_schema), self.__destination_conn)
def run_downward(self): relationships = database_helper.get_fk_relationships( self.__source_conn) order = TopoOrderer().get_topological_order_by_tables(relationships) order = list(reversed(order)) database_helper.run_query( f'CREATE SCHEMA IF NOT EXISTS {self.temp_schema}', self.__destination_conn) if len(order) == 0: return passthrough_tables = self.__get_passthrough_tables(order) sampled_tables = self.__get_sampled_tables(order, passthrough_tables) if len(sampled_tables) == 0: return for t in sampled_tables: columns_query = self.__columns_to_copy(t, relationships) q = f'SELECT {columns_query} FROM "{self.schema}"."{t}" WHERE random() < {self.scalePercent/100}' database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, t, self.schema) for t in passthrough_tables: #copy passthrough tables directly to new database q = f'SELECT * FROM "{self.schema}"."{t}"' database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, t, self.schema) for c in range(1, len(order)): for t in order[c]: if t in passthrough_tables: continue self.subset_via_parents(t, relationships) database_helper.run_query( f'DROP SCHEMA IF EXISTS {self.temp_schema} CASCADE', self.__destination_conn)
def run_middle_out(self): relationships = database_helper.get_fk_relationships( self.__all_tables, self.__source_conn) disconnected_tables = compute_disconnected_tables( config_reader.get_target_table(), self.__all_tables, relationships) connected_tables = [ table for table in self.__all_tables if table not in disconnected_tables ] order = get_topological_order_by_tables(relationships, connected_tables) order = list(order) database_helper.run_query( 'CREATE SCHEMA IF NOT EXISTS {}'.format(self.temp_schema), self.__destination_conn) # randomly sample the targets, per their target percentage targets = compute_targets(config_reader.get_target_table(), order) print('Beginning subsetting with these direct targets: ' + str(targets)) start_time = time.time() for t in targets: columns_query = self.__columns_to_copy(t, relationships) q = 'SELECT {} FROM "{}"."{}" WHERE random() < {}'.format( columns_query, schema_name(t), table_name(t), targets[t] / 100) database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, table_name(t), schema_name(t)) print('Direct target tables completed in {}s'.format(time.time() - start_time)) # greedily grab as many downstream rows as the target strata can support downstream_tables = compute_downstream_tables( config_reader.get_target_table(), order) print('Beginning greedy downstream subsetting with these tables: ' + str(downstream_tables)) start_time = time.time() processed_tables = set(targets.keys()) for t in downstream_tables: self.__subset_greedily(t, processed_tables, relationships) processed_tables.add(t) print('Greedy subsettings completed in {}s'.format(time.time() - start_time)) # use subset_via_parents to get all supporting rows according to existing needs upstream_tables = list( reversed( compute_upstream_tables(config_reader.get_target_table(), order))) print('Beginning upstream subsetting with these tables: ' + str(upstream_tables)) start_time = time.time() for t in upstream_tables: self.subset_via_parents(t, relationships) print('Upstream subsetting completed in {}s'.format(time.time() - start_time)) # get all the data for tables in disconnected components (i.e. pass those tables through) print( "Beginning pass-through of tables disconnected from the main component: " + str(disconnected_tables)) start_time = time.time() for t in disconnected_tables: q = 'SELECT * FROM "{}"."{}"'.format(schema_name(t), table_name(t)) database_helper.copy_rows(self.__source_conn, self.__destination_conn, q, table_name(t), schema_name(t)) print('Disconnected tables completed in {}s'.format(time.time() - start_time)) # clean out the temp schema database_helper.run_query( 'DROP SCHEMA IF EXISTS {} CASCADE;'.format(self.temp_schema), self.__destination_conn)