Пример #1
0
    def run_middle_out(self):
        relationships = database_helper.get_fk_relationships(
            self.__all_tables, self.__source_conn)
        order = get_topological_order_by_tables(relationships,
                                                self.__all_tables)
        order = list(order)

        database_helper.run_query(
            f'CREATE SCHEMA IF NOT EXISTS {self.temp_schema}',
            self.__destination_conn)

        # randomly sample the targets, per their target percentage
        targets = compute_targets(config_reader.get_target_table(), order)
        print('Beginning subsetting with these direct targets: ' +
              str(targets))
        start_time = time.time()
        for t in targets:
            columns_query = self.__columns_to_copy(t, relationships)
            q = f'SELECT {columns_query} FROM "{schema_name(t)}"."{table_name(t)}" WHERE random() < {targets[t]/100}'
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))
        print(f'Direct target tables completed in {time.time()-start_time}s')

        # greedily grab as many downstream rows as the target strata can support
        downstream_tables = compute_downstream_tables(
            config_reader.get_target_table(), order)
        print('Beginning greedy downstream subsetting with these tables: ' +
              str(downstream_tables))
        start_time = time.time()
        processed_tables = set(targets.keys())
        for t in downstream_tables:
            self.__subset_greedily(t, processed_tables, relationships)
            processed_tables.add(t)
        print(f'Greedy subsettings completed in {time.time()-start_time}s')

        # use subset_via_parents to get all supporting rows according to existing needs
        upstream_tables = list(
            reversed(
                compute_upstream_tables(config_reader.get_target_table(),
                                        order)))
        print('Beginning upstream subsetting with these tables: ' +
              str(upstream_tables))
        start_time = time.time()
        for t in upstream_tables:
            self.subset_via_parents(t, relationships)
        print(f'Upstream subsetting completed in {time.time()-start_time}s')
Пример #2
0
    def run_downward(self, scalePercent):
        relationships = database_helper.get_fk_relationships(
            self.__all_tables, self.__source_conn)
        order = get_topological_order_by_tables(relationships,
                                                self.__all_tables)
        order = list(reversed(order))

        database_helper.run_query(
            'CREATE SCHEMA IF NOT EXISTS {}'.format(self.temp_schema),
            self.__destination_conn)

        if len(order) == 0:
            return

        passthrough_tables = self.__get_passthrough_tables(order)
        sampled_tables = self.__get_sampled_tables(order, passthrough_tables)

        if len(sampled_tables) == 0:
            return

        for t in sampled_tables:
            columns_query = self.__columns_to_copy(t, relationships)
            q = 'SELECT {} FROM "{}"."{}" WHERE random() < {}'.format(
                columns_query, schema_name(t), table_name(t),
                scalePercent / 100)
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))

        for t in passthrough_tables:
            #copy passthrough tables directly to new database
            q = 'SELECT * FROM "{}"."{}"'.format(schema_name(t), table_name(t))
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))

        for c in range(1, len(order)):

            for t in order[c]:
                if t in passthrough_tables:
                    continue

                self.subset_via_parents(t, relationships)

        database_helper.run_query(
            'DROP SCHEMA IF EXISTS {} CASCADE'.format(self.temp_schema),
            self.__destination_conn)
Пример #3
0
    def run_middle_out(self):
        passthrough_tables = self.__get_passthrough_tables()
        relationships = self.__db_helper.get_unredacted_fk_relationships(
            self.__all_tables, self.__source_conn)
        disconnected_tables = compute_disconnected_tables(
            config_reader.get_initial_target_tables(), passthrough_tables,
            self.__all_tables, relationships)
        connected_tables = [
            table for table in self.__all_tables
            if table not in disconnected_tables
        ]
        order = get_topological_order_by_tables(relationships,
                                                connected_tables)
        order = list(order)

        # start by subsetting the direct targets
        print('Beginning subsetting with these direct targets: ' +
              str(config_reader.get_initial_target_tables()))
        start_time = time.time()
        processed_tables = set()
        for idx, target in enumerate(config_reader.get_initial_targets()):
            print_progress(target, idx + 1,
                           len(config_reader.get_initial_targets()))
            self.__subset_direct(target, relationships)
            processed_tables.add(target['table'])
        print('Direct target tables completed in {}s'.format(time.time() -
                                                             start_time))

        # greedily grab rows with foreign keys to rows in the target strata
        upstream_tables = compute_upstream_tables(
            config_reader.get_initial_target_tables(), order)
        print('Beginning greedy upstream subsetting with these tables: ' +
              str(upstream_tables))
        start_time = time.time()
        for idx, t in enumerate(upstream_tables):
            print_progress(t, idx + 1, len(upstream_tables))
            data_added = self.__subset_upstream(t, processed_tables,
                                                relationships)
            if data_added:
                processed_tables.add(t)
        print('Greedy subsettings completed in {}s'.format(time.time() -
                                                           start_time))

        # process pass-through tables, you need this before subset_downstream, so you can get all required downstream rows
        print('Beginning pass-through tables: ' + str(passthrough_tables))
        start_time = time.time()
        for idx, t in enumerate(passthrough_tables):
            print_progress(t, idx + 1, len(passthrough_tables))
            q = 'SELECT * FROM {}'.format(fully_qualified_table(t))
            self.__db_helper.copy_rows(
                self.__source_conn, self.__destination_conn, q,
                mysql_db_name_hack(t, self.__destination_conn))
        print('Pass-through completed in {}s'.format(time.time() - start_time))

        # use subset_downstream to get all supporting rows according to existing needs
        downstream_tables = compute_downstream_tables(passthrough_tables,
                                                      disconnected_tables,
                                                      order)
        print('Beginning downstream subsetting with these tables: ' +
              str(downstream_tables))
        start_time = time.time()
        for idx, t in enumerate(downstream_tables):
            print_progress(t, idx + 1, len(downstream_tables))
            self.subset_downstream(t, relationships)
        print('Downstream subsetting completed in {}s'.format(time.time() -
                                                              start_time))

        if config_reader.keep_disconnected_tables():
            # get all the data for tables in disconnected components (i.e. pass those tables through)
            print('Beginning disconnected tables: ' + str(disconnected_tables))
            start_time = time.time()
            for idx, t in enumerate(disconnected_tables):
                print_progress(t, idx + 1, len(disconnected_tables))
                q = 'SELECT * FROM {}'.format(fully_qualified_table(t))
                self.__db_helper.copy_rows(
                    self.__source_conn, self.__destination_conn, q,
                    mysql_db_name_hack(t, self.__destination_conn))
            print('Disconnected tables completed in {}s'.format(time.time() -
                                                                start_time))
Пример #4
0
    def run_middle_out(self):
        relationships = database_helper.get_fk_relationships(
            self.__all_tables, self.__source_conn)
        disconnected_tables = compute_disconnected_tables(
            config_reader.get_target_table(), self.__all_tables, relationships)
        connected_tables = [
            table for table in self.__all_tables
            if table not in disconnected_tables
        ]
        order = get_topological_order_by_tables(relationships,
                                                connected_tables)
        order = list(order)

        database_helper.run_query(
            'CREATE SCHEMA IF NOT EXISTS {}'.format(self.temp_schema),
            self.__destination_conn)

        # randomly sample the targets, per their target percentage
        targets = compute_targets(config_reader.get_target_table(), order)
        print('Beginning subsetting with these direct targets: ' +
              str(targets))
        start_time = time.time()
        for t in targets:
            columns_query = self.__columns_to_copy(t, relationships)
            q = 'SELECT {} FROM "{}"."{}" WHERE random() < {}'.format(
                columns_query, schema_name(t), table_name(t), targets[t] / 100)
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))
        print('Direct target tables completed in {}s'.format(time.time() -
                                                             start_time))

        # greedily grab as many downstream rows as the target strata can support
        downstream_tables = compute_downstream_tables(
            config_reader.get_target_table(), order)
        print('Beginning greedy downstream subsetting with these tables: ' +
              str(downstream_tables))
        start_time = time.time()
        processed_tables = set(targets.keys())
        for t in downstream_tables:
            self.__subset_greedily(t, processed_tables, relationships)
            processed_tables.add(t)
        print('Greedy subsettings completed in {}s'.format(time.time() -
                                                           start_time))

        # use subset_via_parents to get all supporting rows according to existing needs
        upstream_tables = list(
            reversed(
                compute_upstream_tables(config_reader.get_target_table(),
                                        order)))
        print('Beginning upstream subsetting with these tables: ' +
              str(upstream_tables))
        start_time = time.time()
        for t in upstream_tables:
            self.subset_via_parents(t, relationships)
        print('Upstream subsetting completed in {}s'.format(time.time() -
                                                            start_time))

        # get all the data for tables in disconnected components (i.e. pass those tables through)
        print(
            "Beginning pass-through of tables disconnected from the main component: "
            + str(disconnected_tables))
        start_time = time.time()
        for t in disconnected_tables:
            q = 'SELECT * FROM "{}"."{}"'.format(schema_name(t), table_name(t))
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))
        print('Disconnected tables completed in {}s'.format(time.time() -
                                                            start_time))

        # clean out the temp schema
        database_helper.run_query(
            'DROP SCHEMA IF EXISTS {} CASCADE;'.format(self.temp_schema),
            self.__destination_conn)