Пример #1
0
    def run_middle_out(self):
        relationships = database_helper.get_fk_relationships(
            self.__all_tables, self.__source_conn)
        order = get_topological_order_by_tables(relationships,
                                                self.__all_tables)
        order = list(order)

        database_helper.run_query(
            f'CREATE SCHEMA IF NOT EXISTS {self.temp_schema}',
            self.__destination_conn)

        # randomly sample the targets, per their target percentage
        targets = compute_targets(config_reader.get_target_table(), order)
        print('Beginning subsetting with these direct targets: ' +
              str(targets))
        start_time = time.time()
        for t in targets:
            columns_query = self.__columns_to_copy(t, relationships)
            q = f'SELECT {columns_query} FROM "{schema_name(t)}"."{table_name(t)}" WHERE random() < {targets[t]/100}'
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))
        print(f'Direct target tables completed in {time.time()-start_time}s')

        # greedily grab as many downstream rows as the target strata can support
        downstream_tables = compute_downstream_tables(
            config_reader.get_target_table(), order)
        print('Beginning greedy downstream subsetting with these tables: ' +
              str(downstream_tables))
        start_time = time.time()
        processed_tables = set(targets.keys())
        for t in downstream_tables:
            self.__subset_greedily(t, processed_tables, relationships)
            processed_tables.add(t)
        print(f'Greedy subsettings completed in {time.time()-start_time}s')

        # use subset_via_parents to get all supporting rows according to existing needs
        upstream_tables = list(
            reversed(
                compute_upstream_tables(config_reader.get_target_table(),
                                        order)))
        print('Beginning upstream subsetting with these tables: ' +
              str(upstream_tables))
        start_time = time.time()
        for t in upstream_tables:
            self.subset_via_parents(t, relationships)
        print(f'Upstream subsetting completed in {time.time()-start_time}s')
Пример #2
0
    def run_downward(self, scalePercent):
        relationships = database_helper.get_fk_relationships(
            self.__all_tables, self.__source_conn)
        order = get_topological_order_by_tables(relationships,
                                                self.__all_tables)
        order = list(reversed(order))

        database_helper.run_query(
            'CREATE SCHEMA IF NOT EXISTS {}'.format(self.temp_schema),
            self.__destination_conn)

        if len(order) == 0:
            return

        passthrough_tables = self.__get_passthrough_tables(order)
        sampled_tables = self.__get_sampled_tables(order, passthrough_tables)

        if len(sampled_tables) == 0:
            return

        for t in sampled_tables:
            columns_query = self.__columns_to_copy(t, relationships)
            q = 'SELECT {} FROM "{}"."{}" WHERE random() < {}'.format(
                columns_query, schema_name(t), table_name(t),
                scalePercent / 100)
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))

        for t in passthrough_tables:
            #copy passthrough tables directly to new database
            q = 'SELECT * FROM "{}"."{}"'.format(schema_name(t), table_name(t))
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))

        for c in range(1, len(order)):

            for t in order[c]:
                if t in passthrough_tables:
                    continue

                self.subset_via_parents(t, relationships)

        database_helper.run_query(
            'DROP SCHEMA IF EXISTS {} CASCADE'.format(self.temp_schema),
            self.__destination_conn)
Пример #3
0
    def run_downward(self):
        relationships = database_helper.get_fk_relationships(
            self.__source_conn)
        order = TopoOrderer().get_topological_order_by_tables(relationships)
        order = list(reversed(order))

        database_helper.run_query(
            f'CREATE SCHEMA IF NOT EXISTS {self.temp_schema}',
            self.__destination_conn)

        if len(order) == 0:
            return

        passthrough_tables = self.__get_passthrough_tables(order)
        sampled_tables = self.__get_sampled_tables(order, passthrough_tables)

        if len(sampled_tables) == 0:
            return

        for t in sampled_tables:
            columns_query = self.__columns_to_copy(t, relationships)
            q = f'SELECT {columns_query} FROM "{self.schema}"."{t}" WHERE random() < {self.scalePercent/100}'
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q, t,
                                      self.schema)

        for t in passthrough_tables:
            #copy passthrough tables directly to new database
            q = f'SELECT * FROM "{self.schema}"."{t}"'
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q, t,
                                      self.schema)

        for c in range(1, len(order)):

            for t in order[c]:
                if t in passthrough_tables:
                    continue

                self.subset_via_parents(t, relationships)

        database_helper.run_query(
            f'DROP SCHEMA IF EXISTS {self.temp_schema} CASCADE',
            self.__destination_conn)
Пример #4
0
    def __subset_greedily(self, target, processed_tables, relationships):

        destination_conn = self.__destination_dbc.get_db_connection()
        temp_target_name = 'subset_temp_' + table_name(target)

        try:
            # copy the whole table
            columns_query = self.__columns_to_copy(target, relationships)
            database_helper.run_query(
                'CREATE TABLE "{}"."{}" AS SELECT * FROM "{}"."{}" LIMIT 0'.
                format(self.temp_schema, temp_target_name, schema_name(target),
                       table_name(target)), destination_conn)
            query = 'SELECT {} FROM "{}"."{}"'.format(columns_query,
                                                      schema_name(target),
                                                      table_name(target))
            database_helper.copy_rows(self.__source_conn, destination_conn,
                                      query, temp_target_name,
                                      self.temp_schema)

            # filter it down in the target database
            relevant_key_constraints = list(
                filter(
                    lambda r: r["child_table_name"] in processed_tables and r[
                        "parent_table_name"] == target, relationships))
            clauses = map(
                lambda kc:
                "\"{}\".\"{}\" IN (SELECT \"{}\" FROM \"{}\".\"{}\")".format(
                    temp_target_name, kc['fk_column_name'], kc[
                        'pk_column_name'], schema_name(kc['child_table_name']),
                    table_name(kc['child_table_name'])),
                relevant_key_constraints)
            query = 'SELECT * FROM \"{}\".\"{}\" WHERE TRUE AND {}'.format(
                self.temp_schema, temp_target_name, " AND ".join(clauses))
            database_helper.run_query(
                'INSERT INTO "{}"."{}" {}'.format(schema_name(target),
                                                  table_name(target), query),
                destination_conn)
            destination_conn.commit()

        finally:
            # delete temporary table
            database_helper.run_query(
                'DROP TABLE IF EXISTS "{}"."{}"'.format(
                    self.temp_schema, temp_target_name), destination_conn)
            destination_conn.close()
Пример #5
0
    def run_middle_out(self):
        relationships = database_helper.get_fk_relationships(
            self.__all_tables, self.__source_conn)
        disconnected_tables = compute_disconnected_tables(
            config_reader.get_target_table(), self.__all_tables, relationships)
        connected_tables = [
            table for table in self.__all_tables
            if table not in disconnected_tables
        ]
        order = get_topological_order_by_tables(relationships,
                                                connected_tables)
        order = list(order)

        database_helper.run_query(
            'CREATE SCHEMA IF NOT EXISTS {}'.format(self.temp_schema),
            self.__destination_conn)

        # randomly sample the targets, per their target percentage
        targets = compute_targets(config_reader.get_target_table(), order)
        print('Beginning subsetting with these direct targets: ' +
              str(targets))
        start_time = time.time()
        for t in targets:
            columns_query = self.__columns_to_copy(t, relationships)
            q = 'SELECT {} FROM "{}"."{}" WHERE random() < {}'.format(
                columns_query, schema_name(t), table_name(t), targets[t] / 100)
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))
        print('Direct target tables completed in {}s'.format(time.time() -
                                                             start_time))

        # greedily grab as many downstream rows as the target strata can support
        downstream_tables = compute_downstream_tables(
            config_reader.get_target_table(), order)
        print('Beginning greedy downstream subsetting with these tables: ' +
              str(downstream_tables))
        start_time = time.time()
        processed_tables = set(targets.keys())
        for t in downstream_tables:
            self.__subset_greedily(t, processed_tables, relationships)
            processed_tables.add(t)
        print('Greedy subsettings completed in {}s'.format(time.time() -
                                                           start_time))

        # use subset_via_parents to get all supporting rows according to existing needs
        upstream_tables = list(
            reversed(
                compute_upstream_tables(config_reader.get_target_table(),
                                        order)))
        print('Beginning upstream subsetting with these tables: ' +
              str(upstream_tables))
        start_time = time.time()
        for t in upstream_tables:
            self.subset_via_parents(t, relationships)
        print('Upstream subsetting completed in {}s'.format(time.time() -
                                                            start_time))

        # get all the data for tables in disconnected components (i.e. pass those tables through)
        print(
            "Beginning pass-through of tables disconnected from the main component: "
            + str(disconnected_tables))
        start_time = time.time()
        for t in disconnected_tables:
            q = 'SELECT * FROM "{}"."{}"'.format(schema_name(t), table_name(t))
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))
        print('Disconnected tables completed in {}s'.format(time.time() -
                                                            start_time))

        # clean out the temp schema
        database_helper.run_query(
            'DROP SCHEMA IF EXISTS {} CASCADE;'.format(self.temp_schema),
            self.__destination_conn)