예제 #1
0
파일: subset.py 프로젝트: shurik/condenser
    def run_middle_out(self):
        relationships = database_helper.get_fk_relationships(
            self.__all_tables, self.__source_conn)
        order = get_topological_order_by_tables(relationships,
                                                self.__all_tables)
        order = list(order)

        database_helper.run_query(
            f'CREATE SCHEMA IF NOT EXISTS {self.temp_schema}',
            self.__destination_conn)

        # randomly sample the targets, per their target percentage
        targets = compute_targets(config_reader.get_target_table(), order)
        print('Beginning subsetting with these direct targets: ' +
              str(targets))
        start_time = time.time()
        for t in targets:
            columns_query = self.__columns_to_copy(t, relationships)
            q = f'SELECT {columns_query} FROM "{schema_name(t)}"."{table_name(t)}" WHERE random() < {targets[t]/100}'
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))
        print(f'Direct target tables completed in {time.time()-start_time}s')

        # greedily grab as many downstream rows as the target strata can support
        downstream_tables = compute_downstream_tables(
            config_reader.get_target_table(), order)
        print('Beginning greedy downstream subsetting with these tables: ' +
              str(downstream_tables))
        start_time = time.time()
        processed_tables = set(targets.keys())
        for t in downstream_tables:
            self.__subset_greedily(t, processed_tables, relationships)
            processed_tables.add(t)
        print(f'Greedy subsettings completed in {time.time()-start_time}s')

        # use subset_via_parents to get all supporting rows according to existing needs
        upstream_tables = list(
            reversed(
                compute_upstream_tables(config_reader.get_target_table(),
                                        order)))
        print('Beginning upstream subsetting with these tables: ' +
              str(upstream_tables))
        start_time = time.time()
        for t in upstream_tables:
            self.subset_via_parents(t, relationships)
        print(f'Upstream subsetting completed in {time.time()-start_time}s')
예제 #2
0
    def run_downward(self, scalePercent):
        relationships = database_helper.get_fk_relationships(
            self.__all_tables, self.__source_conn)
        order = get_topological_order_by_tables(relationships,
                                                self.__all_tables)
        order = list(reversed(order))

        database_helper.run_query(
            'CREATE SCHEMA IF NOT EXISTS {}'.format(self.temp_schema),
            self.__destination_conn)

        if len(order) == 0:
            return

        passthrough_tables = self.__get_passthrough_tables(order)
        sampled_tables = self.__get_sampled_tables(order, passthrough_tables)

        if len(sampled_tables) == 0:
            return

        for t in sampled_tables:
            columns_query = self.__columns_to_copy(t, relationships)
            q = 'SELECT {} FROM "{}"."{}" WHERE random() < {}'.format(
                columns_query, schema_name(t), table_name(t),
                scalePercent / 100)
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))

        for t in passthrough_tables:
            #copy passthrough tables directly to new database
            q = 'SELECT * FROM "{}"."{}"'.format(schema_name(t), table_name(t))
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))

        for c in range(1, len(order)):

            for t in order[c]:
                if t in passthrough_tables:
                    continue

                self.subset_via_parents(t, relationships)

        database_helper.run_query(
            'DROP SCHEMA IF EXISTS {} CASCADE'.format(self.temp_schema),
            self.__destination_conn)
예제 #3
0
    def run_downward(self):
        relationships = database_helper.get_fk_relationships(
            self.__source_conn)
        order = TopoOrderer().get_topological_order_by_tables(relationships)
        order = list(reversed(order))

        database_helper.run_query(
            f'CREATE SCHEMA IF NOT EXISTS {self.temp_schema}',
            self.__destination_conn)

        if len(order) == 0:
            return

        passthrough_tables = self.__get_passthrough_tables(order)
        sampled_tables = self.__get_sampled_tables(order, passthrough_tables)

        if len(sampled_tables) == 0:
            return

        for t in sampled_tables:
            columns_query = self.__columns_to_copy(t, relationships)
            q = f'SELECT {columns_query} FROM "{self.schema}"."{t}" WHERE random() < {self.scalePercent/100}'
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q, t,
                                      self.schema)

        for t in passthrough_tables:
            #copy passthrough tables directly to new database
            q = f'SELECT * FROM "{self.schema}"."{t}"'
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q, t,
                                      self.schema)

        for c in range(1, len(order)):

            for t in order[c]:
                if t in passthrough_tables:
                    continue

                self.subset_via_parents(t, relationships)

        database_helper.run_query(
            f'DROP SCHEMA IF EXISTS {self.temp_schema} CASCADE',
            self.__destination_conn)
예제 #4
0
    def run_middle_out(self):
        relationships = database_helper.get_fk_relationships(
            self.__all_tables, self.__source_conn)
        disconnected_tables = compute_disconnected_tables(
            config_reader.get_target_table(), self.__all_tables, relationships)
        connected_tables = [
            table for table in self.__all_tables
            if table not in disconnected_tables
        ]
        order = get_topological_order_by_tables(relationships,
                                                connected_tables)
        order = list(order)

        database_helper.run_query(
            'CREATE SCHEMA IF NOT EXISTS {}'.format(self.temp_schema),
            self.__destination_conn)

        # randomly sample the targets, per their target percentage
        targets = compute_targets(config_reader.get_target_table(), order)
        print('Beginning subsetting with these direct targets: ' +
              str(targets))
        start_time = time.time()
        for t in targets:
            columns_query = self.__columns_to_copy(t, relationships)
            q = 'SELECT {} FROM "{}"."{}" WHERE random() < {}'.format(
                columns_query, schema_name(t), table_name(t), targets[t] / 100)
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))
        print('Direct target tables completed in {}s'.format(time.time() -
                                                             start_time))

        # greedily grab as many downstream rows as the target strata can support
        downstream_tables = compute_downstream_tables(
            config_reader.get_target_table(), order)
        print('Beginning greedy downstream subsetting with these tables: ' +
              str(downstream_tables))
        start_time = time.time()
        processed_tables = set(targets.keys())
        for t in downstream_tables:
            self.__subset_greedily(t, processed_tables, relationships)
            processed_tables.add(t)
        print('Greedy subsettings completed in {}s'.format(time.time() -
                                                           start_time))

        # use subset_via_parents to get all supporting rows according to existing needs
        upstream_tables = list(
            reversed(
                compute_upstream_tables(config_reader.get_target_table(),
                                        order)))
        print('Beginning upstream subsetting with these tables: ' +
              str(upstream_tables))
        start_time = time.time()
        for t in upstream_tables:
            self.subset_via_parents(t, relationships)
        print('Upstream subsetting completed in {}s'.format(time.time() -
                                                            start_time))

        # get all the data for tables in disconnected components (i.e. pass those tables through)
        print(
            "Beginning pass-through of tables disconnected from the main component: "
            + str(disconnected_tables))
        start_time = time.time()
        for t in disconnected_tables:
            q = 'SELECT * FROM "{}"."{}"'.format(schema_name(t), table_name(t))
            database_helper.copy_rows(self.__source_conn,
                                      self.__destination_conn, q,
                                      table_name(t), schema_name(t))
        print('Disconnected tables completed in {}s'.format(time.time() -
                                                            start_time))

        # clean out the temp schema
        database_helper.run_query(
            'DROP SCHEMA IF EXISTS {} CASCADE;'.format(self.temp_schema),
            self.__destination_conn)