Пример #1
0
    def __init__(self, table: "Table", quals: Optional[Quals],
                 columns: Sequence[str]) -> None:
        self.table = table
        self.quals = quals
        self.columns = columns
        self.tracer = Tracer()

        self.object_manager = table.repository.objects

        self.required_objects = table.objects
        self.tracer.log("resolve_objects")
        self.filtered_objects = self.object_manager.filter_fragments(
            self.required_objects, table, quals)
        # Estimate the number of rows in the filtered objects
        self.estimated_rows = sum([
            o.rows_inserted - o.rows_deleted
            for o in self.object_manager.get_object_meta(
                self.filtered_objects).values()
        ])
        self.tracer.log("filter_objects")

        # Prepare a list of objects to query

        # Special fast case: single-chunk groups can all be batched together
        # and queried directly without having to copy them to a staging table.
        # We also grab all fragments from multiple-fragment groups and batch them together
        # for future application.
        #
        # Technically, we could do multiple batches of application for these groups
        # (apply first batch to the staging table, extract the result, clean the table,
        # apply next batch etc): in the middle of it we could also talk back to the object
        # manager and release the objects that we don't need so that they can be garbage
        # collected. The tradeoff is that we perform more calls to apply_fragments (hence
        # more roundtrips).
        self.non_singletons, self.singletons = self._extract_singleton_fragments(
        )

        logging.info(
            "Fragment grouping: %d singletons, %d non-singletons",
            len(self.singletons),
            len(self.non_singletons),
        )
        self.tracer.log("group_fragments")

        self.sql_quals, self.sql_qual_vals = quals_to_sql(
            quals,
            column_types={c.name: c.pg_type
                          for c in self.table.table_schema})

        if self.singletons:
            self.singleton_queries = _generate_table_names(
                self.object_manager.object_engine, SPLITGRAPH_META_SCHEMA,
                self.singletons)
        else:
            self.singleton_queries = []
        self.tracer.log("generate_singleton_queries")
Пример #2
0
    def get_query_plan(self,
                       quals: Optional[Quals],
                       columns: Sequence[str],
                       use_cache: bool = True) -> QueryPlan:
        """
        Start planning a query (preliminary steps before object downloading,
        like qualifier filtering).

        :param quals: Qualifiers in CNF form
        :param columns: List of columns
        :param use_cache: If True, will fetch the plan from the cache for the same qualifiers and columns.
        :return: QueryPlan
        """
        key = _get_plan_cache_key(quals, columns)

        if use_cache and key in self._query_plans:
            plan = self._query_plans[key]
            # Reset the tracer in the plan: if this instance
            # persisted, the resolve/filter times in it will be
            # way in the past.
            plan.tracer = Tracer()
            plan.tracer.log("resolve_objects")
            plan.tracer.log("filter_objects")
            plan.tracer.log("group_fragments")
            plan.tracer.log("generate_singleton_queries")
            return plan

        plan = QueryPlan(self, quals, columns)
        self._query_plans[key] = plan
        return plan
Пример #3
0
def test_tracer():
    with patch("splitgraph.core.common.datetime") as datetime:
        datetime.now.return_value = dt(2019, 1, 1)
        tracer = Tracer()

        datetime.now.return_value = dt(2019, 1, 1, 0, 0, 1)
        tracer.log("event_1")

        datetime.now.return_value = dt(2019, 1, 1, 0, 0, 30)
        tracer.log("event_2")

    assert tracer.get_total_time() == 30
    assert tracer.get_durations() == [("event_1", 1.0), ("event_2", 29.0)]
    assert (str(tracer) == """event_1: 1.000
event_2: 29.000
Total: 30.000""")
Пример #4
0
class QueryPlan:
    """
    Represents the initial query plan (fragments to query) for given columns and
    qualifiers.
    """
    def __init__(self, table: "Table", quals: Optional[Quals],
                 columns: Sequence[str]) -> None:
        self.table = table
        self.quals = quals
        self.columns = columns
        self.tracer = Tracer()

        self.object_manager = table.repository.objects

        self.required_objects = table.objects
        self.tracer.log("resolve_objects")
        self.filtered_objects = self.object_manager.filter_fragments(
            self.required_objects, table, quals)
        # Estimate the number of rows in the filtered objects
        self.estimated_rows = sum([
            o.rows_inserted - o.rows_deleted
            for o in self.object_manager.get_object_meta(
                self.filtered_objects).values()
        ])
        self.tracer.log("filter_objects")

        # Prepare a list of objects to query

        # Special fast case: single-chunk groups can all be batched together
        # and queried directly without having to copy them to a staging table.
        # We also grab all fragments from multiple-fragment groups and batch them together
        # for future application.
        #
        # Technically, we could do multiple batches of application for these groups
        # (apply first batch to the staging table, extract the result, clean the table,
        # apply next batch etc): in the middle of it we could also talk back to the object
        # manager and release the objects that we don't need so that they can be garbage
        # collected. The tradeoff is that we perform more calls to apply_fragments (hence
        # more roundtrips).
        self.non_singletons, self.singletons = self._extract_singleton_fragments(
        )

        logging.info(
            "Fragment grouping: %d singletons, %d non-singletons",
            len(self.singletons),
            len(self.non_singletons),
        )
        self.tracer.log("group_fragments")

        self.sql_quals, self.sql_qual_vals = quals_to_sql(
            quals,
            column_types={c.name: c.pg_type
                          for c in self.table.table_schema})

        if self.singletons:
            self.singleton_queries = _generate_table_names(
                self.object_manager.object_engine, SPLITGRAPH_META_SCHEMA,
                self.singletons)
        else:
            self.singleton_queries = []
        self.tracer.log("generate_singleton_queries")

    def _extract_singleton_fragments(self) -> Tuple[List[str], List[str]]:
        # Get fragment boundaries (min-max PKs of every fragment).
        table_pk = [(t[1], t[2]) for t in self.table.table_schema if t[3]]
        if not table_pk:
            table_pk = [(t[1], t[2]) for t in self.table.table_schema]
        object_pks = self.object_manager.get_min_max_pks(
            self.filtered_objects, table_pk)
        # Group fragments into non-overlapping groups: those can be applied independently of each other.
        object_groups = get_chunk_groups([
            (object_id, min_max[0], min_max[1])
            for object_id, min_max in zip(self.filtered_objects, object_pks)
        ])
        singletons: List[str] = []
        non_singletons: List[str] = []
        for group in object_groups:
            if len(group) == 1:
                singletons.append(group[0][0])
            else:
                non_singletons.extend(object_id for object_id, _, _ in group)
        return non_singletons, singletons