Пример #1
0
def test_index_normalize_during_query():
    index = ExplicitSecondaryIndex(
        column="col", dtype=pa.int64(), index_dct={1: ["a", "b", "c"], 2: ["d"]}
    )
    assert index.query(1) == ["a", "b", "c"]
    assert index.query(2) == ["d"]
    assert index.query("2") == ["d"]
    assert index.query(1.0) == ["a", "b", "c"]
Пример #2
0
class Index(AsvBenchmarkConfig):
    params = (
        [10 * 1, 10**3],  # values
        [10 * 1, 10**3],  # partitions
        [(int, pa.int64()), (str, pa.string())],  # types
    )
    param_names = ["number_values", "number_partitions", "dtype"]

    def setup(self, number_values, number_partitions, dtype):
        py_type, arrow_type = dtype
        index_dct = {
            py_type(val): [str(part) for part in range(number_partitions)]
            for val in range(0, number_values)
        }
        self.column_name = "column"
        self.ktk_index = ExplicitSecondaryIndex(column=self.column_name,
                                                index_dct=index_dct,
                                                dtype=arrow_type)
        self.tmp_dir = tempfile.mkdtemp()
        self.store = storefact.get_store_from_url("hfs://{}".format(
            self.tmp_dir))
        self.dataset_uuid = "some_uuid"
        self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid)

        self.ktk_index_not_loaded = ExplicitSecondaryIndex(
            column=self.column_name, index_storage_key=self.storage_key)

        self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store)

    def teardown(self, number_values, number_partitions, dtype):
        shutil.rmtree(self.tmp_dir)

    def time_load_index(self, number_values, number_partitions, arrow_type):
        self.ktk_index_not_loaded.load(self.store)

    def time_query_value(self, number_values, number_partitions, arrow_type):
        self.ktk_index.query(number_values / 2)

    def time_as_series(self, number_values, number_partitions, arrow_type):
        self.ktk_index.as_flat_series()

    def time_as_series_partitions_as_index(self, number_values,
                                           number_partitions, arrow_type):
        self.ktk_index.as_flat_series(partitions_as_index=True)