class IndexBase(AsvBenchmarkConfig): def setup(self, number_values, number_partitions, dtype): py_type, arrow_type = dtype self.partition_values = generate_partition_values(number_partitions) index_dct = { py_type(val): list( np.random.choice(self.partition_values, number_partitions // 2)) for val in range(number_values) } self.column_name = "column" self.ktk_index = ExplicitSecondaryIndex(column=self.column_name, index_dct=index_dct, dtype=arrow_type) self.tmp_dir = tempfile.mkdtemp() self.store = storefact.get_store_from_url("hfs://{}".format( self.tmp_dir)) self.dataset_uuid = "some_uuid" self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid) self.ktk_index_not_loaded = ExplicitSecondaryIndex( column=self.column_name, index_storage_key=self.storage_key) self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store) def teardown(self, number_values, number_partitions, dtype): shutil.rmtree(self.tmp_dir)
class Index(AsvBenchmarkConfig): params = ( [10 * 1, 10**3], # values [10 * 1, 10**3], # partitions [(int, pa.int64()), (str, pa.string())], # types ) param_names = ["number_values", "number_partitions", "dtype"] def setup(self, number_values, number_partitions, dtype): py_type, arrow_type = dtype index_dct = { py_type(val): [str(part) for part in range(number_partitions)] for val in range(0, number_values) } self.column_name = "column" self.ktk_index = ExplicitSecondaryIndex(column=self.column_name, index_dct=index_dct, dtype=arrow_type) self.tmp_dir = tempfile.mkdtemp() self.store = storefact.get_store_from_url("hfs://{}".format( self.tmp_dir)) self.dataset_uuid = "some_uuid" self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid) self.ktk_index_not_loaded = ExplicitSecondaryIndex( column=self.column_name, index_storage_key=self.storage_key) self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store) def teardown(self, number_values, number_partitions, dtype): shutil.rmtree(self.tmp_dir) def time_load_index(self, number_values, number_partitions, arrow_type): self.ktk_index_not_loaded.load(self.store) def time_query_value(self, number_values, number_partitions, arrow_type): self.ktk_index.query(number_values / 2) def time_as_series(self, number_values, number_partitions, arrow_type): self.ktk_index.as_flat_series() def time_as_series_partitions_as_index(self, number_values, number_partitions, arrow_type): self.ktk_index.as_flat_series(partitions_as_index=True)