def test_index_store_roundtrip_implicit_key(store, col): index1 = ExplicitSecondaryIndex( column=col, index_dct={1: ["part_1", "part_2"], 3: ["part_3"]}, dtype=pa.int64() ) key1 = index1.store(store, "dataset_uuid") index1.index_storage_key = key1 index2 = ExplicitSecondaryIndex(column=col, index_storage_key=key1).load(store) assert index1 == index2 key2 = index2.store(store, "dataset_uuid") index3 = ExplicitSecondaryIndex(column=col, index_storage_key=key2).load(store) assert index1 == index3 assert index2 == index3
class IndexBase(AsvBenchmarkConfig): def setup(self, number_values, number_partitions, dtype): py_type, arrow_type = dtype self.partition_values = generate_partition_values(number_partitions) index_dct = { py_type(val): list( np.random.choice(self.partition_values, number_partitions // 2)) for val in range(number_values) } self.column_name = "column" self.ktk_index = ExplicitSecondaryIndex(column=self.column_name, index_dct=index_dct, dtype=arrow_type) self.tmp_dir = tempfile.mkdtemp() self.store = storefact.get_store_from_url("hfs://{}".format( self.tmp_dir)) self.dataset_uuid = "some_uuid" self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid) self.ktk_index_not_loaded = ExplicitSecondaryIndex( column=self.column_name, index_storage_key=self.storage_key) self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store) def teardown(self, number_values, number_partitions, dtype): shutil.rmtree(self.tmp_dir)
def test_index_store_roundtrip_explicit_key(store): storage_key = "dataset_uuid/some_index.parquet" index1 = ExplicitSecondaryIndex( column="col", index_dct={1: ["part_1", "part_2"], 3: ["part_3"]}, index_storage_key=storage_key, dtype=pa.int64(), ) key1 = index1.store(store, "dataset_uuid") index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store) assert index1 == index2 key2 = index2.store(store, "dataset_uuid") index3 = ExplicitSecondaryIndex(column="col", index_storage_key=key2).load(store) assert index1 == index3 assert index2 == index3
def test_load_from_store_with_indices(store): meta_dct = { "dataset_metadata_version": 4, "dataset_uuid": "uuid", "partitions": { "product_id=1/part_1": { "files": { "core_data": "dataset_uuid/table/location_id=1/part_1.parquet" } } }, "indices": { "product_id": { "1": ["part_1"], "2": ["part_1"], "100": ["part_1"], "34": ["part_1"], } }, } store.put("uuid.by-dataset-metadata.json", simplejson.dumps(meta_dct).encode("utf-8")) df = pd.DataFrame({"index": [1], "location_id": [1], "product_id": [1]}) store_schema_metadata(make_meta(df, origin="core"), "uuid", store, "core_data") storage_key = "uuid/some_index.parquet" index2 = ExplicitSecondaryIndex( column="location_id", index_dct={ 1: ["part_1", "part_2"], 3: ["part_3"] }, index_storage_key=storage_key, dtype=pa.int64(), ) index2.store(store, "dataset_uuid") dmd = DatasetMetadata.load_from_store(store=store, uuid="uuid") assert "location_id" not in dmd.indices dmd = DatasetMetadata.load_from_store(store=store, uuid="uuid", load_all_indices=True) assert "location_id" in dmd.indices
def test_serialization_no_indices(store): index = ExplicitSecondaryIndex(column="col", index_dct={1: ["part_1"]}) storage_key = index.store(store=store, dataset_uuid="uuid") # Create index without `index_dct` index = ExplicitSecondaryIndex(column="col", index_storage_key=storage_key) index2 = pickle.loads(pickle.dumps(index)) assert index == index2
def test_index_store_roundtrip_ts(store, dtype, timestamps): storage_key = "dataset_uuid/some_index.parquet" index1 = ExplicitSecondaryIndex( column="col", index_dct=dict(zip(timestamps, [["part_1", "part_2"], ["part_3"]])), index_storage_key=storage_key, dtype=dtype, ) key1 = index1.store(store, "dataset_uuid") index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store) assert index1 == index2
def test_index_empty(store, dtype): storage_key = "dataset_uuid/some_index.parquet" index1 = ExplicitSecondaryIndex( column="col", index_dct={}, dtype=dtype, index_storage_key=storage_key ) key1 = index1.store(store, "dataset_uuid") index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store) assert index1 == index2 index3 = pickle.loads(pickle.dumps(index1)) assert index1 == index3
def test_index_large(store): storage_key = "dataset_uuid/some_index.parquet" index1 = ExplicitSecondaryIndex( column="col", index_dct={i: ["part_1"] for i in range(100_000)}, index_storage_key=storage_key, ) key1 = index1.store(store, "dataset_uuid") index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store) assert index1 == index2 index3 = pickle.loads(pickle.dumps(index1)) assert index1 == index3
def test_pickle_without_load(store): storage_key = "dataset_uuid/some_index.parquet" index1 = ExplicitSecondaryIndex(column="col", index_dct={1: ["part_1"]}, index_storage_key=storage_key) key1 = index1.store(store, "dataset_uuid") index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1) assert index2 != index1 index3 = pickle.loads(pickle.dumps(index2)) assert index3 == index2 index4 = index3.load(store) assert index4 == index1 assert index4 != index2
def test_index_store_roundtrip_ts(store): storage_key = "dataset_uuid/some_index.parquet" index1 = ExplicitSecondaryIndex( column="col", index_dct={ pd.Timestamp("2017-01-01"): ["part_1", "part_2"], pd.Timestamp("2017-01-02"): ["part_3"], }, index_storage_key=storage_key, dtype=pa.timestamp("ns"), ) key1 = index1.store(store, "dataset_uuid") index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store) assert index1 == index2
class Index(AsvBenchmarkConfig): params = ( [10 * 1, 10**3], # values [10 * 1, 10**3], # partitions [(int, pa.int64()), (str, pa.string())], # types ) param_names = ["number_values", "number_partitions", "dtype"] def setup(self, number_values, number_partitions, dtype): py_type, arrow_type = dtype index_dct = { py_type(val): [str(part) for part in range(number_partitions)] for val in range(0, number_values) } self.column_name = "column" self.ktk_index = ExplicitSecondaryIndex(column=self.column_name, index_dct=index_dct, dtype=arrow_type) self.tmp_dir = tempfile.mkdtemp() self.store = storefact.get_store_from_url("hfs://{}".format( self.tmp_dir)) self.dataset_uuid = "some_uuid" self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid) self.ktk_index_not_loaded = ExplicitSecondaryIndex( column=self.column_name, index_storage_key=self.storage_key) self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store) def teardown(self, number_values, number_partitions, dtype): shutil.rmtree(self.tmp_dir) def time_load_index(self, number_values, number_partitions, arrow_type): self.ktk_index_not_loaded.load(self.store) def time_query_value(self, number_values, number_partitions, arrow_type): self.ktk_index.query(number_values / 2) def time_as_series(self, number_values, number_partitions, arrow_type): self.ktk_index.as_flat_series() def time_as_series_partitions_as_index(self, number_values, number_partitions, arrow_type): self.ktk_index.as_flat_series(partitions_as_index=True)
def persist_indices(store, dataset_uuid, indices): store = _instantiate_store(store) output_filenames = {} for column, index in indices.items(): # backwards compat if isinstance(index, dict): legacy_storage_key = "{dataset_uuid}.{column}{suffix}".format( dataset_uuid=dataset_uuid, column=column, suffix=naming.EXTERNAL_INDEX_SUFFIX, ) index = ExplicitSecondaryIndex( column=column, index_dct=index, index_storage_key=legacy_storage_key) elif isinstance(index, PartitionIndex): continue output_filenames[column] = index.store(store=store, dataset_uuid=dataset_uuid) return output_filenames