def test_index_as_flat_series_partitions_as_index(): index1 = ExplicitSecondaryIndex( column="col", index_dct={ 1: ["part_1", "part_2"], 2: ["part_1"] }, dtype=pa.int64(), ) ser = index1.as_flat_series(partitions_as_index=True) expected = pd.Series( [1, 1, 2], index=pd.Index(["part_1", "part_2", "part_1"], name="partition"), name="col", ) assert_series_equal(ser, expected) ser_comp = index1.as_flat_series(compact=True, partitions_as_index=True) expected = pd.Series( [[1, 2], [1]], index=pd.Index(["part_1", "part_2"], name="partition"), name="col", ) assert_series_equal(ser_comp, expected)
def test_index_as_flat_series_highly_degenerated_asym(): """ Ensure that the generation of the series is not bound by col numbers or nans in the matrix """ dim = 4 ind_dct = {k: ["part_{}".format(i) for i in range(0, dim)] for k in range(0, dim)} ind_dct[0] = ["part_1"] ind_dct[2] = ["part_2", "part_5"] index1 = ExplicitSecondaryIndex(column="col", index_dct=ind_dct, dtype=pa.int64()) ser = index1.as_flat_series() partition = [ "part_1", "part_0", "part_1", "part_2", "part_3", "part_2", "part_5", "part_0", "part_1", "part_2", "part_3", ] index_values = [0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3] expected = pd.Series( partition, index=pd.Index(index_values, name="col", dtype=int), name="partition" ) assert_series_equal(ser, expected) ser_inv = index1.as_flat_series(partitions_as_index=True) expected_inv = pd.Series( index_values, index=pd.Index(partition, name="partition"), name="col" ) assert_series_equal(ser_inv, expected_inv)
def test_index_as_flat_series_date(dtype, date_as_object): index1 = ExplicitSecondaryIndex( column="col", index_dct={ datetime.date(2017, 1, 2): ["part_1", "part_2"], datetime.date(2018, 2, 3): ["part_1"], }, dtype=pa.date32(), ) ser = index1.as_flat_series(date_as_object=date_as_object) ser = ser.sort_index() expected = pd.Series( ["part_1", "part_2", "part_1"], index=pd.Index( [ datetime.date(2017, 1, 2), datetime.date(2017, 1, 2), datetime.date(2018, 2, 3), ], dtype=dtype, name="col", ), name="partition", ) assert_series_equal(ser, expected)
def test_index_as_flat_series_single_value(): index1 = ExplicitSecondaryIndex(column="col", index_dct={1: ["part_1", "part_2"]}, dtype=pa.int64()) ser = index1.as_flat_series() expected = pd.Series(["part_1", "part_2"], index=pd.Index([1, 1], name="col"), name="partition") assert_series_equal(ser, expected) ser_comp = index1.as_flat_series(compact=True) expected = pd.Series([["part_1", "part_2"]], index=pd.Index([1], name="col"), name="partition") assert_series_equal(ser_comp, expected)
class Index(AsvBenchmarkConfig): params = ( [10 * 1, 10**3], # values [10 * 1, 10**3], # partitions [(int, pa.int64()), (str, pa.string())], # types ) param_names = ["number_values", "number_partitions", "dtype"] def setup(self, number_values, number_partitions, dtype): py_type, arrow_type = dtype index_dct = { py_type(val): [str(part) for part in range(number_partitions)] for val in range(0, number_values) } self.column_name = "column" self.ktk_index = ExplicitSecondaryIndex(column=self.column_name, index_dct=index_dct, dtype=arrow_type) self.tmp_dir = tempfile.mkdtemp() self.store = storefact.get_store_from_url("hfs://{}".format( self.tmp_dir)) self.dataset_uuid = "some_uuid" self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid) self.ktk_index_not_loaded = ExplicitSecondaryIndex( column=self.column_name, index_storage_key=self.storage_key) self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store) def teardown(self, number_values, number_partitions, dtype): shutil.rmtree(self.tmp_dir) def time_load_index(self, number_values, number_partitions, arrow_type): self.ktk_index_not_loaded.load(self.store) def time_query_value(self, number_values, number_partitions, arrow_type): self.ktk_index.query(number_values / 2) def time_as_series(self, number_values, number_partitions, arrow_type): self.ktk_index.as_flat_series() def time_as_series_partitions_as_index(self, number_values, number_partitions, arrow_type): self.ktk_index.as_flat_series(partitions_as_index=True)
def test_index_as_flat_series_highly_degenerated_sym(): dim = 4 index1 = ExplicitSecondaryIndex( column="col", index_dct={ k: ["part_{}".format(i) for i in range(0, dim)] for k in range(0, dim) }, dtype=pa.int64(), ) ser = index1.as_flat_series() expected = pd.Series( ["part_{}".format(i) for i in range(0, dim)] * dim, index=pd.Index(np.array([[i] * dim for i in range(0, dim)]).ravel(), name="col"), name="partition", ) assert_series_equal(ser, expected)