예제 #1
0
def test_index_as_flat_series_partitions_as_index():

    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct={
            1: ["part_1", "part_2"],
            2: ["part_1"]
        },
        dtype=pa.int64(),
    )

    ser = index1.as_flat_series(partitions_as_index=True)
    expected = pd.Series(
        [1, 1, 2],
        index=pd.Index(["part_1", "part_2", "part_1"], name="partition"),
        name="col",
    )
    assert_series_equal(ser, expected)

    ser_comp = index1.as_flat_series(compact=True, partitions_as_index=True)
    expected = pd.Series(
        [[1, 2], [1]],
        index=pd.Index(["part_1", "part_2"], name="partition"),
        name="col",
    )
    assert_series_equal(ser_comp, expected)
예제 #2
0
def test_index_as_flat_series_highly_degenerated_asym():
    """
    Ensure that the generation of the series is not bound by col numbers or nans in the matrix
    """
    dim = 4
    ind_dct = {k: ["part_{}".format(i) for i in range(0, dim)] for k in range(0, dim)}
    ind_dct[0] = ["part_1"]
    ind_dct[2] = ["part_2", "part_5"]
    index1 = ExplicitSecondaryIndex(column="col", index_dct=ind_dct, dtype=pa.int64())
    ser = index1.as_flat_series()
    partition = [
        "part_1",
        "part_0",
        "part_1",
        "part_2",
        "part_3",
        "part_2",
        "part_5",
        "part_0",
        "part_1",
        "part_2",
        "part_3",
    ]
    index_values = [0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3]
    expected = pd.Series(
        partition, index=pd.Index(index_values, name="col", dtype=int), name="partition"
    )
    assert_series_equal(ser, expected)

    ser_inv = index1.as_flat_series(partitions_as_index=True)
    expected_inv = pd.Series(
        index_values, index=pd.Index(partition, name="partition"), name="col"
    )
    assert_series_equal(ser_inv, expected_inv)
예제 #3
0
def test_index_as_flat_series_date(dtype, date_as_object):
    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct={
            datetime.date(2017, 1, 2): ["part_1", "part_2"],
            datetime.date(2018, 2, 3): ["part_1"],
        },
        dtype=pa.date32(),
    )
    ser = index1.as_flat_series(date_as_object=date_as_object)
    ser = ser.sort_index()
    expected = pd.Series(
        ["part_1", "part_2", "part_1"],
        index=pd.Index(
            [
                datetime.date(2017, 1, 2),
                datetime.date(2017, 1, 2),
                datetime.date(2018, 2, 3),
            ],
            dtype=dtype,
            name="col",
        ),
        name="partition",
    )
    assert_series_equal(ser, expected)
예제 #4
0
def test_index_as_flat_series_single_value():

    index1 = ExplicitSecondaryIndex(column="col",
                                    index_dct={1: ["part_1", "part_2"]},
                                    dtype=pa.int64())
    ser = index1.as_flat_series()
    expected = pd.Series(["part_1", "part_2"],
                         index=pd.Index([1, 1], name="col"),
                         name="partition")
    assert_series_equal(ser, expected)

    ser_comp = index1.as_flat_series(compact=True)
    expected = pd.Series([["part_1", "part_2"]],
                         index=pd.Index([1], name="col"),
                         name="partition")
    assert_series_equal(ser_comp, expected)
예제 #5
0
class Index(AsvBenchmarkConfig):
    params = (
        [10 * 1, 10**3],  # values
        [10 * 1, 10**3],  # partitions
        [(int, pa.int64()), (str, pa.string())],  # types
    )
    param_names = ["number_values", "number_partitions", "dtype"]

    def setup(self, number_values, number_partitions, dtype):
        py_type, arrow_type = dtype
        index_dct = {
            py_type(val): [str(part) for part in range(number_partitions)]
            for val in range(0, number_values)
        }
        self.column_name = "column"
        self.ktk_index = ExplicitSecondaryIndex(column=self.column_name,
                                                index_dct=index_dct,
                                                dtype=arrow_type)
        self.tmp_dir = tempfile.mkdtemp()
        self.store = storefact.get_store_from_url("hfs://{}".format(
            self.tmp_dir))
        self.dataset_uuid = "some_uuid"
        self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid)

        self.ktk_index_not_loaded = ExplicitSecondaryIndex(
            column=self.column_name, index_storage_key=self.storage_key)

        self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store)

    def teardown(self, number_values, number_partitions, dtype):
        shutil.rmtree(self.tmp_dir)

    def time_load_index(self, number_values, number_partitions, arrow_type):
        self.ktk_index_not_loaded.load(self.store)

    def time_query_value(self, number_values, number_partitions, arrow_type):
        self.ktk_index.query(number_values / 2)

    def time_as_series(self, number_values, number_partitions, arrow_type):
        self.ktk_index.as_flat_series()

    def time_as_series_partitions_as_index(self, number_values,
                                           number_partitions, arrow_type):
        self.ktk_index.as_flat_series(partitions_as_index=True)
예제 #6
0
def test_index_as_flat_series_highly_degenerated_sym():
    dim = 4
    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct={
            k: ["part_{}".format(i) for i in range(0, dim)]
            for k in range(0, dim)
        },
        dtype=pa.int64(),
    )
    ser = index1.as_flat_series()
    expected = pd.Series(
        ["part_{}".format(i) for i in range(0, dim)] * dim,
        index=pd.Index(np.array([[i] * dim for i in range(0, dim)]).ravel(),
                       name="col"),
        name="partition",
    )
    assert_series_equal(ser, expected)