Exemplo n.º 1
0
    def setup(self, column, filter_size, array_size):

        if column == "null":
            raise NotImplementedError()
        ser = (get_dataframe_not_nested(array_size).sample(
            frac=1.0).reset_index(drop=True)[column])
        self.arr = ser.values
        self.value = ser.sample(n=filter_size).unique()
Exemplo n.º 2
0
def dataset_to_copy(store):
    df = get_dataframe_not_nested(10)
    store_dataframes_as_dataset(
        dfs=[df],
        dataset_uuid=SRC_DS_UUID,
        store=store,
        partition_on=[df.columns[0]],
        secondary_indices=[df.columns[1]],
    )
Exemplo n.º 3
0
def test_store_input_types(store_input_types, bound_store_dataframes):
    from kartothek.serialization.testing import get_dataframe_not_nested

    dataset_uuid = "dataset_uuid"
    df = get_dataframe_not_nested(10)

    assert bound_store_dataframes(
        [df],
        dataset_uuid=dataset_uuid,
        store=store_input_types,
        partition_on=[df.columns[0]],
        secondary_indices=[df.columns[1]],
    )
Exemplo n.º 4
0
 def setup(self, predicate):
     if predicate == "conjunctions":
         self.predicate = [[
             ("int16", ">", 123),
             ("int32", "<", 321),
             ("bool", "==", True),
             ("bool", "==", True),
         ]]
     elif predicate == "disjunctions":
         self.predicate = [
             [("int16", ">", 123)],
             [("int32", "<", 321)],
             [("int32", "<", 321)],
             [("int32", "<", 321)],
         ]
     self.df = get_dataframe_not_nested(10**5)
Exemplo n.º 5
0
def test_store_input_types(store_input_types, bound_load_dataframes):
    from kartothek.io.eager import store_dataframes_as_dataset
    from kartothek.serialization.testing import get_dataframe_not_nested

    dataset_uuid = "dataset_uuid"
    df = get_dataframe_not_nested(10)

    store_dataframes_as_dataset(
        dfs=[df],
        dataset_uuid=dataset_uuid,
        store=store_input_types,
        partition_on=[df.columns[0]],
        secondary_indices=[df.columns[1]],
    )

    # Use predicates to trigger partition pruning with indices
    predicates = [[
        (df.columns[0], "==", df.loc[0, df.columns[0]]),
        (df.columns[1], "==", df.loc[0, df.columns[1]]),
    ]]

    result = bound_load_dataframes(
        dataset_uuid=dataset_uuid,
        store=store_input_types,
        predicates=predicates,
        dates_as_object=True,
    )

    if isinstance(result, list):
        result = result[0]

    if isinstance(result, MetaPartition):
        result = result.data

    if isinstance(result, dict):
        result = result[SINGLE_TABLE]

    pdt.assert_frame_equal(result, df.head(1), check_dtype=False)
Exemplo n.º 6
0
 def setup(self, num_rows, chunk_size):
     self.df = get_dataframe_not_nested(num_rows)
     self.serialiser = ParquetSerializer(chunk_size=chunk_size)
     self.store = get_store_from_url("memory://")
     self.key = self.serialiser.store(self.store, "key_prefix", self.df)
     self.predicates = [[("int16", "==", 123)]]
Exemplo n.º 7
0
 def setup(self, column):
     if column == "null":
         raise NotImplementedError()
     self.arr = (get_dataframe_not_nested(
         10**5).sample(frac=1.0).reset_index(drop=True)[column].values)
     self.value = self.arr[12345]
Exemplo n.º 8
0
def dataframe_not_nested():
    return get_dataframe_not_nested(10)
    print(f"Hive query: {hive_query}")
    cursor.execute(hive_query)

    # Get column names from query substring
    selected_columns = [
        l.strip().split(" ")[0]
        for l in selected_columns_and_dtypes.splitlines()
    ]
    # Read hive table into pandas
    hive_df = pd.read_sql(f"SELECT * FROM {TABLE_NAME}", conn)
    hive_df.columns = selected_columns
    # Pyarrow stores timestamp as microseconds from epoch, convert to date
    hive_df["datetime64"] = pd.to_datetime(hive_df.loc[:, "datetime64"] * 1000,
                                           unit="ns")
    # Output from hive is a string, parse this to date
    hive_df["date_"] = pd.to_datetime(
        hive_df.loc[:, "date_"], format="%Y-%m-%d").apply(lambda x: x.date())

    # Ignore dtype for numeric comparisons (e.g. int32 with int64)
    pdt.assert_frame_equal(df[selected_columns], hive_df, check_dtype=False)
    print(f"Test completed for the following data types: {selected_columns}")


# Create dataset on local filesystem
store_factory = partial(storefact.get_store_from_url,
                        f"hfs://{VOLUME_LOCATION}")

df = get_dataframe_not_nested(100)
# Rename because `date` and `null` are reserved in Hive QL
df = df.rename(columns={"date": "date_", "null": "null_"})
assert_hive_compat(df, store_factory, uuid="test")