def setup(self, column, filter_size, array_size): if column == "null": raise NotImplementedError() ser = (get_dataframe_not_nested(array_size).sample( frac=1.0).reset_index(drop=True)[column]) self.arr = ser.values self.value = ser.sample(n=filter_size).unique()
def dataset_to_copy(store): df = get_dataframe_not_nested(10) store_dataframes_as_dataset( dfs=[df], dataset_uuid=SRC_DS_UUID, store=store, partition_on=[df.columns[0]], secondary_indices=[df.columns[1]], )
def test_store_input_types(store_input_types, bound_store_dataframes): from kartothek.serialization.testing import get_dataframe_not_nested dataset_uuid = "dataset_uuid" df = get_dataframe_not_nested(10) assert bound_store_dataframes( [df], dataset_uuid=dataset_uuid, store=store_input_types, partition_on=[df.columns[0]], secondary_indices=[df.columns[1]], )
def setup(self, predicate): if predicate == "conjunctions": self.predicate = [[ ("int16", ">", 123), ("int32", "<", 321), ("bool", "==", True), ("bool", "==", True), ]] elif predicate == "disjunctions": self.predicate = [ [("int16", ">", 123)], [("int32", "<", 321)], [("int32", "<", 321)], [("int32", "<", 321)], ] self.df = get_dataframe_not_nested(10**5)
def test_store_input_types(store_input_types, bound_load_dataframes): from kartothek.io.eager import store_dataframes_as_dataset from kartothek.serialization.testing import get_dataframe_not_nested dataset_uuid = "dataset_uuid" df = get_dataframe_not_nested(10) store_dataframes_as_dataset( dfs=[df], dataset_uuid=dataset_uuid, store=store_input_types, partition_on=[df.columns[0]], secondary_indices=[df.columns[1]], ) # Use predicates to trigger partition pruning with indices predicates = [[ (df.columns[0], "==", df.loc[0, df.columns[0]]), (df.columns[1], "==", df.loc[0, df.columns[1]]), ]] result = bound_load_dataframes( dataset_uuid=dataset_uuid, store=store_input_types, predicates=predicates, dates_as_object=True, ) if isinstance(result, list): result = result[0] if isinstance(result, MetaPartition): result = result.data if isinstance(result, dict): result = result[SINGLE_TABLE] pdt.assert_frame_equal(result, df.head(1), check_dtype=False)
def setup(self, num_rows, chunk_size): self.df = get_dataframe_not_nested(num_rows) self.serialiser = ParquetSerializer(chunk_size=chunk_size) self.store = get_store_from_url("memory://") self.key = self.serialiser.store(self.store, "key_prefix", self.df) self.predicates = [[("int16", "==", 123)]]
def setup(self, column): if column == "null": raise NotImplementedError() self.arr = (get_dataframe_not_nested( 10**5).sample(frac=1.0).reset_index(drop=True)[column].values) self.value = self.arr[12345]
def dataframe_not_nested(): return get_dataframe_not_nested(10)
print(f"Hive query: {hive_query}") cursor.execute(hive_query) # Get column names from query substring selected_columns = [ l.strip().split(" ")[0] for l in selected_columns_and_dtypes.splitlines() ] # Read hive table into pandas hive_df = pd.read_sql(f"SELECT * FROM {TABLE_NAME}", conn) hive_df.columns = selected_columns # Pyarrow stores timestamp as microseconds from epoch, convert to date hive_df["datetime64"] = pd.to_datetime(hive_df.loc[:, "datetime64"] * 1000, unit="ns") # Output from hive is a string, parse this to date hive_df["date_"] = pd.to_datetime( hive_df.loc[:, "date_"], format="%Y-%m-%d").apply(lambda x: x.date()) # Ignore dtype for numeric comparisons (e.g. int32 with int64) pdt.assert_frame_equal(df[selected_columns], hive_df, check_dtype=False) print(f"Test completed for the following data types: {selected_columns}") # Create dataset on local filesystem store_factory = partial(storefact.get_store_from_url, f"hfs://{VOLUME_LOCATION}") df = get_dataframe_not_nested(100) # Rename because `date` and `null` are reserved in Hive QL df = df.rename(columns={"date": "date_", "null": "null_"}) assert_hive_compat(df, store_factory, uuid="test")