def _partition_data(self, partition_on): existing_indices, base_label = decode_key("uuid/table/{}".format( self.label))[2:] dct = dict() df = self.data # Check that data sizes do not change. This might happen if the # groupby below drops data, e.g. nulls size_after = 0 size_before = len(df) # Implementation from pyarrow # See https://github.com/apache/arrow/blob/b33dfd9c6bd800308bb1619b237dbf24dea159be/python/pyarrow/parquet.py#L1030 # noqa: E501 # column sanity checks data_cols = set(df.columns).difference(partition_on) missing_po_cols = set(partition_on).difference(df.columns) if missing_po_cols: raise ValueError("Partition column(s) missing: {}".format( ", ".join(sorted(missing_po_cols)))) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") # To be aligned with open source tooling we drop the index columns and recreate # them upon reading as it is done by fastparquet and pyarrow partition_keys = [df[col] for col in partition_on] # # The handling of empty dfs is not part of the arrow implementation # if df.empty: # return {} data_df = df.drop(partition_on, axis="columns") for value, group in data_df.groupby(by=partition_keys, sort=False): partitioning_info = [] if pd.api.types.is_scalar(value): value = [value] if existing_indices: partitioning_info.extend(quote_indices(existing_indices)) partitioning_info.extend(quote_indices(zip(partition_on, value))) partitioning_info.append(base_label) new_label = "/".join(partitioning_info) if new_label not in dct: dct[new_label] = {} dct[new_label] = group size_after += len(group) if size_before != size_after: raise ValueError( f"Original dataframe size ({size_before} rows) does not " f"match new dataframe size ({size_after} rows). " f"Hint: you may see this if you are trying to use `partition_on` on a column with null values." ) return dct
def create_partition_key( dataset_uuid: str, table: str, index_values: List[Tuple[str, str]], filename: str = "data", ): """ Create partition key for a kartothek partition Parameters ---------- dataset_uuid table index_values filename Example: create_partition_key('my-uuid', 'testtable', [('index1', 'value1'), ('index2', 'value2')]) returns 'my-uuid/testtable/index1=value1/index2=value2/data' """ key_components = [dataset_uuid, table] index_path = quote_indices(index_values) key_components.extend(index_path) key_components.append(filename) key = "/".join(key_components) return key
def test_index_quote_roundtrip(): indices = [ (1, b"Muenchen"), ("location", b"Muenchen"), ("location", "München"), ("product", "å\\ øß"), ] expected = [ ("1", "Muenchen"), ("location", "Muenchen"), ("location", "München"), ("product", "å\\ øß"), ] assert expected == unquote_indices(quote_indices(indices))
def _get_partition_label(indices, filename, metadata_version): return "/".join( quote_indices(indices) + [filename.replace(PARQUET_FILE_SUFFIX, "")])