def from_buffer(buf, format="json", explicit_partitions=True): if format == "json": metadata = load_json(buf) else: metadata = msgpack.unpackb(buf) return DatasetMetadata.from_dict( metadata, explicit_partitions=explicit_partitions)
def from_buffer(buf: str, format: str = "json", explicit_partitions: bool = True): if format == "json": metadata = load_json(buf) else: metadata = unpackb(buf) return DatasetMetadata.from_dict( metadata, explicit_partitions=explicit_partitions )
def load_from_store( uuid: str, store: StoreInput, load_schema: bool = True, load_all_indices: bool = False, ) -> "DatasetMetadata": """ Load a dataset from a storage Parameters ---------- uuid UUID of the dataset. store Object that implements the .get method for file/object loading. load_schema Load table schema load_all_indices Load all registered indices into memory. Returns ------- dataset_metadata: :class:`~kartothek.core.dataset.DatasetMetadata` Parsed metadata. """ key1 = naming.metadata_key_from_uuid(uuid) store = ensure_store(store) try: value = store.get(key1) metadata = load_json(value) except KeyError: key2 = naming.metadata_key_from_uuid(uuid, format="msgpack") try: value = store.get(key2) metadata = unpackb(value) except KeyError: raise KeyError( "Dataset does not exist. Tried {} and {}".format( key1, key2)) ds = DatasetMetadata.load_from_dict(metadata, store, load_schema=load_schema) if load_all_indices: ds = ds.load_all_indices(store) return ds
def load_from_buffer(buf, store, format="json"): """ Load a dataset from a (string) buffer. Parameters ---------- buf: Union[str, bytes] Input to be parsed. store: simplekv.KeyValueStore Object that implements the .get method for file/object loading. Returns ------- dataset_metadata: :class:`~kartothek.core.dataset.DatasetMetadata` Parsed metadata. """ if format == "json": metadata = load_json(buf) elif format == "msgpack": metadata = msgpack.unpackb(buf) return DatasetMetadata.load_from_dict(metadata, store)
def load_from_buffer(buf, store: StoreInput, format: str = "json") -> "DatasetMetadata": """ Load a dataset from a (string) buffer. Parameters ---------- buf: Input to be parsed. store: Object that implements the .get method for file/object loading. Returns ------- DatasetMetadata: Parsed metadata. """ if format == "json": metadata = load_json(buf) elif format == "msgpack": metadata = unpackb(buf) return DatasetMetadata.load_from_dict(metadata, store)
def validate_shared_columns(schemas, ignore_pandas=False): """ Validate that columns that are shared amongst schemas are compatible. Only DataFrame columns are taken into account, other fields (like index data) are ignored. The following data must be an exact match: - metadata (as stored in the ``"columns"`` list of the ``b'pandas'`` schema metadata) - pyarrow type (that means that e.g. ``int8`` and ``int64`` are NOT compatible) Columns that are only present in a subset of the provided schemas must only be compatible for that subset, i.e. non-existing columns are ignored. The order of the columns in the provided schemas is irrelevant. Type normalization should be handled by :meth:`make_meta`. In the case that all schemas don't contain any pandas metadata, we will check the Arrow schemas directly for compatibility. Then the metadata information will not be checked (as it is non-existent). Parameters ---------- schemas: List[Schema] Schema information from multiple sources, e.g. multiple tables. List may be empty. ignore_pandas: bool Ignore the schema information given by Pandas an always use the Arrow schema. Raises ------ ValueError Incompatible columns were found. """ seen = {} has_pandas = _pandas_in_schemas(schemas) and not ignore_pandas for schema in schemas: if has_pandas: metadata = schema.metadata if metadata is None or b"pandas" not in metadata: raise ValueError( "Pandas and non-Pandas schemas are not comparable. " "Use ignore_pandas=True if you only want to compare " "on Arrow level.") pandas_metadata = load_json(metadata[b"pandas"].decode("utf8")) columns = [] for cmd in pandas_metadata["columns"]: name = cmd.get("name") if name is None: continue columns.append(cmd["field_name"]) else: columns = schema.names for col in columns: field_idx = schema.get_field_index(col) field = schema[field_idx] obj = (field, col) if col in seen: ref = seen[col] if pa.types.is_null(ref[0].type) or pa.types.is_null( field.type): continue if ref != obj: raise ValueError( 'Found incompatible entries for column "{}"\n{}\n{}'. format(col, ref, obj)) else: seen[col] = obj
def _determine_schemas_to_compare(schemas, ignore_pandas): """ Iterate over a list of `pyarrow.Schema` objects and prepares them for comparison by picking a reference and determining all null columns. .. note:: If pandas metadata exists, the version stored in the metadata is overwritten with the currently installed version since we expect to stay backwards compatible Returns ------- reference: Schema A reference schema which is picked from the input list. The reference schema is guaranteed to be a schema having the least number of null columns of all input columns. The set of null columns is guaranteed to be a true subset of all null columns of all input schemas. If no such schema can be found, an Exception is raised list_of_schemas: List[Tuple[Schema, List]] A list holding pairs of (Schema, null_columns) where the null_columns are all columns which are null and must be removed before comparing the schemas """ has_pandas = _pandas_in_schemas(schemas) and not ignore_pandas schemas_to_evaluate = [] reference = None null_cols_in_reference = set() for schema in schemas: if not isinstance(schema, SchemaWrapper): schema = SchemaWrapper(schema, "__unknown__") if has_pandas: metadata = schema.metadata if metadata is None or b"pandas" not in metadata: raise ValueError( "Pandas and non-Pandas schemas are not comparable. " "Use ignore_pandas=True if you only want to compare " "on Arrow level.") pandas_metadata = load_json(metadata[b"pandas"].decode("utf8")) # we don't care about the pandas version, since we assume it's safe # to read datasets that were written by older or newer versions. pandas_metadata["pandas_version"] = "{}".format(pd.__version__) metadata_clean = deepcopy(metadata) metadata_clean[b"pandas"] = _dict_to_binary(pandas_metadata) current = SchemaWrapper(pa.schema(schema, metadata_clean), schema.origin) else: current = schema # If a field is null we cannot compare it and must therefore reject it null_columns = { field.name for field in current if field.type == pa.null() } # Determine a valid reference schema. A valid reference schema is considered to be the schema # of all input schemas with the least empty columns. # The reference schema ought to be a schema whose empty columns are a true subset for all sets # of empty columns. This ensures that the actual reference schema is the schema with the most # information possible. A schema which doesn't fulfil this requirement would weaken the # comparison and would allow for false positives # Trivial case if reference is None: reference = current null_cols_in_reference = null_columns # The reference has enough information to validate against current schema. # Append it to the list of schemas to be verified elif null_cols_in_reference.issubset(null_columns): schemas_to_evaluate.append((current, null_columns)) # current schema includes all information of reference and more. # Add reference to schemas_to_evaluate and update reference elif null_columns.issubset(null_cols_in_reference): schemas_to_evaluate.append((reference, null_cols_in_reference)) reference = current null_cols_in_reference = null_columns # If there is no clear subset available elect the schema with the least null columns as `reference`. # Iterate over the null columns of `reference` and replace it with a non-null field of the `current` # schema which recovers the loop invariant (null columns of `reference` is subset of `current`) else: if len(null_columns) < len(null_cols_in_reference): reference, current = current, reference null_cols_in_reference, null_columns = ( null_columns, null_cols_in_reference, ) for col in null_cols_in_reference - null_columns: # Enrich the information in the reference by grabbing the missing fields # from the current iteration. This assumes that we only check for global validity and # isn't relevant where the reference comes from. reference = _swap_fields_by_name(reference, current, col) null_cols_in_reference.remove(col) schemas_to_evaluate.append((current, null_columns)) assert (reference is not None) or (not schemas_to_evaluate) return reference, schemas_to_evaluate
def _pandas_meta_from_schema(schema): if ARROW_LARGER_EQ_0130: pandas_metadata = schema.pandas_metadata else: pandas_metadata = load_json(schema.metadata[b"pandas"].decode("utf8")) return pandas_metadata