示例#1
0
 def from_buffer(buf, format="json", explicit_partitions=True):
     if format == "json":
         metadata = load_json(buf)
     else:
         metadata = msgpack.unpackb(buf)
     return DatasetMetadata.from_dict(
         metadata, explicit_partitions=explicit_partitions)
 def from_buffer(buf: str, format: str = "json", explicit_partitions: bool = True):
     if format == "json":
         metadata = load_json(buf)
     else:
         metadata = unpackb(buf)
     return DatasetMetadata.from_dict(
         metadata, explicit_partitions=explicit_partitions
     )
示例#3
0
    def load_from_store(
        uuid: str,
        store: StoreInput,
        load_schema: bool = True,
        load_all_indices: bool = False,
    ) -> "DatasetMetadata":
        """
        Load a dataset from a storage

        Parameters
        ----------
        uuid
            UUID of the dataset.
        store
            Object that implements the .get method for file/object loading.
        load_schema
            Load table schema
        load_all_indices
            Load all registered indices into memory.

        Returns
        -------
        dataset_metadata: :class:`~kartothek.core.dataset.DatasetMetadata`
            Parsed metadata.
        """
        key1 = naming.metadata_key_from_uuid(uuid)
        store = ensure_store(store)
        try:
            value = store.get(key1)
            metadata = load_json(value)
        except KeyError:
            key2 = naming.metadata_key_from_uuid(uuid, format="msgpack")
            try:
                value = store.get(key2)
                metadata = unpackb(value)
            except KeyError:
                raise KeyError(
                    "Dataset does not exist. Tried {} and {}".format(
                        key1, key2))

        ds = DatasetMetadata.load_from_dict(metadata,
                                            store,
                                            load_schema=load_schema)
        if load_all_indices:
            ds = ds.load_all_indices(store)
        return ds
示例#4
0
    def load_from_buffer(buf, store, format="json"):
        """
        Load a dataset from a (string) buffer.

        Parameters
        ----------
        buf: Union[str, bytes]
            Input to be parsed.
        store: simplekv.KeyValueStore
            Object that implements the .get method for file/object loading.

        Returns
        -------
        dataset_metadata: :class:`~kartothek.core.dataset.DatasetMetadata`
            Parsed metadata.
        """
        if format == "json":
            metadata = load_json(buf)
        elif format == "msgpack":
            metadata = msgpack.unpackb(buf)
        return DatasetMetadata.load_from_dict(metadata, store)
示例#5
0
    def load_from_buffer(buf,
                         store: StoreInput,
                         format: str = "json") -> "DatasetMetadata":
        """
        Load a dataset from a (string) buffer.

        Parameters
        ----------
        buf:
            Input to be parsed.
        store:
            Object that implements the .get method for file/object loading.

        Returns
        -------
        DatasetMetadata:
            Parsed metadata.
        """
        if format == "json":
            metadata = load_json(buf)
        elif format == "msgpack":
            metadata = unpackb(buf)
        return DatasetMetadata.load_from_dict(metadata, store)
示例#6
0
def validate_shared_columns(schemas, ignore_pandas=False):
    """
    Validate that columns that are shared amongst schemas are compatible.

    Only DataFrame columns are taken into account, other fields (like index data) are ignored. The following data must
    be an exact match:

    - metadata (as stored in the ``"columns"`` list of the ``b'pandas'`` schema metadata)
    - pyarrow type (that means that e.g. ``int8`` and ``int64`` are NOT compatible)

    Columns that are only present in a subset of the provided schemas must only be compatible for that subset, i.e.
    non-existing columns are ignored. The order of the columns in the provided schemas is irrelevant.

    Type normalization should be handled by :meth:`make_meta`.

    In the case that all schemas don't contain any pandas metadata, we will check the Arrow
    schemas directly for compatibility. Then the metadata information will not be checked
    (as it is non-existent).

    Parameters
    ----------
    schemas: List[Schema]
        Schema information from multiple sources, e.g. multiple tables. List may be empty.
    ignore_pandas: bool
        Ignore the schema information given by Pandas an always use the Arrow schema.

    Raises
    ------
    ValueError
        Incompatible columns were found.
    """
    seen = {}
    has_pandas = _pandas_in_schemas(schemas) and not ignore_pandas

    for schema in schemas:
        if has_pandas:
            metadata = schema.metadata
            if metadata is None or b"pandas" not in metadata:
                raise ValueError(
                    "Pandas and non-Pandas schemas are not comparable. "
                    "Use ignore_pandas=True if you only want to compare "
                    "on Arrow level.")
            pandas_metadata = load_json(metadata[b"pandas"].decode("utf8"))

            columns = []
            for cmd in pandas_metadata["columns"]:
                name = cmd.get("name")
                if name is None:
                    continue
                columns.append(cmd["field_name"])
        else:
            columns = schema.names

        for col in columns:
            field_idx = schema.get_field_index(col)
            field = schema[field_idx]
            obj = (field, col)
            if col in seen:
                ref = seen[col]
                if pa.types.is_null(ref[0].type) or pa.types.is_null(
                        field.type):
                    continue
                if ref != obj:
                    raise ValueError(
                        'Found incompatible entries for column "{}"\n{}\n{}'.
                        format(col, ref, obj))
            else:
                seen[col] = obj
示例#7
0
def _determine_schemas_to_compare(schemas, ignore_pandas):
    """
    Iterate over a list of `pyarrow.Schema` objects and prepares them for comparison by picking a reference
    and determining all null columns.

    .. note::

        If pandas metadata exists, the version stored in the metadata is overwritten with the currently
        installed version since we expect to stay backwards compatible

    Returns
    -------
    reference: Schema
        A reference schema which is picked from the input list. The reference schema is guaranteed
        to be a schema having the least number of null columns of all input columns. The set of null
        columns is guaranteed to be a true subset of all null columns of all input schemas. If no such
        schema can be found, an Exception is raised
    list_of_schemas: List[Tuple[Schema, List]]
        A list holding pairs of (Schema, null_columns) where the null_columns are all columns which are null and
        must be removed before comparing the schemas
    """
    has_pandas = _pandas_in_schemas(schemas) and not ignore_pandas
    schemas_to_evaluate = []
    reference = None
    null_cols_in_reference = set()

    for schema in schemas:
        if not isinstance(schema, SchemaWrapper):
            schema = SchemaWrapper(schema, "__unknown__")

        if has_pandas:
            metadata = schema.metadata
            if metadata is None or b"pandas" not in metadata:
                raise ValueError(
                    "Pandas and non-Pandas schemas are not comparable. "
                    "Use ignore_pandas=True if you only want to compare "
                    "on Arrow level.")
            pandas_metadata = load_json(metadata[b"pandas"].decode("utf8"))

            # we don't care about the pandas version, since we assume it's safe
            # to read datasets that were written by older or newer versions.
            pandas_metadata["pandas_version"] = "{}".format(pd.__version__)

            metadata_clean = deepcopy(metadata)
            metadata_clean[b"pandas"] = _dict_to_binary(pandas_metadata)
            current = SchemaWrapper(pa.schema(schema, metadata_clean),
                                    schema.origin)
        else:
            current = schema

        # If a field is null we cannot compare it and must therefore reject it
        null_columns = {
            field.name
            for field in current if field.type == pa.null()
        }

        # Determine a valid reference schema. A valid reference schema is considered to be the schema
        # of all input schemas with the least empty columns.
        # The reference schema ought to be a schema whose empty columns are a true subset for all sets
        # of empty columns. This ensures that the actual reference schema is the schema with the most
        # information possible. A schema which doesn't fulfil this requirement would weaken the
        # comparison and would allow for false positives

        # Trivial case
        if reference is None:
            reference = current
            null_cols_in_reference = null_columns
        # The reference has enough information to validate against current schema.
        # Append it to the list of schemas to be verified
        elif null_cols_in_reference.issubset(null_columns):
            schemas_to_evaluate.append((current, null_columns))
        # current schema includes all information of reference and more.
        # Add reference to schemas_to_evaluate and update reference
        elif null_columns.issubset(null_cols_in_reference):
            schemas_to_evaluate.append((reference, null_cols_in_reference))
            reference = current
            null_cols_in_reference = null_columns
        # If there is no clear subset available elect the schema with the least null columns as `reference`.
        # Iterate over the null columns of `reference` and replace it with a non-null field of the `current`
        # schema which recovers the loop invariant (null columns of `reference` is subset of `current`)
        else:
            if len(null_columns) < len(null_cols_in_reference):
                reference, current = current, reference
                null_cols_in_reference, null_columns = (
                    null_columns,
                    null_cols_in_reference,
                )

            for col in null_cols_in_reference - null_columns:
                # Enrich the information in the reference by grabbing the missing fields
                # from the current iteration. This assumes that we only check for global validity and
                # isn't relevant where the reference comes from.
                reference = _swap_fields_by_name(reference, current, col)
                null_cols_in_reference.remove(col)
            schemas_to_evaluate.append((current, null_columns))

    assert (reference is not None) or (not schemas_to_evaluate)

    return reference, schemas_to_evaluate
示例#8
0
def _pandas_meta_from_schema(schema):
    if ARROW_LARGER_EQ_0130:
        pandas_metadata = schema.pandas_metadata
    else:
        pandas_metadata = load_json(schema.metadata[b"pandas"].decode("utf8"))
    return pandas_metadata