def validate_schema_compatible(self, store: StoreInput, dataset_uuid: str) -> "MetaPartition": """ Validates that the currently held DataFrames match the schema of the existing dataset. Parameters ---------- store If it is a function, the result of calling it must be a KeyValueStore. dataset_uuid The dataset UUID the partition will be assigned to """ # Load the reference meta of the existing dataset. Using the built-in # `load_all_table_meta` would not be helpful here as it would be a no-op # as we have already loaded the meta from the input DataFrame. store = ensure_store(store) reference_meta = read_schema_metadata(dataset_uuid=dataset_uuid, store=store, table=self.table_name) try: validate_compatible([self.schema, reference_meta]) except ValueError as e: raise ValueError( f"Schemas for dataset '{dataset_uuid}' are not compatible!\n\n{e}" ) return self
def test_validate_compatible_other_pandas(df_all_types, remove_metadata, ignore_pandas): def _with_pandas(version): schema = make_meta(df_all_types, origin=version) metadata = schema.metadata pandas_metadata = simplejson.loads(metadata[b"pandas"].decode("utf8")) pandas_metadata["pandas_version"] = version metadata[b"pandas"] = simplejson.dumps(pandas_metadata).encode("utf8") schema = SchemaWrapper(pa.schema(schema, metadata), version) if remove_metadata: return schema.remove_metadata() else: return schema schema1 = make_meta(df_all_types, origin="all") schema2 = _with_pandas("0.19.0") schema3 = _with_pandas("0.99.0") if remove_metadata and not ignore_pandas: # This should fail as long as we have the metadata attached with pytest.raises(ValueError): validate_compatible([schema1, schema2, schema3], ignore_pandas=ignore_pandas) schema1 = schema1.remove_metadata() validate_compatible([schema1, schema2, schema3], ignore_pandas=ignore_pandas)
def test_validate_empty_dataframe_corrupt_raises( df_all_types, df_all_types_schema, df_all_types_empty_schema, corrupt_column, corrupt_value, corrupt_dtype, ): # In case there is something wrong with the schema, raise! # First, an integer column carries a float or an object. df_corrupt = df_all_types.copy() # for value, dtype in [(-1.1, np.float64), ('a', np.object)]: df_corrupt[corrupt_column] = pd.Series([corrupt_value], dtype=corrupt_dtype) df_corrupt_meta = make_meta(df_corrupt, origin="1") # Raise when comparing the proper to the corrupt schema for schemas in permutations([df_all_types_schema, df_corrupt_meta]): with pytest.raises(ValueError): validate_compatible(schemas) # Also raise if there is a schema originating from an empty DF to make # sure the emptiness doesn't cancel the validation for schemas in permutations( [df_all_types_schema, df_corrupt_meta, df_all_types_empty_schema]): with pytest.raises(ValueError): validate_compatible(schemas)
def test_validate_compatible_different(df_all_types): df2 = df_all_types.loc[:, df_all_types.columns[:2]].copy() schema1 = make_meta(df_all_types, origin="1") schema2 = make_meta(df2, origin="2") with pytest.raises(ValueError) as exc: validate_compatible([schema1, schema2]) assert str(exc.value).startswith("Schema violation")
def test_validate_empty_dataframe(df_all_types, df_all_types_schema, df_all_types_empty_schema): # Do not raise in case one of the schemas is of an empty dataframe # Test all permutations to avoid that the implementation is sensitive on whether # the first schema is empty/non-empty for schemas in permutations( [df_all_types_schema, df_all_types_empty_schema]): validate_compatible(schemas) validate_compatible([df_all_types_empty_schema, df_all_types_empty_schema])
def test_validate_different_cats_different_type(): input_df = pd.DataFrame( {"categories": pd.Series([u"a", u"b", u"c", u"a"], dtype="category")}) input_df_2 = pd.DataFrame( {"categories": pd.Series([b"f", b"e", b"e", b"f"], dtype="category")}) meta = make_meta(input_df, origin="1") meta_2 = make_meta(input_df_2, origin="2") with pytest.raises(ValueError): validate_compatible([meta, meta_2])
def test_validate_different_cats_same_type(): input_df = pd.DataFrame( {"categories": pd.Series(["a", "b", "c", "a"], dtype="category")}) input_df_2 = pd.DataFrame( {"categories": pd.Series(["f", "e", "e", "f"], dtype="category")}) input_df_3 = pd.DataFrame({"categories": pd.Series(["f", "e", "e", "f"])}) meta = make_meta(input_df, origin="1") meta_2 = make_meta(input_df_2, origin="2") meta_3 = make_meta(input_df_3, origin="3") validate_compatible([meta, meta_2, meta_3])
def test_validate_compatible_same(df_all_types): schema1 = make_meta(df_all_types, origin="1") schema2 = make_meta(df_all_types, origin="2") schema3 = make_meta(df_all_types, origin="3") validate_compatible([]) validate_compatible([schema1]) validate_compatible([schema1, schema2]) validate_compatible([schema1, schema2, schema3])
def persist_common_metadata( schemas: Iterable[SchemaWrapper], update_dataset: Optional[DatasetFactory], store: KeyValueStore, dataset_uuid: str, table_name: str, ): if not schemas: return None schemas_set = set(schemas) del schemas if update_dataset: schemas_set.add( read_schema_metadata(dataset_uuid=dataset_uuid, store=store, table=table_name)) schemas_sorted = sorted(schemas_set, key=lambda s: sorted(s.origin)) try: result = validate_compatible(schemas_sorted) except ValueError as e: raise ValueError( "Schemas for dataset '{dataset_uuid}' are not compatible!\n\n{e}". format(dataset_uuid=dataset_uuid, e=e)) if result: store_schema_metadata(schema=result, dataset_uuid=dataset_uuid, store=store, table=table_name) return result
def test_compat_old_rw_path(df_all_types, store): # strip down DF before some column types weren't supported before anyway df = df_all_types[ [ c for c in df_all_types.columns if ( not c.startswith("array_") # array types (always null) and c != "unicode" # unicode type (alway null) and "8" not in c # 8 bit types are casted to 64 bit and "16" not in c # 16 bit types are casted to 64 bit and "32" not in c # 32 bit types are casted to 64 bit ) ] ] expected_meta = make_meta(df, origin="df") # old schema write path old_meta = dask_make_meta(df) pa_table = pa.Table.from_pandas(old_meta) buf = pa.BufferOutputStream() pq.write_table(pa_table, buf, version="2.0") key_old = _get_common_metadata_key("dataset_uuid_old", "table") store.put(key_old, buf.getvalue().to_pybytes()) actual_meta = read_schema_metadata( dataset_uuid="dataset_uuid_old", store=store, table="table" ) validate_compatible([actual_meta, expected_meta]) store_schema_metadata( schema=make_meta(df, origin="df"), dataset_uuid="dataset_uuid_new", store=store, table="table", ) key_new = _get_common_metadata_key("dataset_uuid_new", "table") actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store) actual_df["date"] = actual_df["date"].dt.date pdt.assert_frame_equal(actual_df, old_meta)
def add_metapartition( self, metapartition: "MetaPartition", schema_validation: bool = True, ): """ Adds a metapartition to the internal list structure to enable batch processing. Parameters ---------- metapartition The MetaPartition to be added. schema_validation If True (default), ensure that the `table_meta` of both `MetaPartition` objects are the same """ if self.is_sentinel: return metapartition existing_label = [mp_["label"] for mp_ in self.metapartitions] if any([ mp_["label"] in existing_label for mp_ in metapartition.metapartitions ]): raise RuntimeError( "Duplicate labels for nested metapartitions are not allowed!") schema = metapartition.schema if schema_validation and schema: # This ensures that only schema-compatible metapartitions can be nested # The returned schema by validate_compatible is the reference schema with the most # information, i.e. the fewest null columns schema = validate_compatible([self.schema, metapartition.schema]) new_object = MetaPartition( label="NestedMetaPartition", metadata_version=metapartition.metadata_version, schema=schema, partition_keys=metapartition.partition_keys or None, logical_conjunction=metapartition.logical_conjunction or None, table_name=metapartition.table_name, ) # Add metapartition information to the new object new_metapartitions = self.metapartitions.copy() new_metapartitions.extend(metapartition.metapartitions.copy()) new_object.metapartitions = new_metapartitions return new_object
def persist_common_metadata(partition_list, update_dataset, store, dataset_uuid): # hash the schemas for quick equality check with possible false negatives # (e.g. other pandas version or null schemas) tm_dct = defaultdict(set) for mp in partition_list: for tab, tm in mp.table_meta.items(): tm_dct[tab].add(tm) if update_dataset: if set(tm_dct.keys()) and set(update_dataset.tables) != set( tm_dct.keys()): raise ValueError(( "Input partitions for update have different tables than dataset:\n" "Input partition tables: {}\n" "Tables of existing dataset: {}").format( set(tm_dct.keys()), update_dataset.tables)) for table in update_dataset.tables: tm_dct[table].add( read_schema_metadata(dataset_uuid=dataset_uuid, store=store, table=table)) result = {} # sort tables and schemas to have reproducible error messages for table in sorted(tm_dct.keys()): schemas = sorted(tm_dct[table], key=lambda s: sorted(s.origin)) try: result[table] = validate_compatible(schemas) except ValueError as e: raise ValueError( "Schemas for table '{table}' of dataset '{dataset_uuid}' are not compatible!\n\n{e}" .format(table=table, dataset_uuid=dataset_uuid, e=e)) validate_shared_columns(list(result.values())) for table, schema in result.items(): store_schema_metadata(schema=schema, dataset_uuid=dataset_uuid, store=store, table=table) return result
def test_validate_schema_non_overlapping_nulls(df_all_types_schema): """ Test that two schemas with non-overlapping null columns are valid """ first_ix = np.random.randint(len(df_all_types_schema)) second_ix = first_ix while second_ix == first_ix: second_ix = np.random.randint(len(df_all_types_schema)) first_null = pa.field(name=df_all_types_schema.names[first_ix], type=pa.null()) first_schema = df_all_types_schema.set(first_ix, first_null) second_null = pa.field(name=df_all_types_schema.names[second_ix], type=pa.null()) second_schema = df_all_types_schema.set(second_ix, second_null) for schemas in permutations([first_schema, second_schema]): reference_schema = validate_compatible(schemas) # The reference schema should be the original schema # with the columns reconstructed assert df_all_types_schema == reference_schema
def concat_metapartitions(metapartitions, label_merger=None): LOGGER.debug("Concatenating metapartitions") new_metadata_version = -1 data = [] schema = [] for mp in metapartitions: new_metadata_version = max(new_metadata_version, mp.metadata_version) data.append(mp.data) schema.append(mp.schema) # Don't care about the partition_keys. If we try to merge # MetaPartitions without alignment the schemas won't match. partition_keys = mp.partition_keys categoricals = [ col for col, dtype in data[0].items() if pd.api.types.is_categorical_dtype(dtype) ] if categoricals: data = align_categories(data, categoricals) new_df = pd.concat(data) new_schema = validate_compatible(schema) new_label = MetaPartition._merge_labels(metapartitions, label_merger) new_mp = MetaPartition( label=new_label, data=new_df, metadata_version=new_metadata_version, schema=new_schema, partition_keys=partition_keys, ) return new_mp
def test_schema_dataframe_rountrip(index, df_all_types): df = pd.DataFrame(df_all_types, index=index) schema = make_meta(df, origin="1") actual_df = empty_dataframe_from_schema(schema, date_as_object=True) validate_compatible([schema, make_meta(actual_df, origin="2")])
def peakmem_validate_compatible(self, num_schemas, has_na): validate_compatible(self.schemas)
def time_validate_compatible(self, num_schemas, has_na): validate_compatible(self.schemas)