def test_validate_shared_columns_fail(df_all_types, remove_metadata): df2 = df_all_types.copy() df2["uint16"] = df2["uint16"].astype(float) schema1 = make_meta(df_all_types, origin="1") schema2 = make_meta(df2, origin="2") if remove_metadata: schema1 = schema1.remove_metadata() schema2 = schema2.remove_metadata() with pytest.raises(ValueError) as exc: validate_shared_columns([schema1, schema2]) assert str(exc.value).startswith('Found incompatible entries for column "uint16"')
def test_validate_shared_columns_no_share(df_all_types): schema1 = make_meta(df_all_types.loc[:, df_all_types.columns[0:2]], origin="1") schema2 = make_meta(df_all_types.loc[:, df_all_types.columns[2:4]], origin="2") schema3 = make_meta(df_all_types.loc[:, df_all_types.columns[4:6]], origin="3") validate_shared_columns([]) validate_shared_columns([schema1]) validate_shared_columns([schema1, schema2]) validate_shared_columns([schema1, schema2, schema3])
def persist_common_metadata(partition_list, update_dataset, store, dataset_uuid): # hash the schemas for quick equality check with possible false negatives # (e.g. other pandas version or null schemas) tm_dct = defaultdict(set) for mp in partition_list: for tab, tm in mp.table_meta.items(): tm_dct[tab].add(tm) if update_dataset: if set(tm_dct.keys()) and set(update_dataset.tables) != set( tm_dct.keys()): raise ValueError(( "Input partitions for update have different tables than dataset:\n" "Input partition tables: {}\n" "Tables of existing dataset: {}").format( set(tm_dct.keys()), update_dataset.tables)) for table in update_dataset.tables: tm_dct[table].add( read_schema_metadata(dataset_uuid=dataset_uuid, store=store, table=table)) result = {} # sort tables and schemas to have reproducible error messages for table in sorted(tm_dct.keys()): schemas = sorted(tm_dct[table], key=lambda s: sorted(s.origin)) try: result[table] = validate_compatible(schemas) except ValueError as e: raise ValueError( "Schemas for table '{table}' of dataset '{dataset_uuid}' are not compatible!\n\n{e}" .format(table=table, dataset_uuid=dataset_uuid, e=e)) validate_shared_columns(list(result.values())) for table, schema in result.items(): store_schema_metadata(schema=schema, dataset_uuid=dataset_uuid, store=store, table=table) return result
def check_datasets(datasets: Dict[str, DatasetMetadata], cube: Cube) -> Dict[str, DatasetMetadata]: """ Apply sanity checks to persisteted Karothek datasets. The following checks will be applied: - seed dataset present - metadata version correct - only the cube-specific table is present - partition keys are correct - no overlapping payload columns exists - datatypes are consistent - dimension columns are present everywhere - required index structures are present (more are allowed) - ``PartitionIndex`` for every partition key - for seed dataset, ``ExplicitSecondaryIndex`` for every dimension column - for all datasets, ``ExplicitSecondaryIndex`` for every index column Parameters ---------- datasets Datasets. cube Cube specification. Returns ------- datasets: Dict[str, DatasetMetadata] Same as input, but w/ partition indices loaded. Raises ------ ValueError If sanity check failed. """ if cube.seed_dataset not in datasets: raise ValueError('Seed data ("{}") is missing.'.format( cube.seed_dataset)) _check_datasets( datasets=datasets, f=lambda ds: ds.metadata_version, expected=KTK_CUBE_METADATA_VERSION, what="metadata version", ) datasets = { name: ds.load_partition_indices() for name, ds in datasets.items() } _check_datasets( datasets=datasets, f=lambda ds: set(ds.table_meta.keys()), expected={SINGLE_TABLE}, what="table", ) _check_overlap(datasets, cube) # check column types validate_shared_columns( [ds.table_meta[SINGLE_TABLE] for ds in datasets.values()]) _check_partition_columns(datasets, cube) _check_dimension_columns(datasets, cube) _check_indices(datasets, cube) return datasets
def test_validate_shared_columns_same(df_all_types): schema1 = make_meta(df_all_types, origin="1") schema2 = make_meta(df_all_types, origin="2") schema3 = make_meta(df_all_types, origin="3").remove_metadata() validate_shared_columns([]) validate_shared_columns([schema1]) validate_shared_columns([schema1, schema2]) with pytest.raises(ValueError): validate_shared_columns([schema1, schema2, schema3]) validate_shared_columns([schema1, schema2, schema3], ignore_pandas=True) validate_shared_columns( [schema1.remove_metadata(), schema2.remove_metadata(), schema3])
def time_validate_shared_columns(self, num_schemas): validate_shared_columns(self.schemas)
def peakmem_validate_shared_columns(self, num_schemas): validate_shared_columns(self.schemas)