def update_dataset_from_partitions( partition_list, store_factory, dataset_uuid, ds_factory, delete_scope, metadata, metadata_merger, ): store = _instantiate_store(store_factory) if ds_factory: ds_factory = ds_factory.load_all_indices() remove_partitions = _get_partitions(ds_factory, delete_scope) index_columns = list(ds_factory.indices.keys()) for column in index_columns: index = ds_factory.indices[column] if isinstance(index, PartitionIndex): del ds_factory.indices[column] else: # Dataset does not exist yet. remove_partitions = [] new_dataset = store_dataset_from_partitions( partition_list=partition_list, store=store, dataset_uuid=dataset_uuid, dataset_metadata=metadata, metadata_merger=metadata_merger, update_dataset=ds_factory, remove_partitions=remove_partitions, ) return new_dataset
def raise_if_dataset_exists(dataset_uuid, store): try: store_instance = _instantiate_store(store) for form in ["msgpack", "json"]: key = naming.metadata_key_from_uuid(uuid=dataset_uuid, format=form) if key in store_instance: raise RuntimeError( "Dataset `%s` already exists and overwrite is not permitted!", dataset_uuid, ) except KeyError: pass
def persist_indices(store, dataset_uuid, indices): store = _instantiate_store(store) output_filenames = {} for column, index in indices.items(): # backwards compat if isinstance(index, dict): legacy_storage_key = "{dataset_uuid}.{column}{suffix}".format( dataset_uuid=dataset_uuid, column=column, suffix=naming.EXTERNAL_INDEX_SUFFIX, ) index = ExplicitSecondaryIndex( column=column, index_dct=index, index_storage_key=legacy_storage_key) elif isinstance(index, PartitionIndex): continue output_filenames[column] = index.store(store=store, dataset_uuid=dataset_uuid) return output_filenames
def align_datasets(left_dataset_uuid, right_dataset_uuid, store, match_how="exact"): """ Determine dataset partition alignment Parameters ---------- left_dataset_uuid : basestring right_dataset_uuid : basestring store : KeyValuestore or callable match_how : basestring or callable, {exact, prefix, all, callable} Yields ------ list """ store = _instantiate_store(store) left_dataset = DatasetMetadata.load_from_store(uuid=left_dataset_uuid, store=store) right_dataset = DatasetMetadata.load_from_store( uuid=right_dataset_uuid, store=store ) metadata_version = left_dataset.metadata_version # Loop over the dataset with fewer partitions, treating its keys as # partition label prefixes if ( callable(match_how) or match_how == "left" or ( match_how == "prefix" and len(list(left_dataset.partitions.keys())[0]) < len(list(right_dataset.partitions.keys())[0]) ) ): first_dataset = left_dataset second_dataset = right_dataset else: first_dataset = right_dataset second_dataset = left_dataset # The del statements are here to reduce confusion below del left_dataset del right_dataset # For every partition in the 'small' dataset, at least one partition match # needs to be found in the larger dataset. available_partitions = list(second_dataset.partitions.items()) partition_stack = available_partitions[:] # TODO: write a test which protects against the following scenario!! # Sort the partition labels by length of the labels, starting with the # labels which are the longest. This way we prevent label matching for # similar partitions, e.g. cluster_100 and cluster_1. This, of course, # works only as long as the internal loop removes elements which were # matched already (here improperly called stack) for l_1 in sorted(first_dataset.partitions, key=len, reverse=True): p_1 = first_dataset.partitions[l_1] res = [ MetaPartition.from_partition( partition=p_1, metadata_version=metadata_version ) ] for parts in available_partitions: l_2, p_2 = parts if callable(match_how) and not match_how(l_1, l_2): continue if match_how == "exact" and l_1 != l_2: continue elif match_how == "prefix" and not l_2.startswith(l_1): LOGGER.debug("rejecting (%s, %s)", l_1, l_2) continue LOGGER.debug( "Found alignment between partitions " "(%s, %s) and" "(%s, %s)", first_dataset.uuid, p_1.label, second_dataset.uuid, p_2.label, ) res.append( MetaPartition.from_partition( partition=p_2, metadata_version=metadata_version ) ) # In exact or prefix matching schemes, it is expected to only # find one partition alignment. in this case reduce the size of # the inner loop if match_how in ["exact", "prefix"]: partition_stack.remove((l_2, p_2)) # Need to copy, otherwise remove will alter the loop iterator available_partitions = partition_stack[:] if len(res) == 1: raise RuntimeError( "No matching partition for {} in dataset {} " "found".format(p_1, first_dataset) ) yield res
def store_dataset_from_partitions( partition_list, store, dataset_uuid, dataset_metadata=None, metadata_merger=None, update_dataset=None, remove_partitions=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, ): store = _instantiate_store(store) if update_dataset: dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset) metadata_version = dataset_builder.metadata_version else: mp = next(iter(partition_list), None) if mp is None: raise ValueError( "Cannot store empty datasets, partition_list must not be empty if in store mode." ) metadata_version = mp.metadata_version dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, metadata_version=metadata_version, partition_keys=mp.partition_keys, ) dataset_builder.explicit_partitions = True dataset_builder.table_meta = persist_common_metadata( partition_list, update_dataset, store, dataset_uuid) # We can only check for non unique partition labels here and if they occur we will # fail hard. The resulting dataset may be corrupted or file may be left in the store # without dataset metadata partition_labels = partition_labels_from_mps(partition_list) non_unique_labels = extract_duplicates(partition_labels) if non_unique_labels: raise ValueError( "The labels {} are duplicated. Dataset metadata was not written.". format(", ".join(non_unique_labels))) if remove_partitions is None: remove_partitions = [] if metadata_merger is None: metadata_merger = combine_metadata dataset_builder = update_metadata(dataset_builder, metadata_merger, partition_list, dataset_metadata) dataset_builder = update_partitions(dataset_builder, partition_list, remove_partitions) dataset_builder = update_indices(dataset_builder, store, partition_list, remove_partitions) if metadata_storage_format.lower() == "json": store.put(*dataset_builder.to_json()) elif metadata_storage_format.lower() == "msgpack": store.put(*dataset_builder.to_msgpack()) else: raise ValueError( "Unknown metadata storage format encountered: {}".format( metadata_storage_format)) dataset = dataset_builder.to_dataset() return dataset