def configuration_file(tmp_path: Path) -> Path: with open(tmp_path / "version1.txt", "w") as file: file.write("contents1") with open(tmp_path / "version2.txt", "w") as file: file.write("contents2") hash1 = FileAPI.calculate_hash(tmp_path / "version1.txt") hash2 = FileAPI.calculate_hash(tmp_path / "version2.txt") metadata_file = tmp_path / "metadata.yaml" with open(metadata_file, "w") as file: file.write(""" - data_product: test version: 1.0.0 filename: version1.txt verified_hash: {hash1} - data_product: test version: 2.0.0 filename: version2.txt verified_hash: {hash2} """.format(hash1=hash1, hash2=hash2)) configuration_file = tmp_path / "config.yaml" with open(configuration_file, "w") as file: file.write(""" data_directory: . run_id: test_run access_log: access.yaml fail_on_hash_mismatch: True """) return configuration_file
def _verify_hash(filename: Path, access_calculated_hash: str) -> None: """ Verifies the hash of the file matches the calculated hash from the access log :param filename: file to verify the hash of :param access_calculated_hash: hash read from the access log for this filename """ calculated_hash = FileAPI.calculate_hash(filename) if access_calculated_hash != calculated_hash: raise ValueError( f"access log contains hash {access_calculated_hash} but calculated hash of {filename} is {calculated_hash}" )
def upload_to_storage( remote_uri: str, storage_options: Dict[str, Any], data_directory: Path, filename: Path, upload_path: Optional[Union[str, Path]] = None, path_prefix: Optional[str] = None, ) -> str: """ Uploads a file to the remote uri :param remote_uri: URI to the root of the storage :param storage_options: (key, value) pairs that are passed to the remote storage, e.g. credentials :param data_directory: root of the data directory read from the access log :param filename: file to upload :param upload_path: optional override to the upload path of the file :param path_prefix: Optional prefix onto the remote path, e.g. namespace :return: path of the file on the remote storage """ split_result = urllib.parse.urlsplit(remote_uri) protocol = split_result.scheme path_prefix = Path(path_prefix) if path_prefix else Path() upload_path = (path_prefix / (upload_path or filename.absolute().relative_to( data_directory.absolute()))).as_posix() fs, path = get_remote_filesystem_and_path(protocol, remote_uri, upload_path, **storage_options) if protocol in {"file", "ssh", "sftp"}: fs.makedirs(Path(path).parent.as_posix(), exist_ok=True) sha1 = FileAPI.calculate_hash(filename) path_root, path_ext = os.path.splitext(path) path = f"{path_root}_{sha1}{path_ext}" logger.info(f"Uploading {filename.as_posix()} to {path} on {remote_uri}") fs.put(filename.as_posix(), path) if path.startswith(remote_uri): # some remote filesystems expect the root uri in the path, others don't, but the registry path doesn't return path[len(remote_uri):] elif path.startswith(split_result.path): # some remote_uri's include part of what the fs considers the path, so strip it off return path[len(split_result.path):] return path
def _upload_file_to_storage( posts: List[YamlDict], filename: Union[str, Path], remote_uri: str, storage_options: Dict[str, str], storage_root: Union[str, YamlDict], namespace: Optional[str] = None, ) -> YamlDict: """ for a given filename, uploads it to the remote uri and returns a reference to the object that will be posted :param posts: List of posts to the data registry, will be modified :param filename: path to the file to upload :param remote_uri: URI to the root of the storage for uploading :param storage_options: (key, value) pairs that are passed to the remote storage, e.g. credentials :param storage_root: existing reference to the storage_root that this was uploaded to :param namespace: namespace of the file being uploaded, if provided will be prefixed onto the upload path :return: object reference to the uploaded file """ filename = Path(filename) path = upload_to_storage(remote_uri, storage_options, filename.parent, filename, path_prefix=namespace) file_hash = FileAPI.calculate_hash(filename) location = _create_target_data_dict( DataRegistryTarget.storage_location, { DataRegistryField.path: path, DataRegistryField.hash: file_hash, DataRegistryField.storage_root: storage_root, }, ) posts.append(location) obj = _create_target_data_dict( DataRegistryTarget.object, {DataRegistryField.storage_location: location} ) posts.append(obj) return obj
def upload_data_product_cli( data_product_path, namespace, storage_root_name, storage_location_path, accessibility, data_product_name, data_product_description, data_product_version, component, data_registry, token, remote_uri, remote_option, remote_uri_override, ): configure_cli_logging() template_file = Path(__file__).parent / Path("templates/data_product.yaml") with open(template_file, "r") as f: template = f.read() data_registry = data_registry or DEFAULT_DATA_REGISTRY_URL remote_uri_override = remote_uri_override or remote_uri remote_uri = remote_uri.strip() remote_uri_override = remote_uri_override.strip() storage_root_name = storage_root_name or urllib.parse.urlparse( remote_uri_override).netloc storage_root = remote_uri_override remote_options = get_remote_options() arg_remote_options = dict(remote_option) if remote_option else {} remote_options.update(arg_remote_options) data_product_path = Path(data_product_path) storage_location_hash = FileAPI.calculate_hash(data_product_path) path = upload_to_storage(remote_uri, remote_option, data_product_path.parent, data_product_path, upload_path=storage_location_path, path_prefix=namespace) namespace_ref = get_reference({DataRegistryField.name: namespace}, DataRegistryTarget.namespace, data_registry, token) if namespace_ref: query = { DataRegistryField.name: data_product_name, DataRegistryField.namespace: namespace_ref } if data_product_version: query["version"] = data_product_version data_products = get_data(query, DataRegistryTarget.data_product, data_registry, token, False) if data_products: latest = next(iter(sort_by_semver(data_products))) data_product_version = str( semver.VersionInfo.parse( latest[DataRegistryField.version]).bump_minor()) elif not data_product_version: data_product_version = "0.1.0" populated_yaml = template.format( namespace=namespace, storage_root_name=storage_root_name, storage_root=storage_root, accessibility=accessibility, storage_location_path=path, storage_location_hash=storage_location_hash, data_product_name=data_product_name, data_product_description=data_product_description, data_product_version=data_product_version, component_name="COMPONENT_NAME", component_description="COMPONENT_DESCRIPTION", ) config = yaml.safe_load(populated_yaml) component_template = config["post"].pop(-1) if component: for component_name, component_description in component: c = component_template["data"].copy() c["name"] = component_name c["description"] = component_description config["post"].append({ "data": c, "target": DataRegistryTarget.object_component }) else: c = component_template["data"].copy() c["name"] = data_product_name c["description"] = data_product_description config["post"].append({ "data": c, "target": DataRegistryTarget.object_component }) upload_from_config(config, data_registry, token)