def _get_run_files(self, run_uuid, resource_type): _validate_run_id(run_uuid) run_info = self._get_run_info(run_uuid) if run_info is None: raise MlflowException( "Run '%s' metadata is in invalid state." % run_uuid, databricks_pb2.INVALID_STATE) if resource_type == "metric": subfolder_name = FileStore.METRICS_FOLDER_NAME elif resource_type == "param": subfolder_name = FileStore.PARAMS_FOLDER_NAME elif resource_type == "tag": subfolder_name = FileStore.TAGS_FOLDER_NAME else: raise Exception("Looking for unknown resource under run.") _, run_dir = self._find_run_root(run_uuid) # run_dir exists since run validity has been confirmed above. source_dirs = find(run_dir, subfolder_name, full_path=True) if len(source_dirs) == 0: return run_dir, [] file_names = [] for root, _, files in os.walk(source_dirs[0]): for name in files: abspath = os.path.join(root, name) file_names.append(os.path.relpath(abspath, source_dirs[0])) if sys.platform == "win32": # Turn metric relative path into metric name. # Metrics can have '/' in the name. On windows, '/' is interpreted as a separator. # When the metric is read back the path will use '\' for separator. # We need to translate the path into posix path. from mlflow.utils.file_utils import relative_path_to_artifact_path file_names = [ relative_path_to_artifact_path(x) for x in file_names ] return source_dirs[0], file_names
def log_artifacts(self, local_dir, artifact_path=None): """ Parallelized implementation of `download_artifacts` for Databricks. """ artifact_path = artifact_path or "" inflight_uploads = {} for (dirpath, _, filenames) in os.walk(local_dir): artifact_subdir = artifact_path if dirpath != local_dir: rel_path = os.path.relpath(dirpath, local_dir) rel_path = relative_path_to_artifact_path(rel_path) artifact_subdir = posixpath.join(artifact_path, rel_path) for name in filenames: file_path = os.path.join(dirpath, name) upload_future = self.thread_pool.submit( self.log_artifact, file_path, artifact_subdir) inflight_uploads[file_path] = upload_future # Join futures to ensure that all artifacts have been uploaded prior to returning failed_uploads = {} for (src_file_path, upload_future) in inflight_uploads.items(): try: upload_future.result() except Exception as e: failed_uploads[src_file_path] = repr(e) if len(failed_uploads) > 0: raise MlflowException(message=( "The following failures occurred while uploading one or more artifacts" " to {artifact_root}: {failures}".format( artifact_root=self.artifact_uri, failures=failed_uploads, )))
def log_artifacts(self, local_dir, artifact_path=None): artifact_path = artifact_path or "" for (dirpath, _, filenames) in os.walk(local_dir): artifact_subdir = artifact_path if dirpath != local_dir: rel_path = os.path.relpath(dirpath, local_dir) rel_path = relative_path_to_artifact_path(rel_path) artifact_subdir = posixpath.join(artifact_path, rel_path) for name in filenames: file_path = os.path.join(dirpath, name) self.log_artifact(file_path, artifact_subdir)
def log_artifacts(self, local_dir, artifact_path=None): local_dir = os.path.abspath(local_dir) for root, _, filenames in os.walk(local_dir): if root == local_dir: artifact_dir = artifact_path else: rel_path = os.path.relpath(root, local_dir) rel_path = relative_path_to_artifact_path(rel_path) artifact_dir = (posixpath.join(artifact_path, rel_path) if artifact_path else rel_path) for f in filenames: self.log_artifact(os.path.join(root, f), artifact_dir)
def copy_artifacts(artifact_uri, artifact_path): local_dir = "/dbfs/%s/%s" % (strip_prefix( artifact_uri.rstrip('/'), 'dbfs:/'), strip_prefix(artifact_path, '/')) artifact_path = artifact_path or '' for (dirpath, _, filenames) in os.walk(local_dir): artifact_subdir = artifact_path if dirpath != local_dir: rel_path = os.path.relpath(dirpath, local_dir) rel_path = relative_path_to_artifact_path(rel_path) artifact_subdir = posixpath.join(artifact_path, rel_path) for name in filenames: file_path = os.path.join(dirpath, name) _copy_artifact(file_path, artifact_uri, artifact_subdir)
def log_artifacts(self, local_dir, artifact_path=None): (bucket, dest_path) = data.parse_s3_uri(self.artifact_uri) if artifact_path: dest_path = posixpath.join(dest_path, artifact_path) s3_client = self._get_s3_client() local_dir = os.path.abspath(local_dir) for (root, _, filenames) in os.walk(local_dir): upload_path = dest_path if root != local_dir: rel_path = os.path.relpath(root, local_dir) rel_path = relative_path_to_artifact_path(rel_path) upload_path = posixpath.join(dest_path, rel_path) for f in filenames: s3_client.upload_file(os.path.join(root, f), bucket, posixpath.join(upload_path, f))
def list_artifacts(self, path=None): # NOTE: The path is expected to be in posix format. # Posix paths work fine on windows but just in case we normalize it here. if path: path = os.path.normpath(path) list_dir = os.path.join(self.artifact_dir, path) if path else self.artifact_dir if os.path.isdir(list_dir): artifact_files = list_all(list_dir, full_path=True) infos = [get_file_info(f, relative_path_to_artifact_path( os.path.relpath(f, self.artifact_dir))) for f in artifact_files] return sorted(infos, key=lambda f: f.path) else: return []
def log_artifacts(self, local_dir, artifact_path=None): (bucket, dest_path) = self.parse_oss_uri(self.artifact_uri) if artifact_path: dest_path = posixpath.join(dest_path, artifact_path) self._get_oss_bucket(bucket) local_dir = os.path.abspath(local_dir) for (root, _, filenames) in os.walk(local_dir): upload_path = dest_path if root != local_dir: rel_path = os.path.relpath(root, local_dir) rel_path = relative_path_to_artifact_path(rel_path) upload_path = posixpath.join(dest_path, rel_path) for f in filenames: self.oss_bucket.put_object_from_file( posixpath.join(upload_path, f), os.path.join(root, f))
def log_artifacts(self, local_dir, artifact_path=None): (bucket, dest_path) = self.parse_gcs_uri(self.artifact_uri) if artifact_path: dest_path = posixpath.join(dest_path, artifact_path) gcs_bucket = self.gcs.Client().get_bucket(bucket) local_dir = os.path.abspath(local_dir) for (root, _, filenames) in os.walk(local_dir): upload_path = dest_path if root != local_dir: rel_path = os.path.relpath(root, local_dir) rel_path = relative_path_to_artifact_path(rel_path) upload_path = posixpath.join(dest_path, rel_path) for f in filenames: path = posixpath.join(upload_path, f) gcs_bucket.blob(path).upload_from_filename(os.path.join(root, f))
def log_artifacts(self, local_dir, artifact_path=None): dest_path = posixpath.join(self.path, artifact_path) \ if artifact_path else self.path local_dir = os.path.abspath(local_dir) for (root, _, filenames) in os.walk(local_dir): upload_path = dest_path if root != local_dir: rel_path = os.path.relpath(root, local_dir) upload_path = relative_path_to_artifact_path(rel_path) if not filenames: with self.get_ftp_client() as ftp: self._mkdir(ftp, posixpath.join(self.path, upload_path)) for f in filenames: if os.path.isfile(os.path.join(root, f)): self.log_artifact(os.path.join(root, f), upload_path)
def _get_resource_files(self, root_dir, subfolder_name): source_dirs = find(root_dir, subfolder_name, full_path=True) if len(source_dirs) == 0: return root_dir, [] file_names = [] for root, _, files in os.walk(source_dirs[0]): for name in files: abspath = os.path.join(root, name) file_names.append(os.path.relpath(abspath, source_dirs[0])) if sys.platform == "win32": # Turn metric relative path into metric name. # Metrics can have '/' in the name. On windows, '/' is interpreted as a separator. # When the metric is read back the path will use '\' for separator. # We need to translate the path into posix path. from mlflow.utils.file_utils import relative_path_to_artifact_path file_names = [relative_path_to_artifact_path(x) for x in file_names] return source_dirs[0], file_names
def log_artifacts(self, local_dir, artifact_path=None): (bucket, dest_path) = self.parse_gcs_uri(self.artifact_uri) if artifact_path: dest_path = posixpath.join(dest_path, artifact_path) gcs_bucket = self._get_bucket(bucket) local_dir = os.path.abspath(local_dir) for (root, _, filenames) in os.walk(local_dir): upload_path = dest_path if root != local_dir: rel_path = os.path.relpath(root, local_dir) rel_path = relative_path_to_artifact_path(rel_path) upload_path = posixpath.join(dest_path, rel_path) for f in filenames: path = posixpath.join(upload_path, f) gcs_bucket.blob(path, chunk_size=self._GCS_UPLOAD_CHUNK_SIZE ).upload_from_filename( os.path.join(root, f), timeout=self._GCS_DEFAULT_TIMEOUT)
def log_artifacts_minio( run: mlflow.entities.Run, local_dir: str, artifact_path: str = None, delete_local: bool = True, ) -> None: """Upload local artefacts via Minio client This is needed as boto3 and Minio have problems with empty files. See - https://github.com/minio/minio/issues/5150 - https://github.com/boto/botocore/pull/1328 :param run: an active Mlflow Run :type run: mlflow.entities.Run :param local_dir: the path to the local directory with artifacts to log to Mlflow :type local_dir: str :param artifact_path: relative path of logged artifacts in Mlflow Run assets :type artifact_path: str :param delete_local: whether to delete the local assets after logging them to Mlflow :type delete_local: bool """ (bucket, dest_path) = parse_s3_uri(run.info.artifact_uri) if artifact_path: dest_path = posixpath.join(dest_path, artifact_path) minio_client = Minio( urlparse(os.environ["MLFLOW_S3_ENDPOINT_URL"]).netloc, access_key=os.environ["AWS_ACCESS_KEY_ID"], secret_key=os.environ["AWS_SECRET_ACCESS_KEY"], secure=False, ) local_dir = os.path.abspath(local_dir) for (root, _, filenames) in os.walk(local_dir): upload_path = dest_path if root != local_dir: rel_path = os.path.relpath(root, local_dir) rel_path = relative_path_to_artifact_path(rel_path) upload_path = posixpath.join(dest_path, rel_path) for f in filenames: minio_client.fput_object(bucket, posixpath.join(upload_path, f), os.path.join(root, f)) if delete_local: shutil.rmtree(local_dir)
def log_artifacts(self, local_dir, artifact_path=None): bucket, ns, dest_path = self.parse_os_uri(self.artifact_uri) if artifact_path: dest_path = os.path.join(dest_path, artifact_path) os_client = self._get_os_client() local_dir = os.path.abspath(local_dir) for (root, _, filenames) in os.walk(local_dir): upload_path = dest_path if root != local_dir: rel_path = os.path.relpath(root, local_dir) rel_path = relative_path_to_artifact_path(rel_path) upload_path = os.path.join(dest_path, rel_path) for f in filenames: self._upload_file( os_client, local_file=os.path.join(root, f), bucket=bucket, ns=ns, dest_path=os.path.join(upload_path, f), )
def log_artifacts(self, local_dir, artifact_path=None): dest_path = posixpath.join(self.path, artifact_path) \ if artifact_path else self.path dest_path = posixpath.join(dest_path, os.path.split(local_dir)[1]) dest_path_re = os.path.split(local_dir)[1] if artifact_path: dest_path_re = posixpath.join(artifact_path, os.path.split(local_dir)[1]) local_dir = os.path.abspath(local_dir) for (root, _, filenames) in os.walk(local_dir): upload_path = dest_path if root != local_dir: rel_path = os.path.relpath(root, local_dir) rel_path = relative_path_to_artifact_path(rel_path) upload_path = posixpath.join(dest_path_re, rel_path) if not filenames: self._mkdir(posixpath.join(self.path, upload_path)) for f in filenames: if os.path.isfile(os.path.join(root, f)): self.log_artifact(os.path.join(root, f), upload_path)
def _relative_path_local(base_dir, subdir_path): rel_path = _relative_path(base_dir, subdir_path, os.path) return relative_path_to_artifact_path(rel_path) if rel_path is not None else None
def log_artifacts(self, local_dir, artifact_path=None): """ Parallelized implementation of `download_artifacts` for Databricks. """ StagedArtifactUpload = namedtuple( "StagedArtifactUpload", [ # Local filesystem path of the source file to upload "src_file_path", # Run-relative artifact path specifying the upload destination "dst_run_relative_artifact_path", ], ) artifact_path = artifact_path or "" staged_uploads = [] for (dirpath, _, filenames) in os.walk(local_dir): artifact_subdir = artifact_path if dirpath != local_dir: rel_path = os.path.relpath(dirpath, local_dir) rel_path = relative_path_to_artifact_path(rel_path) artifact_subdir = posixpath.join(artifact_path, rel_path) for name in filenames: file_path = os.path.join(dirpath, name) dst_run_relative_artifact_path = self._get_run_relative_artifact_path_for_upload( src_file_path=file_path, dst_artifact_dir=artifact_subdir, ) staged_uploads.append( StagedArtifactUpload( src_file_path=file_path, dst_run_relative_artifact_path= dst_run_relative_artifact_path, )) write_credential_infos = self._get_write_credential_infos( run_id=self.run_id, paths=[ staged_upload.dst_run_relative_artifact_path for staged_upload in staged_uploads ], ) inflight_uploads = {} for staged_upload, write_credential_info in zip( staged_uploads, write_credential_infos): upload_future = self.thread_pool.submit( self._upload_to_cloud, cloud_credential_info=write_credential_info, src_file_path=staged_upload.src_file_path, dst_run_relative_artifact_path=staged_upload. dst_run_relative_artifact_path, ) inflight_uploads[staged_upload.src_file_path] = upload_future # Join futures to ensure that all artifacts have been uploaded prior to returning failed_uploads = {} for (src_file_path, upload_future) in inflight_uploads.items(): try: upload_future.result() except Exception as e: failed_uploads[src_file_path] = repr(e) if len(failed_uploads) > 0: raise MlflowException(message=( "The following failures occurred while uploading one or more artifacts" " to {artifact_root}: {failures}".format( artifact_root=self.artifact_uri, failures=failed_uploads, )))