def describe_runs(n=0, notebook=None, rule=None, session=None): """Returns a generator of descriptions for all the notebook runs. See :meth:`describe_run` for details of the description. Args: n (int): The number of runs to return or all runs if 0 (default: 0) notebook (str): If not None, return only runs of this notebook (default: None) rule (str): If not None, return only runs invoked by this rule (default: None) session (boto3.Session): The boto3 session to use. Will create a default session if not supplied (default: None). """ session = ensure_session(session) client = session.client("sagemaker") paginator = client.get_paginator("list_processing_jobs") page_iterator = paginator.paginate(NameContains="papermill-") for page in page_iterator: for item in page["ProcessingJobSummaries"]: job_name = item["ProcessingJobName"] if not job_name.startswith("papermill-"): continue d = describe_run(job_name, session) if notebook != None and notebook != d["Notebook"]: continue if rule != None and rule != d["Rule"]: continue yield d if n > 0: n = n - 1 if n == 0: return
def wait_for_complete(job_name, progress=True, sleep_time=10, session=None): """Wait for a notebook execution job to complete. Args: job_name (str): The name of the SageMaker Processing Job executing the notebook. (Required) progress (boolean): If True, print a period after every poll attempt. (Default: True) sleep_time (int): The number of seconds between polls. (Default: 10) session (boto3.Session): A boto3 session to use. Will create a default session if not supplied. (Default: None) Returns: A tuple with the job status and the failure message if any. """ session = ensure_session(session) client = session.client("sagemaker") done = False while not done: if progress: print(".", end="") desc = client.describe_processing_job(ProcessingJobName=job_name) status = desc["ProcessingJobStatus"] if status != "InProgress": done = True else: time.sleep(sleep_time) if progress: print() return status, desc.get("ExitMessage")
def upload_notebook(notebook, session=None): """Uploads a notebook to S3 in the default SageMaker Python SDK bucket for this user. The resulting S3 object will be named "s3://<bucket>/papermill-input/notebook-YYYY-MM-DD-hh-mm-ss.ipynb". Args: notebook (str): The filename of the notebook you want to upload. (Required) session (boto3.Session): A boto3 session to use. Will create a default session if not supplied. (Default: None) Returns: The resulting object name in S3 in URI format. """ session = ensure_session(session) s3 = session.client("s3") bucket = default_bucket(session) prefix = f"papermill_input/{time.strftime('%Y-%m-%d-%H-%M-%S', time.gmtime())}" directory, nb_filename = os.path.split(notebook) for root, dirs, files in os.walk(directory, followlinks=True): for filename in files: local_path = os.path.join(root, filename) relative_path = os.path.relpath(local_path, directory) s3_path = os.path.join(prefix, relative_path) try: s3.head_object(Bucket=bucket, Key=s3_path) except: s3.upload_file(local_path, bucket, s3_path) return f"s3://{bucket}/{prefix}/"
def __init__(self, max_jobs=20, session=None, log=None): self.session = ensure_session(session) self.client = self.session.client("sagemaker") self.log = log or logging.getLogger(__name__) self.max_jobs = max_jobs self.new_jobs = NewJobs(self.client) self.run_list = [] self.in_progress = {}
def stop_run(job_name, session=None): """Stop the named processing job Args: job_name (string): The name of the job to stop session (boto3.Session): The boto3 session to use. Will create a default session if not supplied (default: None).""" session = ensure_session(session) client = session.client("sagemaker") client.stop_processing_job(ProcessingJobName=job_name)
def save_csv_to_s3(df, csv_name): session = ensure_session() df.to_csv(csv_name, index=False) s3 = session.client("s3") bucket = default_bucket(session) prefix = "full_repo_scan" s3_path = os.path.join(prefix, csv_name) s3.upload_file(csv_name, bucket, s3_path) return f"s3://{bucket}/{prefix}/{csv_name}"
def describe(job_name, session): """Get the status and exit message for a Processing job. Args: job_name (str): session: Returns: (str, str): A tuple with the status and the exit message. """ session = ensure_session(session) client = session.client("sagemaker") response = client.describe_processing_job(ProcessingJobName=job_name) return response["ProcessingJobStatus"], response.get("ExitMessage")
def run_notebook( image, notebook, parameters={}, role=None, instance_type="ml.m5.large", output_prefix=None, output=".", session=None, ): """Run a notebook in SageMaker Processing producing a new output notebook. Args: image (str): The ECR image that defines the environment to run the job (required). notebook (str): The local notebook to upload and run (required). parameters (dict): The dictionary of parameters to pass to the notebook (default: {}). role (str): The name of a role to use to run the notebook (default: calls get_execution_role()). instance_type (str): The SageMaker instance to use for executing the job (default: ml.m5.large). output_prefix (str): The prefix path in S3 for where to store the output notebook (default: determined based on SageMaker Python SDK) output (str): The directory to copy the output file to (default: the current working directory). session (boto3.Session): The boto3 session to use. Will create a default session if not supplied (default: None). Returns: A tuple with the processing job name, the job status, the failure reason (or None) and the the path to the result notebook. The output notebook name is formed by adding a timestamp to the original notebook name. """ session = ensure_session(session) if output_prefix is None: output_prefix = get_output_prefix() s3path = upload_notebook(notebook, session) job_name = execute_notebook( image=image, input_path=s3path, output_prefix=output_prefix, notebook=notebook, parameters=parameters, role=role, instance_type=instance_type, session=session, ) print(f"Job {job_name} started") status, failure_reason = wait_for_complete(job_name) if status == "Completed": local = download_notebook(job_name, output=output) else: local = None return (job_name, status, local, failure_reason)
def get_output_notebook(job_name, session=None): """Get the name and S3 uri for an output notebook from a previously completed job. Args: job_name (str): The name of the SageMaker Processing Job that executed the notebook. (Required) session (boto3.Session): A boto3 session to use. Will create a default session if not supplied. (Default: None) Returns: (str, str): A tuple with the notebook name and S3 uri to the output notebook. """ session = ensure_session(session) client = session.client("sagemaker") desc = client.describe_processing_job(ProcessingJobName=job_name) prefix = desc["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] notebook = os.path.basename(desc["Environment"]["PAPERMILL_OUTPUT"]) return notebook, f"{prefix}/{notebook}"
def download_all(lis, output=".", session=None): """Download each of the output notebooks from a list previously completed jobs. Args: lis (list, pandas.Series, or pandas.DataFrame): A list of jobs or a pandas DataFrame with a "Job" column (as returned by :meth:`list_runs`). (Required) output (str): The directory to copy the output files to. (Default: the current working directory) session (boto3.Session): A boto3 session to use. Will create a default session if not supplied. (Default: None) Returns: The list of the filenames of the downloaded notebooks. """ import pandas as pd # pylint: disable=import-error if isinstance(lis, pd.DataFrame): lis = list(lis["Job"]) elif isinstance(lis, pd.Series): lis = list(lis) session = ensure_session(session) return [download_notebook(job, output, session) for job in lis]
def main(): args = parse_args(sys.argv[1:]) skip_args = { "docker": args.skip_docker, "local_mode": args.skip_local, "fsx_esx": args.skip_filesystem } notebook_names = parse.all_notebook_filenames() job_names = [] kernel_names = [] session = ensure_session() instance_type = args.instance or "ml.m5.xlarge" for notebook in notebook_names: if parse.is_notebook_skipped(notebook, skip_args): job_name = None else: image = kernels.kernel_image_for(notebook) s3path = upload_notebook(notebook, session) parameters = {"kms_key": kms_key()} job_name = execute_notebook( image=image, input_path=s3path, notebook=notebook, instance_type=instance_type, session=session, output_prefix=get_output_prefix(), parameters=parameters, ) time.sleep(1) print(job_name) job_names.append(str(job_name)) kernel_names.append(kernels.kernel_type_for(notebook)) print("\n" * 2) print("-" * 100) print("\n" * 2) print(save_csv_to_s3(notebook_names, job_names, kernel_names))
def save_csv_to_s3(notebooks, job_names, kernels): session = ensure_session() df = pd.DataFrame({ "filename": notebooks, "processing-job-name": job_names, "kernel": kernels }) csv_name = f"{time.strftime('%Y-%m-%d-%H-%M-%S', time.gmtime())}.csv" df.to_csv(csv_name, index=False) s3 = session.client("s3") bucket = default_bucket(session) prefix = "full_repo_scan" s3_path = os.path.join(prefix, csv_name) try: s3.head_object(Bucket=bucket, Key=s3_path) except: s3.upload_file(csv_name, bucket, s3_path) return f"s3://{bucket}/{prefix}/{csv_name}"
def upload_fileobj(notebook_fileobj, session=None): """Uploads a file object to S3 in the default SageMaker Python SDK bucket for this user. The resulting S3 object will be named "s3://<bucket>/papermill-input/notebook-YYYY-MM-DD-hh-mm-ss.ipynb". Args: notebook_fileobj (fileobj): A file object (as returned from open) that is reading from the notebook you want to upload. (Required) session (boto3.Session): A boto3 session to use. Will create a default session if not supplied. (Default: None) Returns: The resulting object name in S3 in URI format. """ session = ensure_session(session) snotebook = f"notebook-{time.strftime('%Y-%m-%d-%H-%M-%S', time.gmtime())}.ipynb" s3 = session.client("s3") key = "papermill_input/" + snotebook bucket = default_bucket(session) s3path = f"s3://{bucket}/{key}" s3.upload_fileobj(notebook_fileobj, bucket, key) return s3path
def main(): args = parse_args(sys.argv[1:]) session = ensure_session() csv_filename = args.csv df = pd.read_csv(csv_filename, index_col=False) output_notebooks = [] runtimes = [] statuses = [] errors = [] dates = [] sagemaker = session.client("sagemaker") for index, row in df.iterrows(): job_name = row["processing-job-name"] if job_name == "None": uri = "None" runtime = 0 status = "Skipped" error = "UsesDocker" date = datetime.now(timezone.utc).strftime("%Y-%m-%d") else: response = sagemaker.describe_processing_job(ProcessingJobName=job_name) notebook, uri = get_output_notebook(job_name, session) runtime = ( response.get("ProcessingEndTime", datetime.now(timezone.utc)) - response.get("ProcessingStartTime", datetime.now(timezone.utc)) ).total_seconds() status = response.get("ProcessingJobStatus") date = response.get("ProcessingEndTime", datetime.now(timezone.utc)).strftime( "%Y-%m-%d" ) error = response.get("ExitMessage") if error == "Kernel died": error = "KernelDied" elif error: lines = error.splitlines() error_message = lines[-1] error_type, error_details = error_message.split(":", 1) error = error_type or "Uncategorized" output_notebooks.append(uri) runtimes.append(runtime) statuses.append(status) errors.append(error) dates.append(date) print(job_name) time.sleep(1) df["output"] = output_notebooks df["runtime"] = runtimes df["status"] = statuses df["error"] = errors df.insert(loc=0, column="date", value=dates) print("\n" * 2) print("-" * 100) print("\n" * 2) print(save_csv_to_s3(df, csv_filename))
def main(): args = parse_args(sys.argv[1:]) session = ensure_session() csv_filename = args.csv dataframe = pd.read_csv(csv_filename, index_col=False) output_notebooks = [] runtimes = [] statuses = [] errors = [] dates = [] error_details = [] sagemaker = session.client("sagemaker") for index, row in dataframe.iterrows(): job_name = row["processing-job-name"] detail = None if job_name == "None": uri = "None" runtime = 0 status = "Skipped" error = None date = datetime.now(timezone.utc).strftime("%Y-%m-%d") else: response = sagemaker.describe_processing_job( ProcessingJobName=job_name) date = response.get("ProcessingEndTime", datetime.now( timezone.utc)).strftime("%Y-%m-%d") notebook, uri = get_output_notebook(job_name, session) status = response.get("ProcessingJobStatus") runtime = ( response.get("ProcessingEndTime", datetime.now(timezone.utc)) - response.get("ProcessingStartTime", datetime.now( timezone.utc))).total_seconds() if runtime < 0: runtime = 0 error = response.get("ExitMessage") if error == "Kernel died": error = "KernelDied" detail = "kernel died" elif error: found_error_type = False valid_error_types = ("Exception:", "Error:", "InvalidArn:", "NotFound:", "InUse:") lines = error.splitlines() for line in reversed(lines): if any(error_type in line for error_type in valid_error_types): error_parsed = line.split(":", 1) print( "The following error was encountered while executing the notebook" ) print(line) error = error_parsed[0] detail = error_parsed[1] found_error_type = True break if not found_error_type: error = "Uncategorized" if status == "Stopped": error = "TimedOut" detail = "Notebook execution timed out" output_notebooks.append(uri) runtimes.append(runtime) statuses.append(status) errors.append(error) dates.append(date) error_details.append(detail) print(job_name) time.sleep(1) new_dataframe = pd.DataFrame({ "date": dates, "filename": dataframe["filename"], "processing-job-name": dataframe["processing-job-name"], "kernel": dataframe["kernel"], "output": output_notebooks, "runtime": runtimes, "status": statuses, "error": errors, "error_detail": error_details }) print("\n" * 2) print("-" * 100) print("\n" * 2) print(save_csv_to_s3(new_dataframe, csv_filename))
def main(): args = parse_args(sys.argv[1:]) skip_args = { "docker": args.skip_docker, "local_mode": args.skip_local, "fsx_esx": args.skip_filesystem } jobs = {} session = ensure_session() instance_type = args.instance or "ml.m5.xlarge" for notebook in parse.pr_notebook_filenames(args.pr): if parse.is_notebook_skipped(notebook, skip_args): job_name = None else: image = kernels.kernel_image_for(notebook) s3path = upload_notebook(notebook, session) parameters = {"kms_key": kms_key()} job_name = execute_notebook( image=image, input_path=s3path, notebook=notebook, role="SageMakerRole", instance_type=instance_type, session=session, output_prefix=get_output_prefix(), parameters=parameters, ) time.sleep(1) jobs[notebook] = job_name failures = {} while jobs: for notebook in list(jobs): job_name = jobs[notebook] if not is_running(job_name, session): if job_name: status, failure_reason = wait_for_complete( job_name, progress=False, session=session ) else: status, failure_reason = ( "Skipped", "This notebook was skipped because it either uses Docker or Local Mode.", ) basename = os.path.basename(notebook) print("\n" * 2) print(f"* {basename} " + "*" * (97 - len(basename))) print("*") print(f"* {'job name':>11}: {str(job_name):<11}") print("*") print(f"* {'kernel':>11}: {kernels.kernel_type_for(notebook):<11}") print("*") print(f"* {'status':>11}: {status:<11}") print("*") if status != "Completed": print(failure_reason) if status != "Skipped": failures[notebook] = failure_reason jobs.pop(notebook) time.sleep(10) print("\n" * 2) print("-" * 100) if failures: raise Exception( "One or more notebooks failed to execute. Please see above for error messages. " "If you need more information, please see the CloudWatch logs for the corresponding Processing job." )
def describe_run(job_name, session=None): """Describe a particular notebook run. Args: job_name (str): The name of the processing job that ran the notebook. Returns: A dictionary with keys for each element of the job description. For example:: {'Notebook': 'scala-spark-test.ipynb', 'Rule': '', 'Parameters': '{"input": "s3://notebook-testing/const.txt"}', 'Job': 'papermill-scala-spark-test-2020-10-21-20-00-11', 'Status': 'Completed', 'Failure': None, 'Created': datetime.datetime(2020, 10, 21, 13, 0, 12, 817000, tzinfo=tzlocal()), 'Start': datetime.datetime(2020, 10, 21, 13, 4, 1, 58000, tzinfo=tzlocal()), 'End': datetime.datetime(2020, 10, 21, 13, 4, 55, 710000, tzinfo=tzlocal()), 'Elapsed': datetime.timedelta(seconds=54, microseconds=652000), 'Result': 's3://sagemaker-us-west-2-1234567890/papermill_output/scala-spark-test-2020-10-21-20-00-11.ipynb', 'Input': 's3://sagemaker-us-west-2-1234567890/papermill_input/notebook-2020-10-21-20-00-08.ipynb', 'Image': 'spark-scala-notebook-runner', 'Instance': 'ml.m5.large', 'Role': 'BasicExecuteNotebookRole-us-west-2'} """ session = ensure_session(session) client = session.client("sagemaker") while True: try: desc = client.describe_processing_job(ProcessingJobName=job_name) break except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "ThrottlingException": time.sleep(1) else: raise e status = desc["ProcessingJobStatus"] if status == "Completed": output_prefix = desc["ProcessingOutputConfig"]["Outputs"][0][ "S3Output"]["S3Uri"] notebook_name = os.path.basename( desc["Environment"]["PAPERMILL_OUTPUT"]) result = f"{output_prefix}/{notebook_name}" else: result = None if status == "Failed": failure = desc["ExitMessage"] else: failure = None d = {} d["Notebook"] = desc["Environment"].get("PAPERMILL_NOTEBOOK_NAME", "") d["Rule"] = desc["Environment"].get("AWS_EVENTBRIDGE_RULE", "") d["Parameters"] = desc["Environment"].get("PAPERMILL_PARAMS", "") d["Job"] = job_name d["Status"] = status d["Failure"] = failure d["Created"] = desc["CreationTime"] d["Start"] = desc.get("ProcessingStartTime") d["End"] = desc.get("ProcessingEndTime") elapsed = None if d.get("Start") is not None and d.get("End") is not None: elapsed = d["End"] - d["Start"] d["Elapsed"] = elapsed d["Result"] = result d["Input"] = desc["ProcessingInputs"][0]["S3Input"]["S3Uri"] d["Image"] = abbreviate_image(desc["AppSpecification"]["ImageUri"]) d["Instance"] = desc["ProcessingResources"]["ClusterConfig"][ "InstanceType"] d["Role"] = abbreviate_role(desc["RoleArn"]) return d
def execute_notebook( *, image, input_path, output_prefix, notebook, parameters, role=None, instance_type, session, ): session = ensure_session(session) if not role: role = get_execution_role(session) elif "/" not in role: account = session.client("sts").get_caller_identity()["Account"] role = f"arn:aws:iam::{account}:role/{role}" if "/" not in image: account = session.client("sts").get_caller_identity()["Account"] region = session.region_name image = f"{account}.dkr.ecr.{region}.amazonaws.com/{image}:latest" if notebook is None: notebook = input_path base = os.path.basename(notebook) nb_name, nb_ext = os.path.splitext(base) timestamp = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) job_name = ( ("papermill-" + re.sub(r"[^-a-zA-Z0-9]", "-", nb_name))[:62 - len(timestamp)] + "-" + timestamp) input_directory = "/opt/ml/processing/input/" local_input = os.path.join(input_directory, os.path.basename(notebook)) result = f"{nb_name}-{timestamp}{nb_ext}" local_output = "/opt/ml/processing/output/" api_args = { "ProcessingInputs": [ { "InputName": "notebook", "S3Input": { "S3Uri": input_path, "LocalPath": input_directory, "S3DataType": "S3Prefix", "S3InputMode": "File", "S3DataDistributionType": "FullyReplicated", }, }, ], "ProcessingOutputConfig": { "Outputs": [ { "OutputName": "result", "S3Output": { "S3Uri": output_prefix, "LocalPath": local_output, "S3UploadMode": "EndOfJob", }, }, ], }, "ProcessingJobName": job_name, "ProcessingResources": { "ClusterConfig": { "InstanceCount": 1, "InstanceType": instance_type, "VolumeSizeInGB": 40, } }, "StoppingCondition": { "MaxRuntimeInSeconds": 7200 }, "AppSpecification": { "ImageUri": image, "ContainerArguments": [ "run_notebook", ], }, "RoleArn": role, "Environment": {}, } api_args["Environment"]["PAPERMILL_INPUT"] = local_input api_args["Environment"]["PAPERMILL_OUTPUT"] = local_output + result if os.environ.get("AWS_DEFAULT_REGION") != None: api_args["Environment"]["AWS_DEFAULT_REGION"] = os.environ[ "AWS_DEFAULT_REGION"] api_args["Environment"]["PAPERMILL_PARAMS"] = json.dumps(parameters) api_args["Environment"]["PAPERMILL_NOTEBOOK_NAME"] = notebook client = boto3.client("sagemaker") result = client.create_processing_job(**api_args) job_arn = result["ProcessingJobArn"] job = re.sub("^.*/", "", job_arn) return job