def configure_and_run_docker_container( args: argparse.Namespace, docker_img: str, instance_config: InstanceConfig, system_paasta_config: SystemPaastaConfig, ) -> int: volumes = list() for volume in instance_config.get_volumes(system_paasta_config.get_volumes()): if os.path.exists(volume["hostPath"]): volumes.append( "{}:{}:{}".format( volume["hostPath"], volume["containerPath"], volume["mode"].lower() ) ) else: print( PaastaColors.yellow( "Warning: Path %s does not exist on this host. Skipping this binding." % volume["hostPath"] ), file=sys.stderr, ) original_docker_cmd = args.cmd or instance_config.get_cmd() spark_ui_port = pick_random_port(args.service + str(os.getpid())) spark_app_name = get_spark_app_name(original_docker_cmd, spark_ui_port) access_key, secret_key = get_aws_credentials( service=args.service, no_aws_credentials=args.no_aws_credentials, aws_credentials_yaml=args.aws_credentials_yaml, profile_name=args.aws_profile, ) spark_config_dict = get_spark_config( args=args, spark_app_name=spark_app_name, spark_ui_port=spark_ui_port, docker_img=docker_img, system_paasta_config=system_paasta_config, volumes=volumes, access_key=access_key, secret_key=secret_key, ) spark_conf_str = create_spark_config_str(spark_config_dict, is_mrjob=args.mrjob) # Spark client specific volumes volumes.append("%s:rw" % args.work_dir) volumes.append("/etc/passwd:/etc/passwd:ro") volumes.append("/etc/group:/etc/group:ro") volumes.append("/nail/home:/nail/home:rw") environment = instance_config.get_env_dictionary() environment.update( get_spark_env(args, spark_conf_str, spark_ui_port, access_key, secret_key) ) webui_url = get_webui_url(spark_ui_port) docker_cmd = get_docker_cmd(args, instance_config, spark_conf_str) if "history-server" in docker_cmd: print(f"\nSpark history server URL {webui_url}\n") elif any(c in docker_cmd for c in ["pyspark", "spark-shell", "spark-submit"]): print(f"\nSpark monitoring URL {webui_url}\n") if clusterman_metrics and _should_emit_resource_requirements( docker_cmd, args.mrjob ): try: emit_resource_requirements(spark_config_dict, args.cluster, webui_url) except Boto3Error as e: print( PaastaColors.red( f"Encountered {e} while attempting to send resource requirements to Clusterman." ) ) if args.suppress_clusterman_metrics_errors: print( "Continuing anyway since --suppress-clusterman-metrics-errors was passed" ) else: raise return run_docker_container( container_name=spark_app_name, volumes=volumes, environment=environment, docker_img=docker_img, docker_cmd=docker_cmd, dry_run=args.dry_run, nvidia=args.nvidia, )
def configure_and_run_docker_container( args: argparse.Namespace, docker_img: str, instance_config: InstanceConfig, system_paasta_config: SystemPaastaConfig, spark_conf: Mapping[str, str], aws_creds: Tuple[Optional[str], Optional[str], Optional[str]], cluster_manager: str, pod_template_path: str, ) -> int: # driver specific volumes volumes: List[str] = [] docker_memory_limit = _calculate_docker_memory_limit( spark_conf, args.docker_memory_limit ) docker_cpu_limit = _calculate_docker_cpu_limit( spark_conf, args.docker_cpu_limit, ) if cluster_manager == CLUSTER_MANAGER_MESOS: volumes = ( spark_conf.get("spark.mesos.executor.docker.volumes", "").split(",") if spark_conf.get("spark.mesos.executor.docker.volumes", "") != "" else [] ) elif cluster_manager == CLUSTER_MANAGER_K8S: volume_names = [ re.match( r"spark.kubernetes.executor.volumes.hostPath.(\d+).mount.path", key ).group(1) for key in spark_conf.keys() if "spark.kubernetes.executor.volumes.hostPath." in key and ".mount.path" in key ] for volume_name in volume_names: read_only = ( "ro" if spark_conf.get( f"spark.kubernetes.executor.volumes.hostPath.{volume_name}.mount.readOnly" ) == "true" else "rw" ) container_path = spark_conf.get( f"spark.kubernetes.executor.volumes.hostPath.{volume_name}.mount.path" ) host_path = spark_conf.get( f"spark.kubernetes.executor.volumes.hostPath.{volume_name}.options.path" ) volumes.append(f"{host_path}:{container_path}:{read_only}") volumes.append("%s:rw" % args.work_dir) volumes.append("/nail/home:/nail/home:rw") if args.enable_compact_bin_packing: volumes.append(f"{pod_template_path}:{pod_template_path}:rw") environment = instance_config.get_env_dictionary() # type: ignore spark_conf_str = create_spark_config_str(spark_conf, is_mrjob=args.mrjob) environment.update( get_spark_env(args, spark_conf_str, aws_creds, spark_conf["spark.ui.port"]) ) # type:ignore webui_url = get_webui_url(spark_conf["spark.ui.port"]) webui_url_msg = f"\nSpark monitoring URL {webui_url}\n" docker_cmd = get_docker_cmd(args, instance_config, spark_conf_str) if "history-server" in docker_cmd: print(f"\nSpark history server URL {webui_url}\n") elif any(c in docker_cmd for c in ["pyspark", "spark-shell", "spark-submit"]): signalfx_url = get_signalfx_url(spark_conf) signalfx_url_msg = f"\nSignalfx dashboard: {signalfx_url}\n" print(webui_url_msg) print(signalfx_url_msg) log.info(webui_url_msg) log.info(signalfx_url_msg) history_server_url = get_history_url(spark_conf) if history_server_url: history_server_url_msg = ( f"\nAfter the job is finished, you can find the spark UI from {history_server_url}\n" "Check y/spark-recent-history for faster access to prod logs\n" ) print(history_server_url_msg) log.info(history_server_url_msg) print(f"Selected cluster manager: {cluster_manager}\n") if clusterman_metrics and _should_get_resource_requirements(docker_cmd, args.mrjob): try: if cluster_manager == CLUSTER_MANAGER_MESOS: print("Sending resource request metrics to Clusterman") hourly_cost, resources = send_and_calculate_resources_cost( clusterman_metrics, spark_conf, webui_url, args.pool ) else: resources = get_resources_requested(spark_conf) hourly_cost = get_spark_hourly_cost( clusterman_metrics, resources, spark_conf["spark.executorEnv.PAASTA_CLUSTER"], args.pool, ) message = ( f"Resource request ({resources['cpus']} cpus and {resources['mem']} MB memory total)" f" is estimated to cost ${hourly_cost} per hour" ) if clusterman_metrics.util.costs.should_warn(hourly_cost): print(PaastaColors.red(f"WARNING: {message}")) else: print(message) except Boto3Error as e: print( PaastaColors.red( f"Encountered {e} while attempting to send resource requirements to Clusterman." ) ) if args.suppress_clusterman_metrics_errors: print( "Continuing anyway since --suppress-clusterman-metrics-errors was passed" ) else: raise final_spark_submit_cmd_msg = f"Final command: {docker_cmd}" print(PaastaColors.grey(final_spark_submit_cmd_msg)) log.info(final_spark_submit_cmd_msg) return run_docker_container( container_name=spark_conf["spark.app.name"], volumes=volumes, environment=environment, docker_img=docker_img, docker_cmd=docker_cmd, dry_run=args.dry_run, nvidia=args.nvidia, docker_memory_limit=docker_memory_limit, docker_cpu_limit=docker_cpu_limit, )
def configure_and_run_docker_container( args: argparse.Namespace, docker_img: str, instance_config: InstanceConfig, system_paasta_config: SystemPaastaConfig, spark_conf: Mapping[str, str], aws_creds: Tuple[Optional[str], Optional[str], Optional[str]], ) -> int: # driver specific volumes volumes = (spark_conf.get("spark.mesos.executor.docker.volumes", "").split(",") if spark_conf.get("spark.mesos.executor.docker.volumes", "") != "" else []) volumes.append("%s:rw" % args.work_dir) volumes.append("/nail/home:/nail/home:rw") environment = instance_config.get_env_dictionary() # type: ignore spark_conf_str = create_spark_config_str(spark_conf, is_mrjob=args.mrjob) environment.update( get_spark_env(args, spark_conf_str, aws_creds, spark_conf["spark.ui.port"])) # type:ignore webui_url = get_webui_url(spark_conf["spark.ui.port"]) docker_cmd = get_docker_cmd(args, instance_config, spark_conf_str) if "history-server" in docker_cmd: print(f"\nSpark history server URL {webui_url}\n") elif any(c in docker_cmd for c in ["pyspark", "spark-shell", "spark-submit"]): signalfx_url = get_signalfx_url(spark_conf) print(f"\nSpark monitoring URL {webui_url}\n") print(f"\nSignalfx dashboard: {signalfx_url}\n") history_server_url = get_history_url(spark_conf) if history_server_url: print( f"\nAfter the job is finished, you can find the spark UI from {history_server_url}\n" ) if clusterman_metrics and _should_emit_resource_requirements( docker_cmd, args.mrjob): try: print("Sending resource request metrics to Clusterman") hourly_cost, resources = send_and_calculate_resources_cost( clusterman_metrics, spark_conf, webui_url, args.pool) message = ( f"Resource request ({resources['cpus']} cpus and {resources['mem']} MB memory total)" f" is estimated to cost ${hourly_cost} per hour") if clusterman_metrics.util.costs.should_warn(hourly_cost): print(PaastaColors.red(f"WARNING: {message}")) else: print(message) except Boto3Error as e: print( PaastaColors.red( f"Encountered {e} while attempting to send resource requirements to Clusterman." )) if args.suppress_clusterman_metrics_errors: print( "Continuing anyway since --suppress-clusterman-metrics-errors was passed" ) else: raise return run_docker_container( container_name=spark_conf["spark.app.name"], volumes=volumes, environment=environment, docker_img=docker_img, docker_cmd=docker_cmd, dry_run=args.dry_run, nvidia=args.nvidia, )