def get_env(self): env = super().get_env() if self.get_executor() == "spark": env["EXECUTOR_CLUSTER"] = self.get_spark_paasta_cluster() env["EXECUTOR_POOL"] = self.get_spark_paasta_pool() # Run spark (and mesos framework) as root. env["SPARK_USER"] = "******" env["SPARK_OPTS"] = stringify_spark_env( self.get_spark_config_dict()) env.update(get_mesos_spark_auth_env()) env["CLUSTERMAN_RESOURCES"] = json.dumps( dict( get_spark_resource_requirements( spark_config_dict=self.get_spark_config_dict(), webui_url=get_webui_url(self.spark_ui_port), ).values())) if "AWS_ACCESS_KEY_ID" not in env or "AWS_SECRET_ACCESS_KEY" not in env: try: access_key, secret_key = get_aws_credentials( service=self.get_service(), aws_credentials_yaml=self.config_dict.get( "aws_credentials_yaml"), ) env["AWS_ACCESS_KEY_ID"] = access_key env["AWS_SECRET_ACCESS_KEY"] = secret_key except Exception: log.warning( f"Cannot set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment " f"variables for tron action {self.get_instance()} of service " f"{self.get_service()} via credentail file. Traceback:\n" f"{traceback.format_exc()}") if "AWS_DEFAULT_REGION" not in env: env["AWS_DEFAULT_REGION"] = DEFAULT_AWS_REGION return env
def get_env(self): env = super().get_env() if self.get_executor() == "spark": spark_config_dict = self.get_spark_config_dict() env["EXECUTOR_CLUSTER"] = self.get_spark_paasta_cluster() env["EXECUTOR_POOL"] = self.get_spark_paasta_pool() env["SPARK_OPTS"] = stringify_spark_env(spark_config_dict) # The actual mesos secret will be decrypted and injected on mesos master when assigning # tasks. env["SPARK_MESOS_SECRET"] = "SHARED_SECRET(SPARK_MESOS_SECRET)" if clusterman_metrics: env["CLUSTERMAN_RESOURCES"] = json.dumps( generate_clusterman_metrics_entries( clusterman_metrics, get_resources_requested(spark_config_dict), spark_config_dict["spark.app.name"], get_webui_url(spark_config_dict["spark.ui.port"]), )) else: env["CLUSTERMAN_RESOURCES"] = "{}" if "AWS_ACCESS_KEY_ID" not in env or "AWS_SECRET_ACCESS_KEY" not in env: try: access_key, secret_key, session_token = get_aws_credentials( service=self.get_service(), aws_credentials_yaml=self.config_dict.get( "aws_credentials_yaml"), ) env["AWS_ACCESS_KEY_ID"] = access_key env["AWS_SECRET_ACCESS_KEY"] = secret_key except Exception: log.warning( f"Cannot set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment " f"variables for tron action {self.get_instance()} of service " f"{self.get_service()} via credentail file. Traceback:\n" f"{traceback.format_exc()}") if "AWS_DEFAULT_REGION" not in env: env["AWS_DEFAULT_REGION"] = DEFAULT_AWS_REGION return env
def configure_and_run_docker_container( args: argparse.Namespace, docker_img: str, instance_config: InstanceConfig, system_paasta_config: SystemPaastaConfig, spark_conf: Mapping[str, str], aws_creds: Tuple[Optional[str], Optional[str], Optional[str]], cluster_manager: str, pod_template_path: str, ) -> int: # driver specific volumes volumes: List[str] = [] docker_memory_limit = _calculate_docker_memory_limit( spark_conf, args.docker_memory_limit ) docker_cpu_limit = _calculate_docker_cpu_limit( spark_conf, args.docker_cpu_limit, ) if cluster_manager == CLUSTER_MANAGER_MESOS: volumes = ( spark_conf.get("spark.mesos.executor.docker.volumes", "").split(",") if spark_conf.get("spark.mesos.executor.docker.volumes", "") != "" else [] ) elif cluster_manager == CLUSTER_MANAGER_K8S: volume_names = [ re.match( r"spark.kubernetes.executor.volumes.hostPath.(\d+).mount.path", key ).group(1) for key in spark_conf.keys() if "spark.kubernetes.executor.volumes.hostPath." in key and ".mount.path" in key ] for volume_name in volume_names: read_only = ( "ro" if spark_conf.get( f"spark.kubernetes.executor.volumes.hostPath.{volume_name}.mount.readOnly" ) == "true" else "rw" ) container_path = spark_conf.get( f"spark.kubernetes.executor.volumes.hostPath.{volume_name}.mount.path" ) host_path = spark_conf.get( f"spark.kubernetes.executor.volumes.hostPath.{volume_name}.options.path" ) volumes.append(f"{host_path}:{container_path}:{read_only}") volumes.append("%s:rw" % args.work_dir) volumes.append("/nail/home:/nail/home:rw") if args.enable_compact_bin_packing: volumes.append(f"{pod_template_path}:{pod_template_path}:rw") environment = instance_config.get_env_dictionary() # type: ignore spark_conf_str = create_spark_config_str(spark_conf, is_mrjob=args.mrjob) environment.update( get_spark_env(args, spark_conf_str, aws_creds, spark_conf["spark.ui.port"]) ) # type:ignore webui_url = get_webui_url(spark_conf["spark.ui.port"]) webui_url_msg = f"\nSpark monitoring URL {webui_url}\n" docker_cmd = get_docker_cmd(args, instance_config, spark_conf_str) if "history-server" in docker_cmd: print(f"\nSpark history server URL {webui_url}\n") elif any(c in docker_cmd for c in ["pyspark", "spark-shell", "spark-submit"]): signalfx_url = get_signalfx_url(spark_conf) signalfx_url_msg = f"\nSignalfx dashboard: {signalfx_url}\n" print(webui_url_msg) print(signalfx_url_msg) log.info(webui_url_msg) log.info(signalfx_url_msg) history_server_url = get_history_url(spark_conf) if history_server_url: history_server_url_msg = ( f"\nAfter the job is finished, you can find the spark UI from {history_server_url}\n" "Check y/spark-recent-history for faster access to prod logs\n" ) print(history_server_url_msg) log.info(history_server_url_msg) print(f"Selected cluster manager: {cluster_manager}\n") if clusterman_metrics and _should_get_resource_requirements(docker_cmd, args.mrjob): try: if cluster_manager == CLUSTER_MANAGER_MESOS: print("Sending resource request metrics to Clusterman") hourly_cost, resources = send_and_calculate_resources_cost( clusterman_metrics, spark_conf, webui_url, args.pool ) else: resources = get_resources_requested(spark_conf) hourly_cost = get_spark_hourly_cost( clusterman_metrics, resources, spark_conf["spark.executorEnv.PAASTA_CLUSTER"], args.pool, ) message = ( f"Resource request ({resources['cpus']} cpus and {resources['mem']} MB memory total)" f" is estimated to cost ${hourly_cost} per hour" ) if clusterman_metrics.util.costs.should_warn(hourly_cost): print(PaastaColors.red(f"WARNING: {message}")) else: print(message) except Boto3Error as e: print( PaastaColors.red( f"Encountered {e} while attempting to send resource requirements to Clusterman." ) ) if args.suppress_clusterman_metrics_errors: print( "Continuing anyway since --suppress-clusterman-metrics-errors was passed" ) else: raise final_spark_submit_cmd_msg = f"Final command: {docker_cmd}" print(PaastaColors.grey(final_spark_submit_cmd_msg)) log.info(final_spark_submit_cmd_msg) return run_docker_container( container_name=spark_conf["spark.app.name"], volumes=volumes, environment=environment, docker_img=docker_img, docker_cmd=docker_cmd, dry_run=args.dry_run, nvidia=args.nvidia, docker_memory_limit=docker_memory_limit, docker_cpu_limit=docker_cpu_limit, )
def configure_and_run_docker_container( args: argparse.Namespace, docker_img: str, instance_config: InstanceConfig, system_paasta_config: SystemPaastaConfig, ) -> int: volumes = list() for volume in instance_config.get_volumes(system_paasta_config.get_volumes()): if os.path.exists(volume["hostPath"]): volumes.append( "{}:{}:{}".format( volume["hostPath"], volume["containerPath"], volume["mode"].lower() ) ) else: print( PaastaColors.yellow( "Warning: Path %s does not exist on this host. Skipping this binding." % volume["hostPath"] ), file=sys.stderr, ) original_docker_cmd = args.cmd or instance_config.get_cmd() spark_ui_port = pick_random_port(args.service + str(os.getpid())) spark_app_name = get_spark_app_name(original_docker_cmd, spark_ui_port) access_key, secret_key = get_aws_credentials( service=args.service, no_aws_credentials=args.no_aws_credentials, aws_credentials_yaml=args.aws_credentials_yaml, profile_name=args.aws_profile, ) spark_config_dict = get_spark_config( args=args, spark_app_name=spark_app_name, spark_ui_port=spark_ui_port, docker_img=docker_img, system_paasta_config=system_paasta_config, volumes=volumes, access_key=access_key, secret_key=secret_key, ) spark_conf_str = create_spark_config_str(spark_config_dict, is_mrjob=args.mrjob) # Spark client specific volumes volumes.append("%s:rw" % args.work_dir) volumes.append("/etc/passwd:/etc/passwd:ro") volumes.append("/etc/group:/etc/group:ro") volumes.append("/nail/home:/nail/home:rw") environment = instance_config.get_env_dictionary() environment.update( get_spark_env(args, spark_conf_str, spark_ui_port, access_key, secret_key) ) webui_url = get_webui_url(spark_ui_port) docker_cmd = get_docker_cmd(args, instance_config, spark_conf_str) if "history-server" in docker_cmd: print(f"\nSpark history server URL {webui_url}\n") elif any(c in docker_cmd for c in ["pyspark", "spark-shell", "spark-submit"]): print(f"\nSpark monitoring URL {webui_url}\n") if clusterman_metrics and _should_emit_resource_requirements( docker_cmd, args.mrjob ): try: emit_resource_requirements(spark_config_dict, args.cluster, webui_url) except Boto3Error as e: print( PaastaColors.red( f"Encountered {e} while attempting to send resource requirements to Clusterman." ) ) if args.suppress_clusterman_metrics_errors: print( "Continuing anyway since --suppress-clusterman-metrics-errors was passed" ) else: raise return run_docker_container( container_name=spark_app_name, volumes=volumes, environment=environment, docker_img=docker_img, docker_cmd=docker_cmd, dry_run=args.dry_run, nvidia=args.nvidia, )
def configure_and_run_docker_container( args: argparse.Namespace, docker_img: str, instance_config: InstanceConfig, system_paasta_config: SystemPaastaConfig, spark_conf: Mapping[str, str], aws_creds: Tuple[Optional[str], Optional[str], Optional[str]], ) -> int: # driver specific volumes volumes = (spark_conf.get("spark.mesos.executor.docker.volumes", "").split(",") if spark_conf.get("spark.mesos.executor.docker.volumes", "") != "" else []) volumes.append("%s:rw" % args.work_dir) volumes.append("/nail/home:/nail/home:rw") environment = instance_config.get_env_dictionary() # type: ignore spark_conf_str = create_spark_config_str(spark_conf, is_mrjob=args.mrjob) environment.update( get_spark_env(args, spark_conf_str, aws_creds, spark_conf["spark.ui.port"])) # type:ignore webui_url = get_webui_url(spark_conf["spark.ui.port"]) docker_cmd = get_docker_cmd(args, instance_config, spark_conf_str) if "history-server" in docker_cmd: print(f"\nSpark history server URL {webui_url}\n") elif any(c in docker_cmd for c in ["pyspark", "spark-shell", "spark-submit"]): signalfx_url = get_signalfx_url(spark_conf) print(f"\nSpark monitoring URL {webui_url}\n") print(f"\nSignalfx dashboard: {signalfx_url}\n") history_server_url = get_history_url(spark_conf) if history_server_url: print( f"\nAfter the job is finished, you can find the spark UI from {history_server_url}\n" ) if clusterman_metrics and _should_emit_resource_requirements( docker_cmd, args.mrjob): try: print("Sending resource request metrics to Clusterman") hourly_cost, resources = send_and_calculate_resources_cost( clusterman_metrics, spark_conf, webui_url, args.pool) message = ( f"Resource request ({resources['cpus']} cpus and {resources['mem']} MB memory total)" f" is estimated to cost ${hourly_cost} per hour") if clusterman_metrics.util.costs.should_warn(hourly_cost): print(PaastaColors.red(f"WARNING: {message}")) else: print(message) except Boto3Error as e: print( PaastaColors.red( f"Encountered {e} while attempting to send resource requirements to Clusterman." )) if args.suppress_clusterman_metrics_errors: print( "Continuing anyway since --suppress-clusterman-metrics-errors was passed" ) else: raise return run_docker_container( container_name=spark_conf["spark.app.name"], volumes=volumes, environment=environment, docker_img=docker_img, docker_cmd=docker_cmd, dry_run=args.dry_run, nvidia=args.nvidia, )
def test_get_webui_url(): with mock.patch("socket.getfqdn", return_value="1.2.3.4"): assert spark_tools.get_webui_url("1234") == "http://1.2.3.4:1234"