def test_set_yarn_spark_resource_config_fallback(
    patched_virtual_memory,
    patched_cpu_count,
    patched_yarn_config,
    patched_spark_config,
    default_bootstrapper: Bootstrapper,
) -> None:
    mocked_virtual_memory_total = PropertyMock(return_value=123 * 1024 * 1024)
    type(patched_virtual_memory.return_value
         ).total = mocked_virtual_memory_total
    patched_cpu_count.return_value = 456

    default_bootstrapper.load_processing_job_config = MagicMock(
        return_value=None)
    default_bootstrapper.load_instance_type_info = MagicMock(return_value=None)
    default_bootstrapper.get_yarn_spark_resource_config = MagicMock(
        return_value=(patched_yarn_config, patched_spark_config))

    default_bootstrapper.set_yarn_spark_resource_config()

    patched_virtual_memory.assert_called_once()
    mocked_virtual_memory_total.assert_called_once()
    patched_cpu_count.assert_called_once()

    default_bootstrapper.load_processing_job_config.assert_called_once()
    default_bootstrapper.load_instance_type_info.assert_called_once()
    default_bootstrapper.get_yarn_spark_resource_config.assert_called_once_with(
        1, 123, 456)
    patched_yarn_config.write_config.assert_called_once()
    patched_spark_config.write_config.assert_called_once()
def test_set_regional_configs(patched_config,
                              default_bootstrapper: Bootstrapper) -> None:
    default_bootstrapper.get_regional_configs = MagicMock(
        return_value=[patched_config])
    default_bootstrapper.set_regional_configs()
    default_bootstrapper.get_regional_configs.assert_called_once()
    patched_config.write_config.assert_called_once()
def test_start_hadoop_daemons_on_worker(patched_popen, patched_call) -> None:
    worker_bootstrapper = Bootstrapper(resource_config={"current_host": "algo-2", "hosts": ["algo-1", "algo-2"]})
    worker_bootstrapper.start_hadoop_daemons()

    expected_subprocess_calls = [
        call("rm -rf /opt/amazon/hadoop/hdfs/datanode && mkdir -p /opt/amazon/hadoop/hdfs/datanode", shell=True,),
    ]

    patched_call.call_args_list = expected_subprocess_calls

    expected_subprocess_popens = [
        call("hdfs datanode", shell=True),
        call("yarn nodemanager", shell=True),
    ]

    patched_popen.call_args_list == expected_subprocess_popens
def test_get_regional_configs_gov(patched_getenv, default_bootstrapper: Bootstrapper) -> None:
    patched_getenv.return_value = "us-gov-west-1"
    regional_configs_list = default_bootstrapper.get_regional_configs()
    assert len(regional_configs_list) == 1
    assert regional_configs_list[0] == Configuration(
        Classification="core-site", Properties={"fs.s3a.endpoint": "s3.us-gov-west-1.amazonaws.com"}
    )
    patched_getenv.assert_called_once_with("AWS_REGION")
def test_load_processing_job_config(patched_exists, default_bootstrapper: Bootstrapper) -> None:
    exp_config = {"ProcessingResources": {"ClusterConfig": {"InstanceType": "foo.xbar", "InstanceCount": 123}}}

    patched_exists.return_value = True
    with patch("smspark.bootstrapper.open", mock_open(read_data=json.dumps(exp_config))) as m:
        actual_config = default_bootstrapper.load_processing_job_config()
    assert actual_config == exp_config
    patched_exists.assert_called_once_with(Bootstrapper.PROCESSING_JOB_CONFIG_PATH)
    m.assert_called_once_with(Bootstrapper.PROCESSING_JOB_CONFIG_PATH, "r")
def test_set_yarn_spark_resource_config(
    patched_yarn_config, patched_spark_config, default_bootstrapper: Bootstrapper
) -> None:
    processing_job_config = {
        "ProcessingResources": {"ClusterConfig": {"InstanceType": "foo.xbar", "InstanceCount": 123}}
    }
    instance_type_info = {"foo.xbar": {"MemoryInfo": {"SizeInMiB": 456}, "VCpuInfo": {"DefaultVCpus": 789}}}
    default_bootstrapper.load_processing_job_config = MagicMock(return_value=processing_job_config)
    default_bootstrapper.load_instance_type_info = MagicMock(return_value=instance_type_info)
    default_bootstrapper.get_yarn_spark_resource_config = MagicMock(
        return_value=(patched_yarn_config, patched_spark_config)
    )

    default_bootstrapper.set_yarn_spark_resource_config()

    default_bootstrapper.load_processing_job_config.assert_called_once()
    default_bootstrapper.load_instance_type_info.assert_called_once()
    default_bootstrapper.get_yarn_spark_resource_config.assert_called_once_with(123, 456, 789)
    patched_yarn_config.write_config.assert_called_once()
    patched_spark_config.write_config.assert_called_once()
def test_load_instance_type_info(patched_exists, default_bootstrapper: Bootstrapper) -> None:
    raw_config = [
        {"InstanceType": "foo.xlarge", "foo": "bar"},
        {"InstanceType": "bar.xlarge", "bar": "foo",},
    ]
    exp_config = {"foo.xlarge": {"foo": "bar"}, "bar.xlarge": {"bar": "foo"}}

    patched_exists.return_value = True
    with patch("smspark.bootstrapper.open", mock_open(read_data=json.dumps(raw_config))) as m:
        actual_config = default_bootstrapper.load_instance_type_info()
    assert actual_config == exp_config
    patched_exists.assert_called_once_with(Bootstrapper.INSTANCE_TYPE_INFO_PATH)
    m.assert_called_once_with(Bootstrapper.INSTANCE_TYPE_INFO_PATH, "r")
def start_history_server(event_logs_s3_uri: str) -> None:
    """Bootstrap the history server instance and starts the Spark history server instance."""
    bootstrapper = Bootstrapper()
    log.info("copying aws jars")
    bootstrapper.copy_aws_jars()
    log.info("copying cluster config")
    bootstrapper.copy_cluster_config()
    log.info("setting regional configs")
    bootstrapper.set_regional_configs()
    log.info("copying history server config")
    config_history_server(event_logs_s3_uri)
    log.info("bootstrap master node")
    bootstrapper.start_spark_standalone_primary()

    try:
        subprocess.run("sbin/start-history-server.sh", check=True)
    except subprocess.CalledProcessError as e:
        raise AlgorithmError(message=e.stderr.decode(sys.getfilesystemencoding()), caused_by=e, exit_code=e.returncode)
    except Exception as e:
        log.error("Exception during processing: " + str(e) + "\n" + traceback.format_exc())
        raise AlgorithmError(
            message="error occurred during start-history-server execution. Please see logs for details.", caused_by=e,
        )
예제 #9
0
    def __init__(
        self,
        resource_config: Dict[str, Any] = None,  # type: ignore
        processing_job_config: Dict[str, Any] = None,  # type: ignore
    ) -> None:
        """Initialize a ProcessingJobManager, loading configs if not provided."""
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger("smspark-submit")

        try:
            resource_config_path = "/opt/ml/config/resourceconfig.json"
            with open(resource_config_path, "r") as f:
                self._resource_config = json.load(f)
        except Exception:
            self.logger.warning(
                "Could not read resource config file at {}. Using default resourceconfig.".format(resource_config_path)
            )
            self._resource_config = default_resource_config

        self.logger.info(self._resource_config)

        try:
            processing_job_config_path = "/opt/ml/config/processingjobconfig.json"
            with open(processing_job_config_path, "r") as f:
                self._processing_job_config = json.load(f)
        except Exception:
            self.logger.warning(
                "Could not read resource config file at {}. Using default resourceconfig.".format(resource_config_path)
            )
            self._processing_job_config = default_processing_job_config

        self.logger.info(self._processing_job_config)
        self.bootstrapper = Bootstrapper(self._resource_config)
        self.waiter = Waiter()
        self.status_app = StatusApp()
        self.status_client = StatusClient()
def test_get_yarn_spark_resource_config(
        default_bootstrapper: Bootstrapper) -> None:
    # Using a cluster with one single m5.xlarge instance, calculate Yarn and Spark configs, and double check the math
    instance_mem_mb = 16384
    instance_cores = 4
    yarn_config, spark_config = default_bootstrapper.get_yarn_spark_resource_config(
        1, instance_mem_mb, instance_cores)

    exp_yarn_max_mem_mb = 15892  # = int(instance_mem_mb * .97) = int(16384 * .97) = int(15892.48)

    exp_yarn_config_props = {
        "yarn.scheduler.minimum-allocation-mb": "1",
        "yarn.scheduler.maximum-allocation-mb": str(exp_yarn_max_mem_mb),
        "yarn.scheduler.minimum-allocation-vcores": "1",
        "yarn.scheduler.maximum-allocation-vcores": str(instance_cores),
        "yarn.nodemanager.resource.memory-mb": str(exp_yarn_max_mem_mb),
        "yarn.nodemanager.resource.cpu-vcores": str(instance_cores),
    }

    assert yarn_config.Classification == "yarn-site"
    assert yarn_config.Properties == exp_yarn_config_props

    exp_executor_cores = 4  # = instance_cores = 4
    exp_executor_count_total = 1  # = instance_count * executor_count_per_instance = 1 * 1
    exp_default_parallelism = 8  # = instance_count * instance_cores * 2 = 1 * 4 * 2

    exp_driver_mem_mb = 2048  # = 2 * 1024
    exp_driver_mem_ovr_mb = 204  # = int(driver_mem_mb * driver_mem_ovr_pct) = int(2048 * 0.1) = int(204.8)
    # = int((instance_mem_mb - driver_mem_mb - driver_mem_ovr_mb) /
    #       (executor_count_per_instance + executor_count_per_instance * executor_mem_ovr_pct))
    # = int((15892 - 2048 - 204) / (1 + 1 * 0.1))
    # = int(13640 / 1.1)
    exp_executor_mem_mb = 12399
    exp_executor_mem_ovr_mb = 1239  # = int(executor_mem_mb * executor_mem_ovr_pct) = int(12399 * 0.1) = int(1239.9)

    exp_driver_gc_config = (
        "-XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 "
        "-XX:+CMSClassUnloadingEnabled")
    exp_driver_java_opts = "-XX:OnOutOfMemoryError='kill -9 %p' " f"{exp_driver_gc_config}"

    # ConcGCThreads = max(int(executor_cores / 4), 1) = max(int(4 / 4), 1) = max(1, 1) = 1
    # ParallelGCThreads = max(int(3 * executor_cores / 4), 1) = max(int(3 * 4 / 4), 1) = max(3, 1) = 3
    exp_executor_gc_config = (
        "-XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70 "
        "-XX:ConcGCThreads=1 "
        "-XX:ParallelGCThreads=3 ")
    exp_executor_java_opts = (
        "-verbose:gc -XX:OnOutOfMemoryError='kill -9 %p' "
        "-XX:+PrintGCDetails -XX:+PrintGCDateStamps "
        f"{exp_executor_gc_config}")

    exp_spark_config_props = {
        "spark.driver.memory": f"{exp_driver_mem_mb}m",
        "spark.driver.memoryOverhead": f"{exp_driver_mem_ovr_mb}m",
        "spark.driver.defaultJavaOptions": f"{exp_driver_java_opts}",
        "spark.executor.memory": f"{exp_executor_mem_mb}m",
        "spark.executor.memoryOverhead": f"{exp_executor_mem_ovr_mb}m",
        "spark.executor.cores": f"{exp_executor_cores}",
        "spark.executor.defaultJavaOptions": f"{exp_executor_java_opts}",
        "spark.executor.instances": f"{exp_executor_count_total}",
        "spark.default.parallelism": f"{exp_default_parallelism}",
    }

    assert spark_config.Classification == "spark-defaults"
    assert spark_config.Properties == exp_spark_config_props

    # Using the same instance type, increase the instance count by 10x
    yarn_config, spark_config = default_bootstrapper.get_yarn_spark_resource_config(
        10, instance_mem_mb, instance_cores)

    # Yarn config should be the same
    assert yarn_config.Properties == exp_yarn_config_props

    # Spark config should be the same with more 10x executors and parallelism
    exp_spark_config_props[
        "spark.executor.instances"] = f"{exp_executor_count_total * 10}"
    exp_spark_config_props[
        "spark.default.parallelism"] = f"{exp_default_parallelism * 10}"
    assert spark_config.Properties == exp_spark_config_props
def test_load_instance_type_info(patched_exists,
                                 default_bootstrapper: Bootstrapper) -> None:
    patched_exists.return_value = False
    assert default_bootstrapper.load_instance_type_info() == {}
    patched_exists.assert_called_once_with(
        Bootstrapper.INSTANCE_TYPE_INFO_PATH)
def test_load_processing_job_config_fallback(
        patched_exists, default_bootstrapper: Bootstrapper) -> None:
    patched_exists.return_value = False
    assert default_bootstrapper.load_processing_job_config() == {}
    patched_exists.assert_called_once_with(
        Bootstrapper.PROCESSING_JOB_CONFIG_PATH)
def test_get_regional_configs_missing_region(
        patched_getenv, default_bootstrapper: Bootstrapper) -> None:
    patched_getenv.return_value = None
    regional_configs_list = default_bootstrapper.get_regional_configs()
    assert len(regional_configs_list) == 0
    patched_getenv.assert_called_once_with("AWS_REGION")
def default_bootstrapper() -> Bootstrapper:
    return Bootstrapper(default_resource_config)
예제 #15
0
class ProcessingJobManager(object):
    """Manages the lifecycle of a Spark job."""

    def __init__(
        self,
        resource_config: Dict[str, Any] = None,  # type: ignore
        processing_job_config: Dict[str, Any] = None,  # type: ignore
    ) -> None:
        """Initialize a ProcessingJobManager, loading configs if not provided."""
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger("smspark-submit")

        try:
            resource_config_path = "/opt/ml/config/resourceconfig.json"
            with open(resource_config_path, "r") as f:
                self._resource_config = json.load(f)
        except Exception:
            self.logger.warning(
                "Could not read resource config file at {}. Using default resourceconfig.".format(resource_config_path)
            )
            self._resource_config = default_resource_config

        self.logger.info(self._resource_config)

        try:
            processing_job_config_path = "/opt/ml/config/processingjobconfig.json"
            with open(processing_job_config_path, "r") as f:
                self._processing_job_config = json.load(f)
        except Exception:
            self.logger.warning(
                "Could not read resource config file at {}. Using default resourceconfig.".format(resource_config_path)
            )
            self._processing_job_config = default_processing_job_config

        self.logger.info(self._processing_job_config)
        self.bootstrapper = Bootstrapper(self._resource_config)
        self.waiter = Waiter()
        self.status_app = StatusApp()
        self.status_client = StatusClient()

    @property
    def hostname(self) -> str:
        """Return the current host's hostname."""
        return self._resource_config["current_host"]

    @property
    def hosts(self) -> Sequence[str]:
        """Return a sequence of all the hostnames in the cluster."""
        return self._resource_config["hosts"]

    @property
    def _is_primary_host(self) -> bool:
        current_host = self.hostname
        return current_host == self._cluster_primary_host

    @property
    def _cluster_primary_host(self) -> str:
        return sorted(self._resource_config["hosts"])[0]

    def _wait_for_hostname_resolution(self) -> None:
        for host in self._resource_config["hosts"]:
            self._dns_lookup(host)

    @retry(stop=stop_after_delay(60))
    def _dns_lookup(self, host: str) -> None:
        socket.gethostbyname(host)

    def run(self, spark_submit_cmd: str, spark_event_logs_s3_uri: str, local_spark_event_logs_dir: str) -> None:
        """Run a Spark job.

        First, wait for workers to come up and bootstraps the cluster.
        Then runs spark-submit, waits until the job succeeds or fails.
        Worker nodes are shut down gracefully.

        Args:
          spark_submit_cmd (str): Command submitted to run spark-submit
        """
        self.logger.info("waiting for hosts")
        self._wait_for_hostname_resolution()
        self.logger.info("starting status server")
        self._start_status_server()
        self.logger.info("bootstrapping cluster")
        self._bootstrap_yarn()
        self.logger.info("starting executor logs watcher")
        self._start_executor_logs_watcher()

        if self._is_primary_host:
            self.logger.info("start log event log publisher")
            spark_log_publisher = self._start_spark_event_log_publisher(
                spark_event_logs_s3_uri, local_spark_event_logs_dir
            )

            self.logger.info(f"Waiting for hosts to bootstrap: {self.hosts}")

            def all_hosts_have_bootstrapped() -> bool:
                try:
                    host_statuses: Mapping[str, StatusMessage] = self.status_client.get_status(self.hosts)

                except ConnectionError as e:
                    self.logger.info(
                        f"Got ConnectionError when polling hosts for status. Host may not have come up: {str(e)}.\nTraceback: {traceback.format_exc()}"
                    )
                    return False
                self.logger.info(f"Received host statuses: {host_statuses.items()}")
                has_bootstrapped = [message.status == Status.WAITING for message in host_statuses.values()]
                return all(has_bootstrapped)

            self.waiter.wait_for(predicate_fn=all_hosts_have_bootstrapped, timeout=180.0, period=5.0)

            try:
                subprocess.run(spark_submit_cmd, check=True, shell=True)
                self.logger.info("spark submit was successful. primary node exiting.")
            except subprocess.CalledProcessError as e:
                self.logger.error(
                    f"spark-submit command failed with exit code {e.returncode}: {str(e)}\n{traceback.format_exc()}"
                    + str(e)
                    + "\n"
                    + traceback.format_exc()
                )
                raise AlgorithmError("spark failed with a non-zero exit code", caused_by=e, exit_code=e.returncode)
            except Exception as e:
                self.logger.error("Exception during processing: " + str(e) + "\n" + traceback.format_exc())
                raise AlgorithmError(
                    message="error occurred during spark-submit execution. Please see logs for details.", caused_by=e,
                )

            finally:
                spark_log_publisher.down()
                spark_log_publisher.join(timeout=20)

        else:
            # workers wait until the primary is up, then wait until it's down.
            def primary_is_up() -> bool:
                try:
                    self.status_client.get_status([self._cluster_primary_host])
                    return True
                except Exception:
                    return False

            def primary_is_down() -> bool:
                return not primary_is_up()

            self.logger.info("waiting for the primary to come up")
            self.waiter.wait_for(primary_is_up, timeout=60.0, period=1.0)
            self.logger.info("waiting for the primary to go down")
            self.waiter.wait_for(primary_is_down, timeout=float("inf"), period=5.0)
            self.logger.info("primary is down, worker now exiting")

    def _bootstrap_yarn(self) -> None:
        self.status_app.status = Status.BOOTSTRAPPING
        self.bootstrapper.bootstrap_smspark_submit()
        self.status_app.status = Status.WAITING

    def _start_executor_logs_watcher(self, log_dir: str = "/var/log/yarn") -> None:
        # TODO: check Yarn configs for yarn.log.dir/YARN_LOG_DIR, in case of overrides
        spark_executor_logs_watcher = SparkExecutorLogsWatcher(log_dir)
        spark_executor_logs_watcher.daemon = True
        spark_executor_logs_watcher.start()

    def _start_status_server(self) -> None:
        server = StatusServer(self.status_app, self.hostname)
        server.daemon = True
        server.start()

    def _start_spark_event_log_publisher(
        self, spark_event_logs_s3_uri: str, local_spark_event_logs_dir: str
    ) -> SparkEventLogPublisher:
        spark_log_publisher = SparkEventLogPublisher(spark_event_logs_s3_uri, local_spark_event_logs_dir)
        spark_log_publisher.daemon = True
        spark_log_publisher.start()
        return spark_log_publisher