示例#1
0
    def test_get_spark_conf_aws_session(self):
        other_spark_opts = {
            'spark.driver.memory': '2g',
            'spark.executor.memoryOverhead': '1024'
        }
        not_allowed_opts = {
            'spark.executorEnv.PAASTA_SERVICE': 'random-service'
        }
        user_spark_opts = {
            **({}),
            **not_allowed_opts,
            **other_spark_opts,
        }

        aws_creds = ('key', 'secret', 'token')

        output = spark_config.get_spark_conf(
            cluster_manager='kubernetes',
            spark_app_base_name=self.spark_app_base_name,
            user_spark_opts=user_spark_opts,
            paasta_cluster=self.cluster,
            paasta_pool=self.pool,
            paasta_service=self.service,
            paasta_instance=self.instance,
            docker_img=self.docker_image,
            extra_volumes=self.extra_volumes,
            aws_creds=aws_creds,
        )
        assert self.aws_provider_key in output.keys()
        assert 'org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider' == output[
            self.aws_provider_key]
示例#2
0
    def tes_leaderst_get_spark_conf_kubernetes(
        self,
        user_spark_opts,
        spark_opts_from_env,
        ui_port,
        mock_append_event_log_conf,
        mock_append_sql_shuffle_partitions_conf,
        mock_adjust_spark_requested_resources_kubernetes,
        mock_time,
        assert_ui_port,
        assert_app_name,
        assert_kubernetes_conf,
        mock_log,
    ):
        other_spark_opts = {
            'spark.driver.memory': '2g',
            'spark.executor.memoryOverhead': '1024'
        }
        user_spark_opts = {
            **(user_spark_opts or {}),
            **other_spark_opts,
        }

        aws_creds = (None, None, None)

        output = spark_config.get_spark_conf(
            cluster_manager='kubernetes',
            spark_app_base_name=self.spark_app_base_name,
            user_spark_opts=user_spark_opts,
            paasta_cluster=self.cluster,
            paasta_pool=self.pool,
            paasta_service=self.service,
            paasta_instance=self.instance,
            docker_img=self.docker_image,
            extra_volumes=self.extra_volumes,
            aws_creds=aws_creds,
            spark_opts_from_env=spark_opts_from_env,
        )

        verified_keys = set(
            assert_ui_port(output) + assert_app_name(output) +
            assert_kubernetes_conf(output) + list(other_spark_opts.keys()) +
            list(mock_adjust_spark_requested_resources_kubernetes.return_value.
                 keys()) +
            list(mock_append_event_log_conf.return_value.keys()) + list(
                mock_append_sql_shuffle_partitions_conf.return_value.keys()), )
        assert len(set(output.keys()) - verified_keys) == 0
        mock_adjust_spark_requested_resources_kubernetes.mocker.assert_called_once_with(
            mock.ANY,
            'kubernetes',
            self.pool,
        )
        mock_append_event_log_conf.mocker.assert_called_once_with(
            mock.ANY,
            *aws_creds,
        )
        mock_append_sql_shuffle_partitions_conf.mocker.assert_called_once_with(
            mock.ANY, )
示例#3
0
 def test_get_spark_conf_mesos_error(self, reason, monkeypatch,
                                     mock_request_mesos_leader):
     if reason == 'mesos_leader':
         mock_request_mesos_leader.side_effect = spark_config.requests.RequestException(
         )
     else:
         monkeypatch.setattr(spark_config,
                             'DEFAULT_SPARK_MESOS_SECRET_FILE',
                             '/not_exist')
     with pytest.raises(ValueError):
         spark_config.get_spark_conf(
             cluster_manager='mesos',
             spark_app_base_name=self.spark_app_base_name,
             user_spark_opts={},
             paasta_cluster=self.cluster,
             paasta_pool=self.pool,
             paasta_service=self.service,
             paasta_instance=self.instance,
             docker_img=self.docker_image,
             aws_creds=(None, None, None),
             extra_volumes=[],
         )
示例#4
0
    def get_spark_config_dict(self):
        spark_config_dict = getattr(self, "_spark_config_dict", None)
        # cached the created dict, so that we don't need to process it multiple
        # times, and having inconsistent result
        if spark_config_dict is not None:
            return spark_config_dict

        if self.get_spark_cluster_manager() == "mesos":
            mesos_leader = (
                f"zk://{load_system_paasta_config().get_zk_hosts()}"
                if not self.for_validation else "N/A")
        else:
            mesos_leader = None

        aws_creds = get_aws_credentials(
            aws_credentials_yaml=self.config_dict.get("aws_credentials_yaml"))
        self._spark_config_dict = get_spark_conf(
            cluster_manager=self.get_spark_cluster_manager(),
            spark_app_base_name=
            f"tron_spark_{self.get_service()}_{self.get_instance()}",
            user_spark_opts=self.config_dict.get("spark_args", {}),
            paasta_cluster=self.get_spark_paasta_cluster(),
            paasta_pool=self.get_spark_paasta_pool(),
            paasta_service=self.get_service(),
            paasta_instance=self.get_instance(),
            docker_img=self.get_docker_url(),
            aws_creds=aws_creds,
            extra_volumes=self.get_volumes(
                load_system_paasta_config().get_volumes()),
            # tron is using environment variable to load the required creds
            with_secret=False,
            mesos_leader=mesos_leader,
            # load_system_paasta already load the default volumes
            load_paasta_default_volumes=False,
        )
        return self._spark_config_dict
示例#5
0
文件: spark_run.py 项目: Yelp/paasta
def paasta_spark_run(args):
    # argparse does not work as expected with both default and
    # type=validate_work_dir.
    validate_work_dir(args.work_dir)

    try:
        system_paasta_config = load_system_paasta_config()
    except PaastaNotConfiguredError:
        print(
            PaastaColors.yellow(
                "Warning: Couldn't load config files from '/etc/paasta'. This indicates"
                "PaaSTA is not configured locally on this host, and local-run may not behave"
                "the same way it would behave on a server configured for PaaSTA."
            ),
            sep="\n",
        )
        system_paasta_config = SystemPaastaConfig({"volumes": []}, "/etc/paasta")

    if args.cmd == "jupyter-lab" and not args.build and not args.image:
        print(
            PaastaColors.red(
                "The jupyter-lab command requires a prebuilt image with -I or --image."
            ),
            file=sys.stderr,
        )
        return 1

    # Use the default spark:client instance configs if not provided
    try:
        instance_config = get_instance_config(
            service=args.service,
            instance=args.instance,
            cluster=system_paasta_config.get_cluster_aliases().get(
                args.cluster, args.cluster
            ),
            load_deployments=args.build is False and args.image is None,
            soa_dir=args.yelpsoa_config_root,
        )
    except NoConfigurationForServiceError as e:
        print(str(e), file=sys.stderr)
        return 1
    except NoDeploymentsAvailable:
        print(
            PaastaColors.red(
                "Error: No deployments.json found in %(soa_dir)s/%(service)s."
                "You can generate this by running:"
                "generate_deployments_for_service -d %(soa_dir)s -s %(service)s"
                % {"soa_dir": args.yelpsoa_config_root, "service": args.service}
            ),
            sep="\n",
            file=sys.stderr,
        )
        return 1

    if not args.cmd and not instance_config.get_cmd():
        print(
            "A command is required, pyspark, spark-shell, spark-submit or jupyter",
            file=sys.stderr,
        )
        return 1

    aws_creds = get_aws_credentials(
        service=args.service,
        no_aws_credentials=args.no_aws_credentials,
        aws_credentials_yaml=args.aws_credentials_yaml,
        profile_name=args.aws_profile,
    )
    docker_image = get_docker_image(args, instance_config)
    if docker_image is None:
        return 1

    pod_template_path = generate_pod_template_path()
    args.enable_compact_bin_packing = should_enable_compact_bin_packing(
        args.disable_compact_bin_packing, args.cluster_manager
    )

    volumes = instance_config.get_volumes(system_paasta_config.get_volumes())
    app_base_name = get_spark_app_name(args.cmd or instance_config.get_cmd())

    if args.enable_compact_bin_packing:
        document = POD_TEMPLATE.format(
            spark_pod_label=limit_size_with_hash(f"exec-{app_base_name}"),
        )
        parsed_pod_template = yaml.load(document)
        with open(pod_template_path, "w") as f:
            yaml.dump(parsed_pod_template, f)

    needs_docker_cfg = not args.build
    user_spark_opts = _parse_user_spark_args(
        args.spark_args, pod_template_path, args.enable_compact_bin_packing
    )

    args.cmd = _auto_add_timeout_for_job(args.cmd, args.timeout_job_runtime)

    # This is required if configs are provided as part of `spark-submit`
    # Other way to provide is with --spark-args
    sub_cmds = args.cmd.split(" ")  # spark.driver.memory=10g
    for cmd in sub_cmds:
        if cmd.startswith("spark.driver.memory") or cmd.startswith(
            "spark.driver.cores"
        ):
            key, value = cmd.split("=")
            user_spark_opts[key] = value

    paasta_instance = get_smart_paasta_instance_name(args)
    auto_set_temporary_credentials_provider = (
        args.disable_temporary_credentials_provider is False
    )
    spark_conf = get_spark_conf(
        cluster_manager=args.cluster_manager,
        spark_app_base_name=app_base_name,
        docker_img=docker_image,
        user_spark_opts=user_spark_opts,
        paasta_cluster=args.cluster,
        paasta_pool=args.pool,
        paasta_service=args.service,
        paasta_instance=paasta_instance,
        extra_volumes=volumes,
        aws_creds=aws_creds,
        needs_docker_cfg=needs_docker_cfg,
        auto_set_temporary_credentials_provider=auto_set_temporary_credentials_provider,
    )
    # Experimental: TODO: Move to service_configuration_lib once confirmed that there are no issues
    # Enable AQE: Adaptive Query Execution
    if "spark.sql.adaptive.enabled" not in spark_conf:
        spark_conf["spark.sql.adaptive.enabled"] = "true"
        aqe_msg = "Spark performance improving feature Adaptive Query Execution (AQE) is enabled. Set spark.sql.adaptive.enabled as false to disable."
        log.info(aqe_msg)
        print(PaastaColors.blue(aqe_msg))
    return configure_and_run_docker_container(
        args,
        docker_img=docker_image,
        instance_config=instance_config,
        system_paasta_config=system_paasta_config,
        spark_conf=spark_conf,
        aws_creds=aws_creds,
        cluster_manager=args.cluster_manager,
        pod_template_path=pod_template_path,
    )
示例#6
0
def paasta_spark_run(args):
    # argparse does not work as expected with both default and
    # type=validate_work_dir.
    validate_work_dir(args.work_dir)

    try:
        system_paasta_config = load_system_paasta_config()
    except PaastaNotConfiguredError:
        print(
            PaastaColors.yellow(
                "Warning: Couldn't load config files from '/etc/paasta'. This indicates"
                "PaaSTA is not configured locally on this host, and local-run may not behave"
                "the same way it would behave on a server configured for PaaSTA."
            ),
            sep="\n",
        )
        system_paasta_config = SystemPaastaConfig({"volumes": []},
                                                  "/etc/paasta")

    if args.cmd == "jupyter-lab" and not args.build and not args.image:
        print(
            PaastaColors.red(
                "The jupyter-lab command requires a prebuilt image with -I or --image."
            ),
            file=sys.stderr,
        )
        return 1

    # Use the default spark:client instance configs if not provided
    try:
        instance_config = get_instance_config(
            service=args.service,
            instance=args.instance,
            cluster=args.cluster,
            load_deployments=args.build is False and args.image is None,
            soa_dir=args.yelpsoa_config_root,
        )
    except NoConfigurationForServiceError as e:
        print(str(e), file=sys.stderr)
        return 1
    except NoDeploymentsAvailable:
        print(
            PaastaColors.red(
                "Error: No deployments.json found in %(soa_dir)s/%(service)s."
                "You can generate this by running:"
                "generate_deployments_for_service -d %(soa_dir)s -s %(service)s"
                % {
                    "soa_dir": args.yelpsoa_config_root,
                    "service": args.service
                }),
            sep="\n",
            file=sys.stderr,
        )
        return 1

    if not args.cmd and not instance_config.get_cmd():
        print(
            "A command is required, pyspark, spark-shell, spark-submit or jupyter",
            file=sys.stderr,
        )
        return 1

    aws_creds = get_aws_credentials(
        service=args.service,
        no_aws_credentials=args.no_aws_credentials,
        aws_credentials_yaml=args.aws_credentials_yaml,
        profile_name=args.aws_profile,
    )
    docker_image = get_docker_image(args, instance_config)
    if docker_image is None:
        return 1

    volumes = instance_config.get_volumes(system_paasta_config.get_volumes())
    app_base_name = get_spark_app_name(args.cmd or instance_config.get_cmd())
    needs_docker_cfg = not args.build and not args.image
    user_spark_opts = _parse_user_spark_args(args.spark_args)
    paasta_instance = get_smart_paasta_instance_name(args)
    spark_conf = get_spark_conf(
        cluster_manager="mesos",
        spark_app_base_name=app_base_name,
        docker_img=docker_image,
        user_spark_opts=user_spark_opts,
        paasta_cluster=args.cluster,
        paasta_pool=args.pool,
        paasta_service=args.service,
        paasta_instance=paasta_instance,
        extra_volumes=volumes,
        aws_creds=aws_creds,
        needs_docker_cfg=needs_docker_cfg,
    )
    return configure_and_run_docker_container(
        args,
        docker_img=docker_image,
        instance_config=instance_config,
        system_paasta_config=system_paasta_config,
        spark_conf=spark_conf,
        aws_creds=aws_creds,
    )
示例#7
0
    def test_get_spark_conf_mesos(
        self,
        user_spark_opts,
        spark_opts_from_env,
        ui_port,
        with_secret,
        mesos_leader,
        needs_docker_cfg,
        extra_docker_params,
        mock_get_mesos_docker_volumes_conf,
        mock_append_event_log_conf,
        mock_append_sql_shuffle_partitions_conf,
        mock_adjust_spark_requested_resources_mesos,
        mock_time,
        assert_mesos_leader,
        assert_docker_parameters,
        assert_mesos_secret,
        assert_docker_cfg,
        assert_mesos_conf,
        assert_ui_port,
        assert_app_name,
        mock_log,
    ):
        other_spark_opts = {
            'spark.driver.memory': '2g',
            'spark.executor.memoryOverhead': '1024'
        }
        not_allowed_opts = {
            'spark.executorEnv.PAASTA_SERVICE': 'random-service'
        }
        user_spark_opts = {
            **(user_spark_opts or {}),
            **not_allowed_opts,
            **other_spark_opts,
        }

        aws_creds = (None, None, None)

        output = spark_config.get_spark_conf(
            cluster_manager='mesos',
            spark_app_base_name=self.spark_app_base_name,
            user_spark_opts=user_spark_opts,
            paasta_cluster=self.cluster,
            paasta_pool=self.pool,
            paasta_service=self.service,
            paasta_instance=self.instance,
            docker_img=self.docker_image,
            extra_volumes=self.extra_volumes,
            aws_creds=aws_creds,
            extra_docker_params=extra_docker_params,
            with_secret=with_secret,
            needs_docker_cfg=needs_docker_cfg,
            mesos_leader=mesos_leader,
            spark_opts_from_env=spark_opts_from_env,
            load_paasta_default_volumes=True,
        )

        verified_keys = set(
            assert_mesos_leader(output) + assert_docker_parameters(output) +
            assert_mesos_secret(output) + assert_docker_cfg(output) +
            assert_mesos_conf(output) + assert_ui_port(output) +
            assert_app_name(output) + list(other_spark_opts.keys()) +
            list(mock_get_mesos_docker_volumes_conf.return_value.keys()) +
            list(mock_adjust_spark_requested_resources_mesos.return_value.keys(
            )) + list(mock_append_event_log_conf.return_value.keys()) + list(
                mock_append_sql_shuffle_partitions_conf.return_value.keys()), )
        assert len(set(output.keys()) - verified_keys) == 0
        mock_get_mesos_docker_volumes_conf.mocker.assert_called_once_with(
            mock.ANY,
            self.extra_volumes,
            True,
        )
        mock_adjust_spark_requested_resources_mesos.mocker.assert_called_once_with(
            mock.ANY,
            'mesos',
            self.pool,
        )
        mock_append_event_log_conf.mocker.assert_called_once_with(
            mock.ANY,
            *aws_creds,
        )
        mock_append_sql_shuffle_partitions_conf.mocker.assert_called_once_with(
            mock.ANY, )
        (warning_msg, ), _ = mock_log.warning.call_args
        assert next(iter(not_allowed_opts.keys())) in warning_msg