def test_get_spark_conf_aws_session(self): other_spark_opts = { 'spark.driver.memory': '2g', 'spark.executor.memoryOverhead': '1024' } not_allowed_opts = { 'spark.executorEnv.PAASTA_SERVICE': 'random-service' } user_spark_opts = { **({}), **not_allowed_opts, **other_spark_opts, } aws_creds = ('key', 'secret', 'token') output = spark_config.get_spark_conf( cluster_manager='kubernetes', spark_app_base_name=self.spark_app_base_name, user_spark_opts=user_spark_opts, paasta_cluster=self.cluster, paasta_pool=self.pool, paasta_service=self.service, paasta_instance=self.instance, docker_img=self.docker_image, extra_volumes=self.extra_volumes, aws_creds=aws_creds, ) assert self.aws_provider_key in output.keys() assert 'org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider' == output[ self.aws_provider_key]
def tes_leaderst_get_spark_conf_kubernetes( self, user_spark_opts, spark_opts_from_env, ui_port, mock_append_event_log_conf, mock_append_sql_shuffle_partitions_conf, mock_adjust_spark_requested_resources_kubernetes, mock_time, assert_ui_port, assert_app_name, assert_kubernetes_conf, mock_log, ): other_spark_opts = { 'spark.driver.memory': '2g', 'spark.executor.memoryOverhead': '1024' } user_spark_opts = { **(user_spark_opts or {}), **other_spark_opts, } aws_creds = (None, None, None) output = spark_config.get_spark_conf( cluster_manager='kubernetes', spark_app_base_name=self.spark_app_base_name, user_spark_opts=user_spark_opts, paasta_cluster=self.cluster, paasta_pool=self.pool, paasta_service=self.service, paasta_instance=self.instance, docker_img=self.docker_image, extra_volumes=self.extra_volumes, aws_creds=aws_creds, spark_opts_from_env=spark_opts_from_env, ) verified_keys = set( assert_ui_port(output) + assert_app_name(output) + assert_kubernetes_conf(output) + list(other_spark_opts.keys()) + list(mock_adjust_spark_requested_resources_kubernetes.return_value. keys()) + list(mock_append_event_log_conf.return_value.keys()) + list( mock_append_sql_shuffle_partitions_conf.return_value.keys()), ) assert len(set(output.keys()) - verified_keys) == 0 mock_adjust_spark_requested_resources_kubernetes.mocker.assert_called_once_with( mock.ANY, 'kubernetes', self.pool, ) mock_append_event_log_conf.mocker.assert_called_once_with( mock.ANY, *aws_creds, ) mock_append_sql_shuffle_partitions_conf.mocker.assert_called_once_with( mock.ANY, )
def test_get_spark_conf_mesos_error(self, reason, monkeypatch, mock_request_mesos_leader): if reason == 'mesos_leader': mock_request_mesos_leader.side_effect = spark_config.requests.RequestException( ) else: monkeypatch.setattr(spark_config, 'DEFAULT_SPARK_MESOS_SECRET_FILE', '/not_exist') with pytest.raises(ValueError): spark_config.get_spark_conf( cluster_manager='mesos', spark_app_base_name=self.spark_app_base_name, user_spark_opts={}, paasta_cluster=self.cluster, paasta_pool=self.pool, paasta_service=self.service, paasta_instance=self.instance, docker_img=self.docker_image, aws_creds=(None, None, None), extra_volumes=[], )
def get_spark_config_dict(self): spark_config_dict = getattr(self, "_spark_config_dict", None) # cached the created dict, so that we don't need to process it multiple # times, and having inconsistent result if spark_config_dict is not None: return spark_config_dict if self.get_spark_cluster_manager() == "mesos": mesos_leader = ( f"zk://{load_system_paasta_config().get_zk_hosts()}" if not self.for_validation else "N/A") else: mesos_leader = None aws_creds = get_aws_credentials( aws_credentials_yaml=self.config_dict.get("aws_credentials_yaml")) self._spark_config_dict = get_spark_conf( cluster_manager=self.get_spark_cluster_manager(), spark_app_base_name= f"tron_spark_{self.get_service()}_{self.get_instance()}", user_spark_opts=self.config_dict.get("spark_args", {}), paasta_cluster=self.get_spark_paasta_cluster(), paasta_pool=self.get_spark_paasta_pool(), paasta_service=self.get_service(), paasta_instance=self.get_instance(), docker_img=self.get_docker_url(), aws_creds=aws_creds, extra_volumes=self.get_volumes( load_system_paasta_config().get_volumes()), # tron is using environment variable to load the required creds with_secret=False, mesos_leader=mesos_leader, # load_system_paasta already load the default volumes load_paasta_default_volumes=False, ) return self._spark_config_dict
def paasta_spark_run(args): # argparse does not work as expected with both default and # type=validate_work_dir. validate_work_dir(args.work_dir) try: system_paasta_config = load_system_paasta_config() except PaastaNotConfiguredError: print( PaastaColors.yellow( "Warning: Couldn't load config files from '/etc/paasta'. This indicates" "PaaSTA is not configured locally on this host, and local-run may not behave" "the same way it would behave on a server configured for PaaSTA." ), sep="\n", ) system_paasta_config = SystemPaastaConfig({"volumes": []}, "/etc/paasta") if args.cmd == "jupyter-lab" and not args.build and not args.image: print( PaastaColors.red( "The jupyter-lab command requires a prebuilt image with -I or --image." ), file=sys.stderr, ) return 1 # Use the default spark:client instance configs if not provided try: instance_config = get_instance_config( service=args.service, instance=args.instance, cluster=system_paasta_config.get_cluster_aliases().get( args.cluster, args.cluster ), load_deployments=args.build is False and args.image is None, soa_dir=args.yelpsoa_config_root, ) except NoConfigurationForServiceError as e: print(str(e), file=sys.stderr) return 1 except NoDeploymentsAvailable: print( PaastaColors.red( "Error: No deployments.json found in %(soa_dir)s/%(service)s." "You can generate this by running:" "generate_deployments_for_service -d %(soa_dir)s -s %(service)s" % {"soa_dir": args.yelpsoa_config_root, "service": args.service} ), sep="\n", file=sys.stderr, ) return 1 if not args.cmd and not instance_config.get_cmd(): print( "A command is required, pyspark, spark-shell, spark-submit or jupyter", file=sys.stderr, ) return 1 aws_creds = get_aws_credentials( service=args.service, no_aws_credentials=args.no_aws_credentials, aws_credentials_yaml=args.aws_credentials_yaml, profile_name=args.aws_profile, ) docker_image = get_docker_image(args, instance_config) if docker_image is None: return 1 pod_template_path = generate_pod_template_path() args.enable_compact_bin_packing = should_enable_compact_bin_packing( args.disable_compact_bin_packing, args.cluster_manager ) volumes = instance_config.get_volumes(system_paasta_config.get_volumes()) app_base_name = get_spark_app_name(args.cmd or instance_config.get_cmd()) if args.enable_compact_bin_packing: document = POD_TEMPLATE.format( spark_pod_label=limit_size_with_hash(f"exec-{app_base_name}"), ) parsed_pod_template = yaml.load(document) with open(pod_template_path, "w") as f: yaml.dump(parsed_pod_template, f) needs_docker_cfg = not args.build user_spark_opts = _parse_user_spark_args( args.spark_args, pod_template_path, args.enable_compact_bin_packing ) args.cmd = _auto_add_timeout_for_job(args.cmd, args.timeout_job_runtime) # This is required if configs are provided as part of `spark-submit` # Other way to provide is with --spark-args sub_cmds = args.cmd.split(" ") # spark.driver.memory=10g for cmd in sub_cmds: if cmd.startswith("spark.driver.memory") or cmd.startswith( "spark.driver.cores" ): key, value = cmd.split("=") user_spark_opts[key] = value paasta_instance = get_smart_paasta_instance_name(args) auto_set_temporary_credentials_provider = ( args.disable_temporary_credentials_provider is False ) spark_conf = get_spark_conf( cluster_manager=args.cluster_manager, spark_app_base_name=app_base_name, docker_img=docker_image, user_spark_opts=user_spark_opts, paasta_cluster=args.cluster, paasta_pool=args.pool, paasta_service=args.service, paasta_instance=paasta_instance, extra_volumes=volumes, aws_creds=aws_creds, needs_docker_cfg=needs_docker_cfg, auto_set_temporary_credentials_provider=auto_set_temporary_credentials_provider, ) # Experimental: TODO: Move to service_configuration_lib once confirmed that there are no issues # Enable AQE: Adaptive Query Execution if "spark.sql.adaptive.enabled" not in spark_conf: spark_conf["spark.sql.adaptive.enabled"] = "true" aqe_msg = "Spark performance improving feature Adaptive Query Execution (AQE) is enabled. Set spark.sql.adaptive.enabled as false to disable." log.info(aqe_msg) print(PaastaColors.blue(aqe_msg)) return configure_and_run_docker_container( args, docker_img=docker_image, instance_config=instance_config, system_paasta_config=system_paasta_config, spark_conf=spark_conf, aws_creds=aws_creds, cluster_manager=args.cluster_manager, pod_template_path=pod_template_path, )
def paasta_spark_run(args): # argparse does not work as expected with both default and # type=validate_work_dir. validate_work_dir(args.work_dir) try: system_paasta_config = load_system_paasta_config() except PaastaNotConfiguredError: print( PaastaColors.yellow( "Warning: Couldn't load config files from '/etc/paasta'. This indicates" "PaaSTA is not configured locally on this host, and local-run may not behave" "the same way it would behave on a server configured for PaaSTA." ), sep="\n", ) system_paasta_config = SystemPaastaConfig({"volumes": []}, "/etc/paasta") if args.cmd == "jupyter-lab" and not args.build and not args.image: print( PaastaColors.red( "The jupyter-lab command requires a prebuilt image with -I or --image." ), file=sys.stderr, ) return 1 # Use the default spark:client instance configs if not provided try: instance_config = get_instance_config( service=args.service, instance=args.instance, cluster=args.cluster, load_deployments=args.build is False and args.image is None, soa_dir=args.yelpsoa_config_root, ) except NoConfigurationForServiceError as e: print(str(e), file=sys.stderr) return 1 except NoDeploymentsAvailable: print( PaastaColors.red( "Error: No deployments.json found in %(soa_dir)s/%(service)s." "You can generate this by running:" "generate_deployments_for_service -d %(soa_dir)s -s %(service)s" % { "soa_dir": args.yelpsoa_config_root, "service": args.service }), sep="\n", file=sys.stderr, ) return 1 if not args.cmd and not instance_config.get_cmd(): print( "A command is required, pyspark, spark-shell, spark-submit or jupyter", file=sys.stderr, ) return 1 aws_creds = get_aws_credentials( service=args.service, no_aws_credentials=args.no_aws_credentials, aws_credentials_yaml=args.aws_credentials_yaml, profile_name=args.aws_profile, ) docker_image = get_docker_image(args, instance_config) if docker_image is None: return 1 volumes = instance_config.get_volumes(system_paasta_config.get_volumes()) app_base_name = get_spark_app_name(args.cmd or instance_config.get_cmd()) needs_docker_cfg = not args.build and not args.image user_spark_opts = _parse_user_spark_args(args.spark_args) paasta_instance = get_smart_paasta_instance_name(args) spark_conf = get_spark_conf( cluster_manager="mesos", spark_app_base_name=app_base_name, docker_img=docker_image, user_spark_opts=user_spark_opts, paasta_cluster=args.cluster, paasta_pool=args.pool, paasta_service=args.service, paasta_instance=paasta_instance, extra_volumes=volumes, aws_creds=aws_creds, needs_docker_cfg=needs_docker_cfg, ) return configure_and_run_docker_container( args, docker_img=docker_image, instance_config=instance_config, system_paasta_config=system_paasta_config, spark_conf=spark_conf, aws_creds=aws_creds, )
def test_get_spark_conf_mesos( self, user_spark_opts, spark_opts_from_env, ui_port, with_secret, mesos_leader, needs_docker_cfg, extra_docker_params, mock_get_mesos_docker_volumes_conf, mock_append_event_log_conf, mock_append_sql_shuffle_partitions_conf, mock_adjust_spark_requested_resources_mesos, mock_time, assert_mesos_leader, assert_docker_parameters, assert_mesos_secret, assert_docker_cfg, assert_mesos_conf, assert_ui_port, assert_app_name, mock_log, ): other_spark_opts = { 'spark.driver.memory': '2g', 'spark.executor.memoryOverhead': '1024' } not_allowed_opts = { 'spark.executorEnv.PAASTA_SERVICE': 'random-service' } user_spark_opts = { **(user_spark_opts or {}), **not_allowed_opts, **other_spark_opts, } aws_creds = (None, None, None) output = spark_config.get_spark_conf( cluster_manager='mesos', spark_app_base_name=self.spark_app_base_name, user_spark_opts=user_spark_opts, paasta_cluster=self.cluster, paasta_pool=self.pool, paasta_service=self.service, paasta_instance=self.instance, docker_img=self.docker_image, extra_volumes=self.extra_volumes, aws_creds=aws_creds, extra_docker_params=extra_docker_params, with_secret=with_secret, needs_docker_cfg=needs_docker_cfg, mesos_leader=mesos_leader, spark_opts_from_env=spark_opts_from_env, load_paasta_default_volumes=True, ) verified_keys = set( assert_mesos_leader(output) + assert_docker_parameters(output) + assert_mesos_secret(output) + assert_docker_cfg(output) + assert_mesos_conf(output) + assert_ui_port(output) + assert_app_name(output) + list(other_spark_opts.keys()) + list(mock_get_mesos_docker_volumes_conf.return_value.keys()) + list(mock_adjust_spark_requested_resources_mesos.return_value.keys( )) + list(mock_append_event_log_conf.return_value.keys()) + list( mock_append_sql_shuffle_partitions_conf.return_value.keys()), ) assert len(set(output.keys()) - verified_keys) == 0 mock_get_mesos_docker_volumes_conf.mocker.assert_called_once_with( mock.ANY, self.extra_volumes, True, ) mock_adjust_spark_requested_resources_mesos.mocker.assert_called_once_with( mock.ANY, 'mesos', self.pool, ) mock_append_event_log_conf.mocker.assert_called_once_with( mock.ANY, *aws_creds, ) mock_append_sql_shuffle_partitions_conf.mocker.assert_called_once_with( mock.ANY, ) (warning_msg, ), _ = mock_log.warning.call_args assert next(iter(not_allowed_opts.keys())) in warning_msg