Exemplo n.º 1
0
 def test_use_aws_credentials_json(self, tmpdir):
     fp = tmpdir.join('test.json')
     fp.write(
         json.dumps({
             'accessKeyId': self.access_key,
             'secretAccessKey': self.secret_key
         }))
     assert spark_config.get_aws_credentials(
         aws_credentials_json=str(fp)) == self.expected_creds
Exemplo n.º 2
0
 def test_use_service_credentials(self, tmpdir, monkeypatch):
     test_service = 'test_service'
     creds_dir = tmpdir.mkdir('creds')
     creds_file = creds_dir.join(f'test_service.yaml')
     creds_file.write(yaml.dump(self.creds))
     monkeypatch.setattr(spark_config, 'AWS_CREDENTIALS_DIR',
                         str(creds_dir))
     assert spark_config.get_aws_credentials(
         service=test_service) == self.expected_creds
Exemplo n.º 3
0
 def test_use_service_credentials_missing_file(self, tmpdir, monkeypatch,
                                               mock_session, mock_log):
     test_service = 'not_exist'
     creds_dir = tmpdir.mkdir('creds')
     monkeypatch.setattr(spark_config, 'AWS_CREDENTIALS_DIR',
                         str(creds_dir))
     assert spark_config.get_aws_credentials(
         service=test_service) == self.expected_temp_creds
     (warning_msg, ), _ = mock_log.warning.call_args
     expected_message = f"Did not find service AWS credentials at {os.path.join(creds_dir, test_service + '.yaml')}"
     assert expected_message in warning_msg
Exemplo n.º 4
0
    def get_env(self):
        env = super().get_env()
        if self.get_executor() == "spark":
            spark_config_dict = self.get_spark_config_dict()
            env["EXECUTOR_CLUSTER"] = self.get_spark_paasta_cluster()
            env["EXECUTOR_POOL"] = self.get_spark_paasta_pool()
            env["SPARK_OPTS"] = stringify_spark_env(spark_config_dict)
            # The actual mesos secret will be decrypted and injected on mesos master when assigning
            # tasks.
            env["SPARK_MESOS_SECRET"] = "SHARED_SECRET(SPARK_MESOS_SECRET)"
            if clusterman_metrics:
                env["CLUSTERMAN_RESOURCES"] = json.dumps(
                    generate_clusterman_metrics_entries(
                        clusterman_metrics,
                        get_resources_requested(spark_config_dict),
                        spark_config_dict["spark.app.name"],
                        get_webui_url(spark_config_dict["spark.ui.port"]),
                    ))
            else:
                env["CLUSTERMAN_RESOURCES"] = "{}"

            if "AWS_ACCESS_KEY_ID" not in env or "AWS_SECRET_ACCESS_KEY" not in env:
                try:
                    access_key, secret_key, session_token = get_aws_credentials(
                        service=self.get_service(),
                        aws_credentials_yaml=self.config_dict.get(
                            "aws_credentials_yaml"),
                    )
                    env["AWS_ACCESS_KEY_ID"] = access_key
                    env["AWS_SECRET_ACCESS_KEY"] = secret_key
                except Exception:
                    log.warning(
                        f"Cannot set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment "
                        f"variables for tron action {self.get_instance()} of service "
                        f"{self.get_service()} via credentail file. Traceback:\n"
                        f"{traceback.format_exc()}")
            if "AWS_DEFAULT_REGION" not in env:
                env["AWS_DEFAULT_REGION"] = DEFAULT_AWS_REGION

        return env
Exemplo n.º 5
0
    def get_spark_config_dict(self):
        spark_config_dict = getattr(self, "_spark_config_dict", None)
        # cached the created dict, so that we don't need to process it multiple
        # times, and having inconsistent result
        if spark_config_dict is not None:
            return spark_config_dict

        if self.get_spark_cluster_manager() == "mesos":
            mesos_leader = (
                f"zk://{load_system_paasta_config().get_zk_hosts()}"
                if not self.for_validation else "N/A")
        else:
            mesos_leader = None

        aws_creds = get_aws_credentials(
            aws_credentials_yaml=self.config_dict.get("aws_credentials_yaml"))
        self._spark_config_dict = get_spark_conf(
            cluster_manager=self.get_spark_cluster_manager(),
            spark_app_base_name=
            f"tron_spark_{self.get_service()}_{self.get_instance()}",
            user_spark_opts=self.config_dict.get("spark_args", {}),
            paasta_cluster=self.get_spark_paasta_cluster(),
            paasta_pool=self.get_spark_paasta_pool(),
            paasta_service=self.get_service(),
            paasta_instance=self.get_instance(),
            docker_img=self.get_docker_url(),
            aws_creds=aws_creds,
            extra_volumes=self.get_volumes(
                load_system_paasta_config().get_volumes()),
            # tron is using environment variable to load the required creds
            with_secret=False,
            mesos_leader=mesos_leader,
            # load_system_paasta already load the default volumes
            load_paasta_default_volumes=False,
        )
        return self._spark_config_dict
Exemplo n.º 6
0
def paasta_spark_run(args):
    # argparse does not work as expected with both default and
    # type=validate_work_dir.
    validate_work_dir(args.work_dir)

    try:
        system_paasta_config = load_system_paasta_config()
    except PaastaNotConfiguredError:
        print(
            PaastaColors.yellow(
                "Warning: Couldn't load config files from '/etc/paasta'. This indicates"
                "PaaSTA is not configured locally on this host, and local-run may not behave"
                "the same way it would behave on a server configured for PaaSTA."
            ),
            sep="\n",
        )
        system_paasta_config = SystemPaastaConfig({"volumes": []}, "/etc/paasta")

    if args.cmd == "jupyter-lab" and not args.build and not args.image:
        print(
            PaastaColors.red(
                "The jupyter-lab command requires a prebuilt image with -I or --image."
            ),
            file=sys.stderr,
        )
        return 1

    # Use the default spark:client instance configs if not provided
    try:
        instance_config = get_instance_config(
            service=args.service,
            instance=args.instance,
            cluster=system_paasta_config.get_cluster_aliases().get(
                args.cluster, args.cluster
            ),
            load_deployments=args.build is False and args.image is None,
            soa_dir=args.yelpsoa_config_root,
        )
    except NoConfigurationForServiceError as e:
        print(str(e), file=sys.stderr)
        return 1
    except NoDeploymentsAvailable:
        print(
            PaastaColors.red(
                "Error: No deployments.json found in %(soa_dir)s/%(service)s."
                "You can generate this by running:"
                "generate_deployments_for_service -d %(soa_dir)s -s %(service)s"
                % {"soa_dir": args.yelpsoa_config_root, "service": args.service}
            ),
            sep="\n",
            file=sys.stderr,
        )
        return 1

    if not args.cmd and not instance_config.get_cmd():
        print(
            "A command is required, pyspark, spark-shell, spark-submit or jupyter",
            file=sys.stderr,
        )
        return 1

    aws_creds = get_aws_credentials(
        service=args.service,
        no_aws_credentials=args.no_aws_credentials,
        aws_credentials_yaml=args.aws_credentials_yaml,
        profile_name=args.aws_profile,
    )
    docker_image = get_docker_image(args, instance_config)
    if docker_image is None:
        return 1

    pod_template_path = generate_pod_template_path()
    args.enable_compact_bin_packing = should_enable_compact_bin_packing(
        args.disable_compact_bin_packing, args.cluster_manager
    )

    volumes = instance_config.get_volumes(system_paasta_config.get_volumes())
    app_base_name = get_spark_app_name(args.cmd or instance_config.get_cmd())

    if args.enable_compact_bin_packing:
        document = POD_TEMPLATE.format(
            spark_pod_label=limit_size_with_hash(f"exec-{app_base_name}"),
        )
        parsed_pod_template = yaml.load(document)
        with open(pod_template_path, "w") as f:
            yaml.dump(parsed_pod_template, f)

    needs_docker_cfg = not args.build
    user_spark_opts = _parse_user_spark_args(
        args.spark_args, pod_template_path, args.enable_compact_bin_packing
    )

    args.cmd = _auto_add_timeout_for_job(args.cmd, args.timeout_job_runtime)

    # This is required if configs are provided as part of `spark-submit`
    # Other way to provide is with --spark-args
    sub_cmds = args.cmd.split(" ")  # spark.driver.memory=10g
    for cmd in sub_cmds:
        if cmd.startswith("spark.driver.memory") or cmd.startswith(
            "spark.driver.cores"
        ):
            key, value = cmd.split("=")
            user_spark_opts[key] = value

    paasta_instance = get_smart_paasta_instance_name(args)
    auto_set_temporary_credentials_provider = (
        args.disable_temporary_credentials_provider is False
    )
    spark_conf = get_spark_conf(
        cluster_manager=args.cluster_manager,
        spark_app_base_name=app_base_name,
        docker_img=docker_image,
        user_spark_opts=user_spark_opts,
        paasta_cluster=args.cluster,
        paasta_pool=args.pool,
        paasta_service=args.service,
        paasta_instance=paasta_instance,
        extra_volumes=volumes,
        aws_creds=aws_creds,
        needs_docker_cfg=needs_docker_cfg,
        auto_set_temporary_credentials_provider=auto_set_temporary_credentials_provider,
    )
    # Experimental: TODO: Move to service_configuration_lib once confirmed that there are no issues
    # Enable AQE: Adaptive Query Execution
    if "spark.sql.adaptive.enabled" not in spark_conf:
        spark_conf["spark.sql.adaptive.enabled"] = "true"
        aqe_msg = "Spark performance improving feature Adaptive Query Execution (AQE) is enabled. Set spark.sql.adaptive.enabled as false to disable."
        log.info(aqe_msg)
        print(PaastaColors.blue(aqe_msg))
    return configure_and_run_docker_container(
        args,
        docker_img=docker_image,
        instance_config=instance_config,
        system_paasta_config=system_paasta_config,
        spark_conf=spark_conf,
        aws_creds=aws_creds,
        cluster_manager=args.cluster_manager,
        pod_template_path=pod_template_path,
    )
Exemplo n.º 7
0
def paasta_spark_run(args):
    # argparse does not work as expected with both default and
    # type=validate_work_dir.
    validate_work_dir(args.work_dir)

    try:
        system_paasta_config = load_system_paasta_config()
    except PaastaNotConfiguredError:
        print(
            PaastaColors.yellow(
                "Warning: Couldn't load config files from '/etc/paasta'. This indicates"
                "PaaSTA is not configured locally on this host, and local-run may not behave"
                "the same way it would behave on a server configured for PaaSTA."
            ),
            sep="\n",
        )
        system_paasta_config = SystemPaastaConfig({"volumes": []},
                                                  "/etc/paasta")

    if args.cmd == "jupyter-lab" and not args.build and not args.image:
        print(
            PaastaColors.red(
                "The jupyter-lab command requires a prebuilt image with -I or --image."
            ),
            file=sys.stderr,
        )
        return 1

    # Use the default spark:client instance configs if not provided
    try:
        instance_config = get_instance_config(
            service=args.service,
            instance=args.instance,
            cluster=args.cluster,
            load_deployments=args.build is False and args.image is None,
            soa_dir=args.yelpsoa_config_root,
        )
    except NoConfigurationForServiceError as e:
        print(str(e), file=sys.stderr)
        return 1
    except NoDeploymentsAvailable:
        print(
            PaastaColors.red(
                "Error: No deployments.json found in %(soa_dir)s/%(service)s."
                "You can generate this by running:"
                "generate_deployments_for_service -d %(soa_dir)s -s %(service)s"
                % {
                    "soa_dir": args.yelpsoa_config_root,
                    "service": args.service
                }),
            sep="\n",
            file=sys.stderr,
        )
        return 1

    if not args.cmd and not instance_config.get_cmd():
        print(
            "A command is required, pyspark, spark-shell, spark-submit or jupyter",
            file=sys.stderr,
        )
        return 1

    aws_creds = get_aws_credentials(
        service=args.service,
        no_aws_credentials=args.no_aws_credentials,
        aws_credentials_yaml=args.aws_credentials_yaml,
        profile_name=args.aws_profile,
    )
    docker_image = get_docker_image(args, instance_config)
    if docker_image is None:
        return 1

    volumes = instance_config.get_volumes(system_paasta_config.get_volumes())
    app_base_name = get_spark_app_name(args.cmd or instance_config.get_cmd())
    needs_docker_cfg = not args.build and not args.image
    user_spark_opts = _parse_user_spark_args(args.spark_args)
    paasta_instance = get_smart_paasta_instance_name(args)
    spark_conf = get_spark_conf(
        cluster_manager="mesos",
        spark_app_base_name=app_base_name,
        docker_img=docker_image,
        user_spark_opts=user_spark_opts,
        paasta_cluster=args.cluster,
        paasta_pool=args.pool,
        paasta_service=args.service,
        paasta_instance=paasta_instance,
        extra_volumes=volumes,
        aws_creds=aws_creds,
        needs_docker_cfg=needs_docker_cfg,
    )
    return configure_and_run_docker_container(
        args,
        docker_img=docker_image,
        instance_config=instance_config,
        system_paasta_config=system_paasta_config,
        spark_conf=spark_conf,
        aws_creds=aws_creds,
    )
Exemplo n.º 8
0
 def test_aws_credentials_yaml(self, tmpdir, aws_creds):
     fp = tmpdir.join('test.yaml')
     fp.write(yaml.dump(aws_creds))
     expected_output = self.expected_temp_creds if aws_creds == self.temp_creds else self.expected_creds
     assert spark_config.get_aws_credentials(
         aws_credentials_yaml=str(fp)) == expected_output
Exemplo n.º 9
0
 def test_no_aws_creds(self):
     assert spark_config.get_aws_credentials(
         no_aws_credentials=True) == (None, None, None)
Exemplo n.º 10
0
 def test_fail(self, tmpdir):
     fp = tmpdir.join('test.yaml')
     fp.write('not yaml file')
     with pytest.raises(ValueError):
         spark_config.get_aws_credentials(aws_credentials_yaml=str(fp))
Exemplo n.º 11
0
 def test_use_profile(self, mock_session):
     assert spark_config.get_aws_credentials(
         profile_name='test_profile') == self.expected_temp_creds
Exemplo n.º 12
0
 def test_use_session(self, mock_session):
     assert spark_config.get_aws_credentials(
         session=mock_session) == self.expected_temp_creds