예제 #1
0
def test_pipeline(kerberos_flag, stop_count, jar_uri, keytab_secret, spark_service_name, jaas_uri=None):
    stop_count = str(stop_count)
    kerberized = True if kerberos_flag == "true" else False
    broker_dns = sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME, KAFKA_SERVICE_NAME, 'endpoints broker', json=True)['dns'][0]
    topic = "top1"

    big_file, big_file_url = "file:///mnt/mesos/sandbox/big.txt", "http://norvig.com/big.txt"

    # arguments to the application
    producer_args = " ".join([broker_dns, big_file, topic, kerberos_flag])

    uris = "spark.mesos.uris={}".format(big_file_url)

    if kerberized and jaas_uri is None:
        _uri = upload_jaas()
        uris += ",{}".format(_uri)
    else:
        uris += ",{}".format(jaas_uri)

    common_args = [
        "--conf", "spark.mesos.containerizer=mesos",
        "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s",
        "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0",
        "--conf", uris
    ]

    kerberos_args = get_kerberized_kafka_spark_conf(spark_service_name, keytab_secret)

    producer_config = ["--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1",
                       "--class", "KafkaFeeder"] + common_args

    if kerberized:
        producer_config += kerberos_args

    producer_id = utils.submit_job(app_url=jar_uri,
                                   app_args=producer_args,
                                   service_name=spark_service_name,
                                   args=producer_config)

    sdk_tasks.check_running(KAFKA_SERVICE_NAME, 1, timeout_seconds=600)

    consumer_config = ["--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1",
                       "--class", "KafkaConsumer"] + common_args

    if kerberized:
        consumer_config += kerberos_args

    consumer_args = " ".join([broker_dns, topic, stop_count, kerberos_flag])

    try:
        utils.run_tests(app_url=jar_uri,
                    app_args=consumer_args,
                    expected_output="Read {} words".format(stop_count),
                    service_name=spark_service_name,
                    args=consumer_config)
    finally:
        utils.kill_driver(producer_id, spark_service_name)
예제 #2
0
def test_executor_gpus_exceeds_available_gpus():
    """
    Checks: if executor.gpus exceeds the available gpus, the job never runs.
    """
    num_executors = 2
    executor_gpus = 2
    driver_task_id = _submit_gpu_app(num_executors=num_executors,
                                     executor_gpus=executor_gpus,
                                     gpus_max=num_executors * executor_gpus)
    try:
        log.info("Waiting for job to complete.")
        shakedown.wait_for_task_completion(driver_task_id, timeout_sec=240)
    except TimeoutExpired:
        log.info("Job failed to complete, as expected.")
        spark_utils.kill_driver(driver_task_id, spark_utils.SPARK_APP_NAME)
        return

    pytest.fail("Did not expect this job to complete.")
예제 #3
0
def _verify_submission_rejected(service_name, driver_role=None):
    app_name = "MockTaskRunner"
    submit_args = ["--conf spark.cores.max=1", "--class {}".format(app_name)]

    submission_id = None
    error = None
    try:
        submission_id = utils.submit_job(service_name=service_name,
                                         app_url=utils.dcos_test_jar_url(),
                                         driver_role=driver_role,
                                         app_args="1 300",
                                         args=submit_args)
    except Exception as err:
        error = err
    finally:
        if submission_id:
            utils.kill_driver(submission_id, service_name=service_name)

    assert error is not None
예제 #4
0
def _submit_job_and_verify_role(service_name, expected_role, driver_role=None):
    app_name = "MockTaskRunner"
    submit_args = ["--conf spark.cores.max=1", "--class {}".format(app_name)]

    submission_id = utils.submit_job(service_name=service_name,
                                     app_url=utils.dcos_test_jar_url(),
                                     app_args="1 300",
                                     driver_role=driver_role,
                                     args=submit_args)

    try:
        sdk_tasks.check_running(app_name, 1, timeout_seconds=300)
        driver_framework = dcos_utils.get_framework_json(app_name,
                                                         completed=False)
        log.info("Driver framework:\n{}".format(driver_framework))
        assert expected_role == driver_framework["role"], \
            "Expected role '{}' but got '{}'".format(expected_role, driver_framework["role"])

    except Exception:
        log.info(f"Cleaning up. Attempting to kill driver: {submission_id}")
        utils.kill_driver(submission_id, service_name=service_name)
예제 #5
0
def test_supervise_conflict_frameworkid():
    job_service_name = "MockTaskRunner"

    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=600 * 1000,
                    retry_on_result=lambda res: not res)
    def wait_job_present(present):
        svc = shakedown.get_service(job_service_name)
        if present:
            return svc is not None
        else:
            return svc is None

    job_args = [
        "--supervise", "--class", "MockTaskRunner", "--conf",
        "spark.cores.max=1", "--conf", "spark.executors.cores=1"
    ]

    try:
        driver_id = utils.submit_job(app_url=utils.dcos_test_jar_url(),
                                     app_args="1 1800",
                                     service_name=utils.SPARK_SERVICE_NAME,
                                     args=job_args)
        log.info("Started supervised driver {}".format(driver_id))

        wait_job_present(True)
        log.info("Job has registered")

        sdk_tasks.check_running(job_service_name, 1)
        log.info("Job has running executors")

        service_info = shakedown.get_service(job_service_name).dict()
        driver_regex = "spark.mesos.driver.frameworkId={}".format(
            service_info['id'])
        kill_status = sdk_cmd.kill_task_with_pattern(driver_regex,
                                                     service_info['hostname'])

        wait_job_present(False)

        wait_job_present(True)
        log.info("Job has re-registered")
        sdk_tasks.check_running(job_service_name, 1)
        log.info("Job has re-started")

        restarted_service_info = shakedown.get_service(job_service_name).dict()
        assert service_info['id'] != restarted_service_info[
            'id'], "Job has restarted with same framework Id"
    finally:
        kill_info = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME)
        log.info("{}".format(kill_info))
        assert json.loads(kill_info)["success"], "Failed to kill spark job"
        wait_job_present(False)
예제 #6
0
def test_shuffle_job(submit_args=[],
                     use_ucr_for_spark_submit=True,
                     use_cli_for_spark_submit=True,
                     check_network_labels=False):

    if not use_ucr_for_spark_submit:
        submit_args = submit_args + [
            "--conf spark.mesos.containerizer=docker",
            "--conf spark.mesos.executor.docker.parameters=user=99",
        ]

    driver_task_id = _submit_shuffle_job(use_cli=use_cli_for_spark_submit,
                                         sleep=300,
                                         extra_args=submit_args)

    sdk_tasks.check_running(SHUFFLE_JOB_FW_NAME,
                            SHUFFLE_JOB_NUM_EXECUTORS,
                            timeout_seconds=600)
    driver_task = shakedown.get_task(driver_task_id, completed=False)
    _check_task_network(driver_task, is_ucr=use_ucr_for_spark_submit)

    if check_network_labels and use_ucr_for_spark_submit:
        _check_task_network_labels(driver_task)

    executor_tasks = shakedown.get_service_tasks(SHUFFLE_JOB_FW_NAME)
    for task in executor_tasks:
        _check_task_network(task, is_ucr=use_ucr_for_spark_submit)
        if check_network_labels and use_ucr_for_spark_submit:
            _check_task_network_labels(task)

    try:
        utils.wait_for_running_job_output(
            driver_task_id,
            "Groups count: {}".format(SHUFFLE_JOB_EXPECTED_GROUPS_COUNT))
    finally:
        log.info("Cleaning up. Attempting to kill driver: {}".format(
            driver_task_id))
        utils.kill_driver(driver_task_id,
                          service_name=CNI_DISPATCHER_SERVICE_NAME)
예제 #7
0
def _submit_job_and_verify_users(user, use_ucr_for_spark_submit, extra_args=[]):
    app_name = "MockTaskRunner"

    submit_args = ["--conf spark.cores.max=1",
                   "--class {}".format(app_name)] + extra_args

    driver_task_id = utils.submit_job(service_name=SERVICE_NAME,
                                      app_url=utils.dcos_test_jar_url(),
                                      app_args="1 300",
                                      args=submit_args)
    try:
        sdk_tasks.check_running(app_name, 1, timeout_seconds=300)
        driver_task = shakedown.get_task(driver_task_id, completed=False)
        executor_tasks = shakedown.get_service_tasks(app_name)

        for task in [driver_task] + executor_tasks:
            log.info(f"Checking task '{task['id']}'")
            _check_task_user(task, user, use_ucr_for_spark_submit)

    finally:
        log.info(f"Cleaning up. Attempting to kill driver: {driver_task_id}")
        utils.kill_driver(driver_task_id, service_name=SERVICE_NAME)
예제 #8
0
def test_supervise():
    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=600 * 1000,
                    retry_on_result=lambda res: not res)
    def wait_job_present(present):
        svc = shakedown.get_service(JOB_SERVICE_NAME)
        if present:
            return svc is not None
        else:
            return svc is None

    JOB_SERVICE_NAME = "RecoverableNetworkWordCount"

    job_args = [
        "--supervise", "--class",
        "org.apache.spark.examples.streaming.RecoverableNetworkWordCount",
        "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4"
    ]

    data_dir = "hdfs://{}".format(HDFS_DATA_DIR)
    driver_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format(
            dir=data_dir),
        service_name=utils.SPARK_SERVICE_NAME,
        args=(KERBEROS_ARGS + job_args))
    log.info("Started supervised driver {}".format(driver_id))
    wait_job_present(True)
    log.info("Job has registered")
    sdk_tasks.check_running(JOB_SERVICE_NAME, 1)
    log.info("Job has running executors")

    service_info = shakedown.get_service(JOB_SERVICE_NAME).dict()
    driver_regex = "spark.mesos.driver.frameworkId={}".format(
        service_info['id'])
    shakedown.kill_process_on_host(hostname=service_info['hostname'],
                                   pattern=driver_regex)

    wait_job_present(True)
    log.info("Job has re-registered")
    sdk_tasks.check_running(JOB_SERVICE_NAME, 1)
    log.info("Job has re-started")
    out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME)
    log.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    wait_job_present(False)
def _kill_driver_task(driver_task_id):
    log.info(f"Cleaning up. Attempting to kill driver: {driver_task_id}")
    utils.kill_driver(driver_task_id)
예제 #10
0
def test_pipeline(kerberos_flag,
                  stop_count,
                  jar_uri,
                  keytab_secret,
                  spark_app_name,
                  jaas_uri=None):
    stop_count = str(stop_count)
    kerberized = True if kerberos_flag == "true" else False
    broker_dns = _kafka_broker_dns()
    topic = "top1"

    big_file, big_file_url = "file:///mnt/mesos/sandbox/big.txt", "http://norvig.com/big.txt"

    # arguments to the application
    producer_args = " ".join([broker_dns, big_file, topic, kerberos_flag])

    uris = "spark.mesos.uris=http://norvig.com/big.txt"

    if kerberized and jaas_uri is None:
        jaas_path = os.path.join(THIS_DIR, "resources",
                                 "spark-kafka-client-jaas.conf")
        s3.upload_file(jaas_path)
        _uri = s3.s3_http_url("spark-kafka-client-jaas.conf")
        uris += ",{}".format(_uri)
    else:
        uris += ",{}".format(jaas_uri)

    common_args = [
        "--conf", "spark.mesos.containerizer=mesos", "--conf",
        "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf",
        "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris
    ]

    kerberos_args = [
        "--conf",
        "spark.mesos.driver.secret.names={}".format(keytab_secret),
        "--conf",
        "spark.mesos.driver.secret.filenames=kafka-client.keytab",
        "--conf",
        "spark.mesos.executor.secret.names={}".format(keytab_secret),
        "--conf",
        "spark.mesos.executor.secret.filenames=kafka-client.keytab",
        "--conf",
        "spark.mesos.task.labels=DCOS_SPACE:{}".format(utils.SPARK_APP_NAME),
        "--conf",
        "spark.executorEnv.KRB5_CONFIG_BASE64={}".format(KAFKA_KRB5),
        "--conf",
        "spark.mesos.driverEnv.KRB5_CONFIG_BASE64={}".format(KAFKA_KRB5),
        "--conf",
        "spark.driver.extraJavaOptions=-Djava.security.auth.login.config="
        "/mnt/mesos/sandbox/spark-kafka-client-jaas.conf",
        "--conf",
        "spark.executor.extraJavaOptions="
        "-Djava.security.auth.login.config=/mnt/mesos/sandbox/spark-kafka-client-jaas.conf",
    ]

    producer_config = [
        "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=2",
        "--class", "KafkaFeeder"
    ] + common_args

    if kerberized:
        producer_config += kerberos_args

    producer_id = utils.submit_job(app_url=jar_uri,
                                   app_args=producer_args,
                                   app_name=spark_app_name,
                                   args=producer_config)

    shakedown.wait_for(lambda: _producer_launched(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    shakedown.wait_for(lambda: utils.is_service_ready(KAFKA_SERVICE_NAME, 1),
                       ignore_exceptions=False,
                       timeout_seconds=600)

    consumer_config = [
        "--conf", "spark.cores.max=4", "--class", "KafkaConsumer"
    ] + common_args

    if kerberized:
        consumer_config += kerberos_args

    consumer_args = " ".join([broker_dns, topic, stop_count, kerberos_flag])

    utils.run_tests(app_url=jar_uri,
                    app_args=consumer_args,
                    expected_output="Read {} words".format(stop_count),
                    app_name=spark_app_name,
                    args=consumer_config)

    utils.kill_driver(producer_id, spark_app_name)
예제 #11
0
def test_supervise():
    def streaming_job_registered():
        return shakedown.get_service(JOB_SERVICE_NAME) is not None

    def streaming_job_is_not_running():
        return not streaming_job_registered()

    def has_running_executors():
        f = shakedown.get_service(JOB_SERVICE_NAME)
        if f is None:
            return False
        else:
            return len([
                x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING"
            ]) > 0

    JOB_SERVICE_NAME = "RecoverableNetworkWordCount"

    job_args = [
        "--supervise", "--class",
        "org.apache.spark.examples.streaming.RecoverableNetworkWordCount",
        "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4"
    ]

    data_dir = "hdfs:///users/alice"
    driver_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format(
            dir=data_dir),
        app_name=utils.SPARK_APP_NAME,
        args=(KERBEROS_ARGS + job_args))
    log.info("Started supervised driver {}".format(driver_id))
    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has running executors")

    host = shakedown.get_service(JOB_SERVICE_NAME).dict()["hostname"]
    id = shakedown.get_service(JOB_SERVICE_NAME).dict()["id"]
    driver_regex = "spark.mesos.driver.frameworkId={}".format(id)
    shakedown.kill_process_on_host(hostname=host, pattern=driver_regex)

    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has re-registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has re-started")
    out = utils.kill_driver(driver_id, utils.SPARK_APP_NAME)
    log.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    shakedown.wait_for(lambda: streaming_job_is_not_running(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
예제 #12
0
def test_supervise(kerberized_spark, hdfs_with_kerberos):
    job_service_name = "RecoverableNetworkWordCount"

    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=600 * 1000,
                    retry_on_result=lambda res: not res)
    def wait_job_present(present):
        svc = shakedown.get_service(job_service_name)
        if present:
            return svc is not None
        else:
            return svc is None

    job_args = [
        "--supervise", "--class",
        "org.apache.spark.examples.streaming.RecoverableNetworkWordCount",
        "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4"
    ]

    data_dir = "hdfs://{}".format(HDFS_DATA_DIR)
    driver_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format(
            dir=data_dir),
        service_name=utils.SPARK_SERVICE_NAME,
        args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS + job_args))
    log.info("Started supervised driver {}".format(driver_id))
    wait_job_present(True)
    log.info("Job has registered")
    sdk_tasks.check_running(job_service_name, 1)
    log.info("Job has running executors")

    service_info = shakedown.get_service(job_service_name).dict()
    driver_regex = "spark.mesos.driver.frameworkId={}".format(
        service_info['id'])

    status, stdout = shakedown.run_command_on_agent(
        service_info['hostname'],
        "ps aux | grep -v grep | grep '{}'".format(driver_regex),
        username=sdk_cmd.LINUX_USER)

    pids = [p.strip().split()[1] for p in stdout.splitlines()]

    for pid in pids:
        status, stdout = shakedown.run_command_on_agent(
            service_info['hostname'],
            "sudo kill -9 {}".format(pid),
            username=sdk_cmd.LINUX_USER)

        if status:
            print("Killed pid: {}".format(pid))
        else:
            print("Unable to killed pid: {}".format(pid))

    wait_job_present(True)
    log.info("Job has re-registered")
    sdk_tasks.check_running(job_service_name, 1)
    log.info("Job has re-started")
    out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME)
    log.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    wait_job_present(False)