示例#1
0
def test_deploy():
    wait_time = 30
    # taskcfg.yml will initially fail to deploy because several options are missing in the default
    # sdk_marathon.json.mustache. verify that tasks are failing for 30s before continuing.
    log.info('Checking that tasks are failing to launch for at least {}s'.format(wait_time))

    # we can get brief blips of TASK_RUNNING but they shouldnt last more than 2-3s:
    consecutive_task_running = 0
    def fn():
        nonlocal consecutive_task_running
        svc_tasks = shakedown.get_service_tasks(config.SERVICE_NAME)
        states = [t['state'] for t in svc_tasks]
        log.info('Task states: {}'.format(states))
        if 'TASK_RUNNING' in states:
            consecutive_task_running += 1
            assert consecutive_task_running <= 3
        else:
            consecutive_task_running = 0
        return False

    try:
        shakedown.wait_for(lambda: fn(), timeout_seconds=wait_time)
    except shakedown.TimeoutExpired:
        log.info('Timeout reached as expected')

    # add the needed envvars in marathon and confirm that the deployment succeeds:
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    env = marathon_config['env']
    del env['SLEEP_DURATION']
    env['TASKCFG_ALL_OUTPUT_FILENAME'] = 'output'
    env['TASKCFG_ALL_SLEEP_DURATION'] = '1000'
    sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)

    config.check_running()
示例#2
0
def run_job(job_dict, timeout_seconds=600, raise_on_failure=True):
    job_name = job_dict['id']

    sdk_cmd.run_cli('job run {}'.format(job_name))

    def wait_for_run_id():
        runs = json.loads(sdk_cmd.run_cli('job show runs {} --json'.format(job_name)))
        if len(runs) > 0:
            return runs[0]['id']
        return ''
    run_id = shakedown.wait_for(wait_for_run_id, noisy=True, timeout_seconds=timeout_seconds, ignore_exceptions=False)

    def fun():
        # catch errors from CLI: ensure that the only error raised is our own:
        try:
            runs = json.loads(sdk_cmd.run_cli(
                'job history --show-failures --json {}'.format(job_name), print_output=False))
        except:
            log.info(traceback.format_exc())
            return False

        successful_ids = [r['id'] for r in runs['history']['successfulFinishedRuns']]
        failed_ids = [r['id'] for r in runs['history']['failedFinishedRuns']]

        log.info('Job {} run history (waiting for successful {}): successful={} failed={}'.format(
            job_name, run_id, successful_ids, failed_ids))
        # note: if a job has restart.policy=ON_FAILURE, it won't show up in failed_ids if it fails
        if raise_on_failure and run_id in failed_ids:
            raise Exception('Job {} with id {} has failed, exiting early'.format(job_name, run_id))
        return run_id in successful_ids
    shakedown.wait_for(fun, noisy=True, timeout_seconds=timeout_seconds, ignore_exceptions=False)

    return run_id
示例#3
0
def wait_for_any_metrics(package_name, service_name, task_name, timeout):
    def metrics_exist():
        log.info("verifying metrics exist for {}".format(service_name))
        service_metrics = get_metrics(package_name, service_name, task_name)
        # there are 2 generic metrics that are always emitted
        return len(service_metrics) > 2

    shakedown.wait_for(metrics_exist, timeout)
示例#4
0
def write_some_data(data_node_name, file_name):
    def write_data_to_hdfs():
        write_command = "echo '{}' | ./bin/hdfs dfs -put - /{}".format(
            TEST_CONTENT_SMALL, file_name)
        rc, _ = run_hdfs_command(data_node_name, write_command)
        # rc being True is effectively it being 0...
        return rc

    shakedown.wait_for(lambda: write_data_to_hdfs(),
                       timeout_seconds=HDFS_CMD_TIMEOUT_SEC)
示例#5
0
def broker_count_check(count, service_name=SERVICE_NAME):
    def fun():
        try:
            if len(service_cli('broker list', service_name=service_name)) == count:
                return True
        except:
            pass
        return False

    shakedown.wait_for(fun)
示例#6
0
def broker_count_check(count, service_name=config.SERVICE_NAME):
    def fun():
        try:
            if len(sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'broker list', json=True)) == count:
                return True
        except:
            pass
        return False

    shakedown.wait_for(fun)
示例#7
0
def _wait_for_spark(service_name=None):
    def pred():
        dcos_url = dcos.config.get_config_val("core.dcos_url")
        path = "/service{}".format(
            service_name) if service_name else "service/spark"
        spark_url = urllib.parse.urljoin(dcos_url, path)
        status_code = dcos.http.get(spark_url).status_code
        return status_code == 200

    shakedown.wait_for(pred)
def test_supervise():
    def streaming_job_registered():
        return shakedown.get_service("HdfsWordCount") is not None

    def streaming_job_is_not_running():
        return not streaming_job_registered()

    def has_running_executors():
        f = shakedown.get_service("HdfsWordCount")
        if f is None:
            return False
        else:
            return len([
                x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING"
            ]) > 0

    driver_id = utils.submit_job(
        app_url=SPARK_EXAMPLES,
        app_args="file:///mnt/mesos/sandbox/",
        app_name="/spark",
        args=[
            "--supervise", "--class",
            "org.apache.spark.examples.streaming.HdfsWordCount", "--conf",
            "spark.cores.max=8", "--conf", "spark.executors.cores=4"
        ])
    LOGGER.info("Started supervised driver {}".format(driver_id))
    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    LOGGER.info("Job has registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    LOGGER.info("Job has running executors")

    host = shakedown.get_service("HdfsWordCount").dict()["hostname"]
    id = shakedown.get_service("HdfsWordCount").dict()["id"]
    driver_regex = "spark.mesos.driver.frameworkId={}".format(id)
    shakedown.kill_process_on_host(hostname=host, pattern=driver_regex)

    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    LOGGER.info("Job has re-registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    LOGGER.info("Job has re-started")
    out = utils.kill_driver(driver_id, "/spark")
    LOGGER.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    shakedown.wait_for(lambda: streaming_job_is_not_running(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
示例#9
0
def test_suppress():
    dcos_url = dcos.config.get_config_val('core.dcos_url')
    suppressed_url = urllib.parse.urljoin(
        dcos_url, 'service/{}/v1/state/properties/suppressed'.format(FOLDERED_SERVICE_NAME))

    def fun():
        response = dcos.http.get(suppressed_url)
        response.raise_for_status()
        return response.text == "true"

    shakedown.wait_for(fun)
示例#10
0
def wait_for_failover_to_complete(namenode):
    """
    Inspects the name node logs to make sure ZK signals a complete failover.
    The given namenode is the one to become active after the failover is complete.
    """
    def failover_detection():
        status = get_name_node_status(PACKAGE_NAME, namenode)
        return status == "active"

    shakedown.wait_for(lambda: failover_detection(),
                       timeout_seconds=DEFAULT_HDFS_TIMEOUT)
示例#11
0
def _upgrade_or_downgrade(package_name, to_package_version, service_name,
                          running_task_count, additional_options,
                          timeout_seconds, wait_for_deployment):
    task_ids = sdk_tasks.get_task_ids(service_name, '')
    if shakedown.dcos_version_less_than(
            "1.10") or shakedown.ee_version() is None:
        log.info('Using marathon upgrade flow to upgrade {} {}'.format(
            package_name, to_package_version))
        sdk_marathon.destroy_app(service_name)
        sdk_install.install(package_name,
                            service_name,
                            running_task_count,
                            additional_options=additional_options,
                            package_version=to_package_version,
                            timeout_seconds=timeout_seconds,
                            wait_for_deployment=wait_for_deployment)
    else:
        log.info('Using CLI upgrade flow to upgrade {} {}'.format(
            package_name, to_package_version))
        if additional_options:
            with tempfile.NamedTemporaryFile() as opts_f:
                opts_f.write(json.dumps(additional_options).encode('utf-8'))
                opts_f.flush(
                )  # ensure json content is available for the CLI to read below
                sdk_cmd.svc_cli(
                    package_name, service_name,
                    'update start --package-version={} --options={}'.format(
                        to_package_version, opts_f.name))
        else:
            sdk_cmd.svc_cli(
                package_name, service_name,
                'update start --package-version={}'.format(to_package_version))

    if wait_for_deployment:
        log.info('Checking that all tasks have restarted')
        sdk_tasks.check_tasks_updated(service_name, '', task_ids)

        # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected
        # total task count via FINISHED tasks, without actually completing deployment
        log.info("Waiting for {}/{} to finish deployment plan...".format(
            package_name, service_name))
        sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds)

        # given the above wait for plan completion, here we just wait up to 5 minutes
        if shakedown.dcos_version_less_than("1.9"):
            log.info(
                "Skipping `is_suppressed` check for %s/%s as this is only suppored starting in version 1.9",
                package_name, service_name)
        else:
            log.info("Waiting for %s/%s to be suppressed...", package_name,
                     service_name)
            shakedown.wait_for(lambda: sdk_api.is_suppressed(service_name),
                               noisy=True,
                               timeout_seconds=5 * 60)
示例#12
0
def _run_job_and_wait(metronome_client, job_name, timeout_seconds):
    metronome_client.run_job(job_name)

    shakedown.wait_for(
        lambda: (
            'Successful runs: 1' in
            _run_cli('job history {}'.format(job_name))
        ),
        timeout_seconds=timeout_seconds,
        ignore_exceptions=False
    )
示例#13
0
def _upgrade_or_downgrade(
        package_name,
        to_package_version,
        service_name,
        running_task_count,
        additional_options,
        timeout_seconds,
        wait_for_deployment):
    task_ids = sdk_tasks.get_task_ids(service_name, '')
    if shakedown.dcos_version_less_than("1.10") or shakedown.ee_version() is None:
        log.info('Using marathon upgrade flow to upgrade {} {}'.format(package_name, to_package_version))
        sdk_marathon.destroy_app(service_name)
        sdk_install.install(
            package_name,
            service_name,
            running_task_count,
            additional_options=additional_options,
            package_version=to_package_version,
            timeout_seconds=timeout_seconds,
            wait_for_deployment=wait_for_deployment)
    else:
        log.info('Using CLI upgrade flow to upgrade {} {}'.format(package_name, to_package_version))
        if additional_options:
            with tempfile.NamedTemporaryFile() as opts_f:
                opts_f.write(json.dumps(additional_options).encode('utf-8'))
                opts_f.flush() # ensure json content is available for the CLI to read below
                sdk_cmd.svc_cli(
                    package_name, service_name,
                    'update start --package-version={} --options={}'.format(to_package_version, opts_f.name))
        else:
            sdk_cmd.svc_cli(
                package_name, service_name,
                'update start --package-version={}'.format(to_package_version))

    if wait_for_deployment:
        log.info('Checking that all tasks have restarted')
        sdk_tasks.check_tasks_updated(service_name, '', task_ids)

        # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected
        # total task count via FINISHED tasks, without actually completing deployment
        log.info("Waiting for {}/{} to finish deployment plan...".format(
            package_name, service_name))
        sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds)

        # given the above wait for plan completion, here we just wait up to 5 minutes
        if shakedown.dcos_version_less_than("1.9"):
            log.info("Skipping `is_suppressed` check for %s/%s as this is only suppored starting in version 1.9",
                     package_name, service_name)
        else:
            log.info("Waiting for %s/%s to be suppressed...", package_name, service_name)
            shakedown.wait_for(
                lambda: sdk_api.is_suppressed(service_name),
                noisy=True,
                timeout_seconds=5 * 60)
示例#14
0
def wait_for_scheduler_counter_value(service_name,
                                     counter_name,
                                     min_value,
                                     timeout_seconds=15 * 60):
    """Waits for the specified counter value to be reached by the scheduler
    For example, check that `offers.processed` is equal or greater to 1."""
    def check_for_value():
        value = get_scheduler_counter(service_name, counter_name,
                                      timeout_seconds)
        return value >= min_value

    shakedown.wait_for(check_for_value, timeout_seconds)
示例#15
0
def test_suppress():
    dcos_url = dcos.config.get_config_val('core.dcos_url')
    suppressed_url = urllib.parse.urljoin(
        dcos_url, 'service/{}/v1/state/properties/suppressed'.format(
            sdk_utils.get_foldered_name(config.SERVICE_NAME)))

    def fun():
        response = dcos.http.get(suppressed_url)
        response.raise_for_status()
        return response.text == "true"

    shakedown.wait_for(fun)
示例#16
0
def test_integrity_on_name_node_failure():
    """
    The first name node (name-0-node) is the active name node by default when HDFS gets installed.
    This test checks that it is possible to write and read data after the first name node fails.
    """
    tasks.kill_task_with_pattern("NameNode", 'name-0-node.hdfs.mesos')
    time.sleep(1)  # give NameNode a chance to die

    shakedown.wait_for(lambda: write_data_to_hdfs("data-0-node.hdfs.mesos", TEST_FILE_2_NAME), HDFS_CMD_TIMEOUT_SEC)

    shakedown.wait_for(lambda: read_data_from_hdfs("data-2-node.hdfs.mesos", TEST_FILE_2_NAME), HDFS_CMD_TIMEOUT_SEC)

    check_running()
示例#17
0
def test_integrity_on_name_node_failure():
    """
    The first name node (name-0-node) is the active name node by default when HDFS gets installed.
    This test checks that it is possible to write and read data after the first name node fails.
    """
    tasks.kill_task_with_pattern("NameNode", 'name-0-node.hdfs.mesos')
    time.sleep(1)  # give NameNode a chance to die

    shakedown.wait_for(lambda: write_data_to_hdfs("data-0-node.hdfs.mesos", TEST_FILE_2_NAME), HDFS_CMD_TIMEOUT_SEC)

    shakedown.wait_for(lambda: read_data_from_hdfs("data-2-node.hdfs.mesos", TEST_FILE_2_NAME), HDFS_CMD_TIMEOUT_SEC)

    check_healthy()
示例#18
0
def test_integrity_on_data_node_failure():
    shakedown.wait_for(lambda: write_data_to_hdfs("data-0-node.hdfs.mesos", TEST_FILE_1_NAME), HDFS_CMD_TIMEOUT_SEC)

    # gives chance for write to succeed and replication to occur
    time.sleep(5)

    tasks.kill_task_with_pattern("DataNode", 'data-0-node.hdfs.mesos')
    tasks.kill_task_with_pattern("DataNode", 'data-1-node.hdfs.mesos')
    time.sleep(1)  # give DataNode a chance to die

    shakedown.wait_for(lambda: read_data_from_hdfs("data-2-node.hdfs.mesos", TEST_FILE_1_NAME), HDFS_CMD_TIMEOUT_SEC)

    check_healthy()
def test_tasks_updated():
    service_ips = shakedown.get_service_ips(SERVICE_NAME)
    old_task_ids = tasks.get_task_ids(SERVICE_NAME, 'cockroach')
    for service_ip in service_ips:
        shakedown.kill_process_on_host(
            service_ip, "cockroach start")  # Kill CockroachDB node
        tasks.check_running(SERVICE_NAME, DEFAULT_TASK_COUNT,
                            5 * 60)  # Wait for new CockroachDB node to run
        shakedown.wait_for(lambda: cockroach_nodes_healthy(),
                           noisy=True,
                           timeout_seconds=5 *
                           60)  # Wait for healthy CockroachDB cluster
    tasks.check_tasks_updated(SERVICE_NAME, 'cockroach', old_task_ids)
示例#20
0
def test_integrity_on_data_node_failure():
    shakedown.wait_for(lambda: write_data_to_hdfs("data-0-node.hdfs.mesos", TEST_FILE_1_NAME), HDFS_CMD_TIMEOUT_SEC)

    # gives chance for write to succeed and replication to occur
    time.sleep(5)

    tasks.kill_task_with_pattern("DataNode", 'data-0-node.hdfs.mesos')
    tasks.kill_task_with_pattern("DataNode", 'data-1-node.hdfs.mesos')
    time.sleep(1)  # give DataNode a chance to die

    shakedown.wait_for(lambda: read_data_from_hdfs("data-2-node.hdfs.mesos", TEST_FILE_1_NAME), HDFS_CMD_TIMEOUT_SEC)

    check_running()
def broker_count_check(count, service_name=config.SERVICE_NAME):
    def fun():
        try:
            if len(
                    sdk_cmd.svc_cli(config.PACKAGE_NAME,
                                    service_name,
                                    'broker list',
                                    json=True)) == count:
                return True
        except:
            pass
        return False

    shakedown.wait_for(fun)
示例#22
0
def test_state_refresh_disable_cache():
    '''Disables caching via a scheduler envvar'''
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    config.check_running(foldered_name)
    task_ids = sdk_tasks.get_task_ids(foldered_name, '')

    # caching enabled by default:
    stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name,
                             'state refresh_cache')
    assert "Received cmd: refresh" in stdout

    marathon_config = sdk_marathon.get_config(foldered_name)
    marathon_config['env']['DISABLE_STATE_CACHE'] = 'any-text-here'
    sdk_marathon.update_app(foldered_name, marathon_config)

    sdk_tasks.check_tasks_not_updated(foldered_name, '', task_ids)
    config.check_running(foldered_name)

    # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up):
    def check_cache_refresh_fails_409conflict():
        try:
            sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name,
                            'state refresh_cache')
        except Exception as e:
            if "failed: 409 Conflict" in e.args[0]:
                return True
        return False

    shakedown.wait_for(lambda: check_cache_refresh_fails_409conflict(),
                       timeout_seconds=120.)

    marathon_config = sdk_marathon.get_config(foldered_name)
    del marathon_config['env']['DISABLE_STATE_CACHE']
    sdk_marathon.update_app(foldered_name, marathon_config)

    sdk_tasks.check_tasks_not_updated(foldered_name, '', task_ids)
    config.check_running(foldered_name)
    shakedown.deployment_wait(
    )  # ensure marathon thinks the deployment is complete too

    # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up):
    def check_cache_refresh():
        return sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name,
                               'state refresh_cache')

    stdout = shakedown.wait_for(lambda: check_cache_refresh(),
                                timeout_seconds=120.)
    assert "Received cmd: refresh" in stdout
def test_canary_init():
    def fn():
        # check for empty list internally rather than returning empty list.
        # otherwise shakedown.wait_for() will keep going...
        return sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod list', json=True) == []
    assert shakedown.wait_for(fn, noisy=True, timeout_seconds=10 * 60)

    pl = sdk_plan.wait_for_plan_status(config.SERVICE_NAME, 'deploy', 'WAITING')
    log.info(pl)

    assert pl['status'] == 'WAITING'

    assert len(pl['phases']) == 2

    phase = pl['phases'][0]
    assert phase['status'] == 'WAITING'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'WAITING'
    assert steps[1]['status'] == 'WAITING'
    assert steps[2]['status'] == 'PENDING'
    assert steps[3]['status'] == 'PENDING'

    phase = pl['phases'][1]
    assert phase['status'] == 'WAITING'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'WAITING'
    assert steps[1]['status'] == 'WAITING'
    assert steps[2]['status'] == 'PENDING'
    assert steps[3]['status'] == 'PENDING'
示例#24
0
def test_canary_init():
    def fn():
        return sdk_cmd.run_cli('hello-world pod list')

    assert json.loads(shakedown.wait_for(fn, noisy=True)) == []

    pl = sdk_plan.wait_for_plan_status(config.PACKAGE_NAME, 'deploy',
                                       'WAITING')
    log.info(pl)

    assert pl['status'] == 'WAITING'

    assert len(pl['phases']) == 2

    phase = pl['phases'][0]
    assert phase['status'] == 'WAITING'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'WAITING'
    assert steps[1]['status'] == 'WAITING'
    assert steps[2]['status'] == 'PENDING'
    assert steps[3]['status'] == 'PENDING'

    phase = pl['phases'][1]
    assert phase['status'] == 'WAITING'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'WAITING'
    assert steps[1]['status'] == 'WAITING'
    assert steps[2]['status'] == 'PENDING'
    assert steps[3]['status'] == 'PENDING'
示例#25
0
def check_plugin_installed(plugin_name, service_name=SERVICE_NAME):
    curl_api = _curl_api(service_name, "GET")
    def fun():
        result = _get_hosts_with_plugin(curl_api, plugin_name)
        return result is not None and len(result) == DEFAULT_TASK_COUNT

    return shakedown.wait_for(fun, timeout_seconds=DEFAULT_ELASTIC_TIMEOUT)
示例#26
0
def check_elasticsearch_index_health(index_name, color, service_name=SERVICE_NAME):
    curl_api = _curl_api(service_name, "GET")
    def fun():
        result = _get_elasticsearch_index_health(curl_api, index_name)
        return result and result["status"] == color

    return shakedown.wait_for(fun, timeout_seconds=DEFAULT_ELASTIC_TIMEOUT)
示例#27
0
def install(
        package_name,
        expected_running_tasks,
        service_name=None,
        additional_options={},
        package_version=None,
        timeout_seconds=TIMEOUT_SECONDS,
        wait_for_deployment=True):
    if not service_name:
        service_name = package_name
    start = time.time()
    merged_options = get_package_options(additional_options)

    log.info('Installing {}/{} with options={} version={}'.format(
        package_name, service_name, merged_options, package_version))

    # 1. Install package, wait for tasks, wait for marathon deployment
    retried_shakedown_install(
        package_name,
        package_version,
        service_name,
        merged_options,
        timeout_seconds,
        expected_running_tasks)

    # 2. Wait for the scheduler to be idle (as implied by deploy plan completion and suppressed bit)
    # This should be skipped ONLY when it's known that the scheduler will be stuck in an incomplete state.
    if wait_for_deployment:
        # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected
        # total task count via FINISHED tasks, without actually completing deployment
        log.info("Waiting for {}/{} to finish deployment plan...".format(
            package_name, service_name))
        sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds)

        # given the above wait for plan completion, here we just wait up to 5 minutes
        if shakedown.dcos_version_less_than("1.9"):
            log.info("Skipping `is_suppressed` check for %s/%s as this is only suppored starting in version 1.9",
                     package_name, service_name)
        else:
            log.info("Waiting for %s/%s to be suppressed...", package_name, service_name)
            shakedown.wait_for(
                lambda: sdk_api.is_suppressed(service_name),
                noisy=True,
                timeout_seconds=5 * 60)

    log.info('Installed {}/{} after {}'.format(
        package_name, service_name, shakedown.pretty_duration(time.time() - start)))
示例#28
0
def test_state_refresh_disable_cache():
    '''Disables caching via a scheduler envvar'''
    check_running(FOLDERED_SERVICE_NAME)
    task_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, '')

    # caching enabled by default:
    stdout = cmd.run_cli('hello-world --name={} state refresh_cache'.format(
        FOLDERED_SERVICE_NAME))
    assert "Received cmd: refresh" in stdout

    config = marathon.get_config(FOLDERED_SERVICE_NAME)
    config['env']['DISABLE_STATE_CACHE'] = 'any-text-here'
    marathon.update_app(FOLDERED_SERVICE_NAME, config)

    tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, '', task_ids)
    check_running(FOLDERED_SERVICE_NAME)

    # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up):
    def check_cache_refresh_fails_409conflict():
        try:
            cmd.run_cli('hello-world --name={} state refresh_cache'.format(
                FOLDERED_SERVICE_NAME))
        except Exception as e:
            if "failed: 409 Conflict" in e.args[0]:
                return True
        return False

    shakedown.wait_for(lambda: check_cache_refresh_fails_409conflict(),
                       timeout_seconds=120.)

    config = marathon.get_config(FOLDERED_SERVICE_NAME)
    del config['env']['DISABLE_STATE_CACHE']
    marathon.update_app(FOLDERED_SERVICE_NAME, config)

    tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, '', task_ids)
    check_running(FOLDERED_SERVICE_NAME)
    shakedown.deployment_wait(
    )  # ensure marathon thinks the deployment is complete too

    # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up):
    def check_cache_refresh():
        return cmd.run_cli('hello-world --name={} state refresh_cache'.format(
            FOLDERED_SERVICE_NAME))

    stdout = shakedown.wait_for(lambda: check_cache_refresh(),
                                timeout_seconds=120.)
    assert "Received cmd: refresh" in stdout
示例#29
0
def check_kibana_adminrouter_integration(path):
    curl_cmd = "curl -I -k -H \"Authorization: token={}\" -s {}/{}".format(
        DCOS_TOKEN, shakedown.dcos_url().rstrip('/'), path.lstrip('/'))
    def fun():
        exit_status, output = shakedown.run_command_on_master(curl_cmd)
        return output and "HTTP/1.1 200" in output

    return shakedown.wait_for(fun, timeout_seconds=DEFAULT_KIBANA_TIMEOUT, noisy=True)
示例#30
0
def check_plugin_installed(plugin_name, service_name=PACKAGE_NAME):
    curl_api = _curl_api(service_name, "GET")

    def fun():
        result = _get_hosts_with_plugin(curl_api, plugin_name)
        return result is not None and len(result) == DEFAULT_TASK_COUNT

    return shakedown.wait_for(fun, timeout_seconds=DEFAULT_ELASTIC_TIMEOUT)
示例#31
0
def check_plugin_uninstalled(plugin_name, service_name=PACKAGE_NAME):
    curl_api = _curl_api(service_name, "GET")

    def fun():
        result = _get_hosts_with_plugin(curl_api, plugin_name)
        return result is not None and result == []

    return shakedown.wait_for(fun, timeout_seconds=WAIT_TIME_IN_SECONDS)
示例#32
0
def get_plan(service_name, plan):
    def fn():
        output = sdk_api.get(service_name, '/v1/plans/{}'.format(plan))
        try:
            return output.json()
        except:
            return False
    return shakedown.wait_for(fn)
示例#33
0
def get_plan(service_name, plan):
    def fn():
        output = sdk_api.get(service_name, '/v1/plans/{}'.format(plan))
        try:
            return output.json()
        except:
            return False
    return shakedown.wait_for(fn)
示例#34
0
def check_elasticsearch_index_health(index_name, color, service_name=SERVICE_NAME):
    curl_api = _curl_api(service_name, "GET")

    def fun():
        result = _get_elasticsearch_index_health(curl_api, index_name)
        return result and result["status"] == color

    return shakedown.wait_for(fun, timeout_seconds=DEFAULT_ELASTIC_TIMEOUT)
示例#35
0
def check_plugin_uninstalled(plugin_name, service_name=SERVICE_NAME):
    curl_api = _curl_api(service_name, "GET")

    def fun():
        result = _get_hosts_with_plugin(curl_api, plugin_name)
        return result is not None and result == []

    return shakedown.wait_for(fun, timeout_seconds=DEFAULT_ELASTIC_TIMEOUT)
def check_tasks_updated(service_name,
                        prefix,
                        old_task_ids,
                        timeout_seconds=DEFAULT_TIMEOUT_SECONDS):
    # TODO: strongly consider merging the use of checking that tasks have been replaced (this method)
    # and checking that the deploy/upgrade/repair plan has completed. Each serves a part in the bigger
    # atomic test, that the plan completed properly where properly includes that no old tasks remain.
    def fn():
        try:
            task_ids = get_task_ids(service_name, prefix)
        except dcos.errors.DCOSHTTPException:
            log.info(
                'Failed to get task ids for service {}'.format(service_name))
            task_ids = []

        prefix_clause = ''
        if prefix:
            prefix_clause = ' starting with "{}"'.format(prefix)

        old_set = set(old_task_ids)
        new_set = set(task_ids)
        newly_launched_set = new_set.difference(old_set)
        old_remaining_set = old_set.intersection(new_set)
        # the constraints of old and new task cardinality match should be covered by completion of
        # deploy/recovery/whatever plan, not task cardinality, but some uses of this method are not
        # using the plan, so not the definitive source, so will fail when the finished state of a
        # plan yields more or less tasks per pod.
        all_updated = len(newly_launched_set) == len(new_set) and len(
            old_remaining_set) == 0 and len(new_set) >= len(old_set)
        if all_updated:
            log.info(
                'All of the tasks{} have updated\n- Old tasks: {}\n- New tasks: {}'
                .format(prefix_clause, old_set, new_set))
            return all_updated

        # forgive the language a bit, but len('remained') == len('launched'),
        # and similar for the rest of the label for task ids in the log line,
        # so makes for easier reading
        log.info(
            'Waiting for tasks{} to have updated ids:\n- Old tasks (remaining): {}\n- New tasks (launched): {}'
            .format(prefix_clause, old_remaining_set, newly_launched_set))

    shakedown.wait_for(lambda: fn(),
                       noisy=True,
                       timeout_seconds=timeout_seconds)
示例#37
0
def test_node_replace_replaces_node():
    pod_to_replace = 'node-2'
    pod_host = get_pod_host(pod_to_replace)
    utils.out('avoid host for pod {}: {}'.format(pod_to_replace, pod_host))

    # Update the placement constraints so the new node doesn't end up on the same host
    config = marathon.get_config(PACKAGE_NAME)
    config['env']['PLACEMENT_CONSTRAINT'] = 'hostname:UNLIKE:{}'.format(
        pod_host)
    marathon.update_app(PACKAGE_NAME, config)

    plan.wait_for_completed_deployment(PACKAGE_NAME)

    # start replace and wait for it to finish
    cmd.run_cli('cassandra pods replace {}'.format(pod_to_replace))
    plan.wait_for_completed_recovery(PACKAGE_NAME)

    # get an exact task id to run 'task exec' against... just in case there's multiple cassandras
    pod_statuses = json.loads(cmd.run_cli('cassandra pods status node-0'))
    task_id = [
        task['id'] for task in pod_statuses if task['name'] == 'node-0-server'
    ][0]

    # wait for 'nodetool status' to reflect the replacement:
    def fun():
        stdout = cmd.run_cli(
            'task exec {} /bin/bash -c "JAVA_HOME=$(ls -d jre*/) apache-cassandra-*/bin/nodetool -p 7199 status"'
            .format(task_id))
        up_ips = []
        for line in stdout.split('\n'):
            words = list(filter(None, line.split()))
            if len(words) < 2:
                continue
            if not 'UN' == words[0]:
                continue
            up_ips.append(words[1])
        utils.out('UN nodes (want {} entries without {}): {}'.format(
            DEFAULT_TASK_COUNT, pod_host, up_ips))
        return len(up_ips) == DEFAULT_TASK_COUNT and not pod_host in up_ips

    # observed to take 2-3mins in practice:
    shakedown.wait_for(lambda: fun(),
                       timeout_seconds=600,
                       sleep_seconds=15,
                       noisy=True)
示例#38
0
def test_lock():
    '''This test verifies that a second scheduler fails to startup when
    an existing scheduler is running.  Without locking, the scheduler
    would fail during registration, but after writing its config to ZK.
    So in order to verify that the scheduler fails immediately, we ensure
    that the ZK config state is unmodified.'''

    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    marathon_client = dcos.marathon.create_client()

    # Get ZK state from running framework
    zk_path = "dcos-service-{}/ConfigTarget".format(foldered_name)
    zk_config_old = shakedown.get_zk_node_data(zk_path)

    # Get marathon app
    app = marathon_client.get_app(foldered_name)
    old_timestamp = app.get("lastTaskFailure", {}).get("timestamp", None)

    # Scale to 2 instances
    labels = app["labels"]
    original_labels = labels.copy()
    labels.pop("MARATHON_SINGLE_INSTANCE_APP")
    marathon_client.update_app(foldered_name, {"labels": labels})
    shakedown.deployment_wait()
    marathon_client.update_app(foldered_name, {"instances": 2})

    # Wait for second scheduler to fail
    def fn():
        timestamp = marathon_client.get_app(foldered_name).get(
            "lastTaskFailure", {}).get("timestamp", None)
        return timestamp != old_timestamp

    shakedown.wait_for(lambda: fn())

    # Verify ZK is unchanged
    zk_config_new = shakedown.get_zk_node_data(zk_path)
    assert zk_config_old == zk_config_new

    # In order to prevent the second scheduler instance from obtaining a lock, we undo the "scale-up" operation
    marathon_client.update_app(foldered_name, {
        "labels": original_labels,
        "instances": 1
    },
                               force=True)
    shakedown.deployment_wait()
示例#39
0
def install(
        package_name,
        service_name,
        expected_running_tasks,
        additional_options={},
        package_version=None,
        timeout_seconds=TIMEOUT_SECONDS,
        wait_for_deployment=True):
    start = time.time()
    merged_options = get_package_options(additional_options)

    log.info('Installing {}/{} with options={} version={}'.format(
        package_name, service_name, merged_options, package_version))

    # 1. Install package, wait for tasks, wait for marathon deployment
    retried_shakedown_install(
        package_name,
        service_name,
        package_version,
        merged_options,
        timeout_seconds,
        expected_running_tasks)

    # 2. Wait for the scheduler to be idle (as implied by deploy plan completion and suppressed bit)
    # This should be skipped ONLY when it's known that the scheduler will be stuck in an incomplete state.
    if wait_for_deployment:
        # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected
        # total task count via FINISHED tasks, without actually completing deployment
        log.info("Waiting for {}/{} to finish deployment plan...".format(
            package_name, service_name))
        sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds)

        # given the above wait for plan completion, here we just wait up to 5 minutes
        if shakedown.dcos_version_less_than("1.9"):
            log.info("Skipping `is_suppressed` check for %s/%s as this is only suppored starting in version 1.9",
                     package_name, service_name)
        else:
            log.info("Waiting for %s/%s to be suppressed...", package_name, service_name)
            shakedown.wait_for(
                lambda: sdk_api.is_suppressed(service_name),
                noisy=True,
                timeout_seconds=5 * 60)

    log.info('Installed {}/{} after {}'.format(
        package_name, service_name, shakedown.pretty_duration(time.time() - start)))
示例#40
0
def get_name_node_status(service_name, name_node):
    def get_status():
        rc, output = run_hdfs_command(service_name, "./bin/hdfs haadmin -getServiceState {}".format(name_node))
        if not rc:
            return rc

        return output.strip()

    return shakedown.wait_for(lambda: get_status(), timeout_seconds=DEFAULT_HDFS_TIMEOUT)
def get_plan(service_name, plan, timeout_seconds=TIMEOUT_SECONDS):
    def fn():
        output = sdk_api.get(service_name, '/v1/plans/{}'.format(plan))
        try:
            return output.json()
        except:
            return False

    return shakedown.wait_for(fn, noisy=True, timeout_seconds=timeout_seconds)
def check_task_relaunched(task_name,
                          old_task_id,
                          timeout_seconds=DEFAULT_TIMEOUT_SECONDS):
    def fn():
        try:
            task_ids = set([
                t['id'] for t in shakedown.get_tasks(completed=True)
                if t['name'] == task_name
            ])
        except dcos.errors.DCOSHTTPException:
            log.info(
                'Failed to get task ids for service {}'.format(service_name))
            task_ids = set([])

        return len(task_ids) > 0 and (old_task_id not in task_ids
                                      or len(task_ids) > 1)

    shakedown.wait_for(fn, noisy=True, timeout_seconds=timeout_seconds)
示例#43
0
def get_name_node_status(svc_name, name_node):
    def get_status():
        rc, output = run_hdfs_command(svc_name, "./bin/hdfs haadmin -getServiceState {}".format(name_node))
        if not rc:
            return rc

        return output.strip()

    return shakedown.wait_for(lambda: get_status(), timeout_seconds=DEFAULT_HDFS_TIMEOUT)
示例#44
0
def kill_task_with_pattern(pattern, agent_host=None, timeout_seconds=DEFAULT_TIMEOUT_SECONDS):
    exit_status = 0
    def fn():
        command = (
            "sudo kill -9 "
            "$(ps ax | grep {} | grep -v grep | tr -s ' ' | sed 's/^ *//g' | "
            "cut -d ' ' -f 1)".format(pattern))
        if agent_host is None:
            exit_status, _ = shakedown.run_command_on_master(command)
        else:
            exit_status, _ = shakedown.run_command_on_agent(agent_host, command)

        return exit_status

    # might not be able to connect to the agent on first try so we repeat until we can
    shakedown.wait_for(lambda: fn(), noisy=True, timeout_seconds=timeout_seconds)

    if exit_status != 0:
        raise RuntimeError('Failed to kill task with pattern "{}", exit status: {}'.format(pattern, exit_status))
示例#45
0
def check_elasticsearch_index_health(index_name,
                                     color,
                                     service_name=PACKAGE_NAME):
    curl_api = _curl_api(service_name, "GET")

    def fun():
        result = _get_elasticsearch_index_health(curl_api, index_name)
        return result and result["status"] == color

    return shakedown.wait_for(fun, timeout_seconds=WAIT_TIME_IN_SECONDS)
示例#46
0
def test_indexing(default_populated_index):
    def fun():
        indices_stats = config.get_elasticsearch_indices_stats(config.DEFAULT_INDEX_NAME)
        observed_count = indices_stats["_all"]["primaries"]["docs"]["count"]
        assert observed_count == 1, "Indices has incorrect count: should be 1, got {}".format(observed_count)
        doc = config.get_document(config.DEFAULT_INDEX_NAME, config.DEFAULT_INDEX_TYPE, 1)
        observed_name = doc["_source"]["name"]
        return observed_name == "Loren"

    return shakedown.wait_for(fun, timeout_seconds=config.DEFAULT_ELASTIC_TIMEOUT)
示例#47
0
def test_toxic_sidecar_doesnt_trigger_recovery():
    # 1. Run the toxic sidecar plan that will never succeed.
    # 2. Restart the scheduler.
    # 3. Verify that its recovery plan is empty, as a failed FINISHED task should
    # never trigger recovery
    recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery')
    assert(len(recovery_plan['phases']) == 0)
    log.info(recovery_plan)
    sdk_plan.start_plan(config.SERVICE_NAME, 'sidecar-toxic')
    shakedown.wait_for(ToxicSidecarCheck(), timeout_seconds=10 * 60)

    # Restart the scheduler and wait for it to come up.
    sdk_marathon.restart_app(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

    # Now, verify that its recovery plan is empty.
    sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, 'recovery')
    recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery')
    assert(len(recovery_plan['phases']) == 0)
示例#48
0
def kill_task_with_pattern(pattern, agent_host=None, timeout_seconds=DEFAULT_TIMEOUT_SECONDS):
    exit_status = 0
    def fn():
        command = (
            "sudo kill -9 "
            "$(ps ax | grep {} | grep -v grep | tr -s ' ' | sed 's/^ *//g' | "
            "cut -d ' ' -f 1)".format(pattern))
        if agent_host is None:
            exit_status, _ = shakedown.run_command_on_master(command)
        else:
            exit_status, _ = shakedown.run_command_on_agent(agent_host, command)

        return exit_status

    # might not be able to connect to the agent on first try so we repeat until we can
    shakedown.wait_for(lambda: fn(), noisy=True, timeout_seconds=timeout_seconds)

    if exit_status != 0:
        raise RuntimeError('Failed to kill task with pattern "{}", exit status: {}'.format(pattern, exit_status))
示例#49
0
def wait_for_expected_nodes_to_exist(service_name=SERVICE_NAME, task_count=DEFAULT_TASK_COUNT):
    curl_api = _curl_api(service_name, "GET")
    def expected_nodes():
        result = _get_elasticsearch_cluster_health(curl_api)
        if result is None:
            return False
        node_count = result["number_of_nodes"]
        log.info('Waiting for {} healthy nodes, got {}'.format(task_count, node_count))
        return node_count == task_count

    return shakedown.wait_for(expected_nodes, timeout_seconds=DEFAULT_ELASTIC_TIMEOUT)
示例#50
0
def wait_for_phase_status(service_name, plan_name, phase_name, status, timeout_seconds=TIMEOUT_SECONDS):
    def fn():
        plan = get_plan(service_name, plan_name)
        phase = get_phase(plan, phase_name)
        log.info('Waiting for {}.{} phase to have {} status:\n{}'.format(
            plan_name, phase_name, status, plan_string(plan_name, plan)))
        if phase and phase['status'] == status:
            return plan
        else:
            return False
    return shakedown.wait_for(fn, noisy=True, timeout_seconds=timeout_seconds)
示例#51
0
def test_state_properties_get():
    # 'suppressed' could be missing if the scheduler recently started, loop for a bit just in case:
    def check_for_nonempty_properties():
        jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'state properties', json=True)
        return len(jsonobj) > 0

    shakedown.wait_for(lambda: check_for_nonempty_properties(), timeout_seconds=30)

    jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'state properties', json=True)
    assert len(jsonobj) == 6
    # alphabetical ordering:
    assert jsonobj[0] == "hello-0-server:task-status"
    assert jsonobj[1] == "hello-1-server:task-status"
    assert jsonobj[2] == "last-completed-update-type"
    assert jsonobj[3] == "suppressed"
    assert jsonobj[4] == "world-0-server:task-status"
    assert jsonobj[5] == "world-1-server:task-status"

    stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'state property suppressed')
    assert stdout == "true\n"
示例#52
0
def get_elasticsearch_master(service_name=SERVICE_NAME):
    # just in case, re-fetch the _curl_api in case the elasticsearch master is moved:
    def get_master():
        exit_status, output = shakedown.run_command_on_master(
            "{}/_cat/master'".format(_curl_api(service_name, "GET")))
        if exit_status and len(output.split()) > 0:
            return output.split()[-1]

        return False

    return shakedown.wait_for(get_master)
示例#53
0
def test_state_refresh_disable_cache():
    '''Disables caching via a scheduler envvar'''
    config.check_running(FOLDERED_SERVICE_NAME)
    task_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, '')

    # caching enabled by default:
    stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'state refresh_cache')
    assert "Received cmd: refresh" in stdout

    marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME)
    marathon_config['env']['DISABLE_STATE_CACHE'] = 'any-text-here'
    sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config)

    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, '', task_ids)
    config.check_running(FOLDERED_SERVICE_NAME)

    # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up):
    def check_cache_refresh_fails_409conflict():
        try:
            sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'state refresh_cache')
        except Exception as e:
            if "failed: 409 Conflict" in e.args[0]:
                return True
        return False

    shakedown.wait_for(lambda: check_cache_refresh_fails_409conflict(), timeout_seconds=120.)

    marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME)
    del marathon_config['env']['DISABLE_STATE_CACHE']
    sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config)

    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, '', task_ids)
    config.check_running(FOLDERED_SERVICE_NAME)
    shakedown.deployment_wait()  # ensure marathon thinks the deployment is complete too

    # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up):
    def check_cache_refresh():
        return sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'state refresh_cache')

    stdout = shakedown.wait_for(lambda: check_cache_refresh(), timeout_seconds=120.)
    assert "Received cmd: refresh" in stdout
示例#54
0
def test_lock():
    '''This test verifies that a second scheduler fails to startup when
    an existing scheduler is running.  Without locking, the scheduler
    would fail during registration, but after writing its config to ZK.
    So in order to verify that the scheduler fails immediately, we ensure
    that the ZK config state is unmodified.'''

    marathon_client = dcos.marathon.create_client()

    # Get ZK state from running framework
    zk_path = "dcos-service-{}/ConfigTarget".format(FOLDERED_SERVICE_NAME)
    zk_config_old = shakedown.get_zk_node_data(zk_path)

    # Get marathon app
    app = marathon_client.get_app(FOLDERED_SERVICE_NAME)
    old_timestamp = app.get("lastTaskFailure", {}).get("timestamp", None)

    # Scale to 2 instances
    labels = app["labels"]
    original_labels = labels.copy()
    labels.pop("MARATHON_SINGLE_INSTANCE_APP")
    marathon_client.update_app(FOLDERED_SERVICE_NAME, {"labels": labels})
    shakedown.deployment_wait()
    marathon_client.update_app(FOLDERED_SERVICE_NAME, {"instances": 2})

    # Wait for second scheduler to fail
    def fn():
        timestamp = marathon_client.get_app(FOLDERED_SERVICE_NAME).get("lastTaskFailure", {}).get("timestamp", None)
        return timestamp != old_timestamp

    shakedown.wait_for(lambda: fn())

    # Verify ZK is unchanged
    zk_config_new = shakedown.get_zk_node_data(zk_path)
    assert zk_config_old == zk_config_new

    # In order to prevent the second scheduler instance from obtaining a lock, we undo the "scale-up" operation
    marathon_client.update_app(FOLDERED_SERVICE_NAME, {"labels": original_labels, "instances": 1}, force=True)
    shakedown.deployment_wait()
示例#55
0
def check_running(service_name, expected_task_count, timeout_seconds=DEFAULT_TIMEOUT_SECONDS):
    def fn():
        try:
            tasks = shakedown.get_service_tasks(service_name)
        except dcos.errors.DCOSHTTPException:
            log.info('Failed to get tasks for service {}'.format(service_name))
            tasks = []
        running_task_names = []
        other_tasks = []
        for t in tasks:
            if t['state'] == 'TASK_RUNNING':
                running_task_names.append(t['name'])
            else:
                other_tasks.append('{}={}'.format(t['name'], t['state']))
        log.info('Waiting for {} running tasks, got {} running/{} total:\n- running: {}\n- other: {}'.format(
            expected_task_count,
            len(running_task_names), len(tasks),
            sorted(running_task_names),
            sorted(other_tasks)))
        return len(running_task_names) >= expected_task_count

    shakedown.wait_for(lambda: fn(), noisy=True, timeout_seconds=timeout_seconds)
示例#56
0
def check_tasks_updated(service_name, prefix, old_task_ids, timeout_seconds=DEFAULT_TIMEOUT_SECONDS):
    def fn():
        try:
            task_ids = get_task_ids(service_name, prefix)
        except dcos.errors.DCOSHTTPException:
            log.info('Failed to get task ids for service {}'.format(service_name))
            task_ids = []

        prefix_clause = ''
        if prefix:
            prefix_clause = ' starting with "{}"'.format(prefix)
        log.info('Waiting for {} tasks{} to have updated ids:\n- Old tasks: {}\n- Current tasks: {}'.format(
            len(old_task_ids), prefix_clause, sorted(old_task_ids), sorted(task_ids)))

        all_updated = True
        for id in task_ids:
            if id in old_task_ids:
                all_updated = False
        if len(task_ids) < len(old_task_ids):
            all_updated = False
        return all_updated

    shakedown.wait_for(lambda: fn(), noisy=True, timeout_seconds=timeout_seconds)
示例#57
0
def wait_for_plan_status(service_name, plan_name, status, timeout_seconds=TIMEOUT_SECONDS):
    '''Wait for a plan to have one of the specified statuses'''
    if isinstance(status, str):
        statuses = [status, ]
    else:
        statuses = status

    def fn():
        plan = get_plan(service_name, plan_name)
        log.info('Waiting for {} plan to have {} status:\nFound:\n{}'.format(
            plan_name, status, plan_string(plan_name, plan)))
        if plan and plan['status'] in statuses:
            return plan
        else:
            return False
    return shakedown.wait_for(fn, noisy=True, timeout_seconds=timeout_seconds)
示例#58
0
def get_config(app_name):
    # Be permissive of flakes when fetching the app content:
    def fn():
        return sdk_cmd.request('get', api_url('apps/{}'.format(app_name)), retry=False, log_args=False)
    config = shakedown.wait_for(lambda: fn()).json()['app']

    # The configuration JSON that marathon returns doesn't match the configuration JSON it accepts,
    # so we have to remove some offending fields to make it re-submittable, since it's not possible to
    # submit a partial config with only the desired fields changed.
    if 'uris' in config:
        del config['uris']

    if 'version' in config:
        del config['version']

    return config
示例#59
0
def test_endpoints_address():
    def fun():
        ret = sdk_cmd.svc_cli(
            config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
            'endpoints {}'.format(config.DEFAULT_TASK_NAME), json=True)
        if len(ret['address']) == config.DEFAULT_BROKER_COUNT:
            return ret
        return False
    endpoints = shakedown.wait_for(fun)
    # NOTE: do NOT closed-to-extension assert len(endpoints) == _something_
    assert len(endpoints['address']) == config.DEFAULT_BROKER_COUNT
    assert len(endpoints['dns']) == config.DEFAULT_BROKER_COUNT
    for i in range(len(endpoints['dns'])):
        assert sdk_hosts.autoip_host(
            FOLDERED_SERVICE_NAME, 'kafka-{}-broker'.format(i)) in endpoints['dns'][i]
    assert endpoints['vip'] == sdk_hosts.vip_host(
        FOLDERED_SERVICE_NAME, 'broker', 9092)
示例#60
0
def check_plugin_uninstalled(plugin_name):
    return shakedown.wait_for(lambda: plugins_uninstalled_success_predicate(plugin_name),
                              timeout_seconds=WAIT_TIME_IN_SECONDS)