示例#1
0
def destroy_app(app_name):
    sdk_cmd.request('delete', api_url_with_param('apps', app_name))
    # Make sure the scheduler has been destroyed

    def fn():
        return shakedown.get_service(app_name) is None
    sdk_spin.time_wait_noisy(lambda: fn())
示例#2
0
def test_joins_overlay_network():
    """Verify that the container joined the dcos subnet at 9.0.0.0/24.

    The logic for this is in the task itself, which will check the container IP address
    and fail if incorrect, thus preventing the plan from reaching the COMPLETE state."""
    spin.time_wait_noisy(lambda: (plan.get_deployment_plan(PACKAGE_NAME).json(
    )['status'] == 'COMPLETE'))
def install(package_name, running_task_count, service_name=None, additional_options={}, package_version=None):
    if not service_name:
        service_name = package_name
    start = time.time()
    merged_options = get_package_options(additional_options)
    print('Installing {} with options={} version={}'.format(package_name, merged_options, package_version))
    # install_package_and_wait silently waits for all marathon deployments to clear.
    # to give some visibility, install in the following order:
    # 1. install package
    shakedown.install_package(package_name, package_version=package_version, options_json=merged_options)
    # 2. wait for expected tasks to come up
    print("Waiting for expected tasks to come up...")
    sdk_tasks.check_running(service_name, running_task_count)
    # 3. check service health
    marathon_client = dcos.marathon.create_client()

    def fn():
        # TODO(nickbp): upstream fix to shakedown, which currently checks for ANY deployments rather
        #               than the one we care about
        deploying_apps = set([])
        print("Getting deployments")
        deployments = marathon_client.get_deployments()
        print("Found {} deployments".format(len(deployments)))
        for d in deployments:
            print("Deployment: {}".format(d))
            for a in d.get('affectedApps', []):
                print("Adding {}".format(a))
                deploying_apps.add(a)
        print('Checking deployment of {} has ended:\n- Deploying apps: {}'.format(service_name, deploying_apps))
        return not '/{}'.format(service_name) in deploying_apps
    sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=30)
    print('Install done after {}'.format(sdk_spin.pretty_time(time.time() - start)))
def test_joins_overlay_network():
    """Verify that the container joined the dcos subnet at 9.0.0.0/24.

    The logic for this is in the task itself, which will check the container IP address
    and fail if incorrect, thus preventing the plan from reaching the COMPLETE state."""
    spin.time_wait_noisy(lambda: (
        plan.get_deployment_plan(PACKAGE_NAME).json()['status'] == 'COMPLETE'))
示例#5
0
def run_backup_and_restore(backup_plan, restore_plan, plan_parameters):
    # Write data to Cassandra with a metronome job
    launch_and_verify_job(WRITE_DATA_JOB)

    # Verify that the data was written
    launch_and_verify_job(VERIFY_DATA_JOB)

    # Run backup plan, uploading snapshots and schema to S3
    plan.start_plan(PACKAGE_NAME, backup_plan, parameters=plan_parameters)
    spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, backup_plan).
                                  json()['status'] == 'COMPLETE'))

    # Delete all keyspaces and tables with a metronome job
    launch_and_verify_job(DELETE_DATA_JOB)

    # Verify that the keyspaces and tables were deleted
    launch_and_verify_job(VERIFY_DELETION_JOB)

    # Run restore plan, retrieving snapshots and schema from S3
    plan.start_plan(PACKAGE_NAME, restore_plan, parameters=plan_parameters)
    spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, restore_plan).
                                  json()['status'] == 'COMPLETE'))

    # Verify that the data we wrote and then deleted has been restored
    launch_and_verify_job(VERIFY_DATA_JOB)

    # Delete data in preparation for any other backup tests
    launch_and_verify_job(DELETE_DATA_JOB)
示例#6
0
def test_lock():
    '''This test verifies that a second scheduler fails to startup when
    an existing scheduler is running.  Without locking, the scheduler
    would fail during registration, but after writing its config to ZK.
    So in order to verify that the scheduler fails immediately, we ensure
    that the ZK config state is unmodified.'''

    marathon_client = dcos.marathon.create_client()

    # Get ZK state from running framework
    zk_path = "dcos-service-{}/ConfigTarget".format(PACKAGE_NAME)
    zk_config_old = shakedown.get_zk_node_data(zk_path)

    # Get marathon app
    app_id = "/{}".format(PACKAGE_NAME)
    app = marathon_client.get_app(app_id)
    old_timestamp = app.get("lastTaskFailure", {}).get("timestamp", None)

    # Scale to 2 instances
    labels = app["labels"]
    labels.pop("MARATHON_SINGLE_INSTANCE_APP")
    marathon_client.update_app(app_id, {"labels": labels})
    shakedown.deployment_wait()
    marathon_client.update_app(app_id, {"instances": 2})

    # Wait for second scheduler to fail
    def fn():
        timestamp = marathon_client.get_app(app_id).get("lastTaskFailure", {}).get("timestamp", None)
        return timestamp != old_timestamp
    spin.time_wait_noisy(lambda: fn())

    # Verify ZK is unchanged
    zk_config_new = shakedown.get_zk_node_data(zk_path)
    assert zk_config_old == zk_config_new
示例#7
0
def test_deploy():
    wait_time = 30
    # taskcfg.yml will initially fail to deploy because several options are missing in the default
    # marathon.json.mustache. verify that tasks are failing for 30s before continuing.
    print('Checking that tasks are failing to launch for at least {}s'.format(wait_time))

    # we can get brief blips of TASK_RUNNING but they shouldnt last more than 2-3s:
    consecutive_task_running = 0
    def fn():
        nonlocal consecutive_task_running
        svc_tasks = shakedown.get_service_tasks(PACKAGE_NAME)
        states = [t['state'] for t in svc_tasks]
        print('Task states: {}'.format(states))
        if 'TASK_RUNNING' in states:
            consecutive_task_running += 1
            assert consecutive_task_running <= 3
        else:
            consecutive_task_running = 0
        return False

    try:
        spin.time_wait_noisy(lambda: fn(), timeout_seconds=wait_time)
    except shakedown.TimeoutExpired:
        print('Timeout reached as expected')

    # add the needed envvars in marathon and confirm that the deployment succeeds:
    config = marathon.get_config(PACKAGE_NAME)
    env = config['env']
    del env['SLEEP_DURATION']
    env['TASKCFG_ALL_OUTPUT_FILENAME'] = 'output'
    env['TASKCFG_ALL_SLEEP_DURATION'] = '1000'
    marathon.update_app(PACKAGE_NAME, config)

    check_running()
示例#8
0
def check_running(service_name, expected_task_count, timeout_seconds=-1):
    def fn():
        try:
            tasks = shakedown.get_service_tasks(service_name)
        except dcos.errors.DCOSHTTPException:
            sdk_utils.out(
                'Failed to get tasks for service {}'.format(service_name))
            tasks = []
        running_task_names = []
        other_tasks = []
        for t in tasks:
            if t['state'] == 'TASK_RUNNING':
                running_task_names.append(t['name'])
            else:
                other_tasks.append('{}={}'.format(t['name'], t['state']))
        msg = 'Waiting for {} running tasks, got {} running/{} total:\n- running: {}\n- other: {}'.format(
            expected_task_count, len(running_task_names), len(tasks),
            running_task_names, other_tasks)
        sdk_utils.out(msg)
        return len(running_task_names) >= expected_task_count

    if timeout_seconds <= 0:
        sdk_spin.time_wait_noisy(lambda: fn())
    else:
        sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=timeout_seconds)
示例#9
0
def check_tasks_updated(service_name,
                        prefix,
                        old_task_ids,
                        timeout_seconds=-1):
    def fn():
        try:
            task_ids = get_task_ids(service_name, prefix)
        except dcos.errors.DCOSHTTPException:
            print('Failed to get task ids for service {}'.format(service_name))
            task_ids = []

        print(
            'Waiting for tasks starting with "{}" to be updated:\n- Old tasks: {}\n- Current tasks: {}'
            .format(prefix, old_task_ids, task_ids))
        all_updated = True
        for id in task_ids:
            if id in old_task_ids:
                all_updated = False
        if len(task_ids) < len(old_task_ids):
            all_updated = False
        return all_updated

    if timeout_seconds <= 0:
        sdk_spin.time_wait_noisy(lambda: fn())
    else:
        sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=timeout_seconds)
示例#10
0
def test_lock():
    '''This test verifies that a second scheduler fails to startup when
    an existing scheduler is running.  Without locking, the scheduler
    would fail during registration, but after writing its config to ZK.
    So in order to verify that the scheduler fails immediately, we ensure
    that the ZK config state is unmodified.'''

    marathon_client = dcos.marathon.create_client()

    # Get ZK state from running framework
    zk_path = "dcos-service-{}/ConfigTarget".format(PACKAGE_NAME)
    zk_config_old = shakedown.get_zk_node_data(zk_path)

    # Get marathon app
    app_id = "/{}".format(PACKAGE_NAME)
    app = marathon_client.get_app(app_id)
    old_timestamp = app.get("lastTaskFailure", {}).get("timestamp", None)

    # Scale to 2 instances
    labels = app["labels"]
    labels.pop("MARATHON_SINGLE_INSTANCE_APP")
    marathon_client.update_app(app_id, {"labels": labels})
    shakedown.deployment_wait()
    marathon_client.update_app(app_id, {"instances": 2})

    # Wait for second scheduler to fail
    def fn():
        timestamp = marathon_client.get_app(app_id).get("lastTaskFailure", {}).get("timestamp", None)
        return timestamp != old_timestamp

    spin.time_wait_noisy(lambda: fn())

    # Verify ZK is unchanged
    zk_config_new = shakedown.get_zk_node_data(zk_path)
    assert zk_config_old == zk_config_new
示例#11
0
def verify_job_succeeded(job_name, run_id):
    # Verify that our most recent run succeeded
    spin.time_wait_noisy(lambda: (run_id in [
        r['id'] for r in json.loads(
            cmd.run_cli('job history --show-failures --json {}'.format(
                job_name)))['history']['successfulFinishedRuns']
    ]))
示例#12
0
def install(
        package_name,
        running_task_count,
        service_name=None,
        additional_options={},
        package_version=None,
        check_suppression=True):
    if not service_name:
        service_name = package_name
    start = time.time()
    merged_options = get_package_options(additional_options)

    sdk_utils.out('Installing {} with options={} version={}'.format(
        package_name, merged_options, package_version))

    # install_package_and_wait silently waits for all marathon deployments to clear.
    # to give some visibility, install in the following order:
    # 1. install package
    shakedown.install_package(
        package_name,
        package_version=package_version,
        options_json=merged_options)

    # 2. wait for expected tasks to come up
    sdk_utils.out("Waiting for expected tasks to come up...")
    sdk_tasks.check_running(service_name, running_task_count)
    sdk_plan.wait_for_completed_deployment(service_name)

    # 3. check service health
    marathon_client = dcos.marathon.create_client()
    def is_deployment_finished():
        # TODO(nickbp): upstream fix to shakedown, which currently checks for ANY deployments rather
        #               than the one we care about
        deploying_apps = set([])
        sdk_utils.out("Getting deployments")
        deployments = marathon_client.get_deployments()
        sdk_utils.out("Found {} deployments".format(len(deployments)))
        for deployment in deployments:
            sdk_utils.out("Deployment: {}".format(deployment))
            for app in deployment.get('affectedApps', []):
                sdk_utils.out("Adding {}".format(app))
                deploying_apps.add(app)
        sdk_utils.out('Checking that deployment of {} has ended:\n- Deploying apps: {}'.format(service_name, deploying_apps))
        return not '/{}'.format(service_name) in deploying_apps
    sdk_utils.out("Waiting for marathon deployment to finish...")
    sdk_spin.time_wait_noisy(is_deployment_finished)

    # 4. Ensure the framework is suppressed.
    #
    # This is only configurable in order to support installs from
    # Universe during the upgrade_downgrade tests, because currently
    # the suppression endpoint isn't supported by all frameworks in
    # Universe.  It can be removed once all frameworks rely on
    # dcos-commons >= 0.13.
    if check_suppression:
        sdk_utils.out("Waiting for framework to be suppressed...")
        sdk_spin.time_wait_noisy(
            lambda: sdk_api.is_suppressed(service_name))

    sdk_utils.out('Install done after {}'.format(sdk_spin.pretty_time(time.time() - start)))
示例#13
0
def verify_job_finished(job_name, run_id):
    spin.time_wait_noisy(lambda: (run_id in [
        r['id']
        for r in get_runs(job_name)['history']['successfulFinishedRuns']
    ] or run_id in [
        r['id'] for r in get_runs(job_name)['history']['failedFinishedRuns']
    ]))
示例#14
0
def check_running(service_name, expected_task_count, timeout_seconds=-1):
    def fn():
        try:
            tasks = shakedown.get_service_tasks(service_name)
        except dcos.errors.DCOSHTTPException:
            print('Failed to get tasks for service {}'.format(service_name))
            tasks = []
        running_task_names = []
        other_tasks = []
        for t in tasks:
            if t['state'] == 'TASK_RUNNING':
                running_task_names.append(t['name'])
            else:
                other_tasks.append('{}={}'.format(t['name'], t['state']))
        print('Waiting for {} running tasks, got {} running/{} total:\n- running: {}\n- other: {}'.format(
            expected_task_count,
            len(running_task_names), len(tasks),
            running_task_names,
            other_tasks))
        return len(running_task_names) >= expected_task_count

    if timeout_seconds <= 0:
        sdk_spin.time_wait_noisy(lambda: fn())
    else:
        sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=timeout_seconds)
示例#15
0
def test_backup_and_restore_flow():
    backup_parameters = {
        'S3_BUCKET_NAME': os.getenv('AWS_BUCKET_NAME',
                                    'infinity-framework-test'),
        'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'),
        'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'),
        'AWS_REGION': os.getenv('AWS_REGION', 'us-west-2'),
        'SNAPSHOT_NAME': str(uuid.uuid1()),
        'CASSANDRA_KEYSPACES': '"testspace1 testspace2"',
    }

    # Write data to Cassandra with a metronome job
    launch_and_verify_job(WRITE_DATA_JOB)

    # Verify that the data was written
    launch_and_verify_job(VERIFY_DATA_JOB)

    # Run backup plan, uploading snapshots and schema to S3
    plan.start_plan(PACKAGE_NAME, 'backup-s3', parameters=backup_parameters)
    spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, 'backup-s3').
                                  json()['status'] == 'COMPLETE'))

    # Delete all keyspaces and tables with a metronome job
    launch_and_verify_job(DELETE_DATA_JOB)

    # Verify that the keyspaces and tables were deleted
    launch_and_verify_job(VERIFY_DELETION_JOB)

    # Run restore plan, retrieving snapshots and schema from S3
    plan.start_plan(PACKAGE_NAME, 'restore-s3', parameters=backup_parameters)
    spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, 'restore-s3').
                                  json()['status'] == 'COMPLETE'))

    # Verify that the data we wrote and then deleted has been restored
    launch_and_verify_job(VERIFY_DATA_JOB, expected_successes=2)
def upgrade_or_downgrade(package_name, running_task_count):
    task_ids = tasks.get_task_ids(package_name, '')
    marathon.destroy_app(package_name)
    install.install(package_name, running_task_count)
    print('Waiting for upgrade / downgrade deployment to complete')
    spin.time_wait_noisy(lambda: (
        plan.get_deployment_plan(package_name).json()['status'] == 'COMPLETE'))
    print('Checking that all tasks have restarted')
    tasks.check_tasks_updated(package_name, '', task_ids)
示例#17
0
def test_task_dns_prefix_points_to_all_tasks():
    pod_info = dcos.http.get(
        shakedown.dcos_service_url(PACKAGE_NAME) +
        "/v1/pods/{}/info".format("hello-0")).json()

    # Assert that DiscoveryInfo is correctly set on tasks.
    assert(all(p["info"]["discovery"]["name"] == "hello-0" for p in pod_info))
    # Assert that the hello-0.hello-world.mesos DNS entry points to the right IP.
    spin.time_wait_noisy(lambda: (
        plan.get_deployment_plan(PACKAGE_NAME).json()['status'] == 'COMPLETE'))
示例#18
0
def test_repair_plan_completes():
    repair_parameters = {'CASSANDRA_KEYSPACE': 'testspace1'}

    plan.start_plan(PACKAGE_NAME, 'repair', parameters=repair_parameters)
    spin.time_wait_noisy(
        lambda: (
            plan.get_plan(PACKAGE_NAME, 'repair').json()['status'] ==
            'COMPLETE'
        )
    )
示例#19
0
def test_cassandra_migration():
    backup_service_name = os.getenv('CASSANDRA_BACKUP_CLUSTER_NAME')
    restore_service_name = os.getenv('CASSANDRA_RESTORE_CLUSTER_NAME')

    env = EnvironmentContext(
        CASSANDRA_NODE_ADDRESS=os.getenv('BACKUP_NODE_ADDRESS',
                                         'node-0.cassandra.mesos'),
        CASSANDRA_NODE_PORT=os.getenv('BACKUP_NODE_PORT', '9042'))
    plan_parameters = {
        'S3_BUCKET_NAME': os.getenv('AWS_BUCKET_NAME',
                                    'infinity-framework-test'),
        'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'),
        'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'),
        'AWS_REGION': os.getenv('AWS_REGION', 'us-west-2'),
        'SNAPSHOT_NAME': str(uuid.uuid1()),
        'CASSANDRA_KEYSPACES': '"testspace1 testspace2"',
    }

    data_context = DataContext(
        init_jobs=[WRITE_DATA_JOB, VERIFY_DATA_JOB],
        cleanup_jobs=[DELETE_DATA_JOB, VERIFY_DELETION_JOB])
    # Install and run the write/delete data jobs against backup cluster,
    # running dcos-cassandra-service
    with env, JobContext(TEST_JOBS), data_context:
        # Back this cluster up to S3
        backup_parameters = {
            'backup_name':
            plan_parameters['SNAPSHOT_NAME'],
            's3_access_key':
            plan_parameters['AWS_ACCESS_KEY_ID'],
            's3_secret_key':
            plan_parameters['AWS_SECRET_ACCESS_KEY'],
            'external_location':
            's3://{}'.format(plan_parameters['S3_BUCKET_NAME']),
        }
        dcos.http.put('{}v1/backup/start'.format(
            shakedown.dcos_service_url(backup_service_name)),
                      json=backup_parameters)
        spin.time_wait_noisy(lambda: get_dcos_cassandra_plan(
            backup_service_name).json()['status'] == 'COMPLETE')

    env = EnvironmentContext(
        CASSANDRA_NODE_ADDRESS=os.getenv('RESTORE_NODE_ADDRESS',
                                         'node-0-server.sdk-cassandra.mesos'),
        CASSANDRA_NODE_PORT=os.getenv('RESTORE_NODE_PORT', '9052'))

    data_context = DataContext(
        cleanup_jobs=[VERIFY_DATA_JOB, DELETE_DATA_JOB, VERIFY_DELETION_JOB])
    with env, JobContext(TEST_JOBS), data_context:
        plan.start_plan(restore_service_name,
                        'restore-s3',
                        parameters=plan_parameters)
        spin.time_wait_noisy(
            lambda: (plan.get_plan(restore_service_name, 'restore-s3').json()[
                'status'] == 'COMPLETE'))
示例#20
0
def test_sidecar():
    plan.start_sidecar_plan(PACKAGE_NAME)
    sidecar_plan = plan.get_sidecar_plan(PACKAGE_NAME).json()
    sdk_utils.out("sidecar_plan: " + str(sidecar_plan))

    assert (len(sidecar_plan['phases']) == 1)
    assert (sidecar_plan['phases'][0]['name'] == 'sidecar-deploy')
    assert (len(sidecar_plan['phases'][0]['steps']) == 2)

    spin.time_wait_noisy(lambda: (plan.get_sidecar_plan(PACKAGE_NAME).json()[
        'status'] == 'COMPLETE'))
def test_sidecar():
    plan.start_sidecar_plan(PACKAGE_NAME, {'PLAN_PARAMETER': 'parameterized'})
    sidecar_plan = plan.get_sidecar_plan(PACKAGE_NAME).json()
    print("sidecar_plan: " + str(sidecar_plan))

    assert(len(sidecar_plan['phases']) == 1)
    assert(sidecar_plan['phases'][0]['name'] == 'sidecar-deploy')
    assert(len(sidecar_plan['phases'][0]['steps']) == 2)

    spin.time_wait_noisy(lambda: (
        plan.get_sidecar_plan(PACKAGE_NAME).json()['status'] == 'COMPLETE'))
示例#22
0
def test_suppress():
    dcos_url = dcos.config.get_config_val('core.dcos_url')
    suppressed_url = urllib.parse.urljoin(dcos_url,
                                          'service/{}/v1/state/properties/suppressed'.format(PACKAGE_NAME))

    def fun():
        response = dcos.http.get(suppressed_url)
        response.raise_for_status()
        return response.text == "true"

    spin.time_wait_noisy(fun)
示例#23
0
def test_state_properties_get():
    # 'suppressed' could be missing if the scheduler recently started, loop for a bit just in case:
    def check_for_nonempty_properties():
        stdout = cmd.run_cli('hello-world state properties')
        return len(json.loads(stdout)) > 0
    spin.time_wait_noisy(lambda: check_for_nonempty_properties(), timeout_seconds=30.)

    stdout = cmd.run_cli('hello-world state properties')
    jsonobj = json.loads(stdout)
    assert len(jsonobj) == 1
    assert jsonobj[0] == "suppressed"

    stdout = cmd.run_cli('hello-world state property suppressed')
    assert stdout == "true\n"
示例#24
0
def launch_and_verify_job(job_name):
    job_name = qualified_job_name(job_name)

    output = cmd.run_cli('job run {}'.format(job_name))
    # Get the id of the run we just initiated
    run_id = json.loads(cmd.run_cli(
        'job show runs {} --json'.format(job_name)))[0]['id']

    # Verify that our most recent run succeeded
    spin.time_wait_noisy(lambda: (run_id in [
        r['id'] for r in json.loads(
            cmd.run_cli('job history --show-failures --json {}'.format(
                job_name)))['history']['successfulFinishedRuns']
    ]))
示例#25
0
def test_state_properties_get():
    # 'suppressed' could be missing if the scheduler recently started, loop for a bit just in case:
    def check_for_nonempty_properties():
        stdout = cmd.run_cli('hello-world state properties')
        return len(json.loads(stdout)) > 0

    spin.time_wait_noisy(lambda: check_for_nonempty_properties(), timeout_seconds=30)

    stdout = cmd.run_cli('hello-world state properties')
    jsonobj = json.loads(stdout)
    assert len(jsonobj) == 1
    assert jsonobj[0] == "suppressed"

    stdout = cmd.run_cli('hello-world state property suppressed')
    assert stdout == "true\n"
示例#26
0
def test_state_refresh_disable_cache():
    '''Disables caching via a scheduler envvar'''
    check_running()
    task_ids = tasks.get_task_ids(PACKAGE_NAME, '')

    # caching enabled by default:
    stdout = cmd.run_cli('hello-world state refresh_cache')
    assert "Received cmd: refresh" in stdout

    config = marathon.get_config(PACKAGE_NAME)
    cpus = float(config['env']['HELLO_CPUS'])
    config['env']['DISABLE_STATE_CACHE'] = 'any-text-here'
    cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config)

    tasks.check_tasks_not_updated(PACKAGE_NAME, '', task_ids)
    check_running()

    # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up):
    def check_cache_refresh_fails_409conflict():
        try:
            cmd.run_cli('hello-world state refresh_cache')
        except Exception as e:
            if "failed: 409 Conflict" in e.args[0]:
                return True
        return False

    spin.time_wait_noisy(lambda: check_cache_refresh_fails_409conflict(),
                         timeout_seconds=120.)

    config = marathon.get_config(PACKAGE_NAME)
    cpus = float(config['env']['HELLO_CPUS'])
    del config['env']['DISABLE_STATE_CACHE']
    cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config)

    tasks.check_tasks_not_updated(PACKAGE_NAME, '', task_ids)
    check_running()

    # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up):
    def check_cache_refresh():
        return cmd.run_cli('hello-world state refresh_cache')

    stdout = spin.time_wait_return(lambda: check_cache_refresh(),
                                   timeout_seconds=120.)
    assert "Received cmd: refresh" in stdout
示例#27
0
def check_tasks_updated(service_name, prefix, old_task_ids):
    def fn():
        try:
            task_ids = get_task_ids(service_name, prefix)
        except dcos.errors.DCOSHTTPException:
            print('Failed to get task ids for service {}'.format(service_name))
            task_ids = []

        print('Waiting for tasks starting with "{}" to be updated:\n- Old tasks: {}\n- Current tasks: {}'.format(
            prefix, old_task_ids, task_ids))
        all_updated = True
        for id in task_ids:
            if id in old_task_ids:
                all_updated = False
        if len(task_ids) < len(old_task_ids):
            all_updated = False
        return all_updated

    sdk_spin.time_wait_noisy(lambda: fn())
示例#28
0
def check_tasks_not_updated(service_name, prefix, old_task_ids):
    def fn():
        try:
            task_ids = get_task_ids(service_name, prefix)
        except dcos.errors.DCOSHTTPException:
            print('Failed to get task ids for service {}'.format(service_name))
            task_ids = []

        print('Checking prior tasks starting with "{}" are undisturbed:\n- Old tasks: {}\n- Current tasks: {}'.format(
            prefix, old_task_ids, task_ids))
        for task_id in task_ids:
            if task_id not in old_task_ids:
                return False
        return True

    try:
        sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=60)
    except shakedown.TimeoutExpired:
        print('Timeout reached as expected')
示例#29
0
def install(package_name,
            running_task_count,
            service_name=None,
            additional_options={},
            package_version=None):
    if not service_name:
        service_name = package_name
    start = time.time()
    merged_options = get_package_options(additional_options)
    print('Installing {} with options={} version={}'.format(
        package_name, merged_options, package_version))
    # install_package_and_wait silently waits for all marathon deployments to clear.
    # to give some visibility, install in the following order:
    # 1. install package
    shakedown.install_package(package_name,
                              package_version=package_version,
                              options_json=merged_options)
    # 2. wait for expected tasks to come up
    print("Waiting for expected tasks to come up...")
    sdk_tasks.check_running(service_name, running_task_count)
    # 3. check service health
    marathon_client = dcos.marathon.create_client()

    def fn():
        # TODO(nickbp): upstream fix to shakedown, which currently checks for ANY deployments rather
        #               than the one we care about
        deploying_apps = set([])
        print("Getting deployments")
        deployments = marathon_client.get_deployments()
        print("Found {} deployments".format(len(deployments)))
        for d in deployments:
            print("Deployment: {}".format(d))
            for a in d.get('affectedApps', []):
                print("Adding {}".format(a))
                deploying_apps.add(a)
        print('Checking deployment of {} has ended:\n- Deploying apps: {}'.
              format(service_name, deploying_apps))
        return not '/{}'.format(service_name) in deploying_apps

    sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=30)
    print('Install done after {}'.format(
        sdk_spin.pretty_time(time.time() - start)))
示例#30
0
def check_tasks_not_updated(service_name, prefix, old_task_ids):
    def fn():
        try:
            task_ids = get_task_ids(service_name, prefix)
        except dcos.errors.DCOSHTTPException:
            print('Failed to get task ids for service {}'.format(service_name))
            task_ids = []

        print(
            'Checking prior tasks starting with "{}" are undisturbed:\n- Old tasks: {}\n- Current tasks: {}'
            .format(prefix, old_task_ids, task_ids))
        for task_id in task_ids:
            if task_id not in old_task_ids:
                return False
        return True

    try:
        sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=60)
    except shakedown.TimeoutExpired:
        print('Timeout reached as expected')
示例#31
0
def test_state_refresh_disable_cache():
    '''Disables caching via a scheduler envvar'''
    check_running()
    task_ids = tasks.get_task_ids(PACKAGE_NAME, '')

    # caching enabled by default:
    stdout = cmd.run_cli('hello-world state refresh_cache')
    assert "Received cmd: refresh" in stdout

    config = marathon.get_config(PACKAGE_NAME)
    cpus = float(config['env']['HELLO_CPUS'])
    config['env']['DISABLE_STATE_CACHE'] = 'any-text-here'
    cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config)

    tasks.check_tasks_not_updated(PACKAGE_NAME, '', task_ids)
    check_running()

    # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up):
    def check_cache_refresh_fails_409conflict():
        try:
            cmd.run_cli('hello-world state refresh_cache')
        except Exception as e:
            if "failed: 409 Conflict" in e.args[0]:
                return True
        return False
    spin.time_wait_noisy(lambda: check_cache_refresh_fails_409conflict(), timeout_seconds=120.)

    config = marathon.get_config(PACKAGE_NAME)
    cpus = float(config['env']['HELLO_CPUS'])
    del config['env']['DISABLE_STATE_CACHE']
    cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config)

    tasks.check_tasks_not_updated(PACKAGE_NAME, '', task_ids)
    check_running()

    # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up):
    def check_cache_refresh():
        return cmd.run_cli('hello-world state refresh_cache')
    stdout = spin.time_wait_return(lambda: check_cache_refresh(), timeout_seconds=120.)
    assert "Received cmd: refresh" in stdout
示例#32
0
def test_node_replace_replaces_node():
    tasks = cmd.run_cli('task')
    node_ip = [t for t in tasks.split('\n')
               if t.startswith('node-2-server')].pop().split()[1]

    # Update the placement constraints so the new node doesn't end up on the
    # same host
    config = marathon.get_config(PACKAGE_NAME)
    config['env']['PLACEMENT_CONSTRAINT'] = 'hostname:UNLIKE:{}'.format(
        node_ip)
    marathon.update_app(PACKAGE_NAME, config)

    plan.wait_for_completed_deployment(PACKAGE_NAME)

    # start replace and wait for it to finish
    cmd.run_cli('cassandra pods replace node-2')
    plan.wait_for_completed_recovery(PACKAGE_NAME)

    # Install replace verification job with correct node IP templated
    # (the job checks for that IP's absence in the peers list and also verifies
    # that the expected number of peers is present, meaning that the node was
    # replaced from Cassandra's perspective)
    with JobContext([VERIFY_REPLACE_JOB], NODE_IP=node_ip):
        spin.time_wait_noisy(lambda: try_job(VERIFY_REPLACE_JOB))
示例#33
0
def check_default_version_available(package_name, prev_version):
    def fn():
        return get_pkg_version(package_name) != prev_version
    sdk_spin.time_wait_noisy(lambda: fn())
示例#34
0
def launch_and_verify_job(job_name, expected_successes=1):
    cmd.run_cli('job run {}'.format(qualified_job_name(job_name)))

    spin.time_wait_noisy(lambda: ('Successful runs: {}'.format(
        expected_successes) in cmd.run_cli('job history {}'.format(
            qualified_job_name(job_name)))))
def new_default_version_available(prev_version, package_name):
    spin.time_wait_noisy(lambda: get_pkg_version(package_name) != prev_version)
示例#36
0
def new_default_version_available(prev_version):
    spin.time_wait_noisy(lambda: get_pkg_version() != prev_version)
示例#37
0
def new_default_version_available(prev_version):
    spin.time_wait_noisy(lambda: get_pkg_version() != prev_version)