Пример #1
0
 def set_ca_cert(self):
     """ If security is permissive or strict, and the API session is not configured with verify=False,
     then the custom CA cert for the desired cluster must be attached to the session, which this method will do
     """
     log.info('Attempt to get CA bundle via Admin Router')
     r = self.get('/ca/dcos-ca.crt', verify=False)
     r.raise_for_status()
     self.session.verify = helpers.session_tempfile(r.content)
def mount_volumes():
    """ Will create 200MB partions on clusters launched by dcos-launch
    """
    script = """
#!/bin/bash
sudo systemctl stop dcos-mesos-slave.service
sudo rm -f /var/lib/dcos/mesos-resources
sudo rm -f /var/lib/mesos/slave/meta/slaves/latest
"""
    for i in range(2):
        script += """
sudo mkdir -p /dcos/volume{idx}
sudo dd if=/dev/zero of=/root/volume{idx}.img bs=1M count={size}
sudo losetup /dev/loop{idx} /root/volume{idx}.img
sudo mkfs -t ext4 /dev/loop{idx}
sudo losetup -d /dev/loop{idx}
echo "/root/volume{idx}.img /dcos/volume{idx} auto loop 0 2" | sudo tee -a /etc/fstab
sudo mount /dcos/volume{idx}
""".format(idx=i, size=200)

    script += """
sudo systemctl restart dcos-mesos-slave.service
"""

    cluster_info_path = os.getenv('CLUSTER_INFO_PATH', 'cluster_info.json')
    if not os.path.exists(cluster_info_path):
        raise Exception('No cluster info to work with!')
    cluster_info_json = json.load(open(cluster_info_path))
    launcher = dcos_launch.get_launcher(cluster_info_json)
    description = launcher.describe()
    ssh = launcher.get_ssh_client()
    with ssh.tunnel(description['masters'][0]['public_ip']) as t:
        t.copy_file(helpers.session_tempfile(ssh.key), 'ssh_key')
        t.copy_file(helpers.session_tempfile(script), 'volume_script.sh')
        t.command(['chmod', '600', 'ssh_key'])
        ssh_command = ['ssh', '-i', 'ssh_key'] + ssh_client.SHARED_SSH_OPTS
        scp_command = ['scp', '-i', 'ssh_key'] + ssh_client.SHARED_SSH_OPTS
        for private_agent in description['private_agents']:
            target = '{}@{}'.format(ssh.user, private_agent['private_ip'])
            t.command(scp_command +
                      ['volume_script.sh', target + ':~/volume_script.sh'])
            t.command(ssh_command + [target, 'bash', 'volume_script.sh'])
        # nasty hack until we add a better post-flight
        time.sleep(60)
Пример #3
0
def tunnel_args(sshd_manager, tmpdir):
    with sshd_manager.run(1) as sshd_ports:
        yield {
            'user': getpass.getuser(),
            'control_path':
            str(tmpdir.join('x')),  # use as short a name as possible
            'key_path': helpers.session_tempfile(sshd_manager.key),
            'host': '127.0.0.1',
            'port': sshd_ports[0]
        }
def mount_volumes():
    """ Will create 200MB partions on clusters launched by dcos-launch
    """
    script = """
#!/bin/bash
sudo systemctl stop dcos-mesos-slave.service
sudo rm -f /var/lib/dcos/mesos-resources
sudo rm -f /var/lib/mesos/slave/meta/slaves/latest
"""
    for i in range(2):
        script += """
sudo mkdir -p /dcos/volume{idx}
sudo dd if=/dev/zero of=/root/volume{idx}.img bs=1M count={size}
sudo losetup /dev/loop{idx} /root/volume{idx}.img
sudo mkfs -t ext4 /dev/loop{idx}
sudo losetup -d /dev/loop{idx}
echo "/root/volume{idx}.img /dcos/volume{idx} auto loop 0 2" | sudo tee -a /etc/fstab
sudo mount /dcos/volume{idx}
""".format(idx=i, size=200)

    script += """
sudo systemctl restart dcos-mesos-slave.service
"""

    cluster_info_path = os.getenv('CLUSTER_INFO_PATH', 'cluster_info.json')
    if not os.path.exists(cluster_info_path):
        raise Exception('No cluster info to work with!')
    cluster_info_json = json.load(open(cluster_info_path))
    launcher = dcos_launch.get_launcher(cluster_info_json)
    description = launcher.describe()
    ssh = launcher.get_ssh_client()
    with ssh.tunnel(description['masters'][0]['public_ip']) as t:
        t.copy_file(helpers.session_tempfile(ssh.key), 'ssh_key')
        t.copy_file(helpers.session_tempfile(script), 'volume_script.sh')
        t.command(['chmod', '600', 'ssh_key'])
        ssh_command = ['ssh', '-i', 'ssh_key'] + ssh_client.SHARED_SSH_OPTS
        scp_command = ['scp', '-i', 'ssh_key'] + ssh_client.SHARED_SSH_OPTS
        for private_agent in description['private_agents']:
            target = '{}@{}'.format(ssh.user, private_agent['private_ip'])
            t.command(scp_command + ['volume_script.sh', target + ':~/volume_script.sh'])
            t.command(ssh_command + [target, 'bash', 'volume_script.sh'])
        # nasty hack until we add a better post-flight
        time.sleep(60)
Пример #5
0
def upgraded_dcos(dcos_api_session, launcher, setup_workload, onprem_cluster,
                  is_enterprise):
    """ This test is intended to test upgrades between versions so use
    the same config as the original launch
    """
    # Check for previous installation artifacts first
    bootstrap_host = onprem_cluster.bootstrap_host.public_ip
    upgrade.reset_bootstrap_host(onprem_cluster.ssh_client, bootstrap_host)

    upgrade_config_overrides = dict()
    if 'TEST_UPGRADE_CONFIG_PATH' in os.environ:
        with open(os.environ['TEST_UPGRADE_CONFIG_PATH'], 'r') as f:
            upgrade_config_overrides = yaml.load(f.read())

    upgrade_config = copy.copy(launcher.config['dcos_config'])

    upgrade_config.update({
        'cluster_name':
        'My Upgraded DC/OS',
        'ssh_user':
        onprem_cluster.ssh_client.user,  # can probably drop this field
        'bootstrap_url':
        'http://' + onprem_cluster.bootstrap_host.private_ip,
        'master_list': [h.private_ip for h in onprem_cluster.masters],
        'agent_list': [h.private_ip for h in onprem_cluster.private_agents],
        'public_agent_list':
        [h.private_ip for h in onprem_cluster.public_agents]
    })
    upgrade_config.update(upgrade_config_overrides)
    # if it was a ZK-backed install, make sure ZK is still running
    if upgrade_config.get('exhibitor_storage_backend') == 'zookeeper':
        upgrade_config[
            'exhibitor_zk_hosts'] = onprem_cluster.start_bootstrap_zk()
    # if IP detect public was not present, go ahead an inject it
    if 'ip_detect_public_contents' not in upgrade_config:
        upgrade_config['ip_detect_public_contents'] = yaml.dump(
            pkg_resources.resource_string('dcos_test_utils',
                                          'ip-detect/aws_public.sh').decode())

    bootstrap_home = onprem_cluster.ssh_client.get_home_dir(bootstrap_host)
    genconf_dir = os.path.join(bootstrap_home, 'genconf')
    with onprem_cluster.ssh_client.tunnel(bootstrap_host) as tunnel:
        log.info('Setting up upgrade config on bootstrap host')
        tunnel.command(['mkdir', genconf_dir])
        # transfer the config file
        tunnel.copy_file(
            helpers.session_tempfile(yaml.dump(upgrade_config).encode()),
            os.path.join(bootstrap_home, 'genconf/config.yaml'))
        # FIXME: we dont need the ssh key when the upgrade isnt being orchestratd
        tunnel.copy_file(
            helpers.session_tempfile(onprem_cluster.ssh_client.key.encode()),
            os.path.join(bootstrap_home, 'genconf/ssh_key'))
        tunnel.command(
            ['chmod', '600',
             os.path.join(bootstrap_home, 'genconf/ssh_key')])
        # Move the ip-detect script to the expected default path
        # FIXME: can we just send the contents in the config and skip this?
        tunnel.copy_file(
            pkg_resources.resource_filename('dcos_test_utils',
                                            'ip-detect/aws.sh'),
            os.path.join(bootstrap_home, 'genconf/ip-detect'))

    # API object may need to be updated
    upgrade_session = make_dcos_api_session(
        onprem_cluster, launcher, is_enterprise,
        upgrade_config_overrides.get('security'))

    # use the Auth session from the previous API session
    upgrade_session.session.auth = dcos_api_session.session.auth

    # do the actual upgrade
    upgrade.upgrade_dcos(upgrade_session, onprem_cluster,
                         dcos_api_session.get_version(),
                         os.environ['TEST_UPGRADE_INSTALLER_URL'],
                         os.environ['TEST_UPGRADE_USE_CHECKS'] == 'true')

    # this can be set after the fact because the upgrade metrics snapshot
    # endpoint is polled with verify=False
    if upgrade_session.default_url.scheme == 'https':
        upgrade_session.set_ca_cert()

    # Now Re-auth with the new session
    upgrade_session.wait_for_dcos()
    return upgrade_session
Пример #6
0
def temp_ssh_key(key: str) -> str:
    """ Dumps an SSH key string to a temp file that will be deleted at session close and returns the path
    """
    key_path = helpers.session_tempfile(key)
    os.chmod(str(key_path), stat.S_IREAD | stat.S_IWRITE)
    return key_path
Пример #7
0
def test_installer_cli(onprem_cluster, onprem_launcher):
    """ This test will step through the CLI install proceder for on-prem DC/OS

    This test has an environment variable switch: TEST_INSTALL_PREREQS
    If set to 'true', the --install-prereqs option on the installer will
    be run and if it rasises an error code, the test will fail.
    """
    host = onprem_cluster.bootstrap_host.public_ip
    ssh = onprem_launcher.get_ssh_client()

    log.info('Verifying SSH-connectivity to cluster')
    for h in onprem_cluster.hosts:
        ssh.wait_for_ssh_connection(h.public_ip)

    log.info('Setting up installer host')
    home_dir = ssh.get_home_dir(host)
    ssh.add_ssh_user_to_docker_users(host)

    genconf_dir = os.path.join(home_dir, 'genconf')
    ssh.command(host, ['mkdir', '-p', genconf_dir])

    installer_path = os.path.join(home_dir, 'dcos_generate_config.sh')
    onprem.download_dcos_installer(
        ssh,
        host,
        installer_path,
        onprem_launcher.config['installer_url'])
    cli_installer = DcosCliInstaller(host, installer_path, ssh)
    log.info('Installer is ready for use!')

    # Start with minimal, default config, and then inject user settings
    test_config = {
        'cluster_name': 'SSH Installed DC/OS',
        'bootstrap_url': 'file:///opt/dcos_install_tmp',
        'master_discovery': 'static',
        'master_list': [m.private_ip for m in onprem_cluster.masters],
        'ssh_user': onprem_launcher.config['ssh_user'],
        'agent_list': [a.private_ip for a in onprem_cluster.private_agents],
        'platform': 'aws',
        'rexray_config_preset': 'aws',
        'public_agent_list': [a.private_ip for a in onprem_cluster.public_agents],
        'exhibitor_storage_backend': 'static'}
    test_config.update(onprem_launcher.config['dcos_config'])

    # explicitly transfer the files to be in the designated paths on the host
    log.info('Transfering config.yaml')
    cli_installer.copy_to_host(
        helpers.session_tempfile(
            yaml.dump(test_config).encode()), os.path.join(genconf_dir, 'config.yaml'))

    log.info('Transfering ip-detect script')
    ip_detect_script = pkg_resources.resource_string('dcos_test_utils', 'ip-detect/aws.sh')
    cli_installer.copy_to_host(
        helpers.session_tempfile(ip_detect_script), os.path.join(genconf_dir, 'ip-detect'))

    log.info('Transferring deployment SSH key')
    cli_installer.copy_to_host(
        helpers.session_tempfile(
            onprem_launcher.config['ssh_private_key'].encode()), os.path.join(genconf_dir, 'ssh_key'))
    cli_installer.ssh_command(['chmod', '600', os.path.join(genconf_dir, 'ssh_key')])

    log.info('Running installation procedure')
    cli_installer.genconf()
    if os.environ['TEST_INSTALL_PREREQS'] == 'true':
        cli_installer.install_prereqs()
    cli_installer.preflight()
    cli_installer.deploy()
    cli_installer.postflight()
Пример #8
0
 def set_ca_cert(self):
     log.info('Attempt to get CA bundle via Admin Router')
     r = self.get('ca/dcos-ca.crt', verify=False)
     r.raise_for_status()
     self.session.verify = helpers.session_tempfile(r.content)
def mount_volumes():
    """ Will create 200MB partions on clusters launched by dcos-launch
    """
    volume_script = """#!/bin/bash
set -e

if [ {dcos_mounts} ]; then
    echo 'Volumes already exist, exiting early'
    exit 0
fi

echo 'Stopping agent and clearing state...'

systemctl stop dcos-mesos-slave.service

cat /var/lib/dcos/mesos-resources || echo 'No resources file found'
ls -l /var/lib/mesos/slave/meta/slaves/latest || echo 'No latest agent symlink found'
rm -f /var/lib/dcos/mesos-resources
rm -f /var/lib/mesos/slave/meta/slaves/latest

losetup -a
""".format(dcos_mounts=" -a ".join(
        ["-e /dcos/volume{}".format(i) for i in range(MOUNT_VOLUME_COUNT)]))

    for i in range(MOUNT_VOLUME_COUNT):
        volume_script += """
if [ ! -e {loop_file} ]; then
    echo 'Creating loopback device {loop_dev}...'

    dd if=/dev/zero of={loop_file} bs=1M count={size_mb}
    losetup {loop_dev} {loop_file}
    mkfs -t ext4 {loop_dev}
    losetup -d {loop_dev}
fi

if [ ! -e {dcos_mount} ]; then
    echo 'Creating loopback volume {dcos_mount}...'

    mkdir -p {dcos_mount}
    echo \"{loop_file} {dcos_mount} auto loop 0 2\" | tee -a /etc/fstab
    mount {dcos_mount}
fi
""".format(
            size_mb=MOUNT_VOLUME_SIZE_MB,
            dcos_mount="/dcos/volume{}".format(i),
            loop_dev="/dev/loop{}".format(i),
            loop_file="/root/volume{}.img".format(i),
        )

    volume_script += """
echo 'Restarting agent...'
systemctl restart dcos-mesos-slave.service"""

    cluster_info_path = os.getenv("CLUSTER_INFO_PATH", "cluster_info.json")
    if not os.path.exists(cluster_info_path):
        raise Exception("No cluster info to work with!")
    cluster_info_json = json.load(open(cluster_info_path))
    launcher = dcos_launch.get_launcher(cluster_info_json)
    description = launcher.describe()
    ssh = launcher.get_ssh_client()
    with ssh.tunnel(description["masters"][0]["public_ip"]) as t:
        t.copy_file(helpers.session_tempfile(ssh.key), "ssh_key")
        t.copy_file(helpers.session_tempfile(volume_script),
                    "volume_script.sh")
        t.command(["chmod", "600", "ssh_key"])
        ssh_command = ["ssh", "-i", "ssh_key"] + ssh_client.SHARED_SSH_OPTS
        scp_command = ["scp", "-i", "ssh_key"] + ssh_client.SHARED_SSH_OPTS
        for private_agent in description["private_agents"]:
            target = "{}@{}".format(ssh.user, private_agent["private_ip"])
            t.command(scp_command +
                      ["volume_script.sh", target + ":~/volume_script.sh"])
            t.command(ssh_command +
                      [target, "sudo", "bash", "volume_script.sh"])
        # nasty hack until we add a better post-flight
        time.sleep(60)
Пример #10
0
def mount_volumes():
    """ Will create 200MB partions on clusters launched by dcos-launch
    """
    volume_script = """#!/bin/bash
set -e

if [ {dcos_mounts} ]; then
    echo 'Volumes already exist, exiting early'
    exit 0
fi

echo 'Stopping agent and clearing state...'

systemctl stop dcos-mesos-slave.service

cat /var/lib/dcos/mesos-resources || echo 'No resources file found'
ls -l /var/lib/mesos/slave/meta/slaves/latest || echo 'No latest agent symlink found'
rm -f /var/lib/dcos/mesos-resources
rm -f /var/lib/mesos/slave/meta/slaves/latest

losetup -a
""".format(dcos_mounts=" -a ".join([
        "-e /dcos/volume{}".format(i)
        for i, _ in enumerate(MOUNT_VOLUME_PROFILES)
    ]))

    for i, p in enumerate(MOUNT_VOLUME_PROFILES):
        volume_script += """
if [ ! -e {loop_file} ]; then
    echo 'Creating loopback device {loop_dev}...'

    dd if=/dev/zero of={loop_file} bs=1M count={size_mb}
    losetup {loop_dev} {loop_file}
    mkfs -t {fs_type} {loop_dev}
    losetup -d {loop_dev}
fi

if [ ! -e {dcos_mount} ]; then
    echo 'Creating loopback volume {dcos_mount}...'

    mkdir -p {dcos_mount}
    echo \"{loop_file} {dcos_mount} auto loop 0 2\" | tee -a /etc/fstab
    mount {dcos_mount}
fi
""".format(size_mb=MOUNT_VOLUME_SIZE_MB,
           dcos_mount="/dcos/volume{}".format(i),
           loop_dev="/dev/loop{}".format(i),
           loop_file="/root/volume{}.img".format(i),
           fs_type=p or "ext4")

    # To create profile mount volumes, we manually run `make_disk_resources.py`
    # to generate disk resources, then parse the result and set the
    # `disk.source.profile` field for each profile mount volume.
    volume_script += """
echo 'Updating disk resources...'

export MESOS_WORK_DIR MESOS_RESOURCES
eval $(sed -E "s/^([A-Z_]+)=(.*)$/\\1='\\2'/" /opt/mesosphere/etc/mesos-slave-common)  # Set up `MESOS_WORK_DIR`.
eval $(sed -E "s/^([A-Z_]+)=(.*)$/\\1='\\2'/" /opt/mesosphere/etc/mesos-slave)         # Set up `MESOS_RESOURCES`.
source /opt/mesosphere/etc/mesos-slave-common
/opt/mesosphere/bin/make_disk_resources.py /var/lib/dcos/mesos-resources
source /var/lib/dcos/mesos-resources
/opt/mesosphere/bin/python -c "
import json;
import os;

profiles = {profiles}
resources = json.loads(os.environ['MESOS_RESOURCES'])

for r in resources:
    try:
        disk_source = r['disk']['source']
        disk_source['profile'] = profiles[disk_source['mount']['root']]
    except KeyError:
        pass

print('MESOS_RESOURCES=\\'' + json.dumps(resources) + '\\'')
" > /var/lib/dcos/mesos-resources

echo 'Restarting agent...'

systemctl restart dcos-mesos-slave.service
""".format(
        profiles={
            "/dcos/volume{}".format(i): p
            for i, p in enumerate(MOUNT_VOLUME_PROFILES) if p
        })

    cluster_info_path = os.getenv("CLUSTER_INFO_PATH", "cluster_info.json")
    if not os.path.exists(cluster_info_path):
        raise Exception("No cluster info to work with!")
    cluster_info_json = json.load(open(cluster_info_path))
    launcher = dcos_launch.get_launcher(cluster_info_json)
    description = launcher.describe()
    ssh = launcher.get_ssh_client()
    with ssh.tunnel(description["masters"][0]["public_ip"]) as t:
        t.copy_file(helpers.session_tempfile(ssh.key), "ssh_key")
        t.copy_file(helpers.session_tempfile(volume_script),
                    "volume_script.sh")
        t.command(["chmod", "600", "ssh_key"])
        ssh_command = ["ssh", "-i", "ssh_key"] + ssh_client.SHARED_SSH_OPTS
        scp_command = ["scp", "-i", "ssh_key"] + ssh_client.SHARED_SSH_OPTS
        for private_agent in description["private_agents"]:
            target = "{}@{}".format(ssh.user, private_agent["private_ip"])
            t.command(scp_command +
                      ["volume_script.sh", target + ":~/volume_script.sh"])
            t.command(ssh_command +
                      [target, "sudo", "bash", "volume_script.sh"])
        # nasty hack until we add a better post-flight
        time.sleep(60)
Пример #11
0
def upgrade_dcos(
        dcos_api_session: dcos_test_utils.dcos_api_session.DcosApiSession,
        onprem_cluster: dcos_test_utils.onprem.OnpremCluster,
        starting_version: str, installer_url: str, user_config: dict,
        platform: str) -> None:
    """ Performs the documented upgrade process on a cluster

    Note: This is intended for testing purposes only and is an irreversible process

    Args:
        dcos_api_session: API session object capable of authenticating with the
            upgraded DC/OS cluster
        onprem_cluster: SSH-backed onprem abstraction for the cluster to be upgraded
        installer_url: URL for the installer to drive the upgrade
        user_config: this function already creates a viable upgrade config based on
            the onprem_cluster, but overrides can be provided via this dict
        platform: this must be `aws` as no other platform is currently supported
    """
    assert platform == 'aws', 'AWS is the only supported platform backend currently'

    ssh_client = onprem_cluster.ssh_client

    # kill previous genconf on bootstrap host if it is still running
    bootstrap_host = onprem_cluster.bootstrap_host.public_ip
    log.info('Killing any previous installer before starting upgrade')
    previous_installer = ssh_client.command(bootstrap_host, [
        'docker', 'ps', '--quiet', '--filter', 'name=dcos-genconf', '--filter',
        'status=running'
    ]).decode().strip()
    if previous_installer:
        ssh_client.command(bootstrap_host,
                           ['docker', 'kill', previous_installer])

    bootstrap_home = ssh_client.get_home_dir(bootstrap_host)

    log.info('Clearing out old installation files')
    genconf_dir = os.path.join(bootstrap_home, 'genconf')
    ssh_client.command(bootstrap_host, ['sudo', 'rm', '-rf', genconf_dir])
    ssh_client.command(bootstrap_host, ['mkdir', genconf_dir])
    installer_path = os.path.join(bootstrap_home, 'dcos_generate_config.sh')
    dcos_test_utils.onprem.download_dcos_installer(ssh_client, bootstrap_host,
                                                   installer_path,
                                                   installer_url)

    log.info('Starting ZooKeeper on the bootstrap node')
    zk_host = onprem_cluster.start_bootstrap_zk()
    # start the nginx that will host the bootstrap files
    bootstrap_url = 'http://' + onprem_cluster.start_bootstrap_nginx()

    with ssh_client.tunnel(bootstrap_host) as tunnel:
        log.info('Setting up upgrade config on bootstrap host')
        upgrade_config = {
            'cluster_name':
            'My Upgraded DC/OS',
            'ssh_user':
            ssh_client.user,
            'master_discovery':
            'static',
            'exhibitor_storage_backend':
            'zookeeper',
            'exhibitor_zk_hosts':
            zk_host,
            'exhibitor_zk_path':
            '/exhibitor',
            'bootstrap_url':
            bootstrap_url,
            'rexray_config_reset':
            platform,
            'platform':
            platform,
            'master_list': [h.private_ip for h in onprem_cluster.masters],
            'agent_list':
            [h.private_ip for h in onprem_cluster.private_agents],
            'public_agent_list':
            [h.private_ip for h in onprem_cluster.public_agents]
        }
        upgrade_config.update(user_config)

        # transfer ip-detect and ssh key
        tunnel.copy_file(session_tempfile(yaml.dump(upgrade_config).encode()),
                         os.path.join(bootstrap_home, 'genconf/config.yaml'))
        tunnel.copy_file(session_tempfile(ssh_client.key.encode()),
                         os.path.join(bootstrap_home, 'genconf/ssh_key'))
        tunnel.command(
            ['chmod', '600',
             os.path.join(bootstrap_home, 'genconf/ssh_key')])
        ip_detect_script = pkg_resources.resource_string(
            'dcos_test_utils',
            'ip-detect/{}.sh'.format(platform)).decode('utf-8')
        tunnel.copy_file(session_tempfile(ip_detect_script.encode()),
                         os.path.join(bootstrap_home, 'genconf/ip-detect'))

        log.info('Generating node upgrade script')
        upgrade_script_path = tunnel.command([
            'bash', installer_path,
            '--generate-node-upgrade-script ' + starting_version
        ]).decode('utf-8').splitlines()[-1].split("Node upgrade script URL: ",
                                                  1)[1]

        log.info('Editing node upgrade script...')
        # Remove docker (and associated journald) restart from the install
        # script. This prevents Docker-containerized tasks from being killed
        # during agent upgrades.
        tunnel.command([
            'sudo', 'sed', '-i', '-e',
            '"s/systemctl restart systemd-journald//g"', '-e',
            '"s/systemctl restart docker//g"',
            bootstrap_home + '/genconf/serve/dcos_install.sh'
        ])
        tunnel.command(['docker', 'restart', 'dcos-bootstrap-nginx'])
    # upgrading can finally start
    master_list = [host.public_ip for host in onprem_cluster.masters]
    private_agent_list = [
        host.public_ip for host in onprem_cluster.private_agents
    ]
    public_agent_list = [
        host.public_ip for host in onprem_cluster.public_agents
    ]
    upgrade_ordering = [
        # Upgrade masters in a random order.
        ('master', 'master', random.sample(master_list, len(master_list))),
        ('slave', 'agent', private_agent_list),
        ('slave_public', 'public agent', public_agent_list)
    ]
    logging.info('\n'.join(['Upgrade plan:'] + [
        '{} ({})'.format(host, role_name)
        for _, role_name, hosts in upgrade_ordering for host in hosts
    ]))
    for role, role_name, hosts in upgrade_ordering:
        log.info('Upgrading {} nodes: {}'.format(role_name, repr(hosts)))
        for host in hosts:
            log.info('Upgrading {}: {}'.format(role_name, repr(host)))
            ssh_client.command(host, [
                'curl', '--silent', '--verbose', '--show-error', '--fail',
                '--location', '--keepalive-time', '2', '--retry', '20',
                '--speed-limit', '100000', '--speed-time', '60',
                '--remote-name', upgrade_script_path
            ])
            ssh_client.command(host, ['sudo', 'bash', 'dcos_node_upgrade.sh'])
            wait_metric = {
                'master': 'registrar/log/recovered',
                'slave': 'slave/registered',
                'slave_public': 'slave/registered',
            }[role]
            log.info(
                'Waiting for {} to rejoin the cluster...'.format(role_name))
            try:
                wait_for_mesos_metric(dcos_api_session, host, wait_metric, 1)
            except retrying.RetryError as exc:
                raise Exception(
                    'Timed out waiting for {} to rejoin the cluster after upgrade: {}'
                    .format(role_name, repr(host))) from exc
    dcos_api_session.wait_for_dcos()