def wait_for_dcos_oss( cluster: Cluster, request: SubRequest, log_dir: Path, ) -> None: """ Helper for ``wait_for_dcos_oss`` that automatically dumps the journal of every cluster node if a ``DCOSTimeoutError`` is hit. """ try: cluster.wait_for_dcos_oss() except DCOSTimeoutError: # Dumping the logs on timeout only works if DC/OS has already started # the systemd units that the logs are retrieved from. # This does currently not pose a problem since the ``wait_for_dcos_ee`` # timeout is set to one hour. We expect the systemd units to have # started by then. dump_cluster_journals( cluster=cluster, target_dir=log_dir / artifact_dir_format(request.node.name), ) raise
def test_no_live_logging( self, caplog: LogCaptureFixture, cluster_backend: ClusterBackend, oss_installer: Path, ) -> None: """ By default, subprocess output is not logged during DC/OS installation. """ with pytest.raises(CalledProcessError): # It is not possible to install DC/OS with two master nodes. with Cluster( masters=2, cluster_backend=cluster_backend, ) as cluster: cluster.install_dcos_from_path( dcos_installer=oss_installer, dcos_config=cluster.base_config, ip_detect_path=cluster_backend.ip_detect_path, ) assert not self._two_masters_error_logged(log_records=caplog.records)
def test_install_cluster_from_url( self, cluster_backend: ClusterBackend, oss_installer_url: str, tmpdir: local, ) -> None: """ Install a DC/OS cluster with a custom ``ip-detect`` script. """ with Cluster( cluster_backend=cluster_backend, masters=1, agents=0, public_agents=0, ) as cluster: (master, ) = cluster.masters ip_detect_file = tmpdir.join('ip-detect') ip_detect_contents = dedent( """\ #!/bin/bash echo {ip_address} """, ).format(ip_address=master.private_ip_address) ip_detect_file.write(ip_detect_contents) cluster.install_dcos_from_url( dcos_installer=oss_installer_url, dcos_config=cluster.base_config, ip_detect_path=cluster_backend.ip_detect_path, files_to_copy_to_genconf_dir=[ (Path(str(ip_detect_file)), Path('/genconf/ip-detect')), ], ) cluster.wait_for_dcos_oss() cat_result = master.run( args=['cat', '/opt/mesosphere/bin/detect_ip'], ) assert cat_result.stdout.decode() == ip_detect_contents
def test_install_dcos_from_node( self, oss_installer_url: str, ) -> None: """ It is possible to install DC/OS on an AWS cluster node by node. """ cluster_backend = AWS() with Cluster( cluster_backend=cluster_backend, agents=0, public_agents=0, ) as cluster: (master, ) = cluster.masters master.install_dcos_from_url( dcos_installer=oss_installer_url, dcos_config=cluster.base_config, role=Role.MASTER, output=Output.LOG_AND_CAPTURE, ip_detect_path=cluster_backend.ip_detect_path, ) cluster.wait_for_dcos_oss()
def test_install_dcos_from_node( self, oss_artifact_url: str, ) -> None: """ It is possible to install DC/OS on an AWS cluster node by node. """ cluster_backend = AWS() with Cluster( cluster_backend=cluster_backend, agents=0, public_agents=0, ) as cluster: (master, ) = cluster.masters master.install_dcos_from_url( build_artifact=oss_artifact_url, dcos_config=cluster.base_config, role=Role.MASTER, log_output_live=True, ip_detect_path=cluster_backend.ip_detect_path, ) cluster.wait_for_dcos_oss()
def test_run_pytest( self, cluster_backend: ClusterBackend, enterprise_artifact: Path, license_key_contents: str, ) -> None: """ Integration tests can be run with `pytest`. Errors are raised from `pytest`. """ superuser_username = str(uuid.uuid4()) superuser_password = str(uuid.uuid4()) config = { 'superuser_username': superuser_username, 'superuser_password_hash': sha512_crypt.hash(superuser_password), 'fault_domain_enabled': False, 'license_key_contents': license_key_contents, } with Cluster(cluster_backend=cluster_backend) as cluster: cluster.install_dcos_from_path( build_artifact=enterprise_artifact, extra_config=config, log_output_live=True, ) cluster.wait_for_dcos_ee( superuser_username=superuser_username, superuser_password=superuser_password, ) # No error is raised with a successful command. cluster.run_integration_tests( pytest_command=['pytest', '-vvv', '-s', '-x', 'test_tls.py'], env={ 'DCOS_LOGIN_UNAME': superuser_username, 'DCOS_LOGIN_PW': superuser_password, }, log_output_live=True, )
def test_enterprise( self, cluster_backend: ClusterBackend, enterprise_1_9_installer: Path, ) -> None: """ A DC/OS Enterprise 1.9 cluster can be started. """ superuser_username = str(uuid.uuid4()) superuser_password = str(uuid.uuid4()) config = { 'superuser_username': superuser_username, 'superuser_password_hash': sha512_crypt.hash(superuser_password), } with Cluster(cluster_backend=cluster_backend) as cluster: cluster.install_dcos_from_path( dcos_installer=enterprise_1_9_installer, dcos_config={ **cluster.base_config, **config, }, output=Output.LOG_AND_CAPTURE, ip_detect_path=cluster_backend.ip_detect_path, ) cluster.wait_for_dcos_ee( superuser_username=superuser_username, superuser_password=superuser_password, ) for node in { *cluster.masters, *cluster.agents, *cluster.public_agents, }: build = node.dcos_build_info() assert build.version.startswith('1.9') assert build.commit assert build.variant == DCOSVariant.ENTERPRISE
def test_install_dcos_with_custom_ip_detect( self, oss_installer_url: str, tmp_path: Path, ) -> None: """ It is possible to install DC/OS on an AWS with a custom IP detect script. """ cluster_backend = AWS() with Cluster( cluster_backend=cluster_backend, agents=0, public_agents=0, ) as cluster: (master, ) = cluster.masters ip_detect_file = tmp_path / 'ip-detect' ip_detect_contents = dedent( """\ #!/bin/bash echo {ip_address} """, ).format(ip_address=master.private_ip_address) ip_detect_file.write_text(ip_detect_contents) cluster.install_dcos_from_url( dcos_installer=oss_installer_url, dcos_config=cluster.base_config, output=Output.LOG_AND_CAPTURE, ip_detect_path=ip_detect_file, ) cluster.wait_for_dcos_oss() cat_result = master.run( args=['cat', '/opt/mesosphere/bin/detect_ip'], ) node_script_contents = cat_result.stdout.decode() assert node_script_contents == ip_detect_contents backend_script_path = cluster_backend.ip_detect_path backend_script_contents = backend_script_path.read_text() assert node_script_contents != backend_script_contents
def test_install_dcos_from_path(self) -> None: """ The AWS backend requires a build artifact URL in order to launch a DC/OS cluster. """ with Cluster( cluster_backend=AWS(), masters=1, agents=0, public_agents=0, ) as cluster: with pytest.raises(NotImplementedError) as excinfo: cluster.install_dcos_from_path( build_artifact=Path('/foo'), dcos_config=cluster.base_config, ) expected_error = ( 'The AWS backend does not support the installation of build ' 'artifacts passed via path. This is because a more efficient' 'installation method exists in ``install_dcos_from_url``.') assert str(excinfo.value) == expected_error
def cluster( self, oss_artifact: Path, cluster_backend: ClusterBackend, ) -> Iterator[Cluster]: """ Return a `Cluster` with DC/OS installed and running. This is class scoped as we do not intend to modify the cluster in ways that make tests interfere with one another. """ with Cluster(cluster_backend=cluster_backend) as dcos_cluster: dcos_cluster.install_dcos_from_path( dcos_config=dcos_cluster.base_config, build_artifact=oss_artifact, log_output_live=True, ) # We exercise the "http_checks=False" code here but we do not test # its functionality. It is a temporary measure while we wait for # more thorough dcos-checks. dcos_cluster.wait_for_dcos_oss(http_checks=False) dcos_cluster.wait_for_dcos_oss() yield dcos_cluster
def test_live_logging( self, caplog: LogCaptureFixture, cluster_backend: ClusterBackend, oss_artifact: Path, ) -> None: """ If `log_output_live` is given as `True`, the installation output is logged live. """ with pytest.raises(CalledProcessError): # It is not possible to install DC/OS with two master nodes. with Cluster( masters=2, cluster_backend=cluster_backend, ) as cluster: cluster.install_dcos_from_path( build_artifact=oss_artifact, dcos_config=cluster.base_config, log_output_live=True, ) assert self._two_masters_error_logged(log_records=caplog.records)
def test_host_driver_not_supported(self) -> None: """ If the host's storage driver is not supported, `aufs` is used. """ client = docker.from_env(version='auto') info = {**client.info(), **{'Driver': 'not_supported'}} with Mocker(real_http=True) as mock: mock.get(url=self._docker_info_endpoint, json=info) backend = Docker() assert backend.docker_storage_driver == DockerStorageDriver.AUFS with Cluster( cluster_backend=backend, masters=1, agents=0, public_agents=0, ) as cluster: (master, ) = cluster.masters node_driver = self._get_storage_driver(node=master) assert node_driver == DockerStorageDriver.AUFS
def test_install_dcos_from_url( self, oss_installer_url: str, cluster_backend: ClusterBackend, ) -> None: """ It is possible to install DC/OS on a node from a URL. """ with Cluster(cluster_backend=cluster_backend) as cluster: for nodes, role in ( (cluster.masters, Role.MASTER), (cluster.agents, Role.AGENT), (cluster.public_agents, Role.PUBLIC_AGENT), ): for node in nodes: node.install_dcos_from_url( dcos_installer=oss_installer_url, dcos_config=cluster.base_config, ip_detect_path=cluster_backend.ip_detect_path, role=role, output=Output.LOG_AND_CAPTURE, ) cluster.wait_for_dcos_oss()
def test_set_false_exception_raised( self, cluster_backend: ClusterBackend, oss_artifact: Path, ) -> None: """ If `destroy_on_error` is set to `False` and an exception is raised, the cluster is not destroyed. """ with pytest.raises(Exception): with Cluster( generate_config_path=oss_artifact, agents=0, public_agents=0, destroy_on_error=False, cluster_backend=cluster_backend, ) as cluster: (master, ) = cluster.masters cluster.wait_for_dcos() raise Exception() # No exception is raised. The node still exists. master.run_as_root(args=['echo', 'hello'], log_output_live=True) cluster.destroy()
def test_extend_config( self, path: str, cluster_backend: ClusterBackend, oss_artifact: Path, ) -> None: """ This example demonstrates that it is possible to create a cluster with an extended configuration file. See ``test_default`` for evidence that the custom configuration is used. """ config = { 'cluster_docker_credentials': { 'auths': { 'https://index.docker.io/v1/': { 'auth': 'redacted' }, }, }, 'cluster_docker_credentials_enabled': True, } with Cluster( agents=0, public_agents=0, cluster_backend=cluster_backend, ) as cluster: cluster.install_dcos_from_path( oss_artifact, extra_config=config, ) cluster.wait_for_dcos_oss() (master, ) = cluster.masters master.run(args=['test', '-f', path], user=cluster.default_ssh_user)
def test_wait_for_dcos_ee( self, cluster_backend: ClusterBackend, enterprise_artifact: Path, license_key_contents: str, ) -> None: """ A cluster can start up in security disabled mode. """ superuser_username = str(uuid.uuid4()) superuser_password = str(uuid.uuid4()) config = { 'superuser_username': superuser_username, 'superuser_password_hash': sha512_crypt.hash(superuser_password), 'fault_domain_enabled': False, 'license_key_contents': license_key_contents, 'security': 'disabled', } with Cluster( cluster_backend=cluster_backend, agents=0, public_agents=0, ) as cluster: cluster.install_dcos_from_path( build_artifact=enterprise_artifact, dcos_config={ **cluster.base_config, **config, }, log_output_live=True, ) cluster.wait_for_dcos_ee( superuser_username=superuser_username, superuser_password=superuser_password, )
def test_custom_key_pair(self, tmp_path: Path) -> None: """ It is possible to pass a custom key pair to the AWS backend. """ key_name = 'e2e-test-{random}'.format(random=uuid.uuid4().hex) private_key_path = tmp_path / 'private_key' public_key_path = tmp_path / 'public_key' _write_key_pair( public_key_path=public_key_path, private_key_path=private_key_path, ) backend = AWS(aws_key_pair=(key_name, private_key_path)) region_name = backend.aws_region ec2 = boto3.client('ec2', region_name=region_name) ec2.import_key_pair( KeyName=key_name, PublicKeyMaterial=public_key_path.read_bytes(), ) try: with Cluster( cluster_backend=backend, agents=0, public_agents=0, ) as cluster: (master, ) = cluster.masters node = Node( public_ip_address=master.public_ip_address, private_ip_address=master.private_ip_address, default_user=master.default_user, ssh_key_path=private_key_path, ) node.run(args=['echo', '1']) finally: ec2.delete_key_pair(KeyName=key_name)
def static_three_master_cluster( artifact_path: Path, docker_backend: Docker, request: SubRequest, log_dir: Path, ) -> Generator[Cluster, None, None]: """Spin up a highly-available DC/OS cluster with three master nodes.""" with Cluster( cluster_backend=docker_backend, masters=3, agents=0, public_agents=0, ) as cluster: cluster.install_dcos_from_path( dcos_installer=artifact_path, dcos_config=cluster.base_config, ip_detect_path=docker_backend.ip_detect_path, ) wait_for_dcos_oss( cluster=cluster, request=request, log_dir=log_dir, ) yield cluster
def test_mismatched_agents( self, dcos_cluster: Cluster, existing_cluster_backend: ClusterBackend, ) -> None: """ If `agents` differs from the number of agents an error is raised. """ with pytest.raises(ValueError) as excinfo: with Cluster( cluster_backend=existing_cluster_backend, generate_config_path=None, masters=len(dcos_cluster.masters), agents=len(dcos_cluster.agents) + 1, public_agents=len(dcos_cluster.public_agents), destroy_on_error=False, destroy_on_success=False, ): pass # pragma: no cover expected_error = ('The number of agent nodes is `1`. ' 'Therefore, `agents` must be set to `1`.') assert str(excinfo.value) == expected_error
def test_destroy_on_success( self, dcos_cluster: Cluster, existing_cluster_backend: ClusterBackend, ) -> None: """ If `destroy_on_success` is set to `True` an error is raised. """ with pytest.raises(ValueError) as excinfo: with Cluster( cluster_backend=existing_cluster_backend, masters=len(dcos_cluster.masters), agents=len(dcos_cluster.agents), public_agents=len(dcos_cluster.public_agents), destroy_on_error=False, destroy_on_success=True, ): pass # pragma: no cover expected_error = ( 'The given cluster backend does not support being destroyed.' ' Therefore, `destroy_on_success` must be set to `False`.') assert str(excinfo.value) == expected_error
def test_install_dcos_from_url(self, oss_artifact_url: str) -> None: """ The Docker backend requires a build artifact in order to launch a DC/OS cluster. """ with Cluster( cluster_backend=Docker(), masters=1, agents=0, public_agents=0, ) as cluster: with pytest.raises(NotImplementedError) as excinfo: cluster.install_dcos_from_url( build_artifact=oss_artifact_url, dcos_config=cluster.base_config, ) expected_error = ( 'The Docker backend does not support the installation of DC/OS ' 'by build artifacts passed via URL string. This is because a more ' 'efficient installation method exists in `install_dcos_from_path`.' ) assert str(excinfo.value) == expected_error
def test_extra_config( self, dcos_cluster: Cluster, existing_cluster_backend: ClusterBackend, ) -> None: """ If `extra_config` is not empty, an error is raised. """ with pytest.raises(ValueError) as excinfo: with Cluster( cluster_backend=existing_cluster_backend, masters=len(dcos_cluster.masters), agents=len(dcos_cluster.agents), public_agents=len(dcos_cluster.public_agents), destroy_on_error=False, destroy_on_success=False, extra_config={'foo': 'bar'}, ): pass # pragma: no cover expected_error = ('Nodes are already configured. ' 'Therefore, `extra_config` must be empty.') assert str(excinfo.value) == expected_error
def test_live_logging( self, caplog: LogCaptureFixture, cluster_backend: ClusterBackend, oss_installer: Path, ) -> None: """ If ``output`` is given as ``Output.LOG_AND_CAPTURE``, the installation output is logged live. """ with pytest.raises(CalledProcessError): # It is not possible to install DC/OS with two master nodes. with Cluster( masters=2, cluster_backend=cluster_backend, ) as cluster: cluster.install_dcos_from_path( dcos_installer=oss_installer, ip_detect_path=cluster_backend.ip_detect_path, dcos_config=cluster.base_config, output=Output.LOG_AND_CAPTURE, ) assert self._two_masters_error_logged(log_records=caplog.records)
def test_replace_all_static( artifact_path: Path, docker_network_three_available_addresses: Network, tmp_path: Path, request: SubRequest, log_dir: Path, ) -> None: """ In a cluster with an Exhibitor backend consisting of a static ZooKeeper ensemble, after removing one master, and then adding another master with the same IP address, the cluster will get to a healthy state. This is repeated until all masters in the original cluster have been replaced. The purpose of this test is to assert that the ``node-poststart`` procedure correctly prevents a master node replacement from being performed too quickly. A new master node should only become part of the cluster if there are no more underreplicated ranges reported by CockroachDB. Permanent CockroachDB data loss and a potential breakage of DC/OS occurs when a second master node is taken down for replacement while CockroachDB is recovering and there are still underreplicated ranges due to a recent other master node replacement. """ docker_backend = Docker(network=docker_network_three_available_addresses) with Cluster( cluster_backend=docker_backend, # Allocate all 3 available IP addresses in the subnet. masters=3, agents=0, public_agents=0, ) as original_cluster: master = next(iter(original_cluster.masters)) result = master.run( args=[ 'ifconfig', '|', 'grep', '-B1', str(master.public_ip_address), '|', 'grep', '-o', '"^\w*"', ], output=Output.LOG_AND_CAPTURE, shell=True, ) interface = result.stdout.strip().decode() ip_detect_contents = textwrap.dedent( """\ #!/bin/bash -e if [ -f /sbin/ip ]; then IP_CMD=/sbin/ip else IP_CMD=/bin/ip fi $IP_CMD -4 -o addr show dev {interface} | awk '{{split($4,a,"/");print a[1]}}' """.format(interface=interface), ) ip_detect_path = tmp_path / 'ip-detect' ip_detect_path.write_text(data=ip_detect_contents) static_config = { 'master_discovery': 'static', 'master_list': [str(master.private_ip_address) for master in original_cluster.masters], } dcos_config = { **original_cluster.base_config, **static_config, } original_cluster.install_dcos_from_path( dcos_installer=artifact_path, dcos_config=dcos_config, ip_detect_path=ip_detect_path, ) wait_for_dcos_oss( cluster=original_cluster, request=request, log_dir=log_dir, ) current_cluster = original_cluster tmp_clusters = set() original_masters = original_cluster.masters try: for master_to_be_replaced in original_masters: # Destroy a master and free one IP address. original_cluster.destroy_node(node=master_to_be_replaced) temporary_cluster = Cluster( cluster_backend=docker_backend, # Allocate one container with the now free IP address. masters=1, agents=0, public_agents=0, ) tmp_clusters.add(temporary_cluster) # Install a new master on a new container with the same IP address. (new_master, ) = temporary_cluster.masters new_master.install_dcos_from_path( dcos_installer=artifact_path, dcos_config=dcos_config, role=Role.MASTER, ip_detect_path=ip_detect_path, ) # Form a new cluster with the newly create master node. new_cluster = Cluster.from_nodes( masters=current_cluster.masters.union({new_master}), agents=current_cluster.agents, public_agents=current_cluster.public_agents, ) # The `wait_for_dcos_oss` function waits until the new master has # joined the cluster and all masters are healthy. Without the # cockroachdb check, this succeeds before all cockroachdb ranges # have finished replicating to the new master. That meant that the # next master would be replaced too quickly, while it had data that # was not present elsewhere in the cluster. This lead to # irrecoverable dataloss. This function waits until the # master node is "healthy". This is a requirement for replacing the # next master node. # # We don't call the cockroachdb ranges check directly as the # purpose of this test is to ensure that when an operator follows # our documented procedure for replacing a master node multiple # times in a row (e.g. during a cluster upgrade) then the cluster # remains healthy throughout and afterwards. # # If we called the check directly here, we would be # sure the check is being called, but we would not be sure that # "wait_for_dcos_oss", i.e., the standard procedure for determining # whether a node is healthy, is sufficient to prevent the cluster # from breaking. # # We perform this check after every master is replaced, as that is # what we tell operators to do: "After installing the new master # node, wait until it becomes healthy before proceeding to the # next." # # The procedure for replacing multiple masters is documented here: # https://docs.mesosphere.com/1.12/installing/production/upgrading/#dcos-masters wait_for_dcos_oss( cluster=new_cluster, request=request, log_dir=log_dir, ) # Use the new cluster object in the next replacement iteration. current_cluster = new_cluster finally: for cluster in tmp_clusters: cluster.destroy()
def test_custom_mounts(self, tmpdir: local) -> None: """ It is possible to mount local files to master nodes. """ local_all_file = tmpdir.join('all_file.txt') local_all_file.write('') local_master_file = tmpdir.join('master_file.txt') local_master_file.write('') local_agent_file = tmpdir.join('agent_file.txt') local_agent_file.write('') local_public_agent_file = tmpdir.join('public_agent_file.txt') local_public_agent_file.write('') master_path = Path('/etc/on_master_nodes.txt') agent_path = Path('/etc/on_agent_nodes.txt') public_agent_path = Path('/etc/on_public_agent_nodes.txt') all_path = Path('/etc/on_all_nodes.txt') custom_container_mount = Mount( source=str(local_all_file), target=str(all_path), type='bind', ) custom_master_mount = Mount( source=str(local_master_file), target=str(master_path), type='bind', ) custom_agent_mount = Mount( source=str(local_agent_file), target=str(agent_path), type='bind', ) custom_public_agent_mount = Mount( source=str(local_public_agent_file), target=str(public_agent_path), type='bind', ) backend = Docker( custom_container_mounts=[custom_container_mount], custom_master_mounts=[custom_master_mount], custom_agent_mounts=[custom_agent_mount], custom_public_agent_mounts=[custom_public_agent_mount], ) with Cluster( cluster_backend=backend, masters=1, agents=1, public_agents=1, ) as cluster: for nodes, path, local_file in [ (cluster.masters, master_path, local_master_file), (cluster.masters, all_path, local_all_file), (cluster.agents, agent_path, local_agent_file), (cluster.agents, all_path, local_all_file), ( cluster.public_agents, public_agent_path, local_public_agent_file, ), (cluster.public_agents, all_path, local_all_file), ]: for node in nodes: content = str(uuid.uuid4()) local_file.write(content) args = ['cat', str(path)] result = node.run(args=args) assert result.stdout.decode() == content
def run_command( args: List[str], cluster: Cluster, host: Node, transport: Transport, use_test_env: bool, dcos_login_uname: str, dcos_login_pw: str, env: Dict[str, str], ) -> None: """ Run a command on a given cluster / host. Args: args: The arguments to run on a node. cluster: The cluster to run a command on. host: the node to run a command on. transport: The transport to use to communicate with the cluster. use_test_env: Whether to use the DC/OS integration test environment to run the command in. dcos_login_uname: The DC/OS login username. This is only used if using the test environment and DC/OS Enterprise. dcos_login_pw: The DC/OS login password. This is only used if using the test environment and DC/OS Enterprise. env: Environment variables to set before running the command. """ columns, rows = click.get_terminal_size() env = { # LINES and COLUMNS are needed if using the ``DOCKER_EXEC`` transport. # See https://github.com/moby/moby/issues/35407. 'COLUMNS': str(columns), 'LINES': str(rows), 'DCOS_LOGIN_UNAME': dcos_login_uname, 'DCOS_LOGIN_PW': dcos_login_pw, **env, } if not use_test_env: try: host.run( args=args, log_output_live=False, tty=True, shell=True, env=env, transport=transport, ) except subprocess.CalledProcessError as exc: sys.exit(exc.returncode) return try: cluster.run_integration_tests( pytest_command=args, tty=True, env=env, test_host=host, transport=transport, ) except subprocess.CalledProcessError as exc: sys.exit(exc.returncode)
def test_replace_all_static( artifact_path: Path, docker_network_three_available_addresses: Network, tmp_path: Path, request: SubRequest, log_dir: Path, ) -> None: """ In a cluster with an Exhibitor backend consisting of a static ZooKeeper ensemble, after removing one master, and then adding another master with the same IP address, the cluster will get to a healthy state. This is repeated until all masters in the original cluster have been replaced. The purpose of this test is to assert that the ``node-poststart`` procedure correctly prevents a master node replacement from being performed too quickly. A new master node should only become part of the cluster if there are no more underreplicated ranges reported by CockroachDB. Permanent CockroachDB data loss and a potential breakage of DC/OS occurs when a second master node is taken down for replacement while CockroachDB is recovering and there are still underreplicated ranges due to a recent other master node replacement. """ docker_backend = Docker(network=docker_network_three_available_addresses) with Cluster( cluster_backend=docker_backend, # Allocate all 3 available IP addresses in the subnet. masters=3, agents=0, public_agents=0, ) as original_cluster: master = next(iter(original_cluster.masters)) result = master.run( args=[ 'ifconfig', '|', 'grep', '-B1', str(master.public_ip_address), '|', 'grep', '-o', '"^\w*"', ], shell=True, ) interface = result.stdout.strip().decode() ip_detect_contents = textwrap.dedent( """\ #!/bin/bash -e if [ -f /sbin/ip ]; then IP_CMD=/sbin/ip else IP_CMD=/bin/ip fi $IP_CMD -4 -o addr show dev {interface} | awk '{{split($4,a,"/");print a[1]}}' """.format(interface=interface), ) ip_detect_path = tmp_path / 'ip-detect' ip_detect_path.write_text(data=ip_detect_contents) static_config = { 'master_discovery': 'static', 'master_list': [ str(master.private_ip_address) for master in original_cluster.masters ], } dcos_config = { **original_cluster.base_config, **static_config, } original_cluster.install_dcos_from_path( dcos_installer=artifact_path, dcos_config=dcos_config, ip_detect_path=ip_detect_path, ) wait_for_dcos_oss( cluster=original_cluster, request=request, log_dir=log_dir, ) current_cluster = original_cluster tmp_clusters = set() original_masters = original_cluster.masters try: for master_to_be_replaced in original_masters: # Destroy a master and free one IP address. current_cluster.destroy_node(node=master_to_be_replaced) temporary_cluster = Cluster( cluster_backend=docker_backend, # Allocate one container with the now free IP address. masters=1, agents=0, public_agents=0, ) tmp_clusters.add(temporary_cluster) # Install a new master on a new container with the same IP address. (new_master, ) = temporary_cluster.masters new_master.install_dcos_from_path( dcos_installer=artifact_path, dcos_config=dcos_config, role=Role.MASTER, ip_detect_path=ip_detect_path, ) # Form a new cluster with the newly create master node. new_cluster = Cluster.from_nodes( masters=current_cluster.masters.add(new_master), agents=current_cluster.agents, public_agents=current_cluster.public_agents, ) # The `wait_for_dcos_oss` function waits until the new master has # joined the cluster and all masters are healthy. Without the # cockroachdb check, this succeeds before all cockroachdb ranges # have finished replicating to the new master. That meant that the # next master would be replaced too quickly, while it had data that # was not present elsewhere in the cluster. This lead to # irrecoverable dataloss. This function waits until the # master node is "healthy". This is a requirement for replacing the # next master node. # # We don't call the cockroachdb ranges check directly as the # purpose of this test is to ensure that when an operator follows # our documented procedure for replacing a master node multiple # times in a row (e.g. during a cluster upgrade) then the cluster # remains healthy throughout and afterwards. # # If we called the check directly here, we would be # sure the check is being called, but we would not be sure that # "wait_for_dcos_oss", i.e., the standard procedure for determining # whether a node is healthy, is sufficient to prevent the cluster # from breaking. # # We perform this check after every master is replaced, as that is # what we tell operators to do: "After installing the new master # node, wait until it becomes healthy before proceeding to the # next." # # The procedure for replacing multiple masters is documented here: # https://docs.mesosphere.com/1.12/installing/production/upgrading/#dcos-masters wait_for_dcos_oss( cluster=new_cluster, request=request, log_dir=log_dir, ) # Use the new cluster object in the next replacement iteration. current_cluster = new_cluster finally: for cluster in tmp_clusters: cluster.destroy()
from passlib.hash import sha512_crypt if len(sys.argv) != 2: print("Please specify the installer URL as argument.", file=sys.stderr) sys.exit(1) test_license = os.environ.get('DCOS_TEST_LICENSE') if not test_license: print("Please specify a license in $DCOS_TEST_LICENSE.", file=sys.stderr) sys.exit(1) private_key_path = os.environ.get('DCOS_TEST_SSH_KEY_PATH') aws_key_pair = ('default', private_key_path) if private_key_path else None cluster_backend = AWS(aws_key_pair=aws_key_pair) cluster = Cluster(cluster_backend=cluster_backend, agents=0, public_agents=0) username = '******' password = ''.join( random.choice(string.ascii_letters + string.digits) for i in range(12)) extra_config = { 'superuser_username': username, 'superuser_password_hash': sha512_crypt.hash(password), 'fault_domain_enabled': False, 'license_key_contents': test_license, } dcos_config = {**cluster.base_config, **extra_config} cluster.install_dcos_from_url(
from passlib.hash import sha512_crypt if len(sys.argv) != 2: print("Please specify the installer URL as argument.", file=sys.stderr) sys.exit(1) dcos_variant = os.environ.get('DCOS_TEST_VARIANT') if not dcos_variant: print("Please set DCOS_TEST_VARIANT to 'open' or 'enterprise'.", file=sys.stderr) sys.exit(1) private_key_path = os.environ.get('DCOS_TEST_SSH_KEY_PATH') aws_key_pair = ('default', private_key_path) if private_key_path else None cluster_backend = AWS(aws_region='us-east-1', aws_key_pair=aws_key_pair) cluster = Cluster(cluster_backend=cluster_backend, agents=0, public_agents=0) username = '******' password = ''.join(random.choice(string.ascii_letters + string.digits) for i in range(12)) extra_config = { 'superuser_username': username, 'superuser_password_hash': sha512_crypt.hash(password), # 'fault_domain_enabled': False, } test_license = os.environ.get('DCOS_TEST_LICENSE') if test_license: extra_config['license_key_contents'] = test_license dcos_config = {**cluster.base_config, **extra_config}
def run_tests(e2e_backend, installer_url, dcos_license, dcos_url, admin_username, admin_password, ssh_user, ssh_key_path): os.environ["CLI_TEST_SSH_USER"] = ssh_user os.environ["CLI_TEST_MASTER_PROXY"] = "1" os.environ["CLI_TEST_SSH_KEY_PATH"] = ssh_key_path # extra dcos_config (for dcos_launch and dcos_docker backends) extra_config = { 'superuser_username': admin_username, 'superuser_password_hash': sha512_crypt.hash(admin_password), 'fault_domain_enabled': False, 'license_key_contents': dcos_license, } if e2e_backend == 'dcos_launch': cluster_backend = AWS() with Cluster(cluster_backend=cluster_backend, agents=1) as cluster: dcos_config = {**cluster.base_config, **extra_config} cluster.install_dcos_from_url( build_artifact=installer_url, dcos_config=dcos_config, log_output_live=True, ) os.environ["CLI_TEST_SSH_KEY_PATH"] = str( cluster._cluster._ssh_key_path) _run_tests(cluster, admin_username, admin_password) elif e2e_backend == 'dcos_docker': dcos_ee_installer_filename = 'dcos_generate_config.ee.sh' dcos_ee_installer_path = Path.cwd() / Path(dcos_ee_installer_filename) if not dcos_ee_installer_path.exists(): urllib.request.urlretrieve(installer_url, dcos_ee_installer_filename) with Cluster(cluster_backend=Docker(), agents=1) as cluster: dcos_config = {**cluster.base_config, **extra_config} cluster.install_dcos_from_path( build_artifact=dcos_ee_installer_path, dcos_config=dcos_config, log_output_live=True, ) _run_tests(cluster, admin_username, admin_password) elif e2e_backend == 'existing': try: dcos_ip = IPv4Address(dcos_url) except ValueError: parsed_dcos_url = urlparse(dcos_url) dcos_hostname = parsed_dcos_url.hostname dcos_ip = IPv4Address(socket.gethostbyname(dcos_hostname)) masters = set([ Node( public_ip_address=dcos_ip, private_ip_address=dcos_ip, ssh_key_path=Path(ssh_key_path), default_ssh_user=ssh_user, ) ]) cluster = Cluster.from_nodes( masters=masters, agents=set(), public_agents=set(), ) _run_tests(cluster, admin_username, admin_password)
def test_copy_files_to_installer( self, cluster_backend: ClusterBackend, enterprise_artifact: Path, license_key_contents: str, ) -> None: """ Files can be copied from the host to the installer node at creation time. The installer container is removed shortly after creation by DC/OS Docker. Therefore, we inspect the symptoms - we can use custom CA certificates. See CA certificate tests in Enterprise DC/OS for more details. """ cert_filename = 'dcos-ca-certificate.crt' key_filename = 'dcos-ca-certificate-key.key' genconf = Path('/genconf') installer_cert_path = genconf / cert_filename installer_key_path = genconf / key_filename cert_dir_on_host = Path('tests/test_dcos_e2e/certificates').resolve() cert_path = cert_dir_on_host / cert_filename ca_key_path = cert_dir_on_host / key_filename master_key_path = Path( '/var/lib/dcos/pki/tls/CA/private/custom_ca.key', ) superuser_username = str(uuid.uuid4()) superuser_password = str(uuid.uuid4()) config = { 'superuser_username': superuser_username, 'superuser_password_hash': sha512_crypt.hash(superuser_password), 'security': 'strict', 'ca_certificate_path': str(installer_cert_path), 'ca_certificate_key_path': str(installer_key_path), 'fault_domain_enabled': False, 'license_key_contents': license_key_contents, } files_to_copy_to_genconf_dir = [ (cert_path, installer_cert_path), (ca_key_path, installer_key_path), ] with Cluster( cluster_backend=cluster_backend, masters=1, agents=0, public_agents=0, ) as cluster: (master, ) = cluster.masters master.send_file( local_path=ca_key_path, remote_path=master_key_path, ) cluster.install_dcos_from_path( build_artifact=enterprise_artifact, dcos_config={ **cluster.base_config, **config, }, log_output_live=True, ip_detect_path=cluster_backend.ip_detect_path, files_to_copy_to_genconf_dir=files_to_copy_to_genconf_dir, ) # We exercise the "http_checks=False" code here but we do not test # its functionality. It is a temporary measure while we wait for # more thorough dcos-checks. cluster.wait_for_dcos_ee( superuser_username=superuser_username, superuser_password=superuser_password, http_checks=False, ) cluster.wait_for_dcos_ee( superuser_username=superuser_username, superuser_password=superuser_password, ) master_url = 'https://' + str(master.public_ip_address) response = requests.get(master_url, verify=str(cert_path)) response.raise_for_status()
def test_copy_directory_to_node_installer_genconf_dir( self, cluster_backend: ClusterBackend, enterprise_artifact: Path, license_key_contents: str, ) -> None: """ Directories can be copied to the ``genconf`` directory from the host to the installing node when installing DC/OS. Supplying a custom CA certificate directory is a good example for this capability. See CA certificate tests in Enterprise DC/OS for more details. """ cert_filename = 'dcos-ca-certificate.crt' key_filename = 'dcos-ca-certificate-key.key' genconf = Path('/genconf') installer_cert_path = genconf / 'certificates' / cert_filename installer_key_path = genconf / 'certificates' / key_filename cert_dir_on_host = Path('tests/test_dcos_e2e/certificates').resolve() cert_path = cert_dir_on_host / cert_filename ca_key_path = cert_dir_on_host / key_filename master_key_path = Path( '/var/lib/dcos/pki/tls/CA/private/custom_ca.key', ) superuser_username = str(uuid.uuid4()) superuser_password = str(uuid.uuid4()) config = { 'superuser_username': superuser_username, 'superuser_password_hash': sha512_crypt.hash(superuser_password), 'security': 'strict', 'ca_certificate_path': str(installer_cert_path), 'ca_certificate_key_path': str(installer_key_path), 'fault_domain_enabled': False, 'license_key_contents': license_key_contents, } with Cluster( cluster_backend=cluster_backend, masters=1, agents=0, public_agents=0, ) as cluster: (master, ) = cluster.masters master.send_file( local_path=ca_key_path, remote_path=master_key_path, ) master.install_dcos_from_path( build_artifact=enterprise_artifact, dcos_config={ **cluster.base_config, **config, }, ip_detect_path=cluster_backend.ip_detect_path, role=Role.MASTER, files_to_copy_to_genconf_dir=[(cert_dir_on_host, genconf)], log_output_live=True, ) cluster.wait_for_dcos_ee( superuser_username=superuser_username, superuser_password=superuser_password, ) master_url = 'https://' + str(master.public_ip_address) response = requests.get(master_url, verify=str(cert_path)) response.raise_for_status()
def create( agents: int, artifact: str, extra_config: Dict[str, Any], masters: int, public_agents: int, variant: str, workspace_dir: Optional[Path], license_key: Optional[str], security_mode: Optional[str], copy_to_master: List[Tuple[Path, Path]], cluster_id: str, ) -> None: """ Create a DC/OS cluster. DC/OS Enterprise \b DC/OS Enterprise clusters require different configuration variables to DC/OS OSS. For example, enterprise clusters require the following configuration parameters: ``superuser_username``, ``superuser_password_hash``, ``fault_domain_enabled``, ``license_key_contents`` \b These can all be set in ``--extra-config``. However, some defaults are provided for all but the license key. \b The default superuser username is ``admin``. The default superuser password is ``admin``. The default ``fault_domain_enabled`` is ``false``. \b ``license_key_contents`` must be set for DC/OS Enterprise 1.11 and above. This is set to one of the following, in order: \b * The ``license_key_contents`` set in ``--extra-config``. * The contents of the path given with ``--license-key``. * The contents of the path set in the ``DCOS_LICENSE_KEY_PATH`` environment variable. \b If none of these are set, ``license_key_contents`` is not given. """ # noqa: E501 base_workspace_dir = workspace_dir or Path(tempfile.gettempdir()) workspace_dir = base_workspace_dir / uuid.uuid4().hex workspace_dir.mkdir(parents=True) doctor_message = 'Try `dcos-vagrant doctor` for troubleshooting help.' artifact_path = Path(artifact).resolve() if variant == 'auto': variant = get_variant( artifact_path=artifact_path, workspace_dir=workspace_dir, doctor_message=doctor_message, ) enterprise = bool(variant == 'enterprise') description = { CLUSTER_ID_DESCRIPTION_KEY: cluster_id, WORKSPACE_DIR_DESCRIPTION_KEY: str(workspace_dir), VARIANT_DESCRIPTION_KEY: 'ee' if enterprise else '', } cluster_backend = Vagrant( workspace_dir=workspace_dir, virtualbox_description=json.dumps(obj=description), ) if enterprise: superuser_username = '******' superuser_password = '******' enterprise_extra_config = { 'superuser_username': superuser_username, 'superuser_password_hash': sha512_crypt.hash(superuser_password), 'fault_domain_enabled': False, } if license_key is not None: key_contents = Path(license_key).read_text() enterprise_extra_config['license_key_contents'] = key_contents extra_config = {**enterprise_extra_config, **extra_config} if security_mode is not None: extra_config['security'] = security_mode try: cluster = Cluster( cluster_backend=cluster_backend, masters=masters, agents=agents, public_agents=public_agents, files_to_copy_to_installer=[], ) except CalledProcessError as exc: click.echo('Error creating cluster.', err=True) click.echo(doctor_message) sys.exit(exc.returncode) for node in cluster.masters: for path_pair in copy_to_master: local_path, remote_path = path_pair node.send_file( local_path=local_path, remote_path=remote_path, ) try: with click_spinner.spinner(): cluster.install_dcos_from_path( build_artifact=artifact_path, dcos_config={ **cluster.base_config, **extra_config, }, ) except CalledProcessError as exc: click.echo('Error installing DC/OS.', err=True) click.echo(doctor_message) cluster.destroy() sys.exit(exc.returncode)