示例#1
0
def test_ssh_tunnel(sshd_manager):
    with sshd_manager.run(1) as sshd_ports:
        tunnel_args = {
            'ssh_user': getpass.getuser(),
            'ssh_key_path': sshd_manager.key_path,
            'host': '127.0.0.1',
            'port': sshd_ports[0]
        }
        with closing(SSHTunnel(**tunnel_args)) as tunnel:
            tunnel_write_and_run(tunnel.write_to_remote, tunnel.remote_cmd)
示例#2
0
def main():
    options = check_environment()

    host_list = None
    vpc = None  # Set if the test owns the VPC

    if options.host_list is None:
        log.info('CCM_VPC_HOSTS not provided, requesting new VPC from CCM...')
        vpc = make_vpc(use_bare_os=options.test_install_prereqs)
        host_list = vpc.hosts()
    else:
        host_list = options.host_list

    assert os.path.exists(
        'ssh_key'), 'Valid SSH key for hosts must be in working dir!'
    # key must be chmod 600 for test_runner to use
    os.chmod('ssh_key', stat.S_IREAD | stat.S_IWRITE)

    # Create custom SSH Runnner to help orchestrate the test
    ssh_user = '******'
    ssh_key_path = 'ssh_key'
    remote_dir = '/home/centos'

    if options.use_api:
        installer = test_util.installer_api_test.DcosApiInstaller()
        if not options.test_install_prereqs:
            # If we dont want to test the prereq install, use offline mode to avoid it
            installer.offline_mode = True
    else:
        installer = test_util.installer_api_test.DcosCliInstaller()

    host_list_w_port = [i + ':22' for i in host_list]

    @retry(stop_max_delay=120000)
    def establish_host_connectivity():
        """Continually try to recreate the SSH Tunnels to all hosts for 2 minutes
        """
        return closing(
            TunnelCollection(ssh_user, ssh_key_path, host_list_w_port))

    log.info('Checking that hosts are accessible')
    with establish_host_connectivity() as tunnels:
        local_ip = {}
        for tunnel in tunnels.tunnels:
            local_ip[tunnel.host] = get_local_address(tunnel, remote_dir)
            if options.do_setup:
                # Make the default user priveleged to use docker
                tunnel.remote_cmd(
                    ['sudo', 'usermod', '-aG', 'docker', ssh_user])

    # use first node as bootstrap node, second node as master, all others as agents
    test_host = host_list[0]
    test_host_local = local_ip[host_list[0]]
    master_list = [local_ip[host_list[1]]]
    agent_list = [local_ip[host_list[2]]]
    public_agent_list = [local_ip[host_list[3]]]
    log.info('Test host public/private IP: ' + test_host + '/' +
             test_host_local)

    with closing(SSHTunnel(ssh_user, ssh_key_path,
                           test_host)) as test_host_tunnel:
        log.info('Setting up installer on test host')

        installer.setup_remote(tunnel=test_host_tunnel,
                               installer_path=remote_dir +
                               '/dcos_generate_config.sh',
                               download_url=options.installer_url)
        if options.do_setup:
            # only do on setup so you can rerun this test against a living installer
            log.info('Verifying installer password hashing')
            test_pass = '******'
            hash_passwd = installer.get_hashed_password(test_pass)
            assert passlib.hash.sha512_crypt.verify(
                test_pass, hash_passwd), 'Hash does not match password'
            if options.use_api:
                installer.start_web_server()

        with open(pkg_resources.resource_filename(
                "gen", "ip-detect/aws.sh")) as ip_detect_fh:
            ip_detect_script = ip_detect_fh.read()
        with open('ssh_key', 'r') as key_fh:
            ssh_key = key_fh.read()
        # Using static exhibitor is the only option in the GUI installer
        if options.use_api:
            log.info(
                'Installer API is selected, so configure for static backend')
            zk_host = None  # causes genconf to use static exhibitor backend
        else:
            log.info('Installer CLI is selected, so configure for ZK backend')
            zk_host = test_host_local + ':2181'
            zk_cmd = [
                'sudo', 'docker', 'run', '-d', '-p', '2181:2181', '-p',
                '2888:2888', '-p', '3888:3888', 'jplock/zookeeper'
            ]
            test_host_tunnel.remote_cmd(zk_cmd)

        log.info("Configuring install...")
        installer.genconf(zk_host=zk_host,
                          master_list=master_list,
                          agent_list=agent_list,
                          public_agent_list=public_agent_list,
                          ip_detect_script=ip_detect_script,
                          ssh_user=ssh_user,
                          ssh_key=ssh_key,
                          add_config_path=options.add_config_path)

        log.info("Running Preflight...")
        if options.test_install_prereqs:
            # Runs preflight in --web or --install-prereqs for CLI
            # This may take up 15 minutes...
            installer.install_prereqs()
            if options.test_install_prereqs_only:
                if vpc:
                    vpc.delete()
                sys.exit(0)
        else:
            # Will not fix errors detected in preflight
            installer.preflight()

        log.info("Running Deploy...")
        installer.deploy()

        log.info("Running Postflight")
        installer.postflight()

        # Runs dcos-image/integration_test.py inside the cluster
        test_util.test_runner.prepare_test_registry(tunnel=test_host_tunnel,
                                                    test_dir=remote_dir)
        result = test_util.test_runner.integration_test(
            tunnel=test_host_tunnel,
            test_dir=remote_dir,
            region=vpc.get_region() if vpc else DEFAULT_AWS_REGION,
            dcos_dns=master_list[0],
            master_list=master_list,
            agent_list=agent_list,
            public_agent_list=public_agent_list,
            provider='onprem',
            # Setting dns_search: mesos not currently supported in API
            test_dns_search=not options.use_api,
            ci_flags=options.ci_flags,
            aws_access_key_id=options.aws_access_key_id,
            aws_secret_access_key=options.aws_secret_access_key,
            add_env=options.add_env)

    # TODO(cmaloney): add a `--healthcheck` option which runs dcos-diagnostics
    # on every host to see if they are working.

    if result:
        log.info("Test successsful!")
        # Delete the cluster if all was successful to minimize potential costs.
        # Failed clusters the hosts will continue running
        if vpc is not None:
            vpc.delete()
示例#3
0
def run():
    location = os.getenv('AZURE_LOCATION', 'East US')
    credentials = azure.common.credentials.ServicePrincipalCredentials(
        client_id=os.environ['AZURE_CLIENT_ID'],
        secret=os.environ['AZURE_CLIENT_SECRET'],
        tenant=os.environ['AZURE_TENANT_ID'])
    subscription_id = os.environ['AZURE_SUBSCRIPTION_ID']
    template = TemplateLink(uri=os.environ['AZURE_TEMPLATE_URL'])
    # tenant_id = os.environ.get('AZURE_TENANT_ID')
    # client_id = os.environ.get('AZURE_CLIENT_ID')
    # client_secret = os.environ.get('AZURE_CLIENT_SECRET')
    group_name = 'tesing' + ''.join(
        random.choice('01234567890abcdef') for n in range(10))
    deployment_name = 'deployment{}'.format(uuid.uuid4().hex)

    rmc = ResourceManagementClient(credentials, subscription_id)

    template_parameters = get_env_params()
    if template_parameters.get('numberOfPrivateSlaves'):
        assert template_parameters['numberOfPrivateSlaves'][
            'value'] >= 2, 'Test requires at least 2 private slaves!'
    else:
        template_parameters['numberOfPrivateSlaves'] = {'value': 2}
    if template_parameters.get('numberOfPublicSlaves'):
        assert template_parameters['numberOfPublicSlaves'][
            'value'] >= 1, 'Test requires at least 1 public slave!'
    else:
        template_parameters['numberOfPublicSlaves'] = {'value': 1}

    # Output resource group
    print("Resource group name: {}".format(group_name))
    print("Deployment name: {}".format(deployment_name))

    azure_cluster = {
        'resource_group_name': group_name,
        'deployment_name': deployment_name
    }
    pkgpanda.util.write_json('azure-cluster.json', azure_cluster)

    # Create a new resource group
    print("Creating new resource group in location: {}".format(location))
    if rmc.resource_groups.check_existence(group_name):
        print(
            "ERROR: Group name already exists / taken: {}".format(group_name))
    rmc.resource_groups.create_or_update(group_name,
                                         ResourceGroup(location=location))

    test_successful = False

    try:
        deployment_properties = DeploymentProperties(
            template_link=template,
            mode=DeploymentMode.incremental,
            parameters=template_parameters)

        # Use RPC against azure to validate the ARM template is well-formed
        result = rmc.deployments.validate(group_name,
                                          deployment_name,
                                          properties=deployment_properties)
        if result.error:
            print("Template verification failed\n{}".format(result.error),
                  file=sys.stderr)
            sys.exit(1)

        # Actually create a template deployment
        print("Creating template deployment ...")
        deploy_poller = rmc.deployments.create_or_update(
            group_name, deployment_name, deployment_properties)

        # Stop after 45 attempts (each one takes up to one minute)
        @retry(stop_max_attempt_number=45)
        def poll_deploy():
            res = deploy_poller.result(timeout=60)
            print("Current deploy state: {}".format(
                res.properties.provisioning_state))
            assert deploy_poller.done(), "Not done deploying."

        print("Waiting for template to deploy ...")
        try:
            poll_deploy()
        except:
            print("Current deploy status:\n{}".format(deploy_poller.result(0)))
            raise
        print("Template deployed successfully")

        assert deploy_poller.done(
        ), "Deployment failed / polling didn't reach deployment done."
        deployment_result = deploy_poller.result()
        print(deployment_result.properties.outputs)
        master_lb = deployment_result.properties.outputs['dnsAddress']['value']
        master_url = "http://{}".format(master_lb)

        print(
            "Template deployed using SSH private key: https://mesosphere.onelogin.com/notes/18444"
        )
        print(
            "For troubleshooting, master0 can be reached using: ssh -p 2200 core@{}"
            .format(master_lb))

        @retry(wait_fixed=(5 * 1000), stop_max_delay=(15 * 60 * 1000))
        def poll_on_dcos_ui_up():
            r = get_dcos_ui(master_url)
            assert r is not None and r.status_code == requests.codes.ok, \
                "Unable to reach DC/OS UI: {}".format(master_url)

        print("Waiting for DC/OS UI at: {} ...".format(master_url))
        poll_on_dcos_ui_up()

        # Run test now, so grab IPs
        nmc = NetworkManagementClient(credentials, subscription_id)
        ip_buckets = {
            'masterNodeNic': [],
            'slavePrivateNic': [],
            'slavePublicNic': []
        }

        for resource in rmc.resource_groups.list_resources(group_name):
            for bucket_name, bucket in ip_buckets.items():
                if resource.name.startswith(bucket_name):
                    nic = nmc.network_interfaces.get(group_name, resource.name)
                    all_ips = []
                    for config in nic.ip_configurations:
                        all_ips.append(config.private_ip_address)
                    bucket.extend(all_ips)

        with closing(SSHTunnel('core', 'ssh_key', master_lb, port=2200)) as t:
            integration_test(
                tunnel=t,
                test_dir='/home/core',
                dcos_dns=master_lb,
                master_list=ip_buckets['masterNodeNic'],
                agent_list=ip_buckets['slavePrivateNic'],
                public_agent_list=ip_buckets['slavePublicNic'],
                provider='azure',
                test_dns_search=False,
                pytest_dir=os.getenv(
                    'DCOS_PYTEST_DIR',
                    '/opt/mesosphere/active/dcos-integration-test'),
                pytest_cmd=os.getenv('DCOS_PYTEST_CMD',
                                     "py.test -vv -m 'not ccm' ") +
                os.getenv('CI_FLAGS', ''))
        test_successful = True
    except Exception as ex:
        print("ERROR: exception {}".format(ex))
        raise
    finally:
        # Send a delete request
        # TODO(cmaloney): The old code had a retry around this:
        # @retry(wait_exponential_multiplier=1000, wait_exponential_max=60*1000, stop_max_delay=(30*60*1000))
        poller = rmc.resource_groups.delete(group_name)

        # poll for the delete to complete
        print("Deleting resource group: {} ...".format(group_name))

        @retry(wait_fixed=(5 * 1000), stop_max_delay=(60 * 60 * 1000))
        def wait_for_delete():
            assert poller.done(), "Timed out waiting for delete"

        print("Waiting for delete ...")
        wait_for_delete()

        print("Clean up successful")

    if test_successful:
        print("Azure test deployment succeeded")
    else:
        print("ERROR: Azure test deployment failed", file=sys.stderr)
        sys.exit(2)
示例#4
0
def main():
    validate_env()
    location = os.getenv('AZURE_LOCATION', 'East US')
    credentials = azure.common.credentials.ServicePrincipalCredentials(
        client_id=os.environ['AZURE_CLIENT_ID'],
        secret=os.environ['AZURE_CLIENT_SECRET'],
        tenant=os.environ['AZURE_TENANT_ID'])
    subscription_id = os.environ['AZURE_SUBSCRIPTION_ID']
    template = TemplateLink(uri=os.environ['AZURE_TEMPLATE_URL'])
    # tenant_id = os.environ.get('AZURE_TENANT_ID')
    # client_id = os.environ.get('AZURE_CLIENT_ID')
    # client_secret = os.environ.get('AZURE_CLIENT_SECRET')
    group_name = 'testing' + ''.join(
        random.choice('01234567890abcdef') for n in range(10))
    deployment_name = 'deployment{}'.format(uuid.uuid4().hex)

    rmc = ResourceManagementClient(credentials, subscription_id)

    template_parameters = get_env_params()

    # Output resource group
    print("Resource group name: {}".format(group_name))
    print("Deployment name: {}".format(deployment_name))

    azure_cluster = {
        'resource_group_name': group_name,
        'deployment_name': deployment_name
    }
    pkgpanda.util.write_json('azure-cluster.json', azure_cluster)

    # Create a new resource group
    print("Creating new resource group in location: {}".format(location))
    if rmc.resource_groups.check_existence(group_name):
        print(
            "ERROR: Group name already exists / taken: {}".format(group_name))
    rmc.resource_groups.create_or_update(group_name,
                                         ResourceGroup(location=location))

    test_successful = False

    try:
        deployment_properties = DeploymentProperties(
            template_link=template,
            mode=DeploymentMode.incremental,
            parameters=template_parameters)

        # Use RPC against azure to validate the ARM template is well-formed
        result = rmc.deployments.validate(group_name,
                                          deployment_name,
                                          properties=deployment_properties)
        if result.error:
            print("Template verification failed\n{}".format(result.error),
                  file=sys.stderr)
            sys.exit(1)

        # Actually create a template deployment
        print("Creating template deployment ...")
        deploy_poller = rmc.deployments.create_or_update(
            group_name, deployment_name, deployment_properties)

        # Stop after 45 attempts (each one takes up to one minute)
        @retry(stop_max_attempt_number=45)
        def poll_deploy():
            res = deploy_poller.result(timeout=60)
            print("Current deploy state: {}".format(
                res.properties.provisioning_state))
            assert deploy_poller.done(), "Not done deploying."

        print("Waiting for template to deploy ...")
        try:
            poll_deploy()
        except:
            print("Current deploy status:\n{}".format(deploy_poller.result(0)))
            raise
        print("Template deployed successfully")

        assert deploy_poller.done(
        ), "Deployment failed / polling didn't reach deployment done."
        deployment_result = deploy_poller.result()
        print(deployment_result.properties.outputs)
        master_lb = deployment_result.properties.outputs['masterFQDN']['value']

        print(
            "Template deployed using SSH private key: https://mesosphere.onelogin.com/notes/18444"
        )
        print(
            "For troubleshooting, master0 can be reached using: ssh -p 2200 {}@{}"
            .format(get_value('linuxAdminUsername'), master_lb))

        # Run test now, so grab IPs
        nmc = NetworkManagementClient(credentials, subscription_id)
        ip_buckets = {'master': [], 'private': [], 'public': []}

        for resource in rmc.resource_groups.list_resources(
                group_name,
                filter=
            ("resourceType eq 'Microsoft.Network/networkInterfaces' or "
             "resourceType eq 'Microsoft.Compute/virtualMachineScaleSets'")):
            if resource.type == 'Microsoft.Network/networkInterfaces':
                nics = [nmc.network_interfaces.get(group_name, resource.name)]
            elif resource.type == 'Microsoft.Compute/virtualMachineScaleSets':
                nics = list(
                    nmc.network_interfaces.
                    list_virtual_machine_scale_set_network_interfaces(
                        virtual_machine_scale_set_name=resource.name,
                        resource_group_name=group_name))
            else:
                raise ('Unexpected resourceType: {}'.format(resource.type))

            for bucket_name in ip_buckets.keys():
                if bucket_name in resource.name:
                    for n in nics:
                        for config in n.ip_configurations:
                            ip_buckets[bucket_name].append(
                                config.private_ip_address)

        print('Detected IP configuration: {}'.format(ip_buckets))

        with SSHTunnel(get_value('linuxAdminUsername'),
                       'ssh_key',
                       master_lb,
                       port=2200) as t:
            integration_test(
                tunnel=t,
                test_dir='/home/{}'.format(get_value('linuxAdminUsername')),
                dcos_dns=ip_buckets['master'][0],
                master_list=ip_buckets['master'],
                agent_list=ip_buckets['private'],
                public_agent_list=ip_buckets['public'],
                provider='azure',
                test_dns_search=False,
                add_env=get_test_config(),
                pytest_dir=os.getenv(
                    'DCOS_PYTEST_DIR',
                    '/opt/mesosphere/active/dcos-integration-test'),
                pytest_cmd=os.getenv('DCOS_PYTEST_CMD',
                                     "py.test -rs -vv -m 'not ccm' ") +
                os.getenv('CI_FLAGS', ''))
        test_successful = True
    except Exception as ex:
        traceback.print_exc()
        print("ERROR: exception {}".format(ex))
        raise
    finally:
        if os.getenv('AZURE_CLEANUP') == 'false':
            print("Cluster must be cleaned up manually")
            print("Cluster details: {}".format(azure_cluster))
        else:
            # Send a delete request
            # TODO(cmaloney): The old code had a retry around this:
            # @retry(wait_exponential_multiplier=1000, wait_exponential_max=60*1000, stop_max_delay=(30*60*1000))
            poller = rmc.resource_groups.delete(group_name)

            # poll for the delete to complete
            print("Deleting resource group: {} ...".format(group_name))

            @retry(wait_fixed=(5 * 1000), stop_max_delay=(60 * 60 * 1000))
            def wait_for_delete():
                assert poller.done(), "Timed out waiting for delete"

            print("Waiting for delete ...")
            wait_for_delete()

            print("Clean up successful")

    if test_successful:
        print("Azure test deployment succeeded")
    else:
        print("ERROR: Azure test deployment failed", file=sys.stderr)
        sys.exit(2)
示例#5
0
文件: test_aws_cf.py 项目: enst/dcos
def main():
    options = check_environment()

    random_identifier = ''.join(
        random.choice(string.ascii_uppercase + string.digits)
        for _ in range(10))
    unique_cluster_id = 'CF-integration-test-{}'.format(random_identifier)
    log.info(
        'Spinning up AWS CloudFormation with ID: {}'.format(unique_cluster_id))
    bw = test_util.aws.BotoWrapper(
        region=options.aws_region,
        aws_access_key_id=options.aws_access_key_id,
        aws_secret_access_key=options.aws_secret_access_key)
    # TODO(mellenburg): use randomly generated keys this key is delivered by CI or user
    ssh_key_path = 'default_ssh_key'
    cf = test_util.aws.DcosCfSimple.create(stack_name=unique_cluster_id,
                                           template_url=options.template_url,
                                           private_agents=2,
                                           public_agents=1,
                                           admin_location='0.0.0.0/0',
                                           key_pair_name='default',
                                           boto_wrapper=bw)
    cf.wait_for_stack_creation()

    # key must be chmod 600 for test_runner to use
    os.chmod(ssh_key_path, stat.S_IREAD | stat.S_IWRITE)

    # Create custom SSH Runnner to help orchestrate the test
    ssh_user = '******'
    remote_dir = '/home/core'

    master_ips = cf.get_master_ips()
    public_agent_ips = cf.get_public_agent_ips()
    private_agent_ips = cf.get_private_agent_ips()
    test_host = master_ips[0].public_ip
    log.info('Running integration test from: ' + test_host)
    master_list = [i.private_ip for i in master_ips]
    log.info('Master private IPs: ' + repr(master_list))
    agent_list = [i.private_ip for i in private_agent_ips]
    log.info('Private agent private IPs: ' + repr(agent_list))
    public_agent_list = [i.private_ip for i in public_agent_ips]
    log.info('Public agent private IPs: ' + repr(public_agent_list))

    log.info(
        'To access this cluster, use the Mesosphere default shared AWS key '
        '(https://mesosphere.onelogin.com/notes/16670) and SSH with:\n'
        'ssh -i default_ssh_key {}@{}'.format(ssh_user, test_host))
    with closing(SSHTunnel(ssh_user, ssh_key_path,
                           test_host)) as test_host_tunnel:
        # Allow docker use w/o sudo
        result = test_util.test_runner.integration_test(
            tunnel=test_host_tunnel,
            test_dir=remote_dir,
            region=options.aws_region,
            dcos_dns=master_list[0],
            master_list=master_list,
            agent_list=agent_list,
            public_agent_list=public_agent_list,
            provider='aws',
            test_dns_search=False,
            aws_access_key_id=options.aws_access_key_id,
            aws_secret_access_key=options.aws_secret_access_key,
            add_env=options.add_env,
            pytest_dir=options.pytest_dir,
            pytest_cmd=options.pytest_cmd)
    if result == 0:
        log.info('Test successsful! Deleting CloudFormation...')
        cf.delete()
    else:
        log.info('Test failed! VPC will remain alive for debugging')
    if options.ci_flags:
        result = 0  # Wipe the return code so that tests can be muted in CI
    sys.exit(result)
示例#6
0
 def tunnel(self, host):
     return SSHTunnel(self.user, self.key_path, host.public_ip)