def test_ssh_tunnel(sshd_manager): with sshd_manager.run(1) as sshd_ports: tunnel_args = { 'ssh_user': getpass.getuser(), 'ssh_key_path': sshd_manager.key_path, 'host': '127.0.0.1', 'port': sshd_ports[0] } with closing(SSHTunnel(**tunnel_args)) as tunnel: tunnel_write_and_run(tunnel.write_to_remote, tunnel.remote_cmd)
def main(): options = check_environment() host_list = None vpc = None # Set if the test owns the VPC if options.host_list is None: log.info('CCM_VPC_HOSTS not provided, requesting new VPC from CCM...') vpc = make_vpc(use_bare_os=options.test_install_prereqs) host_list = vpc.hosts() else: host_list = options.host_list assert os.path.exists( 'ssh_key'), 'Valid SSH key for hosts must be in working dir!' # key must be chmod 600 for test_runner to use os.chmod('ssh_key', stat.S_IREAD | stat.S_IWRITE) # Create custom SSH Runnner to help orchestrate the test ssh_user = '******' ssh_key_path = 'ssh_key' remote_dir = '/home/centos' if options.use_api: installer = test_util.installer_api_test.DcosApiInstaller() if not options.test_install_prereqs: # If we dont want to test the prereq install, use offline mode to avoid it installer.offline_mode = True else: installer = test_util.installer_api_test.DcosCliInstaller() host_list_w_port = [i + ':22' for i in host_list] @retry(stop_max_delay=120000) def establish_host_connectivity(): """Continually try to recreate the SSH Tunnels to all hosts for 2 minutes """ return closing( TunnelCollection(ssh_user, ssh_key_path, host_list_w_port)) log.info('Checking that hosts are accessible') with establish_host_connectivity() as tunnels: local_ip = {} for tunnel in tunnels.tunnels: local_ip[tunnel.host] = get_local_address(tunnel, remote_dir) if options.do_setup: # Make the default user priveleged to use docker tunnel.remote_cmd( ['sudo', 'usermod', '-aG', 'docker', ssh_user]) # use first node as bootstrap node, second node as master, all others as agents test_host = host_list[0] test_host_local = local_ip[host_list[0]] master_list = [local_ip[host_list[1]]] agent_list = [local_ip[host_list[2]]] public_agent_list = [local_ip[host_list[3]]] log.info('Test host public/private IP: ' + test_host + '/' + test_host_local) with closing(SSHTunnel(ssh_user, ssh_key_path, test_host)) as test_host_tunnel: log.info('Setting up installer on test host') installer.setup_remote(tunnel=test_host_tunnel, installer_path=remote_dir + '/dcos_generate_config.sh', download_url=options.installer_url) if options.do_setup: # only do on setup so you can rerun this test against a living installer log.info('Verifying installer password hashing') test_pass = '******' hash_passwd = installer.get_hashed_password(test_pass) assert passlib.hash.sha512_crypt.verify( test_pass, hash_passwd), 'Hash does not match password' if options.use_api: installer.start_web_server() with open(pkg_resources.resource_filename( "gen", "ip-detect/aws.sh")) as ip_detect_fh: ip_detect_script = ip_detect_fh.read() with open('ssh_key', 'r') as key_fh: ssh_key = key_fh.read() # Using static exhibitor is the only option in the GUI installer if options.use_api: log.info( 'Installer API is selected, so configure for static backend') zk_host = None # causes genconf to use static exhibitor backend else: log.info('Installer CLI is selected, so configure for ZK backend') zk_host = test_host_local + ':2181' zk_cmd = [ 'sudo', 'docker', 'run', '-d', '-p', '2181:2181', '-p', '2888:2888', '-p', '3888:3888', 'jplock/zookeeper' ] test_host_tunnel.remote_cmd(zk_cmd) log.info("Configuring install...") installer.genconf(zk_host=zk_host, master_list=master_list, agent_list=agent_list, public_agent_list=public_agent_list, ip_detect_script=ip_detect_script, ssh_user=ssh_user, ssh_key=ssh_key, add_config_path=options.add_config_path) log.info("Running Preflight...") if options.test_install_prereqs: # Runs preflight in --web or --install-prereqs for CLI # This may take up 15 minutes... installer.install_prereqs() if options.test_install_prereqs_only: if vpc: vpc.delete() sys.exit(0) else: # Will not fix errors detected in preflight installer.preflight() log.info("Running Deploy...") installer.deploy() log.info("Running Postflight") installer.postflight() # Runs dcos-image/integration_test.py inside the cluster test_util.test_runner.prepare_test_registry(tunnel=test_host_tunnel, test_dir=remote_dir) result = test_util.test_runner.integration_test( tunnel=test_host_tunnel, test_dir=remote_dir, region=vpc.get_region() if vpc else DEFAULT_AWS_REGION, dcos_dns=master_list[0], master_list=master_list, agent_list=agent_list, public_agent_list=public_agent_list, provider='onprem', # Setting dns_search: mesos not currently supported in API test_dns_search=not options.use_api, ci_flags=options.ci_flags, aws_access_key_id=options.aws_access_key_id, aws_secret_access_key=options.aws_secret_access_key, add_env=options.add_env) # TODO(cmaloney): add a `--healthcheck` option which runs dcos-diagnostics # on every host to see if they are working. if result: log.info("Test successsful!") # Delete the cluster if all was successful to minimize potential costs. # Failed clusters the hosts will continue running if vpc is not None: vpc.delete()
def run(): location = os.getenv('AZURE_LOCATION', 'East US') credentials = azure.common.credentials.ServicePrincipalCredentials( client_id=os.environ['AZURE_CLIENT_ID'], secret=os.environ['AZURE_CLIENT_SECRET'], tenant=os.environ['AZURE_TENANT_ID']) subscription_id = os.environ['AZURE_SUBSCRIPTION_ID'] template = TemplateLink(uri=os.environ['AZURE_TEMPLATE_URL']) # tenant_id = os.environ.get('AZURE_TENANT_ID') # client_id = os.environ.get('AZURE_CLIENT_ID') # client_secret = os.environ.get('AZURE_CLIENT_SECRET') group_name = 'tesing' + ''.join( random.choice('01234567890abcdef') for n in range(10)) deployment_name = 'deployment{}'.format(uuid.uuid4().hex) rmc = ResourceManagementClient(credentials, subscription_id) template_parameters = get_env_params() if template_parameters.get('numberOfPrivateSlaves'): assert template_parameters['numberOfPrivateSlaves'][ 'value'] >= 2, 'Test requires at least 2 private slaves!' else: template_parameters['numberOfPrivateSlaves'] = {'value': 2} if template_parameters.get('numberOfPublicSlaves'): assert template_parameters['numberOfPublicSlaves'][ 'value'] >= 1, 'Test requires at least 1 public slave!' else: template_parameters['numberOfPublicSlaves'] = {'value': 1} # Output resource group print("Resource group name: {}".format(group_name)) print("Deployment name: {}".format(deployment_name)) azure_cluster = { 'resource_group_name': group_name, 'deployment_name': deployment_name } pkgpanda.util.write_json('azure-cluster.json', azure_cluster) # Create a new resource group print("Creating new resource group in location: {}".format(location)) if rmc.resource_groups.check_existence(group_name): print( "ERROR: Group name already exists / taken: {}".format(group_name)) rmc.resource_groups.create_or_update(group_name, ResourceGroup(location=location)) test_successful = False try: deployment_properties = DeploymentProperties( template_link=template, mode=DeploymentMode.incremental, parameters=template_parameters) # Use RPC against azure to validate the ARM template is well-formed result = rmc.deployments.validate(group_name, deployment_name, properties=deployment_properties) if result.error: print("Template verification failed\n{}".format(result.error), file=sys.stderr) sys.exit(1) # Actually create a template deployment print("Creating template deployment ...") deploy_poller = rmc.deployments.create_or_update( group_name, deployment_name, deployment_properties) # Stop after 45 attempts (each one takes up to one minute) @retry(stop_max_attempt_number=45) def poll_deploy(): res = deploy_poller.result(timeout=60) print("Current deploy state: {}".format( res.properties.provisioning_state)) assert deploy_poller.done(), "Not done deploying." print("Waiting for template to deploy ...") try: poll_deploy() except: print("Current deploy status:\n{}".format(deploy_poller.result(0))) raise print("Template deployed successfully") assert deploy_poller.done( ), "Deployment failed / polling didn't reach deployment done." deployment_result = deploy_poller.result() print(deployment_result.properties.outputs) master_lb = deployment_result.properties.outputs['dnsAddress']['value'] master_url = "http://{}".format(master_lb) print( "Template deployed using SSH private key: https://mesosphere.onelogin.com/notes/18444" ) print( "For troubleshooting, master0 can be reached using: ssh -p 2200 core@{}" .format(master_lb)) @retry(wait_fixed=(5 * 1000), stop_max_delay=(15 * 60 * 1000)) def poll_on_dcos_ui_up(): r = get_dcos_ui(master_url) assert r is not None and r.status_code == requests.codes.ok, \ "Unable to reach DC/OS UI: {}".format(master_url) print("Waiting for DC/OS UI at: {} ...".format(master_url)) poll_on_dcos_ui_up() # Run test now, so grab IPs nmc = NetworkManagementClient(credentials, subscription_id) ip_buckets = { 'masterNodeNic': [], 'slavePrivateNic': [], 'slavePublicNic': [] } for resource in rmc.resource_groups.list_resources(group_name): for bucket_name, bucket in ip_buckets.items(): if resource.name.startswith(bucket_name): nic = nmc.network_interfaces.get(group_name, resource.name) all_ips = [] for config in nic.ip_configurations: all_ips.append(config.private_ip_address) bucket.extend(all_ips) with closing(SSHTunnel('core', 'ssh_key', master_lb, port=2200)) as t: integration_test( tunnel=t, test_dir='/home/core', dcos_dns=master_lb, master_list=ip_buckets['masterNodeNic'], agent_list=ip_buckets['slavePrivateNic'], public_agent_list=ip_buckets['slavePublicNic'], provider='azure', test_dns_search=False, pytest_dir=os.getenv( 'DCOS_PYTEST_DIR', '/opt/mesosphere/active/dcos-integration-test'), pytest_cmd=os.getenv('DCOS_PYTEST_CMD', "py.test -vv -m 'not ccm' ") + os.getenv('CI_FLAGS', '')) test_successful = True except Exception as ex: print("ERROR: exception {}".format(ex)) raise finally: # Send a delete request # TODO(cmaloney): The old code had a retry around this: # @retry(wait_exponential_multiplier=1000, wait_exponential_max=60*1000, stop_max_delay=(30*60*1000)) poller = rmc.resource_groups.delete(group_name) # poll for the delete to complete print("Deleting resource group: {} ...".format(group_name)) @retry(wait_fixed=(5 * 1000), stop_max_delay=(60 * 60 * 1000)) def wait_for_delete(): assert poller.done(), "Timed out waiting for delete" print("Waiting for delete ...") wait_for_delete() print("Clean up successful") if test_successful: print("Azure test deployment succeeded") else: print("ERROR: Azure test deployment failed", file=sys.stderr) sys.exit(2)
def main(): validate_env() location = os.getenv('AZURE_LOCATION', 'East US') credentials = azure.common.credentials.ServicePrincipalCredentials( client_id=os.environ['AZURE_CLIENT_ID'], secret=os.environ['AZURE_CLIENT_SECRET'], tenant=os.environ['AZURE_TENANT_ID']) subscription_id = os.environ['AZURE_SUBSCRIPTION_ID'] template = TemplateLink(uri=os.environ['AZURE_TEMPLATE_URL']) # tenant_id = os.environ.get('AZURE_TENANT_ID') # client_id = os.environ.get('AZURE_CLIENT_ID') # client_secret = os.environ.get('AZURE_CLIENT_SECRET') group_name = 'testing' + ''.join( random.choice('01234567890abcdef') for n in range(10)) deployment_name = 'deployment{}'.format(uuid.uuid4().hex) rmc = ResourceManagementClient(credentials, subscription_id) template_parameters = get_env_params() # Output resource group print("Resource group name: {}".format(group_name)) print("Deployment name: {}".format(deployment_name)) azure_cluster = { 'resource_group_name': group_name, 'deployment_name': deployment_name } pkgpanda.util.write_json('azure-cluster.json', azure_cluster) # Create a new resource group print("Creating new resource group in location: {}".format(location)) if rmc.resource_groups.check_existence(group_name): print( "ERROR: Group name already exists / taken: {}".format(group_name)) rmc.resource_groups.create_or_update(group_name, ResourceGroup(location=location)) test_successful = False try: deployment_properties = DeploymentProperties( template_link=template, mode=DeploymentMode.incremental, parameters=template_parameters) # Use RPC against azure to validate the ARM template is well-formed result = rmc.deployments.validate(group_name, deployment_name, properties=deployment_properties) if result.error: print("Template verification failed\n{}".format(result.error), file=sys.stderr) sys.exit(1) # Actually create a template deployment print("Creating template deployment ...") deploy_poller = rmc.deployments.create_or_update( group_name, deployment_name, deployment_properties) # Stop after 45 attempts (each one takes up to one minute) @retry(stop_max_attempt_number=45) def poll_deploy(): res = deploy_poller.result(timeout=60) print("Current deploy state: {}".format( res.properties.provisioning_state)) assert deploy_poller.done(), "Not done deploying." print("Waiting for template to deploy ...") try: poll_deploy() except: print("Current deploy status:\n{}".format(deploy_poller.result(0))) raise print("Template deployed successfully") assert deploy_poller.done( ), "Deployment failed / polling didn't reach deployment done." deployment_result = deploy_poller.result() print(deployment_result.properties.outputs) master_lb = deployment_result.properties.outputs['masterFQDN']['value'] print( "Template deployed using SSH private key: https://mesosphere.onelogin.com/notes/18444" ) print( "For troubleshooting, master0 can be reached using: ssh -p 2200 {}@{}" .format(get_value('linuxAdminUsername'), master_lb)) # Run test now, so grab IPs nmc = NetworkManagementClient(credentials, subscription_id) ip_buckets = {'master': [], 'private': [], 'public': []} for resource in rmc.resource_groups.list_resources( group_name, filter= ("resourceType eq 'Microsoft.Network/networkInterfaces' or " "resourceType eq 'Microsoft.Compute/virtualMachineScaleSets'")): if resource.type == 'Microsoft.Network/networkInterfaces': nics = [nmc.network_interfaces.get(group_name, resource.name)] elif resource.type == 'Microsoft.Compute/virtualMachineScaleSets': nics = list( nmc.network_interfaces. list_virtual_machine_scale_set_network_interfaces( virtual_machine_scale_set_name=resource.name, resource_group_name=group_name)) else: raise ('Unexpected resourceType: {}'.format(resource.type)) for bucket_name in ip_buckets.keys(): if bucket_name in resource.name: for n in nics: for config in n.ip_configurations: ip_buckets[bucket_name].append( config.private_ip_address) print('Detected IP configuration: {}'.format(ip_buckets)) with SSHTunnel(get_value('linuxAdminUsername'), 'ssh_key', master_lb, port=2200) as t: integration_test( tunnel=t, test_dir='/home/{}'.format(get_value('linuxAdminUsername')), dcos_dns=ip_buckets['master'][0], master_list=ip_buckets['master'], agent_list=ip_buckets['private'], public_agent_list=ip_buckets['public'], provider='azure', test_dns_search=False, add_env=get_test_config(), pytest_dir=os.getenv( 'DCOS_PYTEST_DIR', '/opt/mesosphere/active/dcos-integration-test'), pytest_cmd=os.getenv('DCOS_PYTEST_CMD', "py.test -rs -vv -m 'not ccm' ") + os.getenv('CI_FLAGS', '')) test_successful = True except Exception as ex: traceback.print_exc() print("ERROR: exception {}".format(ex)) raise finally: if os.getenv('AZURE_CLEANUP') == 'false': print("Cluster must be cleaned up manually") print("Cluster details: {}".format(azure_cluster)) else: # Send a delete request # TODO(cmaloney): The old code had a retry around this: # @retry(wait_exponential_multiplier=1000, wait_exponential_max=60*1000, stop_max_delay=(30*60*1000)) poller = rmc.resource_groups.delete(group_name) # poll for the delete to complete print("Deleting resource group: {} ...".format(group_name)) @retry(wait_fixed=(5 * 1000), stop_max_delay=(60 * 60 * 1000)) def wait_for_delete(): assert poller.done(), "Timed out waiting for delete" print("Waiting for delete ...") wait_for_delete() print("Clean up successful") if test_successful: print("Azure test deployment succeeded") else: print("ERROR: Azure test deployment failed", file=sys.stderr) sys.exit(2)
def main(): options = check_environment() random_identifier = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) unique_cluster_id = 'CF-integration-test-{}'.format(random_identifier) log.info( 'Spinning up AWS CloudFormation with ID: {}'.format(unique_cluster_id)) bw = test_util.aws.BotoWrapper( region=options.aws_region, aws_access_key_id=options.aws_access_key_id, aws_secret_access_key=options.aws_secret_access_key) # TODO(mellenburg): use randomly generated keys this key is delivered by CI or user ssh_key_path = 'default_ssh_key' cf = test_util.aws.DcosCfSimple.create(stack_name=unique_cluster_id, template_url=options.template_url, private_agents=2, public_agents=1, admin_location='0.0.0.0/0', key_pair_name='default', boto_wrapper=bw) cf.wait_for_stack_creation() # key must be chmod 600 for test_runner to use os.chmod(ssh_key_path, stat.S_IREAD | stat.S_IWRITE) # Create custom SSH Runnner to help orchestrate the test ssh_user = '******' remote_dir = '/home/core' master_ips = cf.get_master_ips() public_agent_ips = cf.get_public_agent_ips() private_agent_ips = cf.get_private_agent_ips() test_host = master_ips[0].public_ip log.info('Running integration test from: ' + test_host) master_list = [i.private_ip for i in master_ips] log.info('Master private IPs: ' + repr(master_list)) agent_list = [i.private_ip for i in private_agent_ips] log.info('Private agent private IPs: ' + repr(agent_list)) public_agent_list = [i.private_ip for i in public_agent_ips] log.info('Public agent private IPs: ' + repr(public_agent_list)) log.info( 'To access this cluster, use the Mesosphere default shared AWS key ' '(https://mesosphere.onelogin.com/notes/16670) and SSH with:\n' 'ssh -i default_ssh_key {}@{}'.format(ssh_user, test_host)) with closing(SSHTunnel(ssh_user, ssh_key_path, test_host)) as test_host_tunnel: # Allow docker use w/o sudo result = test_util.test_runner.integration_test( tunnel=test_host_tunnel, test_dir=remote_dir, region=options.aws_region, dcos_dns=master_list[0], master_list=master_list, agent_list=agent_list, public_agent_list=public_agent_list, provider='aws', test_dns_search=False, aws_access_key_id=options.aws_access_key_id, aws_secret_access_key=options.aws_secret_access_key, add_env=options.add_env, pytest_dir=options.pytest_dir, pytest_cmd=options.pytest_cmd) if result == 0: log.info('Test successsful! Deleting CloudFormation...') cf.delete() else: log.info('Test failed! VPC will remain alive for debugging') if options.ci_flags: result = 0 # Wipe the return code so that tests can be muted in CI sys.exit(result)
def tunnel(self, host): return SSHTunnel(self.user, self.key_path, host.public_ip)