def doUpdateEnvironmentVariables(self, ssh, master_node, worker_nodes): printStep('Updating environment variables') active_nodes = [] if self.include_master: active_nodes = self.hosts else: active_nodes = worker_nodes # Find the total available number of cores total_cores = 0 for node in active_nodes: total_cores += int(node.cores) counter = 0 for node in self.hosts: target = [] target.append(node) ssh.run_remote_command(target, "'echo export STRATUSLAB_NC=" + str( counter) + " > /etc/profile.d/stratuslab_cluster.sh && " \ "echo export STRATUSLAB_CMASTER=" + master_node.public_dns + " >> /etc/profile.d/stratuslab_cluster.sh && " \ "echo export STRATUSLAB_CSIZE=" + str( len(active_nodes)) + " >> /etc/profile.d/stratuslab_cluster.sh && " \ "echo export STRATUSLAB_CMAX_CORES=" + str( total_cores) + " >> /etc/profile.d/stratuslab_cluster.sh'") counter += 1
def _installPackages(self, section): packages = self.packages[self.profile].get(section, []) if packages: printStep('Installing packages on %s for section "%s": %s' % (self.profile, section, ', '.join(packages))) self.system.installNodePackages(packages)
def _sshPDisk(self, cmd, errorMsg, dontRaiseOnError=False): cmd_str = ' '.join(cmd) printStep("Executing: %s" % cmd_str) retCode, output = sshCmdWithOutput(cmd_str, self.pdisk.persistentDiskIp, user=getuser(), sshKey=self.pdisk.persistentDiskPrivateKey.replace('.pub', '')) if not dontRaiseOnError and retCode != 0: raise Exception('%s\n: Error: %s' % (errorMsg, output)) return output
def _setupFrontend(self): if self._backupConfigFileExists(): printWarning("Policy validation backup file %s already exists, skipping configuration" % PolicyValidator.CONFIG_SAV) return printStep('Creating policy validation configuration file') self._backup() self._writeConfigFromTemplate()
def doPrepareMPImachineFile(self, ssh, worker_nodes): printStep('Preparing MPI machine file') if self.include_master: target = self.hosts else: target = worker_nodes self.create_machine_file(target, "/tmp/machinefile", isForMPI=True) ssh.copy_file_to_hosts(self.hosts, "/tmp/machinefile", "/tmp") os.unlink("/tmp/machinefile")
def _createLvmGroup(self): if 0 == self.system._nodeShell('%s %s' % (self.persistentDiskLvmVgdisplay, self.persistentDiskLvmDevice)): return printStep('Creating LVM volume group...') self.system._nodeShell('%s %s' % (self.persistentDiskLvmPvcreate, self.persistentDiskPhysicalDevices)) self.system._nodeShell('%s %s %s' % (self.persistentDiskLvmVgcreate, self.persistentDiskLvmDevice, self.persistentDiskPhysicalDevices))
def _configureNodeSudo(self): printStep('Configuring sudo rights...') self.system._remoteAppendOrReplaceInFile('/etc/sudoers', '%s ALL = NOPASSWD: /sbin/iscsiadm, /usr/sbin/lsof, /usr/bin/virsh' % self.oneUsername, '%s ALL = NOPASSWD: /sbin/iscsiadm, /usr/sbin/lsof, /usr/bin/virsh' % self.oneUsername) self.system._remoteAppendOrReplaceInFile('/etc/sudoers', 'Defaults:%s !requiretty' % self.oneUsername, 'Defaults:%s !requiretty' % self.oneUsername) self.system._remoteAppendOrReplaceInFile('/etc/sudoers', 'Defaults:%s !requiretty' % 'root', 'Defaults:%s !requiretty' % 'root')
def doUpdateHostsFile(self, ssh, master_node, worker_nodes): printStep('Updating hosts file') ssh.run_remote_command(self.hosts, "'echo >> /etc/hosts && " \ " echo \"# Cluster nodes\" >> /etc/hosts && " \ " echo " + master_node.public_ip + " " + master_node.public_dns + " " + "master >> /etc/hosts'") counter = 0 for host in worker_nodes: ssh.run_remote_command(self.hosts, " 'echo " + host.public_ip + " " + host.public_dns + " worker-" + str( counter) + " >> /etc/hosts'") counter += 1
def doWork(self): configHolder = ConfigHolder(self.options.__dict__) runner = VmManagerFactory.create(self.image, configHolder) cluster = Cluster(configHolder, runner, self.options.master_vmid) printAction('Starting cluster') runner.runInstance() cluster.deploy() printStep('Done!')
def doSetupSSHHostBasedCluster(self, ssh): printStep('Configuring passwordless host-based ssh authentication') ssh.run_remote_command(self.hosts, "'echo \"IgnoreRhosts no\" >> /etc/ssh/sshd_config && service sshd restart &> /dev/null && " + "echo \"HostbasedAuthentication yes\n" + "StrictHostKeyChecking no\n" + "EnableSSHKeysign yes\" >> /etc/ssh/ssh_config'") for host in self.hosts: ssh.run_remote_command(self.hosts, "'ssh-keyscan -t rsa " + host.public_dns + " 2>/dev/null >> /etc/ssh/ssh_known_hosts && " \ "echo " + host.public_dns + " root >> /root/.shosts'")
def _writePdiskConfig(self): printStep('Writing configuration...') self._overrideConfig('disk.store.share', self.persistentDiskShare) self._overrideConfig('disk.store.nfs.location', self.persistentDiskNfsMountPoint) self._overrideConfig('disk.store.iscsi.type', self.persistentDiskStorage) self._overrideConfig('disk.store.iscsi.file.location', self.persistentDiskFileLocation) self._overrideConfig('disk.store.lvm.device', self.persistentDiskLvmDevice) self._overrideConfig('disk.store.lvm.create', self.persistentDiskLvmCreate) self._overrideConfig('disk.store.lvm.remove', self.persistentDiskLvmRemove) self._overrideConfig('disk.store.cloud.node.admin', self.oneUsername) self._overrideConfig('disk.store.cloud.node.ssh_keyfile', self.cloudNodeKey) self._overrideConfig('disk.store.cloud.node.vm_dir', self.persistentDiskCloudVmDir)
def _configureNfsServer(self): printStep('Configuring NFS sharing...') if self._nfsShareAlreadyExists(): self.system.configureExistingNfsShare(self.persistentDiskExistingNfs, self.persistentDiskNfsMountPoint) elif self.profile == 'node': self.system.configureExistingNfsShare('%s:%s' % ( VolumeManager.getFQNHostname(self.persistentDiskIp), self.persistentDiskNfsMountPoint), self.persistentDiskNfsMountPoint) else: self.system.configureNewNfsServer(self.persistentDiskNfsMountPoint, self.networkAddr, self.networkMask)
def doCreateClusterUser(self, ssh, master_node): printStep('Creating additional user') master_only = [] master_only.append(master_node) ssh.run_remote_command(self.hosts, "useradd -m " + self.cluster_user) ssh.run_remote_command(master_only, ' "su - ' + self.cluster_user + " -c 'ssh-keygen -q -t rsa -N " + '\\"\\"' " -f ~/.ssh/id_rsa' " + '"') ssh.run_remote_command(master_only, ' "su - ' + self.cluster_user + " -c 'cp ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys' " + '"') #if self.shared_folder !="home": # for host in self.hosts: # ssh.run_remote_command(master_only, "scp -r /home/"+ self.cluster_user+"/.ssh " + host.public_ip + ":/home/" + self.cluster_user) if self.ssh_hostbased: for host in self.hosts: ssh.run_remote_command(self.hosts, ' "su - ' + self.cluster_user + " -c 'echo " + host.public_dns + " " + self.cluster_user + " >> ~/.shosts'" + '"')
def _mergeAuthWithProxy(self): loginConf = os.path.join(Defaults.ETC_DIR, '%s/login.conf') pdiskDir = 'storage/pdisk' oneproxyDir = 'one-proxy' confLine = '<Arg>%s</Arg>' configFile = os.path.join(self.pdiskHomeDir, 'etc/jetty-jaas-stratuslab.xml') if not self.persistentDiskMergeAuthWithProxy: return printStep('Merging pdisk and one-proxy auth configuration...') if not self.system._remoteFileExists(loginConf % oneproxyDir): printWarning('Not merging login configuration with one proxy, ' 'not able to find one-proxy configuration file.\n' 'Edit %s to do it.' % loginConf % pdiskDir) return if 0 == self.system._nodeShell(['grep', '"%s"' % confLine % loginConf % oneproxyDir, configFile]): return self.system._remoteAppendOrReplaceInFile( configFile, confLine % loginConf % pdiskDir, confLine % loginConf % oneproxyDir)
def doAddPackages(self, ssh): printStep('Installing additional software packages') packages = self.add_packages.replace(",", " ") printStep('Trying to configure new apps with yum...') if ssh.run_remote_command(self.hosts, "yum -q -y install " + packages): printStep('Trying to configure new apps with apt-get...') ssh.run_remote_command(self.hosts, "apt-get -q -y install " + packages)
def _installFrontend(self): printStep('Installing CAs') self._installCAs() printStep('Installing sendmail') self._installSendmail() printStep('Installing OpenNebula') self._installOpenNebula() self._printInstallCompleted(self.frontend.stdout.name, self.frontend.stderr.name)
def doPrepareNFSSharedFolder(self, ssh, master_node, worker_nodes): printStep('Preparing NFS shared folder') master_only = [] master_only.append(master_node) ssh.run_remote_command(self.hosts, "mkdir -p " + self.shared_folder) ssh.run_remote_command(master_only, "'echo " + self.shared_folder + " \"*(rw,no_root_squash)\" >> /etc/exports'") printStep('\tTrying RedHat configuration...') if ssh.run_remote_command(master_only, "service nfs restart &> /dev/null"): printStep('\tTrying debian configuration...') ssh.run_remote_command(master_only, "service nfs-kernel-server restart &> /dev/null") ssh.run_remote_command(worker_nodes, "mount " + master_node.public_ip + ":" + self.shared_folder + " " + self.shared_folder)
def runInstance(self, details=False): self._printContacting() if Image.isImageId(self.vm_image): self._checkImageExists(self.vm_image) self.vm_image = self._prependMarketplaceUrlIfImageId(self.vm_image) elif Image.isDiskId(self.vm_image): self.vm_image = self._createDiskUrlIfDiskId(self.vm_image) elif self._isAliasUrl(self.vm_image): self.vm_image = self._resolveUrl(self.vm_image) else: raise Exceptions.ValidationException('Image reference must be an ' 'Alias URL, Marketplace Image ID or Disk ID: %s' % self.vm_image) printAction('Starting machine(s)') self.printDetail('Using VM template file: %s' % self.vmTemplateFile) vmTpl = self._buildVmTemplate(self.vmTemplateFile) label = (self.instanceNumber > 1) and 'machines' or 'machine' printStep('Starting %s %s' % (self.instanceNumber, label)) self.printDetail('on endpoint: %s' % self.endpoint, Util.VERBOSE_LEVEL_DETAILED) self.printDetail('with template:\n%s' % vmTpl, Util.VERBOSE_LEVEL_DETAILED) for vmNb in range(self.instanceNumber): vmId = self.cloud.vmStart(vmTpl) self.vmIds.append(vmId) networkName, ip = self.getNetworkDetail(vmId) self.vmIdsAndNetwork.append((vmId, networkName, ip)) vmIpPretty = '\t%s ip: %s' % (networkName.title(), ip) printStep('Machine %s (vm ID: %s)\n%s' % (vmNb + 1, vmId, vmIpPretty)) self.instancesDetail.append({'id': vmId, 'ip': ip, 'networkName': networkName}) self._saveVmIds() printStep('Done!') if not details: return self.vmIds else: return self.vmIdsAndNetwork
def _setupNode(self): printStep('Checking node connectivity') self._checkNodeConnectivity() printStep('Creating cloud admin account') self._createCloudAdmin(self.node) printStep('Configuring cloud admin account') self._configureCloudAdminNode() printStep('Configuring hypervisor') self._configureVirtualization() printStep('Configuring bridge') self._configureBridgeOnNode() printStep('Configuring file sharing') self._setupFileSharingClient() printStep('Adding node to cloud') self._assignDrivers() self._addCloudNode()
def _installNode(self): printStep('Installing node dependencies') self._installNodeDependencies() self._warmXenNeedReboot()
def _printInstallCompleted(self, stdoutFilename, stderrFilename): printStep('Installation completed') printInfo('\tInstallation details: %s, %s' % (stdoutFilename, stderrFilename))
def _startServicesNode(self): printStep('Starting virtualization services') self._startVrtualization()
def _setupFrontend(self): printStep('Creating monitoring configuration file') monitoringTpl = Util.get_template_file(['monitoring.cfg.tpl']) monitoringConfFile = os.path.join(Defaults.ETC_DIR, 'monitoring.cfg') self._writeConfigFromTemplate(monitoringConfFile, monitoringTpl)
def _installFrontend(self): printStep('Installing packages') self.system.installPackages(self.packages)
def _setupFrontend(self): self._validateParameters() printStep('Creating registration configuration file') registrationTpl = Util.get_template_file(['registration.cfg.tpl']) registrationConfFile = os.path.join(Defaults.ETC_DIR, 'registration.cfg') self._writeConfigFromTemplate(registrationConfFile, registrationTpl)
def _setupFrontend(self): printStep('Configuring file sharing') self._setupFileSharingServer() printStep('Configuring quarantine') self._configureQuarantine() printStep('Configuring cloud proxy service') self._configureCloudProxyService() printStep('Configuring firewall') self._configureFirewall() printStep('Configuring DHCP server') self._configureDhcpServer() printStep('Configuring database') self._configureDatabase() printStep('Configuring cloud admin account') self._configureCloudAdminFrontend() printStep('Configuring cloud system') self._configureCloudSystem() printStep('Applying local policies') self._configurePolicies() self._setupMarketplacePolicyValidator() printStep('Starting cloud') self._startServicesFrontend() printStep('Adding default ONE vnet') self._addDefaultNetworks() printStep('Adding default ACLs') self._addDefaultAcls() self._printInstallCompleted(self.frontend.stdout.name, self.frontend.stderr.name)
def deploy(self): ssh = SSHUtil(self._runner.userPrivateKeyFile, self.cluster_admin) # Wait until all the images are up and running vmNetworkDetails = [] vmStartTimeout = 600 # wait until the each machine is up or timeout after 15 minutes printStep("Waiting for all cluster VMs to be instantiated...") if self._is_heterogeneous: printStep("Waiting for master") self._runner.waitUntilVmRunningOrTimeout(self._master_vmid, vmStartTimeout) vmNetworkDetails.append(self._runner.getNetworkDetail(self._master_vmid)) for vmId in self._runner.vmIds: printDebug('Waiting for instance to start running %s' % str(vmId)) self._runner.waitUntilVmRunningOrTimeout(vmId, vmStartTimeout) vmNetworkDetails.append(self._runner.getNetworkDetail(vmId)) vm_cpu, vm_ram, vm_swap = self._runner.getInstanceResourceValues() for vmNetwork in vmNetworkDetails: if vmNetwork[0] == 'public': host = Host() host.public_ip = vmNetwork[1] try: host.public_dns = socket.gethostbyaddr(host.public_ip)[0] except: host.public_dns = host.public_ip host.cores = vm_cpu host.ram = vm_ram host.swap = vm_swap self.hosts.append(host) printStep("Waiting for all instances to become accessible...") failedHosts = [] for host in self.hosts: hostReady = False hostFailed = False while not hostReady and not hostFailed: if not ssh.waitForConnectivity(host, vmStartTimeout): printError('Timed out while connecting to %s. Removing from target config. list.' % host.public_ip) failedHosts.append(host) hostFailed = True else: hostReady = True if len(failedHosts) > 0: if self.tolerate_failures: for host in failedHosts: self.hosts.remove(host) else: printError('Error instantiating some or all of the nodes. Bailing out...') if self.clean_after_failure: self._runner.killInstances(self._runner.vmIds) return 128 master_node = self.hosts[0] worker_nodes = list(self.hosts) worker_nodes.remove(master_node) printInfo('\tMaster is %s' % master_node.public_dns) for node in worker_nodes: printInfo('\tWorker: %s' % node.public_dns) # Configure the hosts printAction('Configuring nodes') # Try to install the missing packages if self.add_packages: self.doAddPackages(ssh) # For MPI clusters prepare the machinefile for mpirun if self.mpi_machine_file: self.doPrepareMPImachineFile(ssh, worker_nodes) if self.cluster_user: # Create a new user and prepare the environments for password-less ssh self.doCreateClusterUser(ssh, master_node) # Initialize the shared storage in NFS if self.shared_folder: self.doPrepareNFSSharedFolder(ssh, master_node, worker_nodes) if self.ssh_hostbased: self.doSetupSSHHostBasedCluster(ssh) # Update /etc/profile with StratusLab specific environment variables self.doUpdateEnvironmentVariables(ssh, master_node, worker_nodes) # Store the list of cluster nodes in a file under /tmp self.doPrepareNodeList(ssh, worker_nodes) # Update the /etc/hosts file for all hosts self.doUpdateHostsFile(ssh, master_node, worker_nodes) # Start any services defined in rc.cluster-services self.doStartClusterServices(ssh, master_node) return 0
def doStartClusterServices(self, ssh, master_node): printStep("Applying user defined cluster services") master_only = [] master_only.append(master_node) ssh.run_remote_command(master_only, "'if [ -e /etc/rc.cluster-services ]; then /etc/rc.cluster-services; fi'")
def persistentDiskStorageHotplugTest(self): """Ensure that a disk hot-plugged to a VM and then hot-unplugged""" pdiskDevice = "/dev/%s" pdiskMountPoint = "/mnt/pdisk-test" testFile = "%s/pdisk.txt" % pdiskMountPoint testFileCmp = "/tmp/pdisk.cmp" testString = "pdiskTest" configHolder = Testor.configHolder.copy() configHolder.pdiskUsername = Testor.configHolder.testUsername configHolder.pdiskPassword = Testor.configHolder.testPassword pdisk = VolumeManagerFactory.create(configHolder) runner = self._startVmWithPDiskAndWaitUntilUp(image=self.ubuntuImg) Util.printAction("Creating a new persistent disk") diskUUID = pdisk.createVolume(1, "test %s" % datetime.datetime.today(), False) Util.printAction("Checking persistent disk exists") if not pdisk.volumeExists(diskUUID): self.fail("An error occurred while creating a persistent disk") self._modeprobe(runner, "acpiphp") vmId = self.vmIds[0] node = runner.cloud.getVmNode(vmId) printStep("Attaching pdisk to VM") availableUserBeforeAttach, _ = pdisk.getVolumeUsers(diskUUID) device = pdisk.hotAttach(node, vmId, diskUUID) availableUserAfterAttach, _ = pdisk.getVolumeUsers(diskUUID) if availableUserAfterAttach != (availableUserBeforeAttach - 1): self.fail( "Available users on persistent disk have to decrease by " "one; before=%s, after=%s" % (availableUserBeforeAttach, availableUserAfterAttach) ) self._formatDisk(runner, pdiskDevice % device) self._mountDisk(runner, pdiskDevice % device, pdiskMountPoint) self._writeToFile(runner, testFile, testString) self._umountDisk(runner, pdiskDevice % device) printStep("Detaching pdisk of VM") pdisk.hotDetach(node, vmId, diskUUID) availableUserAfterDetach, _ = pdisk.getVolumeUsers(diskUUID) if availableUserAfterDetach != availableUserBeforeAttach: self.fail( "Available users on persistent disk have to be the " "same as when VM has started; before=%s, after=%s" % (availableUserBeforeAttach, availableUserAfterDetach) ) printStep("Re-attaching pdisk to VM") device = pdisk.hotAttach(node, vmId, diskUUID) self._mountDisk(runner, pdiskDevice % device, pdiskMountPoint) self._writeToFile(runner, testFileCmp, testString) self._compareFiles(runner, testFile, testFileCmp) self._umountPDiskAndStopVm(runner, pdiskDevice % device) availableUserAfterStop, _ = pdisk.getVolumeUsers(diskUUID) if availableUserAfterStop != availableUserBeforeAttach: self.fail( "Available users on persistent disk have to be the " "same as when VM has started; before=%s, after=%s" % (availableUserBeforeAttach, availableUserAfterStop) ) Util.printAction("Removing persistent disk...") pdisk.deleteVolume(diskUUID) try: if pdisk.volumeExists(diskUUID): self.fail("The persistent disk %s is still present" % diskUUID) except ClientException, ex: if not re.match("404", ex.status): self.fail("The persistent disk %s is still present" % diskUUID)
def save_instance_as_new_image(self, vm_id): self._printContacting() self._checkInstanceExists(vm_id) printStep('Instructing cloud to save instance as new image on shutdown')