def abort_dependents(self, thread): dep = [th for th in self.threads.values() if th.depends == thread] for th in dep: log.debug("%s thread is being aborted because it depends on failed %s thread." % (th.name, thread.name)) th.status = 3 self.done_threads += 1 self.abort_dependents(th)
def thread_success(self, thread): with self.lock: self.done_threads += 1 log.debug("%s thread has finished successfully." % thread.name) log.debug("%i threads are done. Remaining: %s" % (self.done_threads, ",".join([t.name for t in self.threads.values() if t.status == -1]))) for t in [th for th in self.threads.values() if th.depends == thread]: t.start() if self.done_threads == self.num_threads: self.all_done.set()
def ssh_connect(self, username, hostname, keyfile): node = self.node log.debug("Establishing SSH connection", node) ssh = SSH(username, hostname, keyfile, default_outf = None, default_errf = None) try: ssh.open() except Exception, e: log.debug("SSH connection timed out", node) # Raise exception and let multi-thread manager handle it raise e
def scp_dir(self, fromdir, todir): for root, dirs, files in walk(fromdir): todir_full = todir + "/" + root[len(fromdir):] try: self.sftp.stat(todir_full) except IOError, e: self.sftp.mkdir(todir_full) for f in files: fromfile = root + "/" + f tofile = todir_full + "/" + f self.sftp.put(fromfile, tofile) log.debug("scp %s -> %s:%s" % (fromfile, self.hostname, tofile))
def __allocate_vms(self, deployer, nodes, resuming): # TODO: Make this an option sequential = False topology = deployer.instance.topology if not resuming: log.info("Allocating %i VMs." % len(nodes)) next_state = Node.STATE_RUNNING_UNCONFIGURED else: log.info("Resuming %i VMs" % len(nodes)) next_state = Node.STATE_RESUMED_UNCONFIGURED node_vm = {} for n in nodes: try: if not resuming: n.set_property("state", Node.STATE_STARTING) topology.save() vm = deployer.allocate_vm(n) else: n.set_property("state", Node.STATE_RESUMING) topology.save() vm = deployer.resume_vm(n) node_vm[n] = vm except Exception: message = self.__unexpected_exception_to_text() return (False, message, None) if sequential: log.debug("Waiting for instance to start.") wait = deployer.NodeWaitThread(None, "wait-%s" % str(vm), n, vm, deployer, state = next_state) wait.run2() if not sequential: log.debug("Waiting for instances to start.") mt_instancewait = MultiThread() for node, vm in node_vm.items(): mt_instancewait.add_thread(deployer.NodeWaitThread(mt_instancewait, "wait-%s" % str(vm), node, vm, deployer, state = next_state)) mt_instancewait.run() if not mt_instancewait.all_success(): message = self.__mt_exceptions_to_text(mt_instancewait.get_exceptions(), "Exception raised while waiting for instances.") return (False, message, None) return (True, "Success", node_vm)
def __stop_vms(self, deployer, nodes): node_vm = deployer.get_node_vm(nodes) topology = deployer.instance.topology mt_configure = MultiThread() order = topology.get_launch_order(nodes) for n in node_vm: n.state = Node.STATE_STOPPING topology.save() threads = {} for node in order: threads[node] = deployer.NodeConfigureThread(mt_configure, "stop-configure-%s" % node.id, node, node_vm[node], deployer, depends=[threads[t] for t in topology.get_depends(node)]) for thread in threads.values(): mt_configure.add_thread(thread) mt_configure.run() if not mt_configure.all_success(): message = self.__mt_exceptions_to_text(mt_configure.get_exceptions(), "Globus Provision was unable to configure the instances.") return (False, message) order.reverse() for node in order: deployer.stop_vms([node]) log.debug("Waiting for instances to stop.") mt_instancewait = MultiThread() for node, vm in node_vm.items(): mt_instancewait.add_thread(deployer.NodeWaitThread(mt_instancewait, "wait-%s" % str(vm), node, vm, deployer, state = Node.STATE_STOPPED)) mt_instancewait.run() if not mt_instancewait.all_success(): message = self.__mt_exceptions_to_text(mt_instancewait.get_exceptions(), "Exception raised while waiting for instances.") return (False, message) return (True, "Success")
def __terminate_vms(self, deployer, nodes): topology = deployer.instance.topology deployer.terminate_vms(nodes) node_vm = deployer.get_node_vm(nodes) log.debug("Waiting for instances to terminate.") mt_instancewait = MultiThread() for node, vm in node_vm.items(): mt_instancewait.add_thread(deployer.NodeWaitThread(mt_instancewait, "wait-%s" % str(vm), node, vm, deployer, state = Node.STATE_TERMINATED)) mt_instancewait.run() if not mt_instancewait.all_success(): message = self.__mt_exceptions_to_text(mt_instancewait.get_exceptions(), "Exception raised while waiting for instances.") return (False, message) return (True, "Success")
def __stop_vms(self, deployer, nodes): topology = deployer.instance.topology order = topology.get_launch_order(nodes) order.reverse() for nodeset in order: deployer.stop_vms(nodeset) node_vm = deployer.get_node_vm(nodes) log.debug("Waiting for instances to stop.") mt_instancewait = MultiThread() for node, vm in node_vm.items(): mt_instancewait.add_thread(deployer.NodeWaitThread(mt_instancewait, "wait-%s" % str(vm), node, vm, deployer, state = Node.STATE_STOPPED)) mt_instancewait.run() if not mt_instancewait.all_success(): message = self.__mt_exceptions_to_text(mt_instancewait.get_exceptions(), "Exception raised while waiting for instances.") return (False, message) return (True, "Success")
def pre_configure(self, ssh): node = self.node instance = self.ec2_instance log.info("Setting up instance %s. Hostname: %s" % (instance.id, instance.public_dns_name), node) try: ssh.run("ls -l /chef") except SSHCommandFailureException: #The image is not properly setup, so do all pre-configuration for globus-provision log.info("Image is not configured with Chef, so installing...") ssh.run("sudo chown -R %s /chef" % self.config.get("ec2-username")) ssh.scp_dir("%s" % self.chef_dir, "/chef") ssh.run("addgroup admin", exception_on_error = False) ssh.run("echo \"%s `hostname`\" | sudo tee -a /etc/hosts" % instance.private_ip_address) ssh.run("sudo apt-get install lsb-release wget") ssh.run("echo \"deb http://apt.opscode.com/ `lsb_release -cs` main\" | sudo tee /etc/apt/sources.list.d/opscode.list") ssh.run("wget -qO - http://apt.opscode.com/[email protected] | sudo apt-key add -") ssh.run("sudo apt-get update") ssh.run("echo 'chef chef/chef_server_url string http://127.0.0.1:4000' | sudo debconf-set-selections") ssh.run("sudo apt-get -q=2 install chef") ssh.run("echo -e \"cookbook_path \\\"/chef/cookbooks\\\"\\nrole_path \\\"/chef/roles\\\"\" > /tmp/chef.conf") ssh.run("echo '{ \"run_list\": \"recipe[provision::ec2]\", \"scratch_dir\": \"%s\" }' > /tmp/chef.json" % self.scratch_dir) ssh.run("sudo chef-solo -c /tmp/chef.conf -j /tmp/chef.json") ssh.run("sudo update-rc.d -f nis remove") ssh.run("sudo update-rc.d -f condor remove") ssh.run("sudo update-rc.d -f chef-client remove") log.debug("Removing private data...") ssh.run("sudo find /root/.*history /home/*/.*history -exec rm -f {} \;", exception_on_error = False)
def __connect(self): config = self.instance.config try: log.debug("Connecting to EC2...") ec2_server_hostname = config.get("ec2-server-hostname") ec2_server_port = config.get("ec2-server-port") ec2_server_path = config.get("ec2-server-path") if ec2_server_hostname != None: self.conn = create_ec2_connection(ec2_server_hostname, ec2_server_path, ec2_server_port) else: self.conn = create_ec2_connection() if self.conn == None: raise DeploymentException, "AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables are not set." log.debug("Connected to EC2.") except BotoClientError, exc: raise DeploymentException, "Could not connect to EC2. %s" % exc.reason
def __recv(self, f, ready_func, recv_func, log_label, rem): nbytes = 0 while ready_func(): data = recv_func(4096) if len(data) > 0: nbytes += len(data) if f is not None: f.write(data) lines = data.split('\n') if len(lines) == 1: rem += lines[0] else: log.debug(log_label + ": %s" % (rem + lines[0])) for line in lines[1:-1]: log.debug(log_label + ": %s" % line) rem = lines[-1] if f is not None: f.flush() return nbytes, rem
def run2(self): topology = self.deployer.instance.topology if self.node.state in (Node.STATE_RUNNING_UNCONFIGURED, Node.STATE_RUNNING, Node.STATE_RESUMED_UNCONFIGURED): if self.node.state == Node.STATE_RUNNING_UNCONFIGURED: log.debug("Configuring node for the first time", self.node) self.node.state = Node.STATE_CONFIGURING next_state = Node.STATE_RUNNING elif self.node.state == Node.STATE_RUNNING: log.debug("Reconfiguring already-running node", self.node) self.node.state = Node.STATE_RECONFIGURING next_state = Node.STATE_RUNNING elif self.node.state == Node.STATE_RESUMED_UNCONFIGURED: log.debug("Reconfiguring resumed node", self.node) self.node.state = Node.STATE_RESUMED_RECONFIGURING next_state = Node.STATE_RUNNING topology.save() if not self.dryrun: ssh = self.connect() self.check_continue() self.pre_configure(ssh) self.check_continue() self.configure(ssh) self.check_continue() self.post_configure(ssh) self.check_continue() self.node.state = next_state topology.save() elif self.node.state == Node.STATE_STOPPING: log.debug("Doing pre-shutdown configuration", self.node) self.node.state = Node.STATE_STOPPING_CONFIGURING topology.save() if not self.dryrun: ssh = self.connect() self.check_continue() self.configure_stop(ssh) self.check_continue() self.node.state = Node.STATE_STOPPING_CONFIGURED topology.save()
def thread_failure(self, thread): with self.lock: if not isinstance(thread.exception, ThreadAbortException): log.debug("%s thread has failed: %s" % (thread.name, thread.exception)) self.abort.set() else: log.debug("%s thread is being aborted." % thread.name) thread.status = 2 self.done_threads += 1 self.abort_dependents(thread) log.debug("%i threads are done. Remaining: %s" % (self.done_threads, ",".join([t.name for t in self.threads.values() if t.status == -1]))) if self.done_threads == self.num_threads: self.all_done.set()
def configure(self, ssh): domain = self.domain node = self.node instance_dir = self.deployer.instance.instance_dir if self.basic: # Upload host file and update hostname log.debug("Uploading host file and updating hostname", node) ssh.scp("%s/hosts" % instance_dir, "/chef/cookbooks/provision/files/default/hosts") ssh.run( "sudo cp /chef/cookbooks/provision/files/default/hosts /etc/hosts", expectnooutput=True) ssh.run( "sudo bash -c \"echo %s > /etc/hostname\"" % node.hostname, expectnooutput=True) ssh.run( "sudo /etc/init.d/hostname.sh || sudo /etc/init.d/hostname restart", expectnooutput=True) self.check_continue() if self.chef: # Upload topology file log.debug("Uploading topology file", node) ssh.scp("%s/topology.rb" % instance_dir, "/chef/cookbooks/provision/attributes/topology.rb") # Copy certificates log.debug("Copying certificates", node) ssh.scp_dir("%s/certs" % instance_dir, "/chef/cookbooks/provision/files/default/") # Upload extra files log.debug("Copying extra files", node) for src, dst in self.deployer.extra_files: ssh.scp(src, dst) self.check_continue() #temporarily add admin group log.debug("Create new admin group") try: ssh.run("addgroup admin") except SSHCommandFailureException: log.debug("Admin group already exists, skipping..") # Run chef log.debug("Running chef", node) ssh.run( "echo -e \"cookbook_path \\\"/chef/cookbooks\\\"\\nrole_path \\\"/chef/roles\\\"\" > /tmp/chef.conf", expectnooutput=True) ssh.run( "echo '{ \"run_list\": [ %s ], \"scratch_dir\": \"%s\", \"domain_id\": \"%s\", \"node_id\": \"%s\" }' > /tmp/chef.json" % (",".join("\"%s\"" % r for r in node.run_list), self.config.get("scratch-dir"), domain.id, node.id), expectnooutput=True) # Sometimes, Chef will fail because a service didn't start or restart # properly (NFS-related services seem to do this occasionally). # In most cases, the problem just "goes away" if you try to restart the # service again. So, if Chef fails, we don't give up and try again # (since the recipes are idempotent, there's no harm to running them # multiple times) chef_tries = 3 while chef_tries > 0: rc = ssh.run( "sudo -i chef-solo -c /tmp/chef.conf -j /tmp/chef.json", exception_on_error=False) if rc != 0: chef_tries -= 1 log.debug("chef-solo failed. %i attempts left", node) else: break if chef_tries == 0: raise DeploymentException, "Failed to configure node %s" % node.id self.check_continue() if self.basic: ssh.run("sudo update-rc.d nis defaults") for cmd in self.deployer.run_cmds: ssh.run(cmd) log.info("Configuration done.", node)
def run(self, command, outf=None, errf=None, exception_on_error = True, expectnooutput=False): channel = self.client.get_transport().open_session() log.debug("%s - Running %s" % (self.hostname,command)) if outf != None: outf = open(outf, "w") else: outf = self.default_outf if errf != None: errf = open(errf, "w") else: errf = self.default_errf try: channel.exec_command(command) if expectnooutput: log.debug("Ignoring output from command (not expecting any)") else: all_out_nbytes = 0 all_err_nbytes = 0 rem_out = "" rem_err = "" while True: rl, wl, xl = select.select([channel],[],[], 0.1) if len(rl) > 0: out_nbytes, rem_out = self.__recv(outf, channel.recv_ready, channel.recv, "SSH_OUT", rem_out) err_nbytes, rem_err = self.__recv(errf, channel.recv_stderr_ready, channel.recv_stderr, "SSH_ERR", rem_err) if out_nbytes + err_nbytes == 0: break all_out_nbytes += out_nbytes all_err_nbytes += err_nbytes if all_out_nbytes == 0: log.debug("Command did not write to standard output.") if all_err_nbytes == 0: log.debug("Command did not write to standard error.") if outf is not None: if outf != sys.stdout: outf.close() if errf != sys.stderr: outf.close() log.debug("%s - Waiting for exit status: %s" % (self.hostname,command)) rc = channel.recv_exit_status() log.debug("%s - Ran %s" % (self.hostname,command)) channel.close() except Exception, e: raise # Replace by something more meaningful
except IOError, e: pdirs = get_parent_directories(tof) for d in pdirs: try: self.sftp.stat(d) except IOError, e: self.sftp.mkdir(d) try: self.sftp.put(fromf, tof) except Exception, e: traceback.print_exc() try: self.close() except: pass log.debug("scp %s -> %s:%s" % (fromf, self.hostname, tof)) def scp_dir(self, fromdir, todir): for root, dirs, files in walk(fromdir): todir_full = todir + "/" + root[len(fromdir):] try: self.sftp.stat(todir_full) except IOError, e: self.sftp.mkdir(todir_full) for f in files: fromfile = root + "/" + f tofile = todir_full + "/" + f self.sftp.put(fromfile, tofile) log.debug("scp %s -> %s:%s" % (fromfile, self.hostname, tofile)) def __recv(self, f, ready_func, recv_func, log_label, rem):