def update_current_list(self): rcmd = RemoteCommand( config = self.config, hostname = self.master_dns, ssh_private_key = self.config.globals.priv_path, user = self.config.workload.user, command = self.command_job_list) rcmd.execute() queue_state = rcmd.stdout self.list = [] if queue_state != None: items = queue_state.split() if self.config.workload.user in items: start = items.index(self.config.workload.user) - 1 uc = 0 sdcs = 0 total = 0 for i in range(start, len(items), 6): #print "Job %s running for %s on %s" % (items[i], items[i+4], items[i+5]) self.list.append(Job(items[i], items[i+4], items[i+5])) node = items[i+5] if "uc" in node: uc += 1 if "sdcs" in node: sdcs += 1 total += 1 print "Jobs: total: %d, Hotel: %d, Sierra: %d" % (total, uc, sdcs)
def get_cloud_termination_list(self): condor_list = [] command = "condor_status" rcmd = RemoteCommand( config = self.config, hostname = self.master.dns, ssh_private_key = self.config.globals.priv_path, user = self.config.workload.user, command = command) rcmd.execute() if rcmd.stdout: # condor status will be lines so split them all_lines = rcmd.stdout.split("\n") for line in all_lines: # line not empty if line.strip(): # if its the first line then go to the next one if line.strip().startswith("Name"): continue # if we find a line that starts with total then we are done, break out from the loop elif line.strip().startswith("Total"): break # it must be a line of interest, parse it else: # split line by space : #"vm-148-102.uc.futu LINUX X86_64 Unclaimed Idle 0.150 2048 0+00:00:04" line_columns = line.split() try: tmp_fqdn = line_columns[0].strip() condor_list.append(tmp_fqdn) except Exception as expt: LOG.info("Error parsing condor status, line says : %s and the expt says : %s" % (line, str(expt))) LOG.info("Condor worker names: %s" % (str(condor_list))) # instances running in this cloud cloud = Cloud(self.cloud_name, self.config) vms = cloud.get_running_instances() # add to the termination list only workers (no master) that have checked in with the master: termination_list = [] for instance in vms: # not a master if not instance.public_dns_name == self.master.dns: for worker_partial_name in condor_list: if worker_partial_name in instance.public_dns_name: termination_list.append(instance) condor_list.remove(worker_partial_name) termination_list_names =[] for instance in termination_list: termination_list_names.append(instance.public_dns_name) LOG.info("Termination list for %s: %s" % (self.cloud_name, str(termination_list_names))) return termination_list
def get_current_number(self): rcmd = RemoteCommand( config = self.config, hostname = self.master_dns, ssh_private_key = self.config.globals.priv_path, user = self.config.workload.user, command = self.command_job_count) rcmd.execute() if not rcmd.stdout: return 0 else: return int(rcmd.stdout)
def deploy_software(self): ssh_priv_key = self.config.globals.ssh_priv_key ssh_username = self.config.globals.ssh_username ssh_timeout = int(self.config.globals.ssh_timeout) reservations = list() not_available = 0 for cloud in self.clouds: for instance in cloud.get_all_floating_ips(): if self.database.check_benchmark(self.benchmark.name, instance.instance_id): if not check_port_status(instance.ip, 22, ssh_timeout): LOG.error("Deploy_software: the port 22 is not " "available right now. please try it later") continue cmds = list() cmds.append("wget %s" % (self.url)) cmds.append("sudo apt-get update") cmds.append("sudo apt-get install unzip") cmds.append("unzip BioPerf.zip") cmds.append("sed -i 's/read BIOPERF/#read " "BIOPERF/g' install-BioPerf.sh") cmds.append("./install-BioPerf.sh") for c in cmds: command = RemoteCommand(instance.ip, ssh_priv_key, c) command_return = command.execute() if command_return != 0: LOG.error("Deploy_software: " + command.stdout) LOG.error("Deploy_software error: " + command.stderr)
def excute_benchmarks(self,dataset_size): ssh_priv_key = self.config.globals.ssh_priv_key ssh_username = self.config.globals.ssh_username reservations = list() if self.reservations: reservations = self.reservations else: for cloud in self.clouds: reservations = cloud.conn.get_all_instances() for reservation in reservations: for instance in reservation.instances: if self.database.check_benchmark(self.benchmark.name, instance.id): cmds = list() cmds.append("sed -i '13c rm -f $BIOPERF/Outputs/log' ~/BioPerf/Scripts/Run-scripts/CleanOutputs.sh") cmds.append("sed -i '60c FASTA=0' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("sed -i '62c GRAPPA=0' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("./BioPerf/Scripts/Run-scripts/CleanOutputs.sh") if dataset_size=="large": #cmds.append("sed -i '134c input='A'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("echo C>c;echo H>>c;echo '1'>>c;echo Y>>c") elif dataset_size=="medium": #cmds.append("sed -i '134c input='B'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("echo B>c;echo H>>c;echo '1'>>c;echo Y>>c") else: #cmds.append("sed -i '134c input='C'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("echo A>c;echo H>>c;echo '1'>>c;echo Y>>c") cmds.append("cat c|./BioPerf/Scripts/Run-scripts/run-bioperf.sh > ~/BioPerf/Outputs/log") for c in cmds: command = RemoteCommand(instance.public_dns_name, ssh_priv_key, c) command_return = command.execute() if command_return !=0: LOG.error("Excute_benchmarks: "+command.stdout) LOG.error("Excute_benchmarks: "+command.stderr)
def excute_benchmarks(self): ssh_priv_key = self.config.globals.ssh_priv_key ssh_username = self.config.globals.ssh_username reservations = list() if self.reservations: reservations = self.reservations else: for cloud in self.clouds: reservations = cloud.conn.get_all_instances() for reservation in reservations: for instance in reservation.instances: if self.database.check_benchmark(self.benchmark.name, instance.id): cmds = list() cmds.append("sed -i '5c input='y'' ~/BioPerf/Scripts/Run-scripts/CleanOutputs.sh") cmds.append("sed -i '13c rm -f $BIOPERF/Outputs/log' ~/BioPerf/Scripts/Run-scripts/CleanOutputs.sh") cmds.append("sed -i '21c #' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("sed -i '26c #' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("sed -i '10c arch='X'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("sed -i '71c input3='A'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("sed -i '134c input='A'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("sed -i '145c user1='y'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("./BioPerf/Scripts/Run-scripts/CleanOutputs.sh") cmds.append("echo 'Y' 'Y'|./BioPerf/Scripts/Run-scripts/run-bioperf.sh > ~/BioPerf/Outputs/log") for c in cmds: command = RemoteCommand(instance.public_dns_name, ssh_priv_key, c) command_return = command.execute() if command_return !=0: LOG.error("Excute_benchmarks: "+command.stdout) LOG.error("Excute_benchmarks: "+command.stderr)
def deploy_software(self): ssh_priv_key = self.config.globals.ssh_priv_key ssh_username = self.config.globals.ssh_username ssh_timeout = int(self.config.globals.ssh_timeout) reservations = list() not_available = 0; if self.reservations: reservations = self.reservations else: for cloud in self.clouds: reservations = cloud.conn.get_all_instances() for reservation in reservations: for instance in reservation.instances: if self.database.check_benchmark(self.benchmark.name, instance.id): if not check_port_status(instance.ip_address, 22, ssh_timeout): LOG.error("Deploy_software: the port 22 is not available right now. please try it later") continue cmds = list() cmds.append("wget %s" % (self.url)) #cmds.append("apt-get update") #cmds.append("apt-get install unzip") cmds.append("unzip BioPerf.zip") cmds.append("sed -i 's/read BIOPERF/#read BIOPERF/g' install-BioPerf.sh") cmds.append("./install-BioPerf.sh") for c in cmds: command = RemoteCommand(instance.public_dns_name, ssh_priv_key, c) command_return = command.execute() if command_return !=0: LOG.error("Deploy_software: "+command.stdout) LOG.error("Deploy_software error: "+command.stderr)
def idle_workers_count(self): command = "condor_status | grep Idle" rcmd = RemoteCommand( config = self.config, hostname = self.master.dns, ssh_private_key = self.config.globals.priv_path, user = self.config.workload.user, command = command) rcmd.execute() out = rcmd.stdout if out == None: return 0 else: items = out.split() item_count = len(items) # there are normally 8 items per line (i.e. per worker) in the condor_status output #print item_count if item_count%8 != 0: LOG.error("Number of items in the output of condor_status is not a multiple of 8") return item_count/8
def exec_cmd_on_condor_master(self, cmd_string): cmd = RemoteCommand( config = self.config, hostname = self.master_dns, ssh_private_key = self.config.globals.priv_path, user = self.config.workload.user, command = cmd_string) code = cmd.execute() if not code: return cmd.stdout else: LOG.error("Command failed: %s" % (cmd_string)) LOG.error("Stderr: %s" % (cmd.stderr)) return None
def deploy_software(self): ssh_priv_key = self.config.globals.ssh_priv_key ssh_username = self.config.globals.ssh_username ssh_timeout = int(self.config.globals.ssh_timeout) reservations = list() not_available = 0; if self.reservations: reservations = self.reservations else: for cloud in self.clouds: reservations = cloud.conn.get_all_instances() for reservation in reservations: for instance in reservation.instances: if self.database.check_benchmark(self.benchmark.name, instance.id): if not check_port_status(instance.ip_address, 22, ssh_timeout): LOG.error("Deploy_software: the port 22 is not available right now. please try it later") continue cmds = list() cmds.append("rm -rf ~/*") cmds.append("wget %s" % (self.url)) cmds.append("apt-get update") cmds.append("apt-get install unzip") cmds.append("unzip BioPerf.zip") cmds.append("sed -i 's/read BIOPERF/#read BIOPERF/g' install-BioPerf.sh") cmds.append("./install-BioPerf.sh") cmds.append("wget ftp://ftp.cc.gatech.edu/pub/people/bader/BioPerf/swissprot.tar.gz") cmds.append("tar -xvf swissprot.tar.gz") cmds.append("mv Swissprot/* .") cmds.append("wget ftp://ftp.cc.gatech.edu/pub/people/bader/BioPerf/Pfam") cmds.append("wget ftp://ftp.cc.gatech.edu/pub/people/bader/BioPerf/nr") cmds.append("sed -i '10 i\DATABASES=~/' ~/.profile") cmds.append("sed -i '10 i\export DATABASES' ~/.profile") cmds.append("sed -i '5c input='y'' ~/BioPerf/Scripts/Run-scripts/CleanOutputs.sh") cmds.append("sed -i '21c #' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("sed -i '26c #' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("sed -i '10c arch='X'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append("sed -i '71c input3='A'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh") cmds.append('''sed -i "659c ch='Y';" ./BioPerf/Source-codes/Phylip/src/promlk.c''') cmds.append("sed -i '/scanf/c //' ./BioPerf/Source-codes/Phylip/src/promlk.c") cmds.append("cd ./BioPerf/Source-codes/Phylip/src;make promlk;cd ~") cmds.append("mv ./BioPerf/Source-codes/Phylip/src/promlk ./BioPerf/Binaries/x86-Binaries/Phylip") for c in cmds: command = RemoteCommand(instance.public_dns_name, ssh_priv_key, c) command_return = command.execute() if command_return !=0: LOG.error("Deploy_software: "+command.stdout) LOG.error("Deploy_software error: "+command.stderr)
def terminate_condor(self, master_dns): command = "condor_off -fast %s" % (self.dns) rcmd = RemoteCommand( config=self.config, hostname=master_dns, ssh_private_key=self.config.globals.priv_path, user="******", command=command, ) code = rcmd.execute() if code == 0: LOG.info("Successfully stopped Condor daemon on worker %s instance id : %s" % (self.dns, self.instance)) else: LOG.error( "Error occurred during Condor daemon termination on worker %s instance: %s" % (self.dns, self.instance) )
def offline(self, master_dns): # Marking node offline actually has to be done from the master side if master_dns: command = "condor_off -peaceful %s" % (self.dns) rcmd = RemoteCommand( config=self.config, hostname=master_dns, ssh_private_key=self.config.globals.priv_path, user="******", command=command, ) code = rcmd.execute() if code == 0: LOG.info("Successfully marked instance offline: %s" % (self.instance)) else: LOG.error("Error occurred during marking instance offline: %s" % (self.instance)) else: LOG.error("Can't mark instance offline without master's dns")
def run(self): batch_index = 0 for batch in self.batch_files: last_line = os.popen("tail -n 1 %s" % batch).read() # if sleep time is specified if ("SLEEP" in last_line) or ("sleep" in last_line): # last item in the line sleep_time = int(last_line.split()[-1:][0]) else: sleep_time = 0 # Copy the batch file to the log directory copy_string = "cp %s %s/" % (batch, self.config.log_dir) copy_cmd = Command(copy_string) code = copy_cmd.execute() if code == 0: #LOG.info("Workload %s file has been copied successfully to the log directory" % (batch)) pass # Scp this file to the master scp_string = "scp %s %s@%s:~/%s" % (batch, self.config.workload.user, self.master.dns, self.config.workload.submit_remote) scp_cmd = Command(scp_string) code = scp_cmd.execute() if code == 0: #LOG.info("Batch file %s has been copied to the master node" % (batch)) pass else: LOG.error("Error occurred during copying batch file %s to the master node" % (batch)) # Send this batch to the work queue exec_cmd = RemoteCommand( config = self.config, hostname = self.master.dns, ssh_private_key = self.config.globals.priv_path, user = self.config.workload.user, command = 'condor_submit %s' % (self.config.workload.submit_remote)) code = exec_cmd.execute() if code == 0: #LOG.info("Batch file %s has been submitted to the work queue" % (batch)) pass else: LOG.error("Error occurred during submission of batch file %s" % (batch)) LOG.info("%s" % (batch)) batch_index += 1 # Sleep for a while if this is specified in the batch file time.sleep(sleep_time) # Periodic log saving, every 100 jobs if batch_index == 100: self.scp_log_back() batch_index = 0 # To give it enough time so the jobs are scheduled; unless specified otherwise if sleep_time == 0: time.sleep(60) # After this for loop, go into monitor mode (run while there are jobs in the queue) LOG.info("Workload turns into monitor mode: this thread will stop when there are no more jobs in the queue. Sleep interval: %d" % (self.interval)) jobs = Jobs(self.config, self.master.dns) count = jobs.get_current_number() print "Initial job count: %d" % (count) counter = 0 while count > 0: time.sleep(self.interval) count = jobs.get_current_number() print "Job count: %d" % (count) counter += 1 # Periodic log saving, every 20 iterations if counter == 20: self.scp_log_back() counter = 0 # check that the queue is empty one more time time.sleep(120) LOG.info("Checking that the queue is empty one more time after 120 seconds") count = jobs.get_current_number() while count > 0: time.sleep(self.interval) count = jobs.get_current_number() print "Job count: %d" % (count) LOG.info("Workload completed")