def download_logs(self): reservations = list() ssh_username = self.config.globals.ssh_username if self.reservations: reservations = self.reservations else: for cloud in self.clouds: reservations = cloud.conn.get_all_instances() for reservation in reservations: for instance in reservation.instances: if self.database.check_benchmark(self.benchmark.name, instance.id): local_path = os.path.join( self.config.globals.log_local_path, self.benchmark.name, instance.id) if not os.path.exists(local_path): os.makedirs(local_path) for path in self.path: file_name = os.path.basename(path) local_path = os.path.join(local_path, file_name) now = (datetime.datetime.now()).strftime("%H%M%S") local_path = local_path + '_' + now + '_' + \ instance.instance_type com = "scp -r " + ssh_username + "@" + \ instance.public_dns_name + ":" + path + " " + \ local_path LOG.debug("Download logs: [%s] download %s into %s" % (self.benchmark.name, os.path.basename(path), local_path)) command = Command(com) command_return = command.execute() if command_return != 0: LOG.error("Download logs: " + command.stdout) LOG.error("Download logs error: " + command.stderr)
def download_logs(self): reservations = list() ssh_username = self.config.globals.ssh_username for cloud in self.clouds: for instance in cloud.get_all_floating_ips(): if self.database.check_benchmark(self.benchmark.name, instance.instance_id): local_path = os.path.join( self.config.globals.log_local_path, self.benchmark.name, instance.instance_id) if not os.path.exists(local_path): os.makedirs(local_path) for path in self.path: file_name = os.path.basename(path) local_path = os.path.join(local_path, file_name) now = (datetime.datetime.now()).strftime("%H%M%S") local_path = local_path + '_' + now + '_' + \ instance.instance_id com = "scp -r " + ssh_username + "@" + \ instance.ip + ":" + path + " " + \ local_path LOG.debug("Download logs: [%s] download %s into %s" % (self.benchmark.name, os.path.basename(path), local_path)) command = Command(com) command_return = command.execute() if command_return != 0: LOG.error("Download logs: " + command.stdout) LOG.error("Download logs error: " + command.stderr)
def _update_node_info(self): self.nodes = [] self.num_total_nodes = 0 self.num_total_cores = 0 self.num_free_cores = 0 self.num_down_cores = 0 pbsnodes_cmd = str(self._pbsnodes_cmd) + " -a" pbsnodes = Command([pbsnodes_cmd]) pbsnodes_rc = pbsnodes.execute() if pbsnodes_rc != 0: LOG.error("pbsnodes returned %d" % pbsnodes_rc) return node_line = "\n(\S+)\n\s+state\s=\s(\S+)\n\s+np\s=\s(\d+)\n" node_pattern = re.compile(node_line) matches = re.findall(node_pattern, pbsnodes.stdout) for match in matches: n = Node(match[0], int(match[2]), match[1]) self.num_total_nodes += 1 self.num_total_cores += int(match[2]) if match[1] == "free": self.num_free_cores += int(match[2]) if "down" in match[1]: self.num_down_cores += int(match[2]) if "down" not in match[1]: if n.public_dns_name not in self._has_booted: self._has_booted.append(n.public_dns_name) self.nodes.append(n) LOG.debug("Nodes updated: %s total nodes and %s total cores." % ( self.num_total_nodes, self.num_total_cores))
def download_logs(self): reservations = list() ssh_username = self.config.globals.ssh_username if self.reservations: reservations = self.reservations else: for cloud in self.clouds: reservations = cloud.conn.get_all_instances() for reservation in reservations: for instance in reservation.instances: if self.database.check_benchmark(self.benchmark.name, instance.id): local_path = os.path.join(self.config.globals.log_local_path, self.benchmark.name, instance.id) if not os.path.exists(local_path): os.makedirs(local_path) for path in self.path: file_name = os.path.basename(path) local_path = os.path.join(local_path,file_name) local_path = local_path+'_'+(datetime.datetime.now()).strftime("%H%M%S")+'_'+instance.instance_type com = "scp -r "+ssh_username+"@"+instance.public_dns_name+":"+path+" "+local_path LOG.debug("Download logs: [%s] download %s into %s" % (self.benchmark.name, os.path.basename(path), local_path)) command = Command(com) command_return = command.execute() if command_return != 0: LOG.error("Download logs: "+command.stdout) LOG.error("Download logs error: "+command.stderr)
def _add_new_node(self, public_dns_name, np): qmgr_cmd = str(self._qmgr_cmd) + " -c \"create node %s np=%d\"" qmgr_cmd = qmgr_cmd % (public_dns_name, np) add_node = Command([qmgr_cmd]) add_node_rc = add_node.execute() if add_node_rc != 0: LOG.error("qmgr returned %d" % add_node_rc) return LOG.debug("Successfully added node: %s" % public_dns_name)
def scp_log_back(self): scp_string = "scp %s@%s:~/%s %s/sleep.log" \ % (self.config.workload.user, self.master.dns, self.config.workload.log_remote, self.config.log_dir) scp_cmd = Command(scp_string) code = scp_cmd.execute() if code == 0: LOG.info("Successfully obtained the log from the master node") else: LOG.error("Error occurred during obtaining the log from the master node")
def _remove_node(self, public_dns_name): qmgr_cmd = str(self._qmgr_cmd) + " -c \"delete node %s\"" qmgr_cmd = qmgr_cmd % public_dns_name remove_node = Command([qmgr_cmd]) remove_node_rc = remove_node.execute() if remove_node_rc != 0: LOG.error("qmgr returned %d" % remove_node_rc) return if public_dns_name in self._has_booted: self._has_booted.remove(public_dns_name) LOG.debug("Successfully removed node: %s" % public_dns_name)
def offline_node(self, public_dns_name): pbsnodes_cmd = str(self._pbsnodes_cmd) + " -o %s" pbsnodes_cmd = pbsnodes_cmd % public_dns_name offline_node = Command([pbsnodes_cmd]) offline_node_rc = offline_node.execute() if offline_node_rc != 0: LOG.error("pbsnodes returned %d" % offline_node_rc) return else: LOG.debug("Successfully marked node offline: %s" % public_dns_name) for node in self.nodes: if node.public_dns_name == public_dns_name: node.terminate_me = True
def obtain_end_time(self): # cat /Users/dmdu/Dropbox/dmdu-Downscaling-Experiments/experiments/exprC/AP-Mix-Thresh-Infinite/20130224_215317/sleep.log | grep terminated | head -n 1 file_name = "%s/%s" % (self.directory, "sleep.log") end_time_string = "cat %s | grep terminated | tail -n 1" % (file_name) #print "Command: %s " % (end_time_string) cmd = Command(end_time_string) code = cmd.execute() if code: print "Can't obtain the end time" print cmd.stderr self.end = None else: line = cmd.stdout items = line.split() self.end = self.convert_time(items[3], False) #no time conversion; Chicago time print "End time: %s" % (self.end)
def __init__(self, options): #self.clouds = CloudsConfig(read_config(options.clouds_file)) #self.benchmarking = BenchmarkingConfig(read_config(options.benchmarking_file)) self.globals = GlobalConfig(options.global_file) self.master = MasterConfig(options.master_file) self.clouds = CloudsConfig(options.clouds_file) self.phantom_config = PhantomConfig(options.phantom_file) self.workers = WorkersConfig(options.workers_file) self.workload = WorkloadConfig(options.workload_file) self.policy = PolicyConfig(options.policy_file) __timestamp = datetime.datetime.now() timestamp = __timestamp.strftime("%Y%m%d_%H%M%S") self.phantom_config.domain_name = "%s_%s" % (self.phantom_config.domain_name_prefix, timestamp) self.experiment_id = timestamp # Create directory for all the logs self.log_dir = "log/%s" % (self.experiment_id) os.mkdir(self.log_dir) self.remote_log = "%s/%s" % (self.log_dir, options.remote_log) self.node_log = "%s/%s" % (self.log_dir, options.node_log) self.worker_pool_log = "%s/%s" % (self.log_dir, options.worker_pool_log) self.discarded_work_log = "%s/%s" % (self.log_dir, options.discarded_work_log) self.failure_log = "%s/%s" % (self.log_dir, options.failure_log) # to keep current code running for now self.threshold = self.policy.threshold self.downscaler_interval = self.policy.downscaler_interval # Copy config files to the log directory for current experiment copy_string = "cp etc/* %s/" % (self.log_dir) copy_cmd = Command(copy_string) code = copy_cmd.execute() if code == 0: LOG.info("Config files have been copied successfully to the log directory")
def _update_job_info(self): qstat = Command([self._qstat_cmd]) qstat_rc = qstat.execute() if qstat_rc != 0: LOG.error("qstat returned %d" % qstat_rc) return job_line = "(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+" job_line += "(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+([A-Z])\s+(\S+)" job_pattern = re.compile(job_line) queued_cores = 0 queued_jobs = 0 total_jobs = 0 for line in qstat.stdout.split('\n'): match = job_pattern.match(line) if match: if match.group(10) == 'Q': queued_cores += int(match.group(7)) queued_jobs += 1 total_jobs += 1 self.num_queued_jobs = queued_jobs self.num_queued_cores = queued_cores self.num_total_jobs = total_jobs LOG.debug("Jobs updated: %s total jobs and %s queued cores." % ( self.num_total_jobs, self.num_queued_cores))
def run(self): batch_index = 0 for batch in self.batch_files: last_line = os.popen("tail -n 1 %s" % batch).read() # if sleep time is specified if ("SLEEP" in last_line) or ("sleep" in last_line): # last item in the line sleep_time = int(last_line.split()[-1:][0]) else: sleep_time = 0 # Copy the batch file to the log directory copy_string = "cp %s %s/" % (batch, self.config.log_dir) copy_cmd = Command(copy_string) code = copy_cmd.execute() if code == 0: #LOG.info("Workload %s file has been copied successfully to the log directory" % (batch)) pass # Scp this file to the master scp_string = "scp %s %s@%s:~/%s" % (batch, self.config.workload.user, self.master.dns, self.config.workload.submit_remote) scp_cmd = Command(scp_string) code = scp_cmd.execute() if code == 0: #LOG.info("Batch file %s has been copied to the master node" % (batch)) pass else: LOG.error("Error occurred during copying batch file %s to the master node" % (batch)) # Send this batch to the work queue exec_cmd = RemoteCommand( config = self.config, hostname = self.master.dns, ssh_private_key = self.config.globals.priv_path, user = self.config.workload.user, command = 'condor_submit %s' % (self.config.workload.submit_remote)) code = exec_cmd.execute() if code == 0: #LOG.info("Batch file %s has been submitted to the work queue" % (batch)) pass else: LOG.error("Error occurred during submission of batch file %s" % (batch)) LOG.info("%s" % (batch)) batch_index += 1 # Sleep for a while if this is specified in the batch file time.sleep(sleep_time) # Periodic log saving, every 100 jobs if batch_index == 100: self.scp_log_back() batch_index = 0 # To give it enough time so the jobs are scheduled; unless specified otherwise if sleep_time == 0: time.sleep(60) # After this for loop, go into monitor mode (run while there are jobs in the queue) LOG.info("Workload turns into monitor mode: this thread will stop when there are no more jobs in the queue. Sleep interval: %d" % (self.interval)) jobs = Jobs(self.config, self.master.dns) count = jobs.get_current_number() print "Initial job count: %d" % (count) counter = 0 while count > 0: time.sleep(self.interval) count = jobs.get_current_number() print "Job count: %d" % (count) counter += 1 # Periodic log saving, every 20 iterations if counter == 20: self.scp_log_back() counter = 0 # check that the queue is empty one more time time.sleep(120) LOG.info("Checking that the queue is empty one more time after 120 seconds") count = jobs.get_current_number() while count > 0: time.sleep(self.interval) count = jobs.get_current_number() print "Job count: %d" % (count) LOG.info("Workload completed")