Exemplo n.º 1
0
 def download_logs(self):
     reservations = list()
     ssh_username = self.config.globals.ssh_username
     if self.reservations:
         reservations = self.reservations
     else:
         for cloud in self.clouds:
             reservations = cloud.conn.get_all_instances()
     for reservation in reservations:
         for instance in reservation.instances:
             if self.database.check_benchmark(self.benchmark.name,
                                              instance.id):
                 local_path = os.path.join(
                     self.config.globals.log_local_path,
                     self.benchmark.name, instance.id)
                 if not os.path.exists(local_path):
                     os.makedirs(local_path)
                 for path in self.path:
                     file_name = os.path.basename(path)
                     local_path = os.path.join(local_path, file_name)
                     now = (datetime.datetime.now()).strftime("%H%M%S")
                     local_path = local_path + '_' + now + '_' + \
                                  instance.instance_type
                     com = "scp -r " + ssh_username + "@" + \
                           instance.public_dns_name + ":" + path + " " + \
                           local_path
                     LOG.debug("Download logs: [%s] download %s into %s" %
                               (self.benchmark.name, os.path.basename(path),
                                local_path))
                     command = Command(com)
                     command_return = command.execute()
                     if command_return != 0:
                         LOG.error("Download logs: " + command.stdout)
                         LOG.error("Download logs error: " + command.stderr)
Exemplo n.º 2
0
 def download_logs(self):
     reservations = list()
     ssh_username = self.config.globals.ssh_username
     for cloud in self.clouds:
         for instance in cloud.get_all_floating_ips():
             if self.database.check_benchmark(self.benchmark.name,
                                              instance.instance_id):
                 local_path = os.path.join(
                     self.config.globals.log_local_path,
                     self.benchmark.name, instance.instance_id)
                 if not os.path.exists(local_path):
                     os.makedirs(local_path)
                 for path in self.path:
                     file_name = os.path.basename(path)
                     local_path = os.path.join(local_path, file_name)
                     now = (datetime.datetime.now()).strftime("%H%M%S")
                     local_path = local_path + '_' + now + '_' + \
                         instance.instance_id
                     com = "scp -r " + ssh_username + "@" + \
                           instance.ip + ":" + path + " " + \
                           local_path
                     LOG.debug("Download logs: [%s] download %s into %s" %
                               (self.benchmark.name, os.path.basename(path),
                                local_path))
                     command = Command(com)
                     command_return = command.execute()
                     if command_return != 0:
                         LOG.error("Download logs: " + command.stdout)
                         LOG.error("Download logs error: " + command.stderr)
Exemplo n.º 3
0
 def _update_node_info(self):
     self.nodes = []
     self.num_total_nodes = 0
     self.num_total_cores = 0
     self.num_free_cores = 0
     self.num_down_cores = 0
     pbsnodes_cmd = str(self._pbsnodes_cmd) + " -a"
     pbsnodes = Command([pbsnodes_cmd])
     pbsnodes_rc = pbsnodes.execute()
     if pbsnodes_rc != 0:
         LOG.error("pbsnodes returned %d" % pbsnodes_rc)
         return
     node_line = "\n(\S+)\n\s+state\s=\s(\S+)\n\s+np\s=\s(\d+)\n"
     node_pattern = re.compile(node_line)
     matches = re.findall(node_pattern, pbsnodes.stdout)
     for match in matches:
         n = Node(match[0], int(match[2]), match[1])
         self.num_total_nodes += 1
         self.num_total_cores += int(match[2])
         if match[1] == "free":
             self.num_free_cores += int(match[2])
         if "down" in match[1]:
             self.num_down_cores += int(match[2])
         if "down" not in match[1]:
             if n.public_dns_name not in self._has_booted:
                 self._has_booted.append(n.public_dns_name)
         self.nodes.append(n)
     LOG.debug("Nodes updated: %s total nodes and %s total cores." % (
         self.num_total_nodes, self.num_total_cores))
Exemplo n.º 4
0
 def download_logs(self):
     reservations = list()
     ssh_username = self.config.globals.ssh_username
     if self.reservations:
         reservations = self.reservations
     else:
         for cloud in self.clouds:
             reservations = cloud.conn.get_all_instances()
     for reservation in reservations:
         for instance in reservation.instances:
             if self.database.check_benchmark(self.benchmark.name, instance.id):
                 local_path = os.path.join(self.config.globals.log_local_path, self.benchmark.name, instance.id)
                 if not os.path.exists(local_path):
                     os.makedirs(local_path)
                 for path in self.path:
                     file_name = os.path.basename(path)
                     local_path = os.path.join(local_path,file_name)
                     local_path = local_path+'_'+(datetime.datetime.now()).strftime("%H%M%S")+'_'+instance.instance_type
                     com = "scp -r "+ssh_username+"@"+instance.public_dns_name+":"+path+" "+local_path
                     LOG.debug("Download logs: [%s] download %s into %s" % (self.benchmark.name, os.path.basename(path), local_path))
                     command = Command(com)
                     command_return = command.execute()
                     if command_return != 0:
                         LOG.error("Download logs: "+command.stdout)
                         LOG.error("Download logs error: "+command.stderr)
Exemplo n.º 5
0
 def _add_new_node(self, public_dns_name, np):
     qmgr_cmd = str(self._qmgr_cmd) + " -c \"create node %s np=%d\""
     qmgr_cmd = qmgr_cmd % (public_dns_name, np)
     add_node = Command([qmgr_cmd])
     add_node_rc = add_node.execute()
     if add_node_rc != 0:
         LOG.error("qmgr returned %d" % add_node_rc)
         return
     LOG.debug("Successfully added node: %s" % public_dns_name)
Exemplo n.º 6
0
    def scp_log_back(self):

        scp_string = "scp %s@%s:~/%s %s/sleep.log" \
                               % (self.config.workload.user, self.master.dns, self.config.workload.log_remote, self.config.log_dir)
        scp_cmd = Command(scp_string)
        code = scp_cmd.execute()
        if code == 0:
            LOG.info("Successfully obtained the log from the master node")
        else:
            LOG.error("Error occurred during obtaining the log from the master node")
Exemplo n.º 7
0
 def _remove_node(self, public_dns_name):
     qmgr_cmd = str(self._qmgr_cmd) + " -c \"delete node %s\""
     qmgr_cmd = qmgr_cmd % public_dns_name
     remove_node = Command([qmgr_cmd])
     remove_node_rc = remove_node.execute()
     if remove_node_rc != 0:
         LOG.error("qmgr returned %d" % remove_node_rc)
         return
     if public_dns_name in self._has_booted:
         self._has_booted.remove(public_dns_name)
     LOG.debug("Successfully removed node: %s" % public_dns_name)
Exemplo n.º 8
0
 def offline_node(self, public_dns_name):
     pbsnodes_cmd = str(self._pbsnodes_cmd) + " -o %s"
     pbsnodes_cmd = pbsnodes_cmd % public_dns_name
     offline_node = Command([pbsnodes_cmd])
     offline_node_rc = offline_node.execute()
     if offline_node_rc != 0:
         LOG.error("pbsnodes returned %d" % offline_node_rc)
         return
     else:
         LOG.debug("Successfully marked node offline: %s" % public_dns_name)
         for node in self.nodes:
             if node.public_dns_name == public_dns_name:
                 node.terminate_me = True
Exemplo n.º 9
0
    def obtain_end_time(self):
        # cat /Users/dmdu/Dropbox/dmdu-Downscaling-Experiments/experiments/exprC/AP-Mix-Thresh-Infinite/20130224_215317/sleep.log | grep terminated | head -n 1

        file_name = "%s/%s" % (self.directory, "sleep.log")
        end_time_string = "cat %s | grep terminated | tail -n 1" % (file_name)
        #print "Command: %s " % (end_time_string)
        cmd = Command(end_time_string)
        code = cmd.execute()
        if code:
            print "Can't obtain the end time"
            print cmd.stderr
            self.end = None
        else:
            line = cmd.stdout
            items = line.split()
            self.end = self.convert_time(items[3], False) #no time conversion; Chicago time
            print "End time: %s" % (self.end)
Exemplo n.º 10
0
    def __init__(self, options):
        #self.clouds = CloudsConfig(read_config(options.clouds_file))
        #self.benchmarking = BenchmarkingConfig(read_config(options.benchmarking_file))

        self.globals = GlobalConfig(options.global_file)
        self.master = MasterConfig(options.master_file)
        self.clouds = CloudsConfig(options.clouds_file)
        self.phantom_config = PhantomConfig(options.phantom_file)
        self.workers = WorkersConfig(options.workers_file)
        self.workload = WorkloadConfig(options.workload_file)
        self.policy = PolicyConfig(options.policy_file)

        __timestamp = datetime.datetime.now()
        timestamp = __timestamp.strftime("%Y%m%d_%H%M%S")
        self.phantom_config.domain_name = "%s_%s" % (self.phantom_config.domain_name_prefix, timestamp)

        self.experiment_id = timestamp
        # Create directory for all the logs
        self.log_dir = "log/%s" % (self.experiment_id)
        os.mkdir(self.log_dir)
        self.remote_log = "%s/%s" % (self.log_dir, options.remote_log)
        self.node_log = "%s/%s" % (self.log_dir, options.node_log)
        self.worker_pool_log = "%s/%s" % (self.log_dir, options.worker_pool_log)
        self.discarded_work_log = "%s/%s" % (self.log_dir, options.discarded_work_log)
        self.failure_log = "%s/%s" % (self.log_dir, options.failure_log)

        # to keep current code running for now
        self.threshold = self.policy.threshold
        self.downscaler_interval = self.policy.downscaler_interval

        # Copy config files to the log directory for current experiment
        copy_string = "cp etc/* %s/" % (self.log_dir)
        copy_cmd = Command(copy_string)
        code = copy_cmd.execute()
        if code == 0:
            LOG.info("Config files have been copied successfully to the log directory")
Exemplo n.º 11
0
 def _update_job_info(self):
     qstat = Command([self._qstat_cmd])
     qstat_rc = qstat.execute()
     if qstat_rc != 0:
         LOG.error("qstat returned %d" % qstat_rc)
         return
     job_line = "(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+"
     job_line += "(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+([A-Z])\s+(\S+)"
     job_pattern = re.compile(job_line)
     queued_cores = 0
     queued_jobs = 0
     total_jobs = 0
     for line in qstat.stdout.split('\n'):
         match = job_pattern.match(line)
         if match:
             if match.group(10) == 'Q':
                 queued_cores += int(match.group(7))
                 queued_jobs += 1
             total_jobs += 1
     self.num_queued_jobs = queued_jobs
     self.num_queued_cores = queued_cores
     self.num_total_jobs = total_jobs
     LOG.debug("Jobs updated: %s total jobs and %s queued cores." % (
         self.num_total_jobs, self.num_queued_cores))
Exemplo n.º 12
0
    def run(self):

        batch_index = 0
        for batch in self.batch_files:

            last_line = os.popen("tail -n 1 %s" % batch).read()
            # if sleep time is specified
            if ("SLEEP" in last_line) or ("sleep" in last_line):
                # last item in the line
                sleep_time = int(last_line.split()[-1:][0])
            else:
                sleep_time = 0

            # Copy the batch file to the log directory
            copy_string = "cp %s %s/" % (batch, self.config.log_dir)
            copy_cmd = Command(copy_string)
            code = copy_cmd.execute()
            if code == 0:
                #LOG.info("Workload %s file has been copied successfully to the log directory" % (batch))
                pass

            # Scp this file to the master
            scp_string = "scp %s %s@%s:~/%s" % (batch, self.config.workload.user, self.master.dns, self.config.workload.submit_remote)
            scp_cmd = Command(scp_string)
            code = scp_cmd.execute()
            if code == 0:
                #LOG.info("Batch file %s has been copied to the master node" % (batch))
                pass
            else:
                LOG.error("Error occurred during copying batch file %s to the master node" % (batch))

            # Send this batch to the work queue
            exec_cmd = RemoteCommand(
                config = self.config,
                hostname = self.master.dns,
                ssh_private_key = self.config.globals.priv_path,
                user = self.config.workload.user,
                command = 'condor_submit %s' % (self.config.workload.submit_remote))
            code = exec_cmd.execute()
            if code == 0:
                #LOG.info("Batch file %s has been submitted to the work queue" % (batch))
                pass
            else:
                LOG.error("Error occurred during submission of batch file %s" % (batch))

            LOG.info("%s" % (batch))

            batch_index += 1

            # Sleep for a while if this is specified in the batch file
            time.sleep(sleep_time)

            # Periodic log saving, every 100 jobs
            if batch_index == 100:
                self.scp_log_back()
                batch_index = 0

        # To give it enough time so the jobs are scheduled; unless specified otherwise
        if sleep_time == 0:
            time.sleep(60)

        # After this for loop, go into monitor mode (run while there are jobs in the queue)
        LOG.info("Workload turns into monitor mode: this thread will stop when there are no more jobs in the queue. Sleep interval: %d" % (self.interval))
        jobs = Jobs(self.config, self.master.dns)
        count = jobs.get_current_number()
        print "Initial job count: %d" % (count)

        counter = 0
        while  count > 0:
            time.sleep(self.interval)
            count = jobs.get_current_number()
            print "Job count: %d" % (count)
            counter += 1

            # Periodic log saving, every 20 iterations
            if counter == 20:
                self.scp_log_back()
                counter = 0


        # check that the queue is empty one more time
        time.sleep(120)
        LOG.info("Checking that the queue is empty one more time after 120 seconds")
        count = jobs.get_current_number()
        while  count > 0:
            time.sleep(self.interval)
            count = jobs.get_current_number()
            print "Job count: %d" % (count)

        LOG.info("Workload completed")