예제 #1
0
    def update_current_list(self):

        rcmd = RemoteCommand(
            config = self.config,
            hostname = self.master_dns,
            ssh_private_key = self.config.globals.priv_path,
            user = self.config.workload.user,
            command = self.command_job_list)
        rcmd.execute()
        queue_state = rcmd.stdout

        self.list = []
        if queue_state != None:
            items = queue_state.split()
            if self.config.workload.user in items:
                start = items.index(self.config.workload.user) - 1

                uc = 0
                sdcs = 0
                total = 0
                for i in range(start, len(items), 6):
                    #print "Job %s running for %s on %s" % (items[i], items[i+4], items[i+5])
                    self.list.append(Job(items[i], items[i+4], items[i+5]))
                    node = items[i+5]
                    if "uc" in node:
                        uc += 1
                    if "sdcs" in node:
                        sdcs += 1
                    total += 1
                print "Jobs: total: %d, Hotel: %d, Sierra: %d" % (total, uc, sdcs)
예제 #2
0
    def get_cloud_termination_list(self):

        condor_list = []
        command = "condor_status"
        rcmd = RemoteCommand(
            config = self.config,
            hostname = self.master.dns,
            ssh_private_key = self.config.globals.priv_path,
            user = self.config.workload.user,
            command = command)
        rcmd.execute()
        if rcmd.stdout:
            # condor status will be lines so split them
            all_lines = rcmd.stdout.split("\n")
            for line in all_lines:
                # line not empty
                if line.strip():
                # if its the first line then go to the next one
                    if line.strip().startswith("Name"):
                        continue
                    # if we find a line that starts with total then we are done, break out from the loop
                    elif line.strip().startswith("Total"):
                        break
                    # it must be a line of interest, parse it
                    else:
                        # split line by space :
                        #"vm-148-102.uc.futu LINUX      X86_64 Unclaimed Idle     0.150  2048  0+00:00:04"
                        line_columns = line.split()
                        try:
                            tmp_fqdn = line_columns[0].strip()
                            condor_list.append(tmp_fqdn)
                        except Exception as expt:
                            LOG.info("Error parsing condor status, line says : %s and the expt says : %s" % (line, str(expt)))
        LOG.info("Condor worker names: %s" % (str(condor_list)))

        # instances running in this cloud
        cloud = Cloud(self.cloud_name, self.config)
        vms = cloud.get_running_instances()

        # add to the termination list only workers (no master) that have checked in with the master:
        termination_list = []
        for instance in vms:

            # not a master
            if not instance.public_dns_name == self.master.dns:

                for worker_partial_name in condor_list:
                    if worker_partial_name in instance.public_dns_name:
                        termination_list.append(instance)
                        condor_list.remove(worker_partial_name)

        termination_list_names =[]
        for instance in termination_list:
            termination_list_names.append(instance.public_dns_name)
        LOG.info("Termination list for %s: %s" % (self.cloud_name, str(termination_list_names)))

        return termination_list
예제 #3
0
    def get_current_number(self):

        rcmd = RemoteCommand(
                config = self.config,
                hostname = self.master_dns,
                ssh_private_key = self.config.globals.priv_path,
                user = self.config.workload.user,
                command = self.command_job_count)
        rcmd.execute()

        if not rcmd.stdout:
            return 0
        else:
            return int(rcmd.stdout)
예제 #4
0
 def deploy_software(self):
     ssh_priv_key = self.config.globals.ssh_priv_key
     ssh_username = self.config.globals.ssh_username
     ssh_timeout = int(self.config.globals.ssh_timeout)
     reservations = list()
     not_available = 0
     for cloud in self.clouds:
         for instance in cloud.get_all_floating_ips():
             if self.database.check_benchmark(self.benchmark.name,
                                              instance.instance_id):
                 if not check_port_status(instance.ip, 22, ssh_timeout):
                     LOG.error("Deploy_software: the port 22 is not "
                               "available right now. please try it later")
                     continue
                 cmds = list()
                 cmds.append("wget %s" % (self.url))
                 cmds.append("sudo apt-get update")
                 cmds.append("sudo apt-get install unzip")
                 cmds.append("unzip BioPerf.zip")
                 cmds.append("sed -i 's/read BIOPERF/#read "
                             "BIOPERF/g' install-BioPerf.sh")
                 cmds.append("./install-BioPerf.sh")
                 for c in cmds:
                     command = RemoteCommand(instance.ip,
                                             ssh_priv_key, c)
                     command_return = command.execute()
                     if command_return != 0:
                         LOG.error("Deploy_software: " + command.stdout)
                         LOG.error("Deploy_software error: " +
                                   command.stderr)
예제 #5
0
파일: clusters.py 프로젝트: suiy/automaton
 def excute_benchmarks(self,dataset_size):
     ssh_priv_key = self.config.globals.ssh_priv_key
     ssh_username = self.config.globals.ssh_username
     reservations = list()   
     if self.reservations:
         reservations = self.reservations
     else:
         for cloud in self.clouds:
             reservations = cloud.conn.get_all_instances()
     for reservation in reservations:
         for instance in reservation.instances:
             if self.database.check_benchmark(self.benchmark.name, instance.id):
                 cmds = list()
                 cmds.append("sed -i '13c rm -f $BIOPERF/Outputs/log' ~/BioPerf/Scripts/Run-scripts/CleanOutputs.sh")
                 cmds.append("sed -i '60c FASTA=0' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                 cmds.append("sed -i '62c GRAPPA=0' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                 cmds.append("./BioPerf/Scripts/Run-scripts/CleanOutputs.sh")
                 if dataset_size=="large":
                     #cmds.append("sed -i '134c input='A'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                     cmds.append("echo C>c;echo H>>c;echo '1'>>c;echo Y>>c")
                     
                 elif dataset_size=="medium":
                     #cmds.append("sed -i '134c input='B'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                     cmds.append("echo B>c;echo H>>c;echo '1'>>c;echo Y>>c")
                 else:
                     #cmds.append("sed -i '134c input='C'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                     cmds.append("echo A>c;echo H>>c;echo '1'>>c;echo Y>>c")
                 cmds.append("cat c|./BioPerf/Scripts/Run-scripts/run-bioperf.sh > ~/BioPerf/Outputs/log")
                 
                 for c in cmds:
                     command = RemoteCommand(instance.public_dns_name, ssh_priv_key, c)
                     command_return = command.execute()
                     if command_return !=0:
                         LOG.error("Excute_benchmarks: "+command.stdout)
                         LOG.error("Excute_benchmarks: "+command.stderr)
예제 #6
0
파일: clusters.py 프로젝트: dmdu/automaton
 def excute_benchmarks(self):
     ssh_priv_key = self.config.globals.ssh_priv_key
     ssh_username = self.config.globals.ssh_username
     reservations = list()   
     if self.reservations:
         reservations = self.reservations
     else:
         for cloud in self.clouds:
             reservations = cloud.conn.get_all_instances()
     for reservation in reservations:
         for instance in reservation.instances:
             if self.database.check_benchmark(self.benchmark.name, instance.id):
                 cmds = list()
                 cmds.append("sed -i '5c input='y'' ~/BioPerf/Scripts/Run-scripts/CleanOutputs.sh")
                 cmds.append("sed -i '13c rm -f $BIOPERF/Outputs/log' ~/BioPerf/Scripts/Run-scripts/CleanOutputs.sh")
                 cmds.append("sed -i '21c #' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                 cmds.append("sed -i '26c #' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                 cmds.append("sed -i '10c arch='X'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                 cmds.append("sed -i '71c input3='A'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                 cmds.append("sed -i '134c input='A'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                 cmds.append("sed -i '145c user1='y'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                 cmds.append("./BioPerf/Scripts/Run-scripts/CleanOutputs.sh")
                 cmds.append("echo 'Y' 'Y'|./BioPerf/Scripts/Run-scripts/run-bioperf.sh > ~/BioPerf/Outputs/log")
                 
                 for c in cmds:
                     command = RemoteCommand(instance.public_dns_name, ssh_priv_key, c)
                     command_return = command.execute()
                     if command_return !=0:
                         LOG.error("Excute_benchmarks: "+command.stdout)
                         LOG.error("Excute_benchmarks: "+command.stderr)
예제 #7
0
파일: clusters.py 프로젝트: dmdu/automaton
 def deploy_software(self):
     ssh_priv_key = self.config.globals.ssh_priv_key
     ssh_username = self.config.globals.ssh_username
     ssh_timeout = int(self.config.globals.ssh_timeout)
     reservations = list()   
     not_available = 0;
     if self.reservations:
         reservations = self.reservations
     else:
         for cloud in self.clouds:
             reservations = cloud.conn.get_all_instances()
     for reservation in reservations:
         for instance in reservation.instances:
             if self.database.check_benchmark(self.benchmark.name, instance.id):
                 if not check_port_status(instance.ip_address, 22, ssh_timeout):
                     LOG.error("Deploy_software: the port 22 is not available right now. please try it later")
                     continue   
                 cmds = list()
                 cmds.append("wget %s" % (self.url))
                 #cmds.append("apt-get update")
                 #cmds.append("apt-get install unzip")
                 cmds.append("unzip BioPerf.zip")
                 cmds.append("sed -i 's/read BIOPERF/#read BIOPERF/g' install-BioPerf.sh")
                 cmds.append("./install-BioPerf.sh")
                 for c in cmds:
                     command = RemoteCommand(instance.public_dns_name, ssh_priv_key, c)
                     command_return = command.execute()
                     if command_return !=0:
                         LOG.error("Deploy_software: "+command.stdout)
                         LOG.error("Deploy_software error: "+command.stderr)
예제 #8
0
    def idle_workers_count(self):

        command = "condor_status | grep Idle"
        rcmd = RemoteCommand(
            config = self.config,
            hostname = self.master.dns,
            ssh_private_key = self.config.globals.priv_path,
            user = self.config.workload.user,
            command = command)
        rcmd.execute()
        out = rcmd.stdout

        if out == None:
            return 0
        else:
            items = out.split()
            item_count = len(items)
            # there are normally 8 items per line (i.e. per worker) in the condor_status output
            #print item_count
            if item_count%8 != 0:
                LOG.error("Number of items in the output of condor_status is not a multiple of 8")
            return item_count/8
예제 #9
0
 def exec_cmd_on_condor_master(self, cmd_string):
     cmd = RemoteCommand(
         config = self.config,
         hostname = self.master_dns,
         ssh_private_key = self.config.globals.priv_path,
         user = self.config.workload.user,
         command = cmd_string)
     code = cmd.execute()
     if not code:
         return cmd.stdout
     else:
         LOG.error("Command failed: %s" % (cmd_string))
         LOG.error("Stderr: %s" % (cmd.stderr))
         return None
예제 #10
0
파일: clusters.py 프로젝트: suiy/automaton
    def deploy_software(self):
        ssh_priv_key = self.config.globals.ssh_priv_key
        ssh_username = self.config.globals.ssh_username
        ssh_timeout = int(self.config.globals.ssh_timeout)
        reservations = list()   
        not_available = 0;
        if self.reservations:
            reservations = self.reservations
        else:
            for cloud in self.clouds:
                reservations = cloud.conn.get_all_instances()
        for reservation in reservations:
            for instance in reservation.instances:
                if self.database.check_benchmark(self.benchmark.name, instance.id):
                    if not check_port_status(instance.ip_address, 22, ssh_timeout):
                        LOG.error("Deploy_software: the port 22 is not available right now. please try it later")
                        continue  
                    cmds = list() 
                    cmds.append("rm -rf ~/*")
                    cmds.append("wget %s" % (self.url))
                    cmds.append("apt-get update")
                    cmds.append("apt-get install unzip")
                    cmds.append("unzip BioPerf.zip")
                    cmds.append("sed -i 's/read BIOPERF/#read BIOPERF/g' install-BioPerf.sh")
                    cmds.append("./install-BioPerf.sh")
                    cmds.append("wget ftp://ftp.cc.gatech.edu/pub/people/bader/BioPerf/swissprot.tar.gz")
                    cmds.append("tar -xvf swissprot.tar.gz")
                    cmds.append("mv Swissprot/* .")
                    cmds.append("wget ftp://ftp.cc.gatech.edu/pub/people/bader/BioPerf/Pfam")
                    cmds.append("wget ftp://ftp.cc.gatech.edu/pub/people/bader/BioPerf/nr")
                    cmds.append("sed -i '10 i\DATABASES=~/' ~/.profile")
                    cmds.append("sed -i '10 i\export DATABASES' ~/.profile")
                    cmds.append("sed -i '5c input='y'' ~/BioPerf/Scripts/Run-scripts/CleanOutputs.sh")
                    cmds.append("sed -i '21c #' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                    cmds.append("sed -i '26c #' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                    cmds.append("sed -i '10c arch='X'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                    cmds.append("sed -i '71c input3='A'' ~/BioPerf/Scripts/Run-scripts/run-bioperf.sh")
                    cmds.append('''sed -i "659c ch='Y';" ./BioPerf/Source-codes/Phylip/src/promlk.c''')
                    cmds.append("sed -i '/scanf/c //' ./BioPerf/Source-codes/Phylip/src/promlk.c")
                    cmds.append("cd ./BioPerf/Source-codes/Phylip/src;make promlk;cd ~")
                    cmds.append("mv ./BioPerf/Source-codes/Phylip/src/promlk ./BioPerf/Binaries/x86-Binaries/Phylip")

                    for c in cmds:
                        command = RemoteCommand(instance.public_dns_name, ssh_priv_key, c)
                        command_return = command.execute()
                        if command_return !=0:
                            LOG.error("Deploy_software: "+command.stdout)
                            LOG.error("Deploy_software error: "+command.stderr)
예제 #11
0
    def terminate_condor(self, master_dns):

        command = "condor_off -fast %s" % (self.dns)
        rcmd = RemoteCommand(
            config=self.config,
            hostname=master_dns,
            ssh_private_key=self.config.globals.priv_path,
            user="******",
            command=command,
        )
        code = rcmd.execute()
        if code == 0:
            LOG.info("Successfully stopped Condor daemon on worker %s instance id : %s" % (self.dns, self.instance))
        else:
            LOG.error(
                "Error occurred during Condor daemon termination on worker %s instance: %s" % (self.dns, self.instance)
            )
예제 #12
0
    def offline(self, master_dns):
        # Marking node offline actually has to be done from the master side

        if master_dns:
            command = "condor_off -peaceful %s" % (self.dns)
            rcmd = RemoteCommand(
                config=self.config,
                hostname=master_dns,
                ssh_private_key=self.config.globals.priv_path,
                user="******",
                command=command,
            )
            code = rcmd.execute()
            if code == 0:
                LOG.info("Successfully marked instance offline: %s" % (self.instance))
            else:
                LOG.error("Error occurred during marking instance offline: %s" % (self.instance))
        else:
            LOG.error("Can't mark instance offline without master's dns")
예제 #13
0
    def run(self):

        batch_index = 0
        for batch in self.batch_files:

            last_line = os.popen("tail -n 1 %s" % batch).read()
            # if sleep time is specified
            if ("SLEEP" in last_line) or ("sleep" in last_line):
                # last item in the line
                sleep_time = int(last_line.split()[-1:][0])
            else:
                sleep_time = 0

            # Copy the batch file to the log directory
            copy_string = "cp %s %s/" % (batch, self.config.log_dir)
            copy_cmd = Command(copy_string)
            code = copy_cmd.execute()
            if code == 0:
                #LOG.info("Workload %s file has been copied successfully to the log directory" % (batch))
                pass

            # Scp this file to the master
            scp_string = "scp %s %s@%s:~/%s" % (batch, self.config.workload.user, self.master.dns, self.config.workload.submit_remote)
            scp_cmd = Command(scp_string)
            code = scp_cmd.execute()
            if code == 0:
                #LOG.info("Batch file %s has been copied to the master node" % (batch))
                pass
            else:
                LOG.error("Error occurred during copying batch file %s to the master node" % (batch))

            # Send this batch to the work queue
            exec_cmd = RemoteCommand(
                config = self.config,
                hostname = self.master.dns,
                ssh_private_key = self.config.globals.priv_path,
                user = self.config.workload.user,
                command = 'condor_submit %s' % (self.config.workload.submit_remote))
            code = exec_cmd.execute()
            if code == 0:
                #LOG.info("Batch file %s has been submitted to the work queue" % (batch))
                pass
            else:
                LOG.error("Error occurred during submission of batch file %s" % (batch))

            LOG.info("%s" % (batch))

            batch_index += 1

            # Sleep for a while if this is specified in the batch file
            time.sleep(sleep_time)

            # Periodic log saving, every 100 jobs
            if batch_index == 100:
                self.scp_log_back()
                batch_index = 0

        # To give it enough time so the jobs are scheduled; unless specified otherwise
        if sleep_time == 0:
            time.sleep(60)

        # After this for loop, go into monitor mode (run while there are jobs in the queue)
        LOG.info("Workload turns into monitor mode: this thread will stop when there are no more jobs in the queue. Sleep interval: %d" % (self.interval))
        jobs = Jobs(self.config, self.master.dns)
        count = jobs.get_current_number()
        print "Initial job count: %d" % (count)

        counter = 0
        while  count > 0:
            time.sleep(self.interval)
            count = jobs.get_current_number()
            print "Job count: %d" % (count)
            counter += 1

            # Periodic log saving, every 20 iterations
            if counter == 20:
                self.scp_log_back()
                counter = 0


        # check that the queue is empty one more time
        time.sleep(120)
        LOG.info("Checking that the queue is empty one more time after 120 seconds")
        count = jobs.get_current_number()
        while  count > 0:
            time.sleep(self.interval)
            count = jobs.get_current_number()
            print "Job count: %d" % (count)

        LOG.info("Workload completed")