示例#1
0
 def troubleshoot_server(self):
     if self.settings['ssh']:
         if servers[self.server]['cluster_soft'].lower() == 'oge':
             # delete present server run
             logging.error(
                 'Job {name} has server status {stat} on {server}. Troubleshooting by changing node.'
                 .format(name=self.job_name,
                         stat=self.job_status[0],
                         server=self.server))
             ssh = SSH_Client(self.server)
             ssh.send_command_to_server(command=delete_command[servers[
                 self.server]['cluster_soft']] + ' ' + str(self.job_id))
             # find available nodes
             stdout, _ = ssh.send_command_to_server(
                 command=list_available_nodes_command[servers[self.server]
                                                      ['cluster_soft']])
             for line in stdout:
                 node = line.split()[0].split('.')[0].split('node')[1]
                 if servers[self.server][
                         'cluster_soft'] == 'OGE' and '0/0/8' in line and node not in self.server_nodes:
                     self.server_nodes.append(node)
                     break
             else:
                 logging.error(
                     'Could not find an available node on the server')
                 # TODO: continue troubleshooting; if all else fails, put job to sleep for x min and try again searching for a node
                 return
             # modify submit file
             content = ssh.read_remote_file(
                 remote_path=self.remote_path,
                 filename=submit_filename[servers[self.server]
                                          ['cluster_soft']])
             for i, line in enumerate(content):
                 if '#$ -l h=node' in line:
                     content[i] = '#$ -l h=node{0}.cluster'.format(node)
                     break
             else:
                 content.insert(7, '#$ -l h=node{0}.cluster'.format(node))
             content = ''.join(
                 content
             )  # convert list into a single string, not to upset paramico
             # resubmit
             ssh.upload_file(remote_file_path=os.path.join(
                 self.remote_path,
                 submit_filename[servers[self.server]['cluster_soft']]),
                             file_string=content)
             self.run()
         elif servers[self.server]['cluster_soft'].lower() == 'slurm':
             # TODO: change node on Slurm
             # delete present server run
             logging.error(
                 'Job {name} has server status {stat} on {server}. Re-running job.'
                 .format(name=self.job_name,
                         stat=self.job_status[0],
                         server=self.server))
             ssh = SSH_Client(self.server)
             ssh.send_command_to_server(command=delete_command[servers[
                 self.server]['cluster_soft']] + ' ' + str(self.job_id))
             # resubmit
             self.run()
示例#2
0
 def _upload_submit_file(self):
     ssh = SSH_Client(self.server)
     ssh.send_command_to_server(
         command='mkdir -p {0}'.format(self.remote_path))
     remote_file_path = os.path.join(
         self.remote_path,
         submit_filename[servers[self.server]['cluster_soft']])
     ssh.upload_file(remote_file_path=remote_file_path,
                     file_string=self.submit)
示例#3
0
 def _upload_input_file(self):
     ssh = SSH_Client(self.server)
     ssh.send_command_to_server(
         command='mkdir -p {0}'.format(self.remote_path))
     remote_file_path = os.path.join(self.remote_path,
                                     input_filename[self.software])
     ssh.upload_file(remote_file_path=remote_file_path,
                     file_string=self.input)
     self.initial_time = ssh.get_last_modified_time(
         remote_file_path=remote_file_path)