def start_node(node): print "Connecting to node%s with hostname %s." % (node["id"], node["host"]) try: remote = SshMachine(node["host"], port = 22022, user = username, keyfile = path_to_keyfile, ssh_opts=["-o", "StrictHostKeyChecking=no"]) except Exception as e: print "Could not connect to %s: %s" % (node["host"], e) return print "[%s]Connected" % node["id"] try: remote["rm"]("node") except commands.processes.ProcessExecutionError: pass print "[%s]Downloading application..." % node["id"] remote["wget"]("-O", "node", "https://www.dropbox.com/s/mjw7dic2ywk5jrp/node") remote["chmod"]("u+x", "node") print "[%s]Starting python node..." % node["id"] try: remote["./node"]("--id", "%s" % (node["id"]), "--neighbours", json.dumps(neighbourhood[node["id"]]), "%s:%s" % (monitor["host"], monitor["tcp_port"])) except commands.processes.ProcessExecutionError as e: print "[%s]Got an exception: %s" % (node["id"], e) remote.close()
def open_db(urlstr=None): urlstr = urlstr or _get_db_url(urlstr) url = urlparse(urlstr) if url.scheme: machine = SshMachine(url.hostname, user=url.username, port=url.port) else: machine = local yield urlstr, machine, machine.path(url.path) if url.scheme: machine.close()
def initializeDBTier(self): # Run postgresql Server on remote machine 'dbNode' remote = SshMachine(self.dbNode, user = "******") logger.info("Remote connection established to dbNode") r_postgres = remote[REMOTE_POSTGRES] r_postgres("restart") r_postgres("reload") logger.info("Remote postgresql server restarted") print remote.cwd r_ls = remote["ls"] print r_ls() remote.close()
class SshHost(BaseHost): def __init__(self, **kwargs): BaseHost.__init__(self, **kwargs) self._mach = None self.deployment = None def connect(self): if self.deployment is None: self._mach = SshMachine(**self.kwargs) self.deployment = DeployedServer(self._mach) return self.deployment.classic_connect() def close(self): if self.deployment is not None: self.deployment.close() self._mach.close()
def kill_node(node): print "Killing node%s" % node["id"] try: remote = SshMachine(node["host"], port = 22022, user = username, keyfile = path_to_keyfile, ssh_opts=["-o StrictHostKeyChecking=no"]) except Exception as e: print "Could not connect to %s: %s" % (node["host"], e) return try: print remote["killall"]("node") except: print "Could not kill node%s" % node["id"] else: print "Node%s killed!" % node["id"] remote.close()
def startExperiment(gpu, session, command, project_dir, repo_ssh_string, update_repo=True, rebuild_docker=False, branch=None): """ Helper function to start an experiment remotely. Requirements: <project_dir>/docker/run.sh must exist and take two arguments 1. The name of the docker contain to be created 2. The command to be executed <project_dir>/docker/build.sh must exist Also, the project directory should be located in the home directory. Args: gpu (int): Id of the GPU session (string): Name of the container to be created command (string): Command to be exectued project_dir (string): Name of the project directory """ remote = SshMachine(gpu['server']) r_runDocker = remote[remote.cwd / project_dir / "docker/run.sh"] r_buildDocker = remote[remote.cwd / project_dir / "docker/build.sh"] home_dir = remote.cwd killRunningSession(remote, session) if update_repo: updateRepo(remote, project_dir, repo_ssh_string, branch=branch) if rebuild_docker: # Build docker print("Building container...") with remote.cwd(home_dir / project_dir): r_buildDocker() print("Done.") with remote.cwd(home_dir / project_dir): # r_runDocker(gpu, "code/main.py -p with ./code/conf/openaiEnv.yaml") # print('Executing command: ', command) r_runDocker(str(gpu['gpu_nr']), session, command) remote.close()
def test_deploy(self): rem = SshMachine("localhost") SshMachine.python = rem[sys.executable] with DeployedServer(rem) as dep: conn = dep.classic_connect() print(conn.modules.sys) func = conn.modules.os.getcwd print(func()) try: func() except EOFError: pass else: self.fail("expected an EOFError") rem.close()
def start_node(node): print "Connecting to node%s with hostname %s" % (node["id"], node["host"]) try: remote = SshMachine(node["host"], port = 22022, user = username, keyfile = path_to_keyfile) except: print "Could not connect to %s" % node return print "[%s]Connected" % node["id"] print "[%s]Killing python..." % node["id"] try: remote["killall"]("node") except Exception as e: print "[%s]Exception: %s" % (node["id"], e) print "[%s]Python could not get killed" % node["id"] remote.close()
class SshHost(BaseHost): def __init__(self, **kwargs): BaseHost.__init__(self, **kwargs) self._mach = None self.deployment = None def connect(self): if self.deployment is None: self._mach = SshMachine(**self.kwargs) self.deployment = DeployedServer(self._mach) return self.deployment.classic_connect() def close(self): if self.deployment is not None: self.deployment.close() self._mach.close()
def updateRepo(servername, project_dir, repo_ssh_string, branch): """ Helper function to pull newest commits to remote repo. """ remote = SshMachine(servername) r_git = remote['git'] home_dir = remote.cwd # Update repository print("Updating repo...", end='') with remote.cwd(home_dir / project_dir): r_git('fetch', '-q', 'origin') r_git('checkout', branch) r_git('reset', '--hard', 'origin/{}'.format(branch), '-q') # Check that we have the same git hash remote than local with remote.cwd(home_dir / project_dir): r_head = r_git('rev-parse', 'HEAD') l_head = git('rev-parse', 'HEAD') assert l_head == r_head, "Local git hash != pushed git hash. Did you forget to push changes?" print("Repo updated") remote.close()
class SshRunner: _server: Server _chdir: str _root: bool cmd: SshMachine docker_compose: RemoteCommand git: RemoteCommand pip: RemoteCommand def __init__(self, server: Server, chdir="", root=False): self._server = server self._chdir = chdir self._root = root def __enter__(self): if self._root: username = "******" else: username = self._server.username self.cmd = SshMachine(self._server.host, user=username) if self._chdir: if self._chdir.startswith("/"): self.cmd.cwd.chdir(self._chdir) else: self.cmd.cwd.chdir(self.cmd.cwd / self._chdir) if not self._root: self.docker_compose = self.cmd["docker-compose"] self.git = self.cmd["git"] self.pip = self.cmd["pip"] return self def __exit__(self, exc_type, exc_val, exc_tb): self.cmd.close()
def inspect_gpus(servers, own_username='******', verbose=True, needed_gpus=-1, memory_threshold=1200, gpu_util_threshold=5, allow_lightly_used_gpus=True, share_with=['all'], max_nr_processes=6, upper_memory_threshold=5500, upper_gpu_util_threshold=60, average=3, get_all=False): """ Scan servers for free GPUs, print availability and return a list of free GPUs that can used to start jobs on them. Requirements: ~/.ssh/config needs to be set up so that connecting via `ssh <server>` works. Fos OSX, an entry can look like this: Host mulga User maxigl HostName mulga.cs.ox.ac.uk BatchMode yes ForwardAgent yes StrictHostKeyChecking no AddKeysToAgent yes UseKeychain yes IdentityFile ~/.ssh/id_rsa Args: verbose (bool): If True, also print who is using the GPUs server (list of strings): List of servers to scan memory_threshold (int): gpu_util_threshold (int): When used memory < lower_memory_threshold and GPU utilisation < lower_gpu_util_threshold, then the GPU is regarded as free. allow_lightly_used_gpus (bool): share_with (list of strings): upper_memory_threshold (int): upper_gpu_util_threshold (int): If `allow_lightly_used_gpus=True` and memory and gpu utilisation are under the upper thresholds and there is so far only one process executed on that GPU who's user is in in the list `share_with`, then the GPU will be added to the list of GPUs that can be used to start jobs. Return: free_gpus: List of dictionaries, each containing the following keys: 'server': Name of the server 'gpu_nr': Number of the free GPU 'double': Whether someone is already using that GPU but it's still considered usuable (see `allow_lightly_used_gpus`) """ print((color_free | "Free" + " | ") + (color_me | "Own" + " | ") + (color_other | "Other" + " | ") + (color_other_light | "Other (light)")) all_free_gpus = [] server_id = 0 while ((needed_gpus < 0 and server_id < len(servers)) or len(all_free_gpus) < needed_gpus) or get_all: try: server = servers[server_id] except: break server_id += 1 print("{:7}: ".format(server), end='') try: remote = SshMachine(server) except plumbum.machines.session.SSHCommsError: print( "ssh fail - server not in .ssh/config? See doctring of this function." ) continue r_smi = remote["nvidia_smi"] r_ps = remote["ps"] averaged_gpu_data = [] for avg_idx in range(average): fieldnames = [ 'index', 'gpu_uuid', 'memory.total', 'memory.used', 'utilization.gpu', 'gpu_name' ] output = r_smi("--query-gpu=" + ",".join(fieldnames), "--format=csv,noheader,nounits").replace(" ", "") gpu_data = [] for line in output.splitlines(): gpu_data.append( dict([(name, int(x)) if x.strip().isdigit() else (name, x) for x, name in zip(line.split(","), fieldnames)])) if avg_idx == 0: averaged_gpu_data = gpu_data for gpu_idx in range(len(averaged_gpu_data)): averaged_gpu_data[gpu_idx]['utilization.gpu'] /= average averaged_gpu_data[gpu_idx]['memory.used'] /= average else: for gpu_idx, data in enumerate(gpu_data): averaged_gpu_data[gpu_idx][ 'utilization.gpu'] += data['utilization.gpu'] / average averaged_gpu_data[gpu_idx][ 'memory.used'] += data['memory.used'] / average time.sleep(1) gpu_data = averaged_gpu_data # Find processes and users for data in gpu_data: data['nr_processes'] = 0 data['users'] = [] output = r_smi("--query-compute-apps=pid,gpu_uuid", "--format=csv,noheader,nounits").replace(" ", "") gpu_processes = [] for line in output.splitlines(): gpu_processes.append([ int(x) if x.strip().isdigit() else x for x in line.split(",") ]) for process in gpu_processes: pid = process[0] user = (r_ps['-u', '-p'] | sed['-n', '2p'] | awk['{{print $1}}'])(pid) serial = process[1] for data in gpu_data: if data['gpu_uuid'] == serial: data['users'].append(user.strip()) data['nr_processes'] += 1 gpu_numbers = [] gpu_status = [] free_gpus = [] for data in gpu_data: status = "\t" + str(data['index']) + ": " # availability conditions: < 50MB and <5% utilisation ? # Is it free? if (data['memory.used'] < memory_threshold and data['utilization.gpu'] < gpu_util_threshold): status += "free" status = color_free | status gpu_numbers.append(color_free | str(data['index'])) free_gpus.append({ 'server': server, 'gpu_nr': data['index'], 'occupation': 0 }) # 'session': getSession(data['index'])}) else: status += "in use - {:4}% gpu - {:5}% mem - {}".format( str(data['utilization.gpu'])[:4], str(data['memory.used'] / data['memory.total'])[:4], str(data['users'])) if 'all' in share_with: share = True else: share = data['users'][0] in share_with if (allow_lightly_used_gpus and data['memory.used'] < upper_memory_threshold and data['utilization.gpu'] < upper_gpu_util_threshold and data['nr_processes'] < max_nr_processes and share): free_gpus.append({ 'server': server, 'gpu_nr': data['index'], 'occupation': data['nr_processes'] }) # 'session': getSession(data['index'] + 10)}) gpu_numbers.append(color_other_light | str(data['index'])) status = color_other_light | status else: gpu_numbers.append(color_other | str(data['index'])) status = color_other | status # elif (own_username in data['users']): # gpu_numbers[-1] = color_me | str(data['index']) gpu_status.append(status) all_free_gpus += free_gpus print( " ".join(gpu_numbers) + " | {} free | {} total".format(len(free_gpus), len(all_free_gpus))) if verbose: print("\t [{} - {} GB]".format( gpu_data[0]['gpu_name'], gpu_data[0]['memory.total'] // 1000)) for s in gpu_status: print(s) remote.close() return all_free_gpus
class VM: """ Manage a qemu-system virtual machine. It will be started with `__init__` and the ssh connection will be established with `__enter__`, so any ssh operation should be done inside a `with` block. A CPU allocation can also be provided, to bind the cores of the virtual machine to physical cores. :ivar ssh: plumbum.SshMachine object, useful to run commands on the VM. It should only be used inside a `with` block. :ivar process: popen process of qemu, useful to send signals or input to qemu. :example: with VM('bzImage', 'debian.img', '~/.ssh/id_rsa') as vm: vm.shh['ls'] """ def __init__(self, kernel_path, filesystem_img_path, keyfile, cpu_allocation=None, isolcpus=[]): """Start the qemu VM (non blocking) :param kernel_path: Path of the kernel's bzImage :param filesystem_img_path: Path of the filesystem image (.img) :param keyfile: Path of rsa key that is authorized on the image :param cpu_allocation: CpuAllocation for qemu and the vm's cores, or None to not assign CPUs :param isolcpus: list of CPU ID that should be isolated at boot time """ qemu_args = VM.__construct_qemu_args( kernel_path=kernel_path, filesystem_img_path=filesystem_img_path, isolcpus=isolcpus) self.process = local['qemu-system-x86_64'].popen(qemu_args) self.ssh = None self.key = keyfile if cpu_allocation: VM.__qemu_affinity_setup(self.process.pid, cpu_allocation) def __enter__(self): """Initialize the ssh connection (blocks until success)""" err = None for _ in range(SSH_MAX_RETRY): time.sleep(1) try: self.ssh = SshMachine('127.0.0.1', user='******', port=HOST_PORT, keyfile=self.key) break except (EOFError, plumbum.machines.session.SSHCommsError) as e: err = e continue else: # Reached maximum retries raise VMException('SSH connection failed after too many retries', err) return self def __exit__(self, type, value, traceback): """Stop the SSH connection and the VM""" if self.ssh is not None: self.ssh.close() self.ssh = None self.process.terminate() def scp_to(self, src_local, dst_remote): """Send a file from the host to the VM :param src_local: local path of the file to send :param dst_remote: destination path on the vm :raises ValueError: when the ssh connection is not established, i.e. when not used inside a `with` block """ if self.ssh is None: raise VMException( '`VM.scp_to` must be used with an established SSH connection, ' 'i.e. inside a `with` block.') src = local.path(src_local) dst = self.ssh.path(dst_remote) plumbum.path.utils.copy(src, dst) @staticmethod def __construct_qemu_args(kernel_path, filesystem_img_path, isolcpus=[]): """Qemu arguments similar to what `vm start` produces""" kernel_opt = '' if isolcpus: kernel_opt = ' isolcpus=' + ','.join(map(str, isolcpus)) return [ '-nographic', '-s', '-machine', 'accel=kvm', '-cpu', 'host', '-device', 'e1000,netdev=net0', '-netdev', 'user,id=net0,hostfwd=tcp::%d-:22' % HOST_PORT, '-append', 'console=ttyS0,115200 root=/dev/sda rw nokaslr' + kernel_opt, '-smp', '2', '-m', '4G', '-drive', 'if=none,id=hd,file=%s,format=raw' % filesystem_img_path, '-device', 'virtio-scsi-pci,id=scsi', '-device', 'scsi-hd,drive=hd', '-device', 'virtio-rng-pci,max-bytes=1024,period=1000', '-qmp', 'tcp:localhost:4444,server,nowait', '-serial', 'mon:stdio', '-kernel', '%s' % kernel_path, '-name', 'lsm_perf_vm,debug-threads=on' ] @staticmethod def __qemu_affinity_setup(qemu_pid, cpu_alloc): """Run qemu_affinity.py to allocate CPUs based on the CpuAllocation""" system_affinities = ( '-p %(sys)d -i *:%(sys)d -q %(sys)d -w *:%(sys)d' % { 'sys': cpu_alloc.qemu_sys }).split(' ') kvm_affinities = [ '-k', str(cpu_alloc.host_kvm0), str(cpu_alloc.host_kvm1) ] args = system_affinities + kvm_affinities + ['--', str(qemu_pid)] cmd = plumbum.cmd.sudo[sys.executable][QEMU_AFFINITY_PATH][args] return cmd()