Пример #1
0
def start_node(node):
    print "Connecting to node%s with hostname %s." % (node["id"], node["host"])
    try:
        remote = SshMachine(node["host"], port = 22022, user = username, 
                keyfile = path_to_keyfile, ssh_opts=["-o", "StrictHostKeyChecking=no"])
    except Exception as e:
        print "Could not connect to %s: %s" % (node["host"], e)
        return
    print "[%s]Connected" % node["id"]
    try:
        remote["rm"]("node")
    except commands.processes.ProcessExecutionError:
        pass
    print "[%s]Downloading application..." % node["id"]
    remote["wget"]("-O", "node", 
        "https://www.dropbox.com/s/mjw7dic2ywk5jrp/node")
    remote["chmod"]("u+x", "node")
    print "[%s]Starting python node..." % node["id"]
    try:
        remote["./node"]("--id", "%s" % (node["id"]), "--neighbours", 
                json.dumps(neighbourhood[node["id"]]),
                "%s:%s" % (monitor["host"], monitor["tcp_port"]))
    except commands.processes.ProcessExecutionError as e:
        print "[%s]Got an exception: %s" % (node["id"], e)
    remote.close()
Пример #2
0
def open_db(urlstr=None):
    urlstr = urlstr or _get_db_url(urlstr)
    url = urlparse(urlstr)
    if url.scheme:
        machine = SshMachine(url.hostname, user=url.username, port=url.port)
    else:
        machine = local
    yield urlstr, machine, machine.path(url.path)
    if url.scheme:
        machine.close()
Пример #3
0
    def initializeDBTier(self):
        # Run postgresql Server on remote machine 'dbNode'
        remote = SshMachine(self.dbNode, user = "******")
        logger.info("Remote connection established to dbNode")
        r_postgres = remote[REMOTE_POSTGRES]
        r_postgres("restart")
        r_postgres("reload")
        logger.info("Remote postgresql server restarted")

        print remote.cwd
        r_ls = remote["ls"]
        print r_ls()
        remote.close()
Пример #4
0
class SshHost(BaseHost):
    def __init__(self, **kwargs):
        BaseHost.__init__(self, **kwargs)
        self._mach = None
        self.deployment = None
    def connect(self):
        if self.deployment is None:
            self._mach = SshMachine(**self.kwargs)
            self.deployment = DeployedServer(self._mach)
        return self.deployment.classic_connect()
    def close(self):
        if self.deployment is not None:
            self.deployment.close()
            self._mach.close()
Пример #5
0
def kill_node(node):
    print "Killing node%s" % node["id"]
    try:
        remote = SshMachine(node["host"], port = 22022, user = username, 
                keyfile = path_to_keyfile, ssh_opts=["-o StrictHostKeyChecking=no"])
    except Exception as e:
        print "Could not connect to %s: %s" % (node["host"], e)
        return
    try:
        print remote["killall"]("node")
    except:
        print "Could not kill node%s" % node["id"]
    else:
        print "Node%s killed!" % node["id"]
    remote.close()
Пример #6
0
def startExperiment(gpu,
                    session,
                    command,
                    project_dir,
                    repo_ssh_string,
                    update_repo=True,
                    rebuild_docker=False,
                    branch=None):
    """
    Helper function to start an experiment remotely.

    Requirements:
        <project_dir>/docker/run.sh must exist and take two arguments
             1. The name of the docker contain to be created
             2. The command to be executed
        <project_dir>/docker/build.sh must exist

        Also, the project directory should be located in the home directory.

    Args:
        gpu (int): Id of the GPU
        session (string): Name of the container to be created
        command (string): Command to be exectued
        project_dir (string): Name of the project directory

    """
    remote = SshMachine(gpu['server'])
    r_runDocker = remote[remote.cwd / project_dir / "docker/run.sh"]
    r_buildDocker = remote[remote.cwd / project_dir / "docker/build.sh"]
    home_dir = remote.cwd

    killRunningSession(remote, session)
    if update_repo:
        updateRepo(remote, project_dir, repo_ssh_string, branch=branch)

    if rebuild_docker:
        # Build docker
        print("Building container...")
        with remote.cwd(home_dir / project_dir):
            r_buildDocker()
            print("Done.")

    with remote.cwd(home_dir / project_dir):
        # r_runDocker(gpu, "code/main.py -p with ./code/conf/openaiEnv.yaml")
        # print('Executing command: ', command)
        r_runDocker(str(gpu['gpu_nr']), session, command)

    remote.close()
Пример #7
0
    def test_deploy(self):
        rem = SshMachine("localhost")
        SshMachine.python = rem[sys.executable]
        with DeployedServer(rem) as dep:
            conn = dep.classic_connect()
            print(conn.modules.sys)
            func = conn.modules.os.getcwd
            print(func())

        try:
            func()
        except EOFError:
            pass
        else:
            self.fail("expected an EOFError")
        rem.close()
Пример #8
0
def start_node(node):
    print "Connecting to node%s with hostname %s" % (node["id"], node["host"])
    try:
        remote = SshMachine(node["host"], port = 22022, user = username, 
                keyfile = path_to_keyfile)
    except:
        print "Could not connect to %s" % node
        return
    print "[%s]Connected" % node["id"]
    print "[%s]Killing python..." % node["id"]
    try:
        remote["killall"]("node")
    except Exception as e:
        print "[%s]Exception: %s" % (node["id"], e)
        print "[%s]Python could not get killed" % node["id"]
    remote.close()
Пример #9
0
class SshHost(BaseHost):
    def __init__(self, **kwargs):
        BaseHost.__init__(self, **kwargs)
        self._mach = None
        self.deployment = None

    def connect(self):
        if self.deployment is None:
            self._mach = SshMachine(**self.kwargs)
            self.deployment = DeployedServer(self._mach)
        return self.deployment.classic_connect()

    def close(self):
        if self.deployment is not None:
            self.deployment.close()
            self._mach.close()
Пример #10
0
def updateRepo(servername, project_dir, repo_ssh_string, branch):
    """
    Helper function to pull newest commits to remote repo.
    """
    remote = SshMachine(servername)
    r_git = remote['git']
    home_dir = remote.cwd
    # Update repository
    print("Updating repo...", end='')
    with remote.cwd(home_dir / project_dir):
        r_git('fetch', '-q', 'origin')
        r_git('checkout', branch)
        r_git('reset', '--hard', 'origin/{}'.format(branch), '-q')

    # Check that we have the same git hash remote than local
    with remote.cwd(home_dir / project_dir):
        r_head = r_git('rev-parse', 'HEAD')
    l_head = git('rev-parse', 'HEAD')
    assert l_head == r_head, "Local git hash != pushed git hash. Did you forget to push changes?"
    print("Repo updated")
    remote.close()
Пример #11
0
class SshRunner:
    _server: Server
    _chdir: str
    _root: bool

    cmd: SshMachine

    docker_compose: RemoteCommand
    git: RemoteCommand
    pip: RemoteCommand

    def __init__(self, server: Server, chdir="", root=False):
        self._server = server
        self._chdir = chdir
        self._root = root

    def __enter__(self):
        if self._root:
            username = "******"
        else:
            username = self._server.username
        self.cmd = SshMachine(self._server.host, user=username)
        if self._chdir:
            if self._chdir.startswith("/"):
                self.cmd.cwd.chdir(self._chdir)
            else:
                self.cmd.cwd.chdir(self.cmd.cwd / self._chdir)

        if not self._root:
            self.docker_compose = self.cmd["docker-compose"]
            self.git = self.cmd["git"]
            self.pip = self.cmd["pip"]

        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.cmd.close()
Пример #12
0
def inspect_gpus(servers,
                 own_username='******',
                 verbose=True,
                 needed_gpus=-1,
                 memory_threshold=1200,
                 gpu_util_threshold=5,
                 allow_lightly_used_gpus=True,
                 share_with=['all'],
                 max_nr_processes=6,
                 upper_memory_threshold=5500,
                 upper_gpu_util_threshold=60,
                 average=3,
                 get_all=False):
    """
    Scan servers for free GPUs, print availability and return a list of free GPUs that can used to
    start jobs on them.

    Requirements:
        ~/.ssh/config needs to be set up so that connecting via `ssh <server>` works. Fos OSX,
        an entry can look like this:

        Host mulga
            User maxigl
            HostName mulga.cs.ox.ac.uk
            BatchMode yes
            ForwardAgent yes
            StrictHostKeyChecking no
            AddKeysToAgent yes
            UseKeychain yes
            IdentityFile ~/.ssh/id_rsa

    Args:
        verbose (bool):           If True, also print who is using the GPUs
        server (list of strings): List of servers to scan


        memory_threshold (int):
        gpu_util_threshold (int): When used memory < lower_memory_threshold and
                                  GPU utilisation < lower_gpu_util_threshold,
                                  then the GPU is regarded as free.

        allow_lightly_used_gpus (bool):
        share_with (list of strings):
        upper_memory_threshold (int):
        upper_gpu_util_threshold (int): If `allow_lightly_used_gpus=True` and memory and gpu
                                        utilisation are under the upper thresholds and there
                                        is so far only one process executed on that GPU who's
                                        user is in in the list `share_with`, then the GPU will
                                        be added to the list of GPUs that can be used to start jobs.

    Return:
        free_gpus: List of dictionaries, each containing the following keys:
                   'server': Name of the server
                   'gpu_nr': Number of the free GPU
                   'double': Whether someone is already using that GPU but it's still considered
                             usuable (see `allow_lightly_used_gpus`)


    """
    print((color_free | "Free" + " | ") + (color_me | "Own" + " | ") +
          (color_other | "Other" + " | ") +
          (color_other_light | "Other (light)"))

    all_free_gpus = []
    server_id = 0

    while ((needed_gpus < 0 and server_id < len(servers))
           or len(all_free_gpus) < needed_gpus) or get_all:
        try:
            server = servers[server_id]
        except:
            break
        server_id += 1
        print("{:7}: ".format(server), end='')
        try:
            remote = SshMachine(server)
        except plumbum.machines.session.SSHCommsError:
            print(
                "ssh fail - server not in .ssh/config? See doctring of this function."
            )
            continue
        r_smi = remote["nvidia_smi"]
        r_ps = remote["ps"]
        averaged_gpu_data = []
        for avg_idx in range(average):
            fieldnames = [
                'index', 'gpu_uuid', 'memory.total', 'memory.used',
                'utilization.gpu', 'gpu_name'
            ]
            output = r_smi("--query-gpu=" + ",".join(fieldnames),
                           "--format=csv,noheader,nounits").replace(" ", "")

            gpu_data = []
            for line in output.splitlines():
                gpu_data.append(
                    dict([(name, int(x)) if x.strip().isdigit() else (name, x)
                          for x, name in zip(line.split(","), fieldnames)]))
            if avg_idx == 0:
                averaged_gpu_data = gpu_data
                for gpu_idx in range(len(averaged_gpu_data)):
                    averaged_gpu_data[gpu_idx]['utilization.gpu'] /= average
                    averaged_gpu_data[gpu_idx]['memory.used'] /= average
            else:
                for gpu_idx, data in enumerate(gpu_data):
                    averaged_gpu_data[gpu_idx][
                        'utilization.gpu'] += data['utilization.gpu'] / average
                    averaged_gpu_data[gpu_idx][
                        'memory.used'] += data['memory.used'] / average
            time.sleep(1)

        gpu_data = averaged_gpu_data

        # Find processes and users
        for data in gpu_data:
            data['nr_processes'] = 0
            data['users'] = []

        output = r_smi("--query-compute-apps=pid,gpu_uuid",
                       "--format=csv,noheader,nounits").replace(" ", "")

        gpu_processes = []
        for line in output.splitlines():
            gpu_processes.append([
                int(x) if x.strip().isdigit() else x for x in line.split(",")
            ])

        for process in gpu_processes:
            pid = process[0]
            user = (r_ps['-u', '-p'] | sed['-n', '2p']
                    | awk['{{print $1}}'])(pid)
            serial = process[1]
            for data in gpu_data:
                if data['gpu_uuid'] == serial:
                    data['users'].append(user.strip())
                    data['nr_processes'] += 1

        gpu_numbers = []
        gpu_status = []
        free_gpus = []

        for data in gpu_data:
            status = "\t" + str(data['index']) + ": "
            # availability conditions: < 50MB and <5% utilisation ?

            # Is it free?
            if (data['memory.used'] < memory_threshold
                    and data['utilization.gpu'] < gpu_util_threshold):

                status += "free"
                status = color_free | status
                gpu_numbers.append(color_free | str(data['index']))
                free_gpus.append({
                    'server': server,
                    'gpu_nr': data['index'],
                    'occupation': 0
                })
                # 'session': getSession(data['index'])})
            else:
                status += "in use - {:4}% gpu - {:5}% mem - {}".format(
                    str(data['utilization.gpu'])[:4],
                    str(data['memory.used'] / data['memory.total'])[:4],
                    str(data['users']))

                if 'all' in share_with:
                    share = True
                else:
                    share = data['users'][0] in share_with

                if (allow_lightly_used_gpus
                        and data['memory.used'] < upper_memory_threshold
                        and data['utilization.gpu'] < upper_gpu_util_threshold
                        and data['nr_processes'] < max_nr_processes and share):

                    free_gpus.append({
                        'server': server,
                        'gpu_nr': data['index'],
                        'occupation': data['nr_processes']
                    })
                    # 'session': getSession(data['index'] + 10)})
                    gpu_numbers.append(color_other_light | str(data['index']))
                    status = color_other_light | status
                else:
                    gpu_numbers.append(color_other | str(data['index']))
                    status = color_other | status
                # elif (own_username in data['users']):
                #     gpu_numbers[-1] = color_me | str(data['index'])

            gpu_status.append(status)

        all_free_gpus += free_gpus
        print(
            " ".join(gpu_numbers) +
            " | {} free | {} total".format(len(free_gpus), len(all_free_gpus)))

        if verbose:
            print("\t [{} - {} GB]".format(
                gpu_data[0]['gpu_name'], gpu_data[0]['memory.total'] // 1000))
            for s in gpu_status:
                print(s)

        remote.close()
    return all_free_gpus
Пример #13
0
class VM:
    """
    Manage a qemu-system virtual machine.

    It will be started with `__init__` and the ssh connection will be
    established with `__enter__`, so any ssh operation should be done
    inside a `with` block.

    A CPU allocation can also be provided, to bind the cores of the
    virtual machine to physical cores.

    :ivar ssh: plumbum.SshMachine object, useful to run commands on
               the VM. It should only be used inside a `with` block.
    :ivar process: popen process of qemu, useful to send signals
                   or input to qemu.

    :example:
        with VM('bzImage', 'debian.img', '~/.ssh/id_rsa') as vm:
            vm.shh['ls']
    """
    def __init__(self,
                 kernel_path,
                 filesystem_img_path,
                 keyfile,
                 cpu_allocation=None,
                 isolcpus=[]):
        """Start the qemu VM (non blocking)

        :param kernel_path: Path of the kernel's bzImage
        :param filesystem_img_path: Path of the filesystem image (.img)
        :param keyfile: Path of rsa key that is authorized on the image
        :param cpu_allocation: CpuAllocation for qemu and the vm's cores,
                               or None to not assign CPUs
        :param isolcpus: list of CPU ID that should be isolated at boot time
        """
        qemu_args = VM.__construct_qemu_args(
            kernel_path=kernel_path,
            filesystem_img_path=filesystem_img_path,
            isolcpus=isolcpus)
        self.process = local['qemu-system-x86_64'].popen(qemu_args)
        self.ssh = None
        self.key = keyfile
        if cpu_allocation:
            VM.__qemu_affinity_setup(self.process.pid, cpu_allocation)

    def __enter__(self):
        """Initialize the ssh connection (blocks until success)"""
        err = None
        for _ in range(SSH_MAX_RETRY):
            time.sleep(1)
            try:
                self.ssh = SshMachine('127.0.0.1',
                                      user='******',
                                      port=HOST_PORT,
                                      keyfile=self.key)
                break
            except (EOFError, plumbum.machines.session.SSHCommsError) as e:
                err = e
                continue
        else:  # Reached maximum retries
            raise VMException('SSH connection failed after too many retries',
                              err)
        return self

    def __exit__(self, type, value, traceback):
        """Stop the SSH connection and the VM"""
        if self.ssh is not None:
            self.ssh.close()
            self.ssh = None
        self.process.terminate()

    def scp_to(self, src_local, dst_remote):
        """Send a file from the host to the VM

        :param src_local: local path of the file to send
        :param dst_remote: destination path on the vm
        :raises ValueError: when the ssh connection is not established,
                            i.e. when not used inside a `with` block
        """
        if self.ssh is None:
            raise VMException(
                '`VM.scp_to` must be used with an established SSH connection, '
                'i.e. inside a `with` block.')
        src = local.path(src_local)
        dst = self.ssh.path(dst_remote)
        plumbum.path.utils.copy(src, dst)

    @staticmethod
    def __construct_qemu_args(kernel_path, filesystem_img_path, isolcpus=[]):
        """Qemu arguments similar to what `vm start` produces"""
        kernel_opt = ''
        if isolcpus:
            kernel_opt = ' isolcpus=' + ','.join(map(str, isolcpus))

        return [
            '-nographic', '-s', '-machine', 'accel=kvm', '-cpu', 'host',
            '-device', 'e1000,netdev=net0', '-netdev',
            'user,id=net0,hostfwd=tcp::%d-:22' % HOST_PORT, '-append',
            'console=ttyS0,115200 root=/dev/sda rw nokaslr' + kernel_opt,
            '-smp', '2', '-m', '4G', '-drive',
            'if=none,id=hd,file=%s,format=raw' % filesystem_img_path,
            '-device', 'virtio-scsi-pci,id=scsi', '-device',
            'scsi-hd,drive=hd', '-device',
            'virtio-rng-pci,max-bytes=1024,period=1000', '-qmp',
            'tcp:localhost:4444,server,nowait', '-serial', 'mon:stdio',
            '-kernel',
            '%s' % kernel_path, '-name', 'lsm_perf_vm,debug-threads=on'
        ]

    @staticmethod
    def __qemu_affinity_setup(qemu_pid, cpu_alloc):
        """Run qemu_affinity.py to allocate CPUs based on the CpuAllocation"""
        system_affinities = (
            '-p %(sys)d -i *:%(sys)d -q %(sys)d -w *:%(sys)d' % {
                'sys': cpu_alloc.qemu_sys
            }).split(' ')
        kvm_affinities = [
            '-k', str(cpu_alloc.host_kvm0),
            str(cpu_alloc.host_kvm1)
        ]
        args = system_affinities + kvm_affinities + ['--', str(qemu_pid)]
        cmd = plumbum.cmd.sudo[sys.executable][QEMU_AFFINITY_PATH][args]
        return cmd()