Пример #1
0
class Machine:
    def __init__(
        self,
        _id: str,
        address: str,
        username: str,
        ssh_password: str,
        jobs_db,
        skip_gpus: Sequence[int] = (),
        gpu_runner_on: bool = False,
        app=None,
    ):
        self._id = _id
        self.address = address
        self.username = username
        self.jobs_db = jobs_db
        self.skip_gpus = skip_gpus
        self.gpu_runner_on = gpu_runner_on
        self.app = app
        self.new_processes = []
        self._client = SSHConnection(self.address,
                                     self.username,
                                     ssh_password,
                                     auto_add_host=True)
        self._client_lock = Lock()

    def dashboard_data(self) -> Dict[str, Any]:
        return {
            "_id": self._id,
            "address": self.address,
            "username": self.username,
            "gpu_runner_on": self.gpu_runner_on,
        }

    def execute(self, command: str, codec: str = "utf-8") -> str:
        """
        Runs `command` using the SSHConnection for this Machine and returns stdout
        :param command: *single-line* command to run
        :param codec: codec to use to decode the standard output from running `command`
        :returns: decoded stdout
        """

        try:
            with self._client_lock:
                return self._client.execute(command, codec)
        except:
            if self.app:
                self.app.logger.info(traceback.format_exc())
            raise

    def start(self, sleep_time: int = 30):
        def handle_machine(machine: Machine, sleep_time: int):
            while True:
                if machine.gpu_runner_on:
                    if self.app:
                        self.app.logger.info(
                            f"Checking for jobs on {self.address}.")

                    machine.start_jobs()
                sleep(sleep_time)

        thread = Thread(target=lambda: handle_machine(self, sleep_time),
                        daemon=True)
        thread.start()

    def start_jobs(self, n_passes: int = 2, keep_time: int = 60) -> None:
        """
        :param n_passes: number of times to query the state of the GPUs; the utilization
          currently used on each GPU is assumed to be the mean across these passes, and
          the memory used is the max
        :param keep_time: how long to keep a started process in the new_processes list
          before removing it (a process is removed immediately if we see it running).
          Leaving a process in the new_processes list means that we assume that the
          resources it has requested are already "reserved" even though we don't see
          them being used yet on the GPU. Removing a processes from this list lets
          other processes be started with those resources. (`keep_time` is in seconds)
        """
        while True:  # place jobs for this machine until you can't place any more
            job = self.jobs_db.find_one({"machine": self._id},
                                        sort=[("util", 1)])
            if not job:  # no more queued jobs for this machine
                break

            # check if there's a gpu you can run this job on (enough memory and util free)
            gpus = {}
            new_processes = self.new_processes

            for _ in range(n_passes):
                gpu_info = get_gpus_from_info_string(
                    self.execute(_smi_command))
                for gpu in gpu_info:
                    try:
                        gpus[gpu.idx].append(gpu)
                    except KeyError:
                        gpus[gpu.idx] = [gpu]

            # TODO - remove processes that have shown up on the GPU
            # if a process doesn't show up on the GPU after enough time, assume it had an error and crashed; remove
            now = time()
            new_processes = [
                process for process in new_processes
                if now - process.timestamp < keep_time
            ]

            # subtract mem and util used by new processes from that which is shown to be free
            mem_newly_used = {gpu_num: 0 for gpu_num in gpus}
            util_newly_used = {gpu_num: 0 for gpu_num in gpus}
            for process in new_processes:
                mem_newly_used[process.gpu_num] += process.mem_needed
                util_newly_used[process.gpu_num] += process.util_needed

            # set mem_used to max from each pass, util_used to mean
            gpus = [
                _GPU(
                    idx=num,
                    mem_used=max([gpu.mem_used
                                  for gpu in gpu_list]) + mem_newly_used[num],
                    mem_total=gpu_list[0].mem_total,
                    util_used=sum([gpu.util_used
                                   for gpu in gpu_list]) / n_passes -
                    util_newly_used[num],
                ) for (num, gpu_list) in gpus.items()
            ]

            gpus = [
                gpu for gpu in gpus
                if gpu.mem_free >= job["mem"] and gpu.util_free >= job["util"]
            ]

            try:
                best_gpu = max(gpus, key=lambda gpu: gpu.util_free)
            except ValueError:  # max gets no gpus because none have enough mem_free and util_free
                if self.app:
                    self.app.logger.info(
                        f"No free GPUs to start jobs on {self.address}!")
                break  # can't place anything on this machine

            job_cmd = job["cmd"].format(best_gpu.idx)

            if self.app:
                self.app.logger.info(f"Starting job: {job_cmd} ({self._id})")

            # make sure to background the script
            # surrounding w/ () executes in a subshell so "Done ..."
            # isn't printed when the job finishes
            output = self.execute(f"({job_cmd} >> ~/.gpu_log 2>&1 &)")

            new_processes.append(
                _Process(
                    job_cmd,
                    best_gpu.idx,
                    mem_needed=job["mem"],
                    util_needed=job["util"],
                    timestamp=time(),
                ))
            self.new_processes = new_processes

            # this job is running, so remove it from the list
            self.jobs_db.remove({"_id": job["_id"]})
Пример #2
0
class PostgresManager(object):
    """docstring for PostgresManager"""
    def __init__(self, node):
        super(PostgresManager, self).__init__()
        self.node = node
        if not node['local'] == 'true':
            self.connection = SSHConnection(
                node['hostname'],
                node['username'],
                node['password'])

    def run(self, cmd):
        if self.node['local'] == 'true':
            return local.shell(cmd)
        else:
            return self.connection.execute(cmd)

    def start(self):
        self.run('sudo /etc/init.d/postgresql start')

    def stop(self):
        self.run('sudo /etc/init.d/postgresql stop')

    def restart(self):
        self.run('sudo /etc/init.d/postgresql restart')

    def reload(self):
        self.run('sudo /etc/init.d/postgresql reload')

    def write_file(self, path, data):
        if self.node['local'] == 'true':
            local.write_file(path, data)
        else:
            self.connection.upload_data(data, path)

    def read_file(self, path):
        if self.node['local'] == 'true':
            return local.read_file(path)
        else:
            return self.connection.download(path)

    def init(self, master):
        environ = os.environ.copy()
        environ['PATH'] = '/usr/lib/pgclust:' + environ['PATH']

        self.stop()
        self.run('sudo pg_dropcluster %(pgversion)s %(cluster)s' % self.node)
        self.run('sudo pg_createcluster %(pgversion)s %(cluster)s' % self.node)
        self.run('sudo chown -R %(pguser)s:%(pguser)s /var/lib/postgresql/%(pgversion)s/%(cluster)s' % self.node)
        self.write_file('/tmp/pgpostgresql.conf', template.PG_CONFIG_TEMPLATE % self.node)
        self.write_file('/tmp/pgpg_hba.conf', template.PG_HBA_CONFIG_TEMPLATE % self.node)
        self.write_file('/tmp/pgrepmgr.conf', template.REPMGR_CONFIG_TEMPLATE % self.node)
        self.run('sudo cp /tmp/pgpostgresql.conf /etc/postgresql/%(pgversion)s/%(cluster)s/postgresql.conf' % self.node)
        self.run('sudo cp /tmp/pgpg_hba.conf /etc/postgresql/%(pgversion)s/%(cluster)s/pg_hba.conf' % self.node)
        self.run('sudo cp /tmp/pgrepmgr.conf /etc/postgresql/%(pgversion)s/%(cluster)s/repmgr.conf' % self.node)
        self.run('sudo chmod 644 /etc/postgresql/%(pgversion)s/%(cluster)s/*' % self.node)
        self.run('sudo rm /tmp/pgpostgresql.conf /tmp/pgpg_hba.conf /tmp/pgrepmgr.conf')
        self.run('sudo chown %(pguser)s:%(pguser)s -R /etc/postgresql' % self.node)

        self.write_file('/tmp/sshid_rsa' % self.node, local.read_file(self.node['privkey']))
        self.write_file('/tmp/sshid_rsa.pub' % self.node, local.read_file(self.node['pubkey']))
        self.write_file('/tmp/sshconfig', template.SSH_CONFIG)
        self.run('sudo -u %(pguser)s mkdir -p ~%(pguser)s/.ssh' % self.node)
        self.run('sudo cp /tmp/sshid_rsa ~%(pguser)s/.ssh/id_rsa' % self.node)
        self.run('sudo cp /tmp/sshid_rsa.pub ~%(pguser)s/.ssh/id_rsa.pub' % self.node)
        self.run('sudo cp /tmp/sshconfig ~%(pguser)s/.ssh/config' % self.node)
        self.run('sudo rm /tmp/sshconfig /tmp/sshid_rsa /tmp/sshid_rsa.pub')
        self.run('sudo chown %(pguser)s:%(pguser)s ~%(pguser)s/.ssh/config' % self.node)
        self.run('sudo chown %(pguser)s:%(pguser)s ~%(pguser)s/.ssh/id_rsa' % self.node)
        self.run('sudo chown %(pguser)s:%(pguser)s ~%(pguser)s/.ssh/id_rsa.pub' % self.node)
        self.run('sudo chmod 644 ~%(pguser)s/.ssh/config' % self.node)
        self.run('sudo chmod 600 ~%(pguser)s/.ssh/id_rsa' % self.node)
        self.run('sudo chmod 644 ~%(pguser)s/.ssh/id_rsa.pub' % self.node)
        if self.node['type'] == 'master':
            self.start()
            self.run('sudo -u %(pguser)s createuser --login --superuser repmgr' % self.node)
            self.run('sudo -u %(pguser)s createdb repmgr' % self.node)
            self.run('sudo -u %(pguser)s repmgr --verbose -f /etc/postgresql/%(pgversion)s/%(cluster)s/repmgr.conf master register' % self.node)
        else:
            self.run('sudo -u %(pguser)s rm -rf /var/lib/postgresql/%(pgversion)s/%(cluster)s' % self.node)
            self.run('sudo -u %(pguser)s mkdir /var/lib/postgresql/%(pgversion)s/%(cluster)s' % self.node)
            self.run('sudo -u %(pguser)s chmod 700 /var/lib/postgresql/%(pgversion)s/%(cluster)s' % self.node)
            self.run(('sudo -u %(pguser)s PATH="' + environ['PATH'] + '" repmgr --verbose --force -D /var/lib/postgresql/%(pgversion)s/%(cluster)s -d repmgr -p 5432 -U repmgr -R %(pguser)s standby clone ') % self.node + master['hostname'])
            self.start()
            self.run('sudo -u %(pguser)s repmgr -f /etc/postgresql/%(pgversion)s/%(cluster)s/repmgr.conf --verbose standby register' % self.node)

    def update_nodes(self, nodes):
        records = []
        for node in nodes:
            node['hostname'] = socket.gethostbyname(node['hostname']) + '/32'
            records.append(template.REPLICATION_NODE_TEMPLATE % node)
        pg_hba = '\n\n'.join(records) + template.PG_HBA_CONFIG_TEMPLATE % self.node
        self.write_file('/tmp/pgpg_hba.conf' % self.node, pg_hba)
        self.run('sudo cp /tmp/pgpg_hba.conf /etc/postgresql/%(pgversion)s/%(cluster)s/pg_hba.conf' % self.node)
        self.run('sudo rm /tmp/pgpg_hba.conf')

    def update_keys(self, nodes):
        self.run('sudo -u %(pguser)s mkdir -p ~%(pguser)s/.ssh' % self.node)
        self.run('sudo chmod 700 ~%(pguser)s/.ssh' % self.node)
        keys = []
        for node in nodes:
            key = local.read_file(node['pubkey'])
            keys.append(key)
        self.write_file('/tmp/sshauthorized_keys' % self.node, '\n'.join(keys))
        self.run('sudo cp /tmp/sshauthorized_keys ~%(pguser)s/.ssh/authorized_keys' % self.node)
        self.write_file('/tmp/sshid_rsa' % self.node, local.read_file(self.node['privkey']))
        self.write_file('/tmp/sshid_rsa.pub' % self.node, local.read_file(self.node['pubkey']))
        self.run('sudo cp /tmp/sshid_rsa ~%(pguser)s/.ssh/id_rsa' % self.node)
        self.run('sudo cp /tmp/sshid_rsa.pub ~%(pguser)s/.ssh/id_rsa.pub' % self.node)
        self.run('sudo rm /tmp/sshauthorized_keys /tmp/sshid_rsa /tmp/sshid_rsa.pub')
        self.run('sudo chown %(pguser)s:%(pguser)s ~%(pguser)s/.ssh/authorized_keys' % self.node)
        self.run('sudo chown %(pguser)s:%(pguser)s ~%(pguser)s/.ssh/id_rsa' % self.node)
        self.run('sudo chown %(pguser)s:%(pguser)s ~%(pguser)s/.ssh/id_rsa.pub' % self.node)
        self.run('sudo chmod 600 ~%(pguser)s/.ssh/authorized_keys' % self.node)
        self.run('sudo chmod 600 ~%(pguser)s/.ssh/id_rsa' % self.node)
        self.run('sudo chmod 644 ~%(pguser)s/.ssh/id_rsa.pub' % self.node)

    def update(self, nodes):
        self.update_nodes(nodes.values())
        self.update_keys(nodes.values())
        self.reload()