class Machine: def __init__( self, _id: str, address: str, username: str, ssh_password: str, jobs_db, skip_gpus: Sequence[int] = (), gpu_runner_on: bool = False, app=None, ): self._id = _id self.address = address self.username = username self.jobs_db = jobs_db self.skip_gpus = skip_gpus self.gpu_runner_on = gpu_runner_on self.app = app self.new_processes = [] self._client = SSHConnection(self.address, self.username, ssh_password, auto_add_host=True) self._client_lock = Lock() def dashboard_data(self) -> Dict[str, Any]: return { "_id": self._id, "address": self.address, "username": self.username, "gpu_runner_on": self.gpu_runner_on, } def execute(self, command: str, codec: str = "utf-8") -> str: """ Runs `command` using the SSHConnection for this Machine and returns stdout :param command: *single-line* command to run :param codec: codec to use to decode the standard output from running `command` :returns: decoded stdout """ try: with self._client_lock: return self._client.execute(command, codec) except: if self.app: self.app.logger.info(traceback.format_exc()) raise def start(self, sleep_time: int = 30): def handle_machine(machine: Machine, sleep_time: int): while True: if machine.gpu_runner_on: if self.app: self.app.logger.info( f"Checking for jobs on {self.address}.") machine.start_jobs() sleep(sleep_time) thread = Thread(target=lambda: handle_machine(self, sleep_time), daemon=True) thread.start() def start_jobs(self, n_passes: int = 2, keep_time: int = 60) -> None: """ :param n_passes: number of times to query the state of the GPUs; the utilization currently used on each GPU is assumed to be the mean across these passes, and the memory used is the max :param keep_time: how long to keep a started process in the new_processes list before removing it (a process is removed immediately if we see it running). Leaving a process in the new_processes list means that we assume that the resources it has requested are already "reserved" even though we don't see them being used yet on the GPU. Removing a processes from this list lets other processes be started with those resources. (`keep_time` is in seconds) """ while True: # place jobs for this machine until you can't place any more job = self.jobs_db.find_one({"machine": self._id}, sort=[("util", 1)]) if not job: # no more queued jobs for this machine break # check if there's a gpu you can run this job on (enough memory and util free) gpus = {} new_processes = self.new_processes for _ in range(n_passes): gpu_info = get_gpus_from_info_string( self.execute(_smi_command)) for gpu in gpu_info: try: gpus[gpu.idx].append(gpu) except KeyError: gpus[gpu.idx] = [gpu] # TODO - remove processes that have shown up on the GPU # if a process doesn't show up on the GPU after enough time, assume it had an error and crashed; remove now = time() new_processes = [ process for process in new_processes if now - process.timestamp < keep_time ] # subtract mem and util used by new processes from that which is shown to be free mem_newly_used = {gpu_num: 0 for gpu_num in gpus} util_newly_used = {gpu_num: 0 for gpu_num in gpus} for process in new_processes: mem_newly_used[process.gpu_num] += process.mem_needed util_newly_used[process.gpu_num] += process.util_needed # set mem_used to max from each pass, util_used to mean gpus = [ _GPU( idx=num, mem_used=max([gpu.mem_used for gpu in gpu_list]) + mem_newly_used[num], mem_total=gpu_list[0].mem_total, util_used=sum([gpu.util_used for gpu in gpu_list]) / n_passes - util_newly_used[num], ) for (num, gpu_list) in gpus.items() ] gpus = [ gpu for gpu in gpus if gpu.mem_free >= job["mem"] and gpu.util_free >= job["util"] ] try: best_gpu = max(gpus, key=lambda gpu: gpu.util_free) except ValueError: # max gets no gpus because none have enough mem_free and util_free if self.app: self.app.logger.info( f"No free GPUs to start jobs on {self.address}!") break # can't place anything on this machine job_cmd = job["cmd"].format(best_gpu.idx) if self.app: self.app.logger.info(f"Starting job: {job_cmd} ({self._id})") # make sure to background the script # surrounding w/ () executes in a subshell so "Done ..." # isn't printed when the job finishes output = self.execute(f"({job_cmd} >> ~/.gpu_log 2>&1 &)") new_processes.append( _Process( job_cmd, best_gpu.idx, mem_needed=job["mem"], util_needed=job["util"], timestamp=time(), )) self.new_processes = new_processes # this job is running, so remove it from the list self.jobs_db.remove({"_id": job["_id"]})
class PostgresManager(object): """docstring for PostgresManager""" def __init__(self, node): super(PostgresManager, self).__init__() self.node = node if not node['local'] == 'true': self.connection = SSHConnection( node['hostname'], node['username'], node['password']) def run(self, cmd): if self.node['local'] == 'true': return local.shell(cmd) else: return self.connection.execute(cmd) def start(self): self.run('sudo /etc/init.d/postgresql start') def stop(self): self.run('sudo /etc/init.d/postgresql stop') def restart(self): self.run('sudo /etc/init.d/postgresql restart') def reload(self): self.run('sudo /etc/init.d/postgresql reload') def write_file(self, path, data): if self.node['local'] == 'true': local.write_file(path, data) else: self.connection.upload_data(data, path) def read_file(self, path): if self.node['local'] == 'true': return local.read_file(path) else: return self.connection.download(path) def init(self, master): environ = os.environ.copy() environ['PATH'] = '/usr/lib/pgclust:' + environ['PATH'] self.stop() self.run('sudo pg_dropcluster %(pgversion)s %(cluster)s' % self.node) self.run('sudo pg_createcluster %(pgversion)s %(cluster)s' % self.node) self.run('sudo chown -R %(pguser)s:%(pguser)s /var/lib/postgresql/%(pgversion)s/%(cluster)s' % self.node) self.write_file('/tmp/pgpostgresql.conf', template.PG_CONFIG_TEMPLATE % self.node) self.write_file('/tmp/pgpg_hba.conf', template.PG_HBA_CONFIG_TEMPLATE % self.node) self.write_file('/tmp/pgrepmgr.conf', template.REPMGR_CONFIG_TEMPLATE % self.node) self.run('sudo cp /tmp/pgpostgresql.conf /etc/postgresql/%(pgversion)s/%(cluster)s/postgresql.conf' % self.node) self.run('sudo cp /tmp/pgpg_hba.conf /etc/postgresql/%(pgversion)s/%(cluster)s/pg_hba.conf' % self.node) self.run('sudo cp /tmp/pgrepmgr.conf /etc/postgresql/%(pgversion)s/%(cluster)s/repmgr.conf' % self.node) self.run('sudo chmod 644 /etc/postgresql/%(pgversion)s/%(cluster)s/*' % self.node) self.run('sudo rm /tmp/pgpostgresql.conf /tmp/pgpg_hba.conf /tmp/pgrepmgr.conf') self.run('sudo chown %(pguser)s:%(pguser)s -R /etc/postgresql' % self.node) self.write_file('/tmp/sshid_rsa' % self.node, local.read_file(self.node['privkey'])) self.write_file('/tmp/sshid_rsa.pub' % self.node, local.read_file(self.node['pubkey'])) self.write_file('/tmp/sshconfig', template.SSH_CONFIG) self.run('sudo -u %(pguser)s mkdir -p ~%(pguser)s/.ssh' % self.node) self.run('sudo cp /tmp/sshid_rsa ~%(pguser)s/.ssh/id_rsa' % self.node) self.run('sudo cp /tmp/sshid_rsa.pub ~%(pguser)s/.ssh/id_rsa.pub' % self.node) self.run('sudo cp /tmp/sshconfig ~%(pguser)s/.ssh/config' % self.node) self.run('sudo rm /tmp/sshconfig /tmp/sshid_rsa /tmp/sshid_rsa.pub') self.run('sudo chown %(pguser)s:%(pguser)s ~%(pguser)s/.ssh/config' % self.node) self.run('sudo chown %(pguser)s:%(pguser)s ~%(pguser)s/.ssh/id_rsa' % self.node) self.run('sudo chown %(pguser)s:%(pguser)s ~%(pguser)s/.ssh/id_rsa.pub' % self.node) self.run('sudo chmod 644 ~%(pguser)s/.ssh/config' % self.node) self.run('sudo chmod 600 ~%(pguser)s/.ssh/id_rsa' % self.node) self.run('sudo chmod 644 ~%(pguser)s/.ssh/id_rsa.pub' % self.node) if self.node['type'] == 'master': self.start() self.run('sudo -u %(pguser)s createuser --login --superuser repmgr' % self.node) self.run('sudo -u %(pguser)s createdb repmgr' % self.node) self.run('sudo -u %(pguser)s repmgr --verbose -f /etc/postgresql/%(pgversion)s/%(cluster)s/repmgr.conf master register' % self.node) else: self.run('sudo -u %(pguser)s rm -rf /var/lib/postgresql/%(pgversion)s/%(cluster)s' % self.node) self.run('sudo -u %(pguser)s mkdir /var/lib/postgresql/%(pgversion)s/%(cluster)s' % self.node) self.run('sudo -u %(pguser)s chmod 700 /var/lib/postgresql/%(pgversion)s/%(cluster)s' % self.node) self.run(('sudo -u %(pguser)s PATH="' + environ['PATH'] + '" repmgr --verbose --force -D /var/lib/postgresql/%(pgversion)s/%(cluster)s -d repmgr -p 5432 -U repmgr -R %(pguser)s standby clone ') % self.node + master['hostname']) self.start() self.run('sudo -u %(pguser)s repmgr -f /etc/postgresql/%(pgversion)s/%(cluster)s/repmgr.conf --verbose standby register' % self.node) def update_nodes(self, nodes): records = [] for node in nodes: node['hostname'] = socket.gethostbyname(node['hostname']) + '/32' records.append(template.REPLICATION_NODE_TEMPLATE % node) pg_hba = '\n\n'.join(records) + template.PG_HBA_CONFIG_TEMPLATE % self.node self.write_file('/tmp/pgpg_hba.conf' % self.node, pg_hba) self.run('sudo cp /tmp/pgpg_hba.conf /etc/postgresql/%(pgversion)s/%(cluster)s/pg_hba.conf' % self.node) self.run('sudo rm /tmp/pgpg_hba.conf') def update_keys(self, nodes): self.run('sudo -u %(pguser)s mkdir -p ~%(pguser)s/.ssh' % self.node) self.run('sudo chmod 700 ~%(pguser)s/.ssh' % self.node) keys = [] for node in nodes: key = local.read_file(node['pubkey']) keys.append(key) self.write_file('/tmp/sshauthorized_keys' % self.node, '\n'.join(keys)) self.run('sudo cp /tmp/sshauthorized_keys ~%(pguser)s/.ssh/authorized_keys' % self.node) self.write_file('/tmp/sshid_rsa' % self.node, local.read_file(self.node['privkey'])) self.write_file('/tmp/sshid_rsa.pub' % self.node, local.read_file(self.node['pubkey'])) self.run('sudo cp /tmp/sshid_rsa ~%(pguser)s/.ssh/id_rsa' % self.node) self.run('sudo cp /tmp/sshid_rsa.pub ~%(pguser)s/.ssh/id_rsa.pub' % self.node) self.run('sudo rm /tmp/sshauthorized_keys /tmp/sshid_rsa /tmp/sshid_rsa.pub') self.run('sudo chown %(pguser)s:%(pguser)s ~%(pguser)s/.ssh/authorized_keys' % self.node) self.run('sudo chown %(pguser)s:%(pguser)s ~%(pguser)s/.ssh/id_rsa' % self.node) self.run('sudo chown %(pguser)s:%(pguser)s ~%(pguser)s/.ssh/id_rsa.pub' % self.node) self.run('sudo chmod 600 ~%(pguser)s/.ssh/authorized_keys' % self.node) self.run('sudo chmod 600 ~%(pguser)s/.ssh/id_rsa' % self.node) self.run('sudo chmod 644 ~%(pguser)s/.ssh/id_rsa.pub' % self.node) def update(self, nodes): self.update_nodes(nodes.values()) self.update_keys(nodes.values()) self.reload()