def __init__(self): self.daemon = None self.workers = [] SerializerBase.register_dict_to_class("worker_node.WorkerNode", WorkerNode.node_dict_to_class) SerializerBase.register_dict_to_class("job.Job", Job.job_dict_to_class) self.nfs_exporter = CertCheckingProxy('PYRO:NfsExporter@localhost:9091') self.first_run = True
class JobDispatcher: Pyro4.config.SSL = True Pyro4.config.SSL_REQUIRECLIENTCERT = True # 2-way ssl Pyro4.config.SSL_SERVERCERT = SSL_CERTS_DIR + LOCAL_HOSTNAME + ".crt" Pyro4.config.SSL_SERVERKEY = SSL_CERTS_DIR + LOCAL_HOSTNAME + ".key" Pyro4.config.SSL_CACERTS = SSL_CERTS_DIR + "rootCA.crt" # to make ssl accept the self-signed master cert # For using NFS mounter as a client Pyro4.config.SSL_CLIENTCERT = Pyro4.config.SSL_SERVERCERT Pyro4.config.SSL_CLIENTKEY = Pyro4.config.SSL_SERVERKEY def __init__(self): self.daemon = None self.workers = [] SerializerBase.register_dict_to_class("worker_node.WorkerNode", WorkerNode.node_dict_to_class) self.nfs_exporter = CertCheckingProxy( 'PYRO:NfsExporter@localhost:9091') self.first_run = True @Pyro4.expose def test(self): return "connection ok" @Pyro4.expose def get_worker_options(self): # options = {"nfs_tuning": ['-o', 'noacl,nocto,noatime,nodiratime']} options = {"nfs_tuning": ['-o', 'async']} return options @Pyro4.expose def join_work(self, node): self.workers.append(node) @Pyro4.expose def report(self, node, job, exit_status, export_range=None): pass @Pyro4.expose def get_job(self, n): pass def start(self): print("selected workflow:", settings.dispatcher["workflow"]) try: self.nfs_exporter.test() except Pyro4.errors.CommunicationError as e: print( "Can't connect to local NFS exporter service, make sure it's running.\n", e) return self.daemon = CertValidatingDaemon(host=LOCAL_HOSTNAME, port=9090) test_uri = self.daemon.register(self, "JobDispatcher") print("Job dispatcher ready. URI:", test_uri) self.daemon.requestLoop()
def setup(self): signal.signal(signal.SIGINT, self.termination_handler) with open(OD_FOLDER + SSL_CERTS_DIR + 'whoismaster') as f: self.MASTER_ADDRESS = f.read().strip() self.MOUNTPOINT_DEFAULT = str(Path.home()) + '/olive-share/' self.job_dispatcher = CertCheckingProxy('PYRO:JobDispatcher@' + self.MASTER_ADDRESS + ':9090') self.nfs_mounter = CertCheckingProxy('PYRO:NfsMounter@' + 'localhost' + ':9092') SerializerBase.register_dict_to_class( "job.ExportRange", ExportRange.export_range_dict_to_class) SerializerBase.register_dict_to_class("job.Job", Job.job_dict_to_class)
def __init__(self, address): with open(SSL_CERTS_DIR + 'whoismaster') as f: self.MASTER_ADDRESS = f.read().strip() self.MOUNTPOINT_DEFAULT = str(Path.home()) + '/olive-share' self.address = address self.cpu_score = 0 self.net_score = 0 self._job_start_time = None self._job = None self.sample_weight = None self.sample_time = None self.worker_options = dict() self.job_dispatcher = CertCheckingProxy('PYRO:JobDispatcher@' + self.MASTER_ADDRESS + ':9090') self.nfs_mounter = CertCheckingProxy('PYRO:NfsMounter@' + 'localhost' + ':9092') SerializerBase.register_dict_to_class( "job.ExportRange", ExportRange.export_range_dict_to_class) SerializerBase.register_dict_to_class("job.Job", Job.job_dict_to_class)
class WorkerNode: Pyro4.config.SSL = True Pyro4.config.SSL_CACERTS = SSL_CERTS_DIR + "rootCA.crt" # to make ssl accept the self-signed node cert Pyro4.config.SSL_CLIENTCERT = SSL_CERTS_DIR + LOCAL_HOSTNAME + ".crt" Pyro4.config.SSL_CLIENTKEY = SSL_CERTS_DIR + LOCAL_HOSTNAME + ".key" sys.excepthook = Pyro4.util.excepthook def __init__(self, address): with open(SSL_CERTS_DIR + 'whoismaster') as f: self.MASTER_ADDRESS = f.read().strip() self.MOUNTPOINT_DEFAULT = str(Path.home()) + '/olive-share' self.address = address self.cpu_score = 0 self.net_score = 0 self._job_start_time = None self._job = None self.sample_weight = None self.sample_time = None self.worker_options = dict() self.job_dispatcher = CertCheckingProxy('PYRO:JobDispatcher@' + self.MASTER_ADDRESS + ':9090') self.nfs_mounter = CertCheckingProxy('PYRO:NfsMounter@' + 'localhost' + ':9092') SerializerBase.register_dict_to_class( "job.ExportRange", ExportRange.export_range_dict_to_class) SerializerBase.register_dict_to_class("job.Job", Job.job_dict_to_class) def job_eta(self, j=None): if self.sample_time is None or self.sample_weight is None: return 9223372036854775807 if j is not None: t = (j.job_weight * self.sample_time) / self.sample_weight elif self._job is not None: t = self.job_eta(self._job) - (time.time() - self._job_start_time) else: t = 0 return t def run_benchmark(self): import random self.cpu_score = random.randrange(1, 10) self.net_score = random.randrange(1, 10) self.cpu_score = float( subprocess.run(['bench/bench-host.sh'], stdout=subprocess.PIPE).stdout) print("node", self.address, "\t\tCPU:", self.cpu_score) def run(self): while True: try: print(self.job_dispatcher.test()) except Pyro4.errors.CommunicationError as e: print(e, "\nCan't connect to dispatcher, retrying...") time.sleep(1) continue if self.cpu_score is None or self.cpu_score == 0: self.run_benchmark() self.worker_options.update( self.job_dispatcher.get_worker_options()) self.job_dispatcher.join_work(self) self.__run() time.sleep(1) def __run(self): while True: try: j, export_range = self.job_dispatcher.get_job(self) except Pyro4.errors.CommunicationError: return print("got job:", j, (export_range if export_range is not None else "")) if j.job_path == "abort": print(self.address, "\tterminating...") self.nfs_mounter.umount(self.MOUNTPOINT_DEFAULT) return if j.job_path == "retry": time.sleep(j.job_weight) continue # mount the NFS share before starting if self.nfs_mounter.mount(j.job_path, self.MASTER_ADDRESS, self.MOUNTPOINT_DEFAULT, self.worker_options["nfs_tuning"]) != 0: self.job_dispatcher.report(self, j, -1, export_range) return self.run_job(j, export_range) def run_job(self, j, export_range): self._job_start_time = time.time() self._job = j project_path = j.job_path[j.job_path.rfind("/") + 1:] olive_args = ['olive-editor', project_path, '-e'] if export_range is not None: # Here we need to call deserialization manually because of dynamic typing # ( not all implementations of dispatcher return (Job, ExportRange) ) if isinstance(export_range, dict): export_range = ExportRange.export_range_dict_to_class( "job.ExportRange", export_range) olive_args.append(str(export_range.instance_id)) olive_args.append('--export-start') olive_args.append(str(export_range.start)) olive_args.append('--export-end') olive_args.append(str(export_range.end)) initial_folder = os.getcwd() os.chdir(self.MOUNTPOINT_DEFAULT) olive_export = subprocess.run(olive_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) os.chdir(initial_folder) # dummy export jobs: # time.sleep((j.job_weight/self.cpu_score)/100) # time.sleep(1) # import random # if random.randrange(-100, 100) > 0: # olive_export = subprocess.run(['true'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # success # else: # olive_export = subprocess.run(['false'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # failure if olive_export.returncode == 0: print("Exported successfully:", j.job_path, (export_range.number if export_range is not None else "")) else: print("Error exporting", j.job_path, "\n", olive_export.stdout, olive_export.stderr) # If we completed a with a full job, umount. Otherwise umount on abort if not j.split: self.nfs_mounter.umount(self.MOUNTPOINT_DEFAULT) try: self.job_dispatcher.report(self, j, olive_export.returncode, export_range) except Pyro4.errors.ConnectionClosedError: return self.sample_weight = j.job_weight self.sample_time = time.time() - self._job_start_time self._job = None @staticmethod def node_dict_to_class(classname, d): # print("{deserializer hook, converting to class: %s}" % d) r = WorkerNode(d["address"]) r.cpu_score = d["cpu_score"] r.net_score = d["net_score"] r._job_start_time = d["_job_start_time"] r.sample_weight = d["sample_weight"] r.sample_time = d["sample_time"] return r
class WorkerNode: Pyro4.config.SSL = True Pyro4.config.SSL_CACERTS = OD_FOLDER + SSL_CERTS_DIR + "rootCA.crt" # to make ssl accept the self-signed node cert Pyro4.config.SSL_CLIENTCERT = OD_FOLDER + SSL_CERTS_DIR + LOCAL_HOSTNAME + ".crt" Pyro4.config.SSL_CLIENTKEY = OD_FOLDER + SSL_CERTS_DIR + LOCAL_HOSTNAME + ".key" sys.excepthook = Pyro4.util.excepthook def __init__(self, address): self.TEMP_DIR = '/tmp/olive' self.address = address self.cpu_score = 0 self.net_score = 0 self._job_start_time = None self._job = None self.sample_weight = None self.sample_time = None self.worker_options = dict() self.olive_export_process = None self.terminating = False self.MASTER_ADDRESS = None self.MOUNTPOINT_DEFAULT = None self.job_dispatcher = None self.nfs_mounter = None def setup(self): signal.signal(signal.SIGINT, self.termination_handler) with open(OD_FOLDER + SSL_CERTS_DIR + 'whoismaster') as f: self.MASTER_ADDRESS = f.read().strip() self.MOUNTPOINT_DEFAULT = str(Path.home()) + '/olive-share/' self.job_dispatcher = CertCheckingProxy('PYRO:JobDispatcher@' + self.MASTER_ADDRESS + ':9090') self.nfs_mounter = CertCheckingProxy('PYRO:NfsMounter@' + 'localhost' + ':9092') SerializerBase.register_dict_to_class( "job.ExportRange", ExportRange.export_range_dict_to_class) SerializerBase.register_dict_to_class("job.Job", Job.job_dict_to_class) def job_eta(self, j=None): if self.sample_time is None or self.sample_weight is None: return 9223372036854775807 if j is not None: t = (j.job_weight * self.sample_time) / self.sample_weight elif self._job is not None: t = self.job_eta(self._job) - (time.time() - self._job_start_time) else: t = 0 return t def termination_handler(self, signum, frame): print("stopping threads and clean termination...") self.terminating = True quit(0) def __connection_watchdog(self): while not self.terminating: time.sleep(5) try: self.job_dispatcher.test() except Pyro4.errors.CommunicationError: if self.olive_export_process is not None: print( "Lost connection to the master, aborting ongoing exports..." ) self.olive_export_process.terminate() def run_benchmark(self): import random self.cpu_score = random.randrange(1, 10) self.net_score = random.randrange(1, 10) self.cpu_score = float( subprocess.run([OD_FOLDER + 'bench/bench-host.sh'], stdout=subprocess.PIPE).stdout) print("node", self.address, "\t\tCPU:", self.cpu_score) def run(self): if not Path(self.TEMP_DIR).exists(): os.mkdir(self.TEMP_DIR) threading.Thread(target=self.__connection_watchdog).start() while not self.terminating: try: print(self.job_dispatcher.test()) except Pyro4.errors.CommunicationError as e: print(e, "\nCan't connect to dispatcher, retrying...") time.sleep(1) continue if self.cpu_score is None or self.cpu_score == 0: self.run_benchmark() self.worker_options.update( self.job_dispatcher.get_worker_options()) self.job_dispatcher.join_work(self) self.__run() time.sleep(1) def __run(self): while True: try: j, export_range = self.job_dispatcher.get_job(self) except Pyro4.errors.CommunicationError: return print("got job:", j, (export_range if export_range is not None else "")) if j.job_path == "abort": print(self.address, "\tterminating...") self.nfs_mounter.umount(self.MOUNTPOINT_DEFAULT) return if j.job_path == "retry": time.sleep(j.job_weight) continue # mount the NFS share before starting if self.nfs_mounter.mount(j.job_path, self.MASTER_ADDRESS, self.MOUNTPOINT_DEFAULT, self.worker_options["nfs_tuning"]) != 0: self.job_dispatcher.report(self, j, -1, export_range) return self.run_job(j, export_range) def run_job(self, j, export_range): self._job_start_time = time.time() self._job = j project_name = j.job_path[j.job_path.rfind("/") + 1:] olive_args = [ 'olive-editor', self.MOUNTPOINT_DEFAULT + project_name, '-e' ] if export_range is not None: # Here we need to call deserialization manually because of dynamic typing # ( not all implementations of dispatcher return (Job, ExportRange) ) if isinstance(export_range, dict): export_range = ExportRange.export_range_dict_to_class( "job.ExportRange", export_range) olive_args.append(str(export_range.instance_id)) olive_args.append('--export-start') olive_args.append(str(export_range.start)) olive_args.append('--export-end') olive_args.append(str(export_range.end)) # Do the actual export with the given parameters os.chdir(self.TEMP_DIR) self.olive_export_process = subprocess.Popen(olive_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.olive_export_process.wait() if export_range is not None: export_name = export_range.instance_id + ".mp4" else: export_name = project_name + ".mp4" # Move the exported video to the NFS share try: shutil.move(export_name, self.MOUNTPOINT_DEFAULT) file_moved = True except OSError: file_moved = False # cleanup partial files for root, dirs, files in os.walk(self.TEMP_DIR): for file in files: os.remove(file) # dummy export jobs: # time.sleep((j.job_weight/self.cpu_score)/100) # time.sleep(1) # import random # if random.randrange(-100, 100) > 0: # olive_export = subprocess.run(['true'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # success # else: # olive_export = subprocess.run(['false'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # failure if self.olive_export_process.returncode == 0 and file_moved: print("Job done:", j.job_path, (export_range.number if export_range is not None else "")) else: print("Error exporting", j.job_path, (export_range.number if export_range is not None else "")) return_code = int(self.olive_export_process.returncode or not file_moved) self.olive_export_process = None # If we completed a with a full job, umount. Otherwise umount on abort if not j.split: self.nfs_mounter.umount(self.MOUNTPOINT_DEFAULT) try: self.job_dispatcher.report(self, j, return_code, export_range) except Pyro4.errors.ConnectionClosedError: return except Pyro4.errors.CommunicationError: return except ConnectionRefusedError: return self.sample_weight = j.job_weight self.sample_time = time.time() - self._job_start_time self._job = None @staticmethod def node_dict_to_class(classname, d): r = WorkerNode(d["address"]) r.cpu_score = d["cpu_score"] r.net_score = d["net_score"] r._job_start_time = d["_job_start_time"] r.sample_weight = d["sample_weight"] r.sample_time = d["sample_time"] return r