def fit_models_parallel(): ''' Use the CDSW Workers API (via Python SDK) to launch each model fitting script in parallel Docs - https://docs.cloudera.com/machine-learning/cloud/distributed-computing/topics/ml-workers-api.html ''' # Launch a separate worker to run each script independently base_path = os.getcwd() script_path = base_path + '/scripts' scripts = os.listdir(script_path) scripts = [ script_path + '/' + script for script in scripts if script[0:3] in ['fit', 'mak'] ] for script in scripts: cdsw.launch_workers(n=1, cpu=1, memory=3, script=script) # Force session to persist until each worker job has completed # Check for completion every minute complete = False while complete == False: time.sleep(60) workers = cdsw.list_workers() workers_status = [wkr['status'] for wkr in workers] if all(status == 'succeeded' for status in workers_status): complete = True
def run_dask_workers(n, cpu, memory, nvidia_gpu=0, scheduler_port=default_scheduler_port): """ Run a CDSW worker, and run a Dask worker inside it. Assumes that the scheduler is running on the CDSW master. """ worker_code = """ import cdsw_dask_utils worker_proc = cdsw_dask_utils._run_dask_worker_in_worker(scheduler_port=%d) # Keep the CDSW worker alive until the Dask worker exits. print(worker_proc.wait()) """ % scheduler_port workers = cdsw.launch_workers( n=n, \ cpu=cpu, \ memory=memory, \ nvidia_gpu=nvidia_gpu, \ kernel="python3", \ code=worker_code ) ids = [worker['id'] for worker in workers] print("IDs", ids) # Wait for the workers to start running, but don't wait for them to exit - # we want them to stay up for use as daemons. cdsw_await_workers.await_workers(ids, wait_for_completion=False) return workers
def run_dask_workers(n, cpu, memory, nvidia_gpu=0, scheduler_port=default_scheduler_port): """ Run a CDSW worker, and run a Dask worker inside it. Assumes that the scheduler is running on the CDSW master. """ worker_code = """ import cdsw_dask_utils worker_proc = cdsw_dask_utils._run_dask_worker_in_worker(scheduler_port=%d) # Keep the CDSW worker alive until the Dask worker exits. print(worker_proc.wait()) """ % scheduler_port workers = cdsw.launch_workers( n=n, \ cpu=cpu, \ memory=memory, \ nvidia_gpu=nvidia_gpu, \ kernel="python3", \ code=worker_code ) try: ids = [worker['id'] for worker in workers] except KeyError as key: errors = [[worker['k8sMessage'], worker['engineId']] for worker in workers] for error in errors: print('''worker {} failed to launch with err message : {}'''.format(error[1], error[0])) raise RuntimeError("failed to launch workers with err : " + error[0]) print("IDs", ids) # Wait for the workers to start running, but don't wait for them to exit - # we want them to stay up for use as daemons. cdsw_await_workers.await_workers(ids, wait_for_completion=False) return workers
import cdsw worker_code = ''' import os engine_id = os.environ.get('CDSW_ENGINE_ID') print('executing a whole bunch of code inside worker: {}'.format(engine_id)) ''' workers = cdsw.launch_workers(n=2, cpu=1, memory=1, code=worker_code) # # Get workers ID for worker in workers : print(worker['id']) # ### get workers information # wait 10 secs for workers to come up import time time.sleep(10) for worker in workers : import json print(json.dumps(worker, indent=4))
# master.py import cdsw, socket # Launch two CDSW workers. These are engines that will run in # the same project, execute a given code or script, and exit. workers = cdsw.launch_workers(n=1, cpu=2, memory=4, kernel="python3",script="worker.py") # Listen on TCP port 6000 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(("0.0.0.0", 6001)) s.listen(1) # Accept two connections, one from each worker. Workers will # execute worker.py. conn, addr = s.accept() for i in range(1): # Receive a message from each worker and return a response. data = conn.recv(20) if not data: break print("Master received:", data) conn.send("Hello From Server!".encode()) conn.close()
def run_cluster(n_workers, n_ps, cpu, memory, nvidia_gpu=0, worker_script=None, timeout_seconds=60): try: os.mkdir("/home/cdsw/.tmp", mode=755) except: pass fname = tempfile.mkdtemp(prefix="/home/cdsw/.tmp/clusterspec") worker_code = tensorflow_worker_code(fname, "worker", worker_script) workers = cdsw.launch_workers(n_workers, cpu=cpu, memory=memory, nvidia_gpu=nvidia_gpu, code=worker_code) worker_ids = [worker["id"] for worker in workers] if n_ps > 0: ps_code = tensorflow_worker_code(fname, "ps", None) parameter_servers = cdsw.launch_workers(n_ps, cpu=cpu, memory=memory, code=ps_code) ps_ids = [ps["id"] for ps in parameter_servers] else: parameter_servers = [] ps_ids = [] # Get the IP addresses of the workers. First, wait for them all to run running_workers = cdsw.await_workers(worker_ids, wait_for_completion=False, timeout_seconds=timeout_seconds) if running_workers["failures"]: raise RuntimeError("Some workers failed to run") # Then extract the IP's from the dictionary describing them. worker_ips = [ worker["ip_address"] for worker in running_workers["workers"] ] # Get the IP addresses of the parameter servers, if any ps_ips = [] if n_ps > 0: running_ps = cdsw.await_workers(ps_ids, wait_for_completion=False, timeout_seconds=timeout_seconds) if running_ps["failures"]: raise RuntimeError("Some parameter servers failed to run") ps_ips = [ps["ip_address"] for ps in running_ps["workers"]] cspec = { "worker": [ip + (":%d" % tf_port) for ip in worker_ips], "ps": [ip + (":%d" % tf_port) for ip in ps_ips] } tmpf = fname + "/cluster.json.tmp" f = open(tmpf, 'w') f.write(json.dumps(cspec)) f.flush() os.fsync(f.fileno()) f.close() os.rename(tmpf, fname + "/cluster.json") if worker_script is not None: # If a script has been provided for the Tensorflow workers, # wait for them all to exit. cdsw.await_workers(worker_ids, wait_for_completion=True) cdsw.stop_workers(*ps_ids) return None, None else: # If no script has been provided, wait for the TensorFlow # cluster to come up, then return a handle to the lead worker # so the user can create a TensorFlow session. # Wait for workers to be up for ip in worker_ips: wait.tcp.open(tf_port, host=ip) for ip in ps_ips: wait.tcp.open(tf_port, host=ip) return cspec, "grpc://%s:%d" % (worker_ips[0], tf_port)
schedulerip = os.environ["CDSW_IP_ADDRESS"] print(" Scheduler IP: " + schedulerip) #Scheduler protocol and port - defaults from Dask schproto = "tcp://" schport = ":8786" schloc = schproto + schedulerip + schport print(" Scheduler URL: " + schloc) # Launch at least one Dask Worker dask_client = cdsw.launch_workers(n=1, cpu=4, memory=8, kernel="python3", script="daskworker.py", env={"DASKSCHURL": schloc}) time.sleep(10) dask_client = cdsw.launch_workers(n=1, cpu=4, memory=8, kernel="python3", script="daskworker.py", env={"DASKSCHURL": schloc}) # wait for a while until the container is launched successfully time.sleep(10)