def get_server_state(srv): nodes = config.driver.list_nodes() #need to refresh node to get state node = [x for x in nodes if x.uuid.find(srv.uuid) == 0] if node: return node[0].state vprint("Cannot find server to determine state; assuming terminated") return 'terminated'
def terminate_server(srv): result = config.driver.destroy_node(srv) if not result: return "error terminating server" state = None while state != 'terminated': state = get_server_state(srv) time.sleep(2) vprint("server state:", state) return "success"
def stop_server(srv): result = srv.stop_node() if not result: return "error stopping server" state = None while state != 'stopped': state = get_server_state(srv) time.sleep(2) vprint("server state:", state) return "success"
def start_server(srv): result = srv.start() if not result: return "error starting server" state = None while state != 'running': state = get_server_state(srv) time.sleep(2) vprint("server state:", state) vprint("Waiting for public IP address to be assigned") config.driver.wait_until_running([srv]) vprint("Public IPs:", srv.public_ips) while len(srv.public_ips) == 0 or srv.public_ips.count(None) == len( srv.public_ips): #Really? Google? [None]???? # srv = config.driver.list_nodes(ex_node_ids=[srv.id])[0] srv = get_server( uuid=srv.uuid) #seems necessary to refresh to update state vprint("Public IPs:", srv.public_ips) time.sleep(5) return srv
def burst(args, sshuser=None, url=None, uuid=None, burst_user=None, gpu=False, ports=None, stop=False, image=None, vmtype=None, pubkey=None, dockerfile="Dockerfile", cloudmap="", dockerdport=2376, bgd=False, sync_only=False, conf=None): error = None tunnel = None try: if not os.path.exists(dockerfile): raise Exception("Dockerfile not found") if not os.path.exists(".dockerignore"): raise Exception(""" .dockerignore file not found. Burst requires a .dockerignore to avoid sending excess data to docker build. Because the working directory is rsync'd to the remote host, you typically only need to send the Dockerfile and files that are referred to (such as requirements.txt) to the build daemon. #Template .dockerignore #Default to ignoring everything: ** #exceptions (These will be sent to the docker daemon for building): !/Dockerfile* !requirements.txt """) if not os.path.exists(".burstignore"): raise Exception(""" .burstignore file not found. Burst requires a .burstignore to avoid synchronizing irrelevant data (such as hidden files) with the remote server. Here is a template, copy this to .burstignore in your project directory: .* venv __pycache__ """) #if url specified, split into user & IP if url: if not sshuser: sshuser, url = url.split('@') #launch, restart, or reconnect to node node = None #unless running --local: if url or uuid or burst_user: #if server does not exist, launch a fresh one fresh = False restart = False node = get_server(url=url, uuid=uuid, name=burst_user, conf=conf) if burst_user and not node: # print ("PUBKEY:", pubkey) node = launch_server(burst_user, pubkey=pubkey, vmtype=vmtype, image=image, conf=conf, user=sshuser, gpu=gpu) fresh = True restart = True if node: #if stopped, restart if node.state.lower() != "running": restart = True vprint("Starting server") node = start_server(node) #by now we must have a public IP address url = node.public_ips[0] #wait for ssh daemon to be ready vprint("Waiting for sshd") cmd = [ "ssh", "-o StrictHostKeyChecking=no", "-o UserKnownHostsFile=/dev/null", "-o LogLevel=error", "{0}@{1}".format(sshuser, url), "echo", "'sshd responding'" ] vvprint(cmd) good = False for z in range(10, -1, -1): ret = run(cmd, timeout=15) if ret[0].strip()[-15:] == 'sshd responding': good = True break vprint( "still waiting on sshd (this can take a while) -- will try %d more times" % z) if z: time.sleep(5) if not good: raise Exception("error in ssh call: %s" % ret[0].strip()) vvprint("SSH returns -->%s|%s<--" % ret) else: raise Exception("Error: node not found") docker_port_args = "" #we have a url unless running --local: if url: #if just launched, install docker if fresh: vprint("Configuring Docker") # 'sudo apt-get -y update; sudo apt-get -y install docker.io; ' \ #images have docker installed cmd = 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error {0}@{1} ' \ '"sudo usermod -a -G docker ubuntu; ' \ 'sudo systemctl unmask docker; sudo service docker start"'.format(sshuser, url) vvprint(cmd) os.system(cmd) vprint("Connecting through ssh") tunnel, docker_port_args = ssh_tunnel(url, sshuser, ports, dockerdport) #path = absolute working directory on host relpath = os.path.abspath('.')[len(os.path.expanduser('~')):] relpath = "/_BURST" + relpath.replace('/', '_') #I can exlain locpath = os.path.abspath('.') path = "/home/{0}{1}".format(sshuser, relpath) if not sync_only: # part of check to see if docker is installed and running remote = "-H localhost:%s" % dockerdport cmd = [ "docker", "{0}".format(remote), "ps", "--format", '{{json .}}' ] vvprint(cmd) out, err = run(cmd) vvprint("PS returns:", out) running = len([x for x in out.strip().split("\n") if x]) if running: raise Exception( "docker process already running -- burst does not support multiple processes" ) #prepare to build docker container vprint("Removing topmost layer") #to avoid running stale image cmd = [ "docker", "{0}".format(remote), "rmi", "--no-prune", DEFAULT_IMAGE ] vvprint(cmd) out, err = run(cmd) if "no such image" in out.lower(): out = "Creating new burst_image" vvprint(out) vmtype, image = fix_vmtype_and_image(vmtype, image) if vmtype and vmtype != get_server_vmtype(node): #FIXME raise Exception( "Cannot change vmtype (instance type) or gpu status -- need to re-launch" ) # get_server_image is broken, need to prompt better here # if image and image != get_server_image(node): # if image and image != get_server_image(node): # raise Exception("FIXME: cannot change host image -- need to terminate & re-launch server") vprint("burst: name %s vmtype %s image %s url %s" % (node.name, vmtype, image, url)) #if using cloud storage (s3 etc), set up config & auth for rclone if cloudmap: if remote: stor = get_config()['storage'] if stor['provider'] == 'GCS': #create a keyfile & point to it srvacctf = ".rclone_key_%s.json" % stor['settings'][ 'private_key']['private_key_id'] f = open(srvacctf, 'w') json.dump(stor['settings']['private_key'], f) f.close() stor['settings']['service_account_file'] = srvacctf # build & save rclone.conf s = f"[{stor['config']}]\n" for k, v in stor.items(): if k != 'settings': s += f"{k} = {v}\n" for k, v in stor['settings'].items(): s += f"{k} = {v}\n" f = open(".rclone.conf", 'w') f.write(s) f.close() rsync_ignore_path = os.path.abspath("./.burstignore") if not sync_only: #sync_only means from remote to local #sync local working data to host if not os.path.exists(rsync_ignore_path): vprint("creating empty .burstignore") os.system("touch .burstignore") cmd = 'rsync -rltzu{4} --del --include=.rclone.conf --exclude-from {5} -e "ssh -o StrictHostKeyChecking=no ' \ '-o UserKnownHostsFile=/dev/null -o LogLevel=error" {0}/. {3}@{1}:{2}/'.format(locpath, url, path, sshuser, get_rsync_v(), rsync_ignore_path) vprint("Synchronizing project folders") vvprint(cmd) os.system(cmd) if get_config().provider == 'GCE': # sync service acct creds (for shutdown) cmd = 'rsync -rltzu{4} --relative -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error" {0}/./.burst/{5} {3}@{1}:{2}/'.format( os.path.expanduser('~'), url, path, sshuser, get_rsync_v(), get_config().raw_secret) vprint("Synchronizing credentials for shutdown") vvprint(cmd) os.system(cmd) #if fresh launch, clone burst locally for monitor if fresh: vprint("Installing burst on server") vvprint("Delay for apt-get to settle") time.sleep(30) #trust me this helps vvprint("Delay done") err = do_ssh(f"{sshuser}@{url}", '"%s"' % install_burst_sh) #notable quoteables if err: raise Exception("Failed to install burst on remote server") if restart: vprint("updating burst installation for monitor") err = do_ssh(f"{sshuser}@{url}", '"%s"' % update_burst_sh) if err: raise Exception("Failed to update burst on remote server") vprint("Starting monitor process for shutdown++") #run monitor (in detached screen) to check if user's burst OR rsync is still running conf = get_config() if conf.provider == "GCE": secret = ".burst/" + conf.raw_secret else: secret = conf.secret proj = ('--project ' + conf.project) if conf.project else '' cmd = f"screen -md bash -c 'cd {path}; /usr/bin/python3 ~/burst/burst/monitor/monitor.py" \ f" --ip {url} --access {conf.access} --provider {conf.provider}" \ f" --secret={secret} --region {conf.region} {proj} >> ~/burst_monitor.log'" vvprint(cmd) err = do_ssh(f"{sshuser}@{url}", '"%s"' % cmd) if err: raise Exception("Failed to initialize timeout monitor") else: vprint("burst: running locally") remote = "" path = os.path.abspath('.') if not sync_only: #actually build container -- for reals vprint("Building docker container") cmd = "docker {1} build . --file {2} -t {0} {3}".format( DEFAULT_IMAGE, remote, dockerfile, get_piper()) vvprint(cmd) os.system(cmd) jupyter = False if len(args): jupyter = args[0] == 'jupyter' #build argument list -- re-quote if whitespace s = "" for a in args: a = a.strip() if " " in a: if '"' in a: s += f"'{a}' " else: s += f'"{a}" ' else: s += f"{a} " args = s.rstrip() # print ("FINAL args:", args) # exit() if gpu: gpu_args = "--gpus all" else: gpu_args = "" #if mounting storage, add arguments & insert commands before (to mount) and after (to unmount) user-specified args cloud_args = "" if cloudmap: cloud, host = cloudmap.split(":") args = f"bash -c 'mkdir -p {host}; rclone mount --vfs-cache-mode writes --vfs-write-back 0 --config .rclone.conf {cloud}: {host} & sleep 3; {args}; umount {host}'" cloud_args = " --privileged" vprint("Running docker container") background_args = "-td" if bgd else "-ti" if jupyter: if len(ports) == 0: raise Exception("jupyter requires -p (usually 8888)") jupargs = f"--label ai.burstable.jupyter={ports[0]}" #FIXME: document that 1st port is jupyter else: jupargs = "" cmd = f"docker {remote} run {gpu_args} {docker_port_args} --rm {background_args}" \ f" --label ai.burstable.shutdown={stop} {jupargs}" \ f" -v {path}:/home/burst/work {cloud_args} {DEFAULT_IMAGE} {args}" #run main task vvprint(cmd) vprint("") v0print("---------------------OUTPUT-----------------------") sys.stdout.flush() if bgd: cmd = cmd.split() docker_container, err = run(cmd) print("Running in background mode. Container =", docker_container[:11]) else: os.system(cmd) sys.stdout.flush() v0print("----------------------END-------------------------") sys.stdout.flush() #sync data on host back to local (note: we do not delete in this direction lest a fresh machine wipes our local workspace) if url and not bgd: vprint("Synchronizing folders") cmd = "rsync -rltzu{4} --exclude-from {5} -e 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error' '{3}@{1}:{2}/.' {0}/".format( locpath, url, path, sshuser, get_rsync_v(), rsync_ignore_path) vvprint(cmd) err = os.system(cmd + " " + get_piper()) # print ("RSYNC err:", err) if err: vvprint("rsync returns:", err) vprint( "Your session has timed out. Run 'burst sync' to synchronize data" ) except Exception as ex: if get_verbosity() & 64: v0print("--------------------------------") traceback.print_exc() v0print("--------------------------------") else: print() print(ex) error = "Exception" if tunnel: tunnel.kill() return error
def launch_server(name, vmtype=None, image=None, pubkey=None, conf=None, user=None, gpu=False): init(conf) vmtype, image = fix_vmtype_and_image(vmtype, image) image_full_path = image if config.provider == 'EC2': images = config.driver.list_images(ex_filters={'name': image}) elif config.provider == 'GCE': #note: GCE libcloud driver list_images is hella borke, list is incomplete so... images = [] for proj in ["deeplearning-platform-release", "ubuntu-os-cloud"]: try: im = config.driver.ex_get_image(image, ex_project_list=proj) images = [im] break except ResourceNotFoundError: pass else: ims = config.driver.list_images() images = [x for x in ims if x.name == image] if not images: raise Exception("Image %s not found" % image) image = images[0] vmtypes = [x for x in config.driver.list_sizes() if x.name == vmtype] if not vmtypes: raise Exception("Instance vmtype %s not found" % vmtype) vmtype = vmtypes[0] if 'disksize' not in config or config.disksize == None: raise Exception( "Need to add disksize to config or specify (in gigabytes, eg --disksize=150)" ) vprint( "Launching instance image=%s, id=%s, session=%s, type=%s ram=%s disk=%s" % (image_full_path, image.id, name, vmtype.id, vmtype.ram, config.disksize)) if pubkey: if config.provider == 'EC2': #Everybody makes it up auth = NodeAuthSSHKey(pubkey) node = config.driver.create_node( name, vmtype, image, auth=auth, ex_blockdevicemappings=[ #So sue me { 'Ebs.VolumeSize': config.disksize, 'DeviceName': '/dev/sda1' } ]) elif config.provider == 'GCE': meta = { 'items': [{ 'key': 'sshKeys', 'value': '%s: %s' % (user, pubkey) }] } if gpu: vprint("Launching with GPU") node = config.driver.create_node( name, vmtype, image, ex_metadata=meta, ex_accelerator_type=config.default_gpu, ex_accelerator_count=1, ex_on_host_maintenance="TERMINATE") else: vprint("Launching without GPU") node = config.driver.create_node(name, vmtype, image, ex_metadata=meta) else: raise Exception("Unsupported clown provider: %s" % config.provider) else: node = config.driver.create_node(name, vmtype, image) vprint("Waiting for public IP address to be active") config.driver.wait_until_running([node]) while len(node.public_ips) == 0: # node = config.driver.list_nodes(ex_node_ids=[node.id])[0] #refresh node -- is this really necessary node = get_server( uuid=node.uuid) #seems necessary to refresh to update state vprint("Public IPs:", node.public_ips) time.sleep(5) vprint("Public IPs:", node.public_ips) return node
def init(conf=None): #init is a one-time thang if 'driver' in config: return if conf == None: conf = {} yam = conf.get('configfile', os.environ['HOME'] + "/.burst/config.yml") if os.path.exists(yam): #FIXME: check for local overriding .burst f = open(yam) yconf = yaml.load(f, Loader=yaml.FullLoader) f.close() # print("DBBG 1", yconf['compute']['configurations']['Ec2Beta']['disksize']) if 'compute_config' in conf: compute_config = conf['compute_config'] else: compute_config = yconf['compute']['settings']['default_compute'] #this got a bit strained. sorry storage_config = None if 'storage_config' in conf: #if storage_config passed in, use storage_config = conf['storage_config'] else: if 'storage' in yconf: #otherwise check in config.yml storage_config = yconf['storage']['settings'][ 'default_storage'] if storage_config: #if it exists, storage = yconf['storage']['configurations'][ storage_config] #use it storage['config'] = storage_config #and store the config name too yconf = yconf['compute']['configurations'][compute_config] # print ("DBBG 2", yconf['disksize']) yconf.update(yconf['settings'] ) #easier to deal with all attributes at top level yconf['compute_config'] = compute_config if storage_config: #if specified, yconf['storage'] = storage #pull storage to top level for ease else: vprint("config.yml not found") yconf = {} #dummy yconf if 'provider' in conf: config.provider = conf['provider'] else: if 'provider' in yconf: config.provider = yconf['provider'] else: raise Exception( "Configuration file %s not available. Try running:\nburst configure" % yam) for param in [ 'access', 'secret', 'region', 'project', 'default_image', 'default_vmtype', 'default_gpu_image', 'default_gpu_vmtype', 'default_gpu', 'storage', 'compute_config', 'disksize' ]: if param in conf: config[param] = conf[param] else: config[param] = yconf.get(param, None) if config.default_vmtype == None or config.default_gpu_vmtype == None: vprint("""config.yml syntax has changed: rename default_size --> default_vmtype default_gpu_size-->default_gpu_vmtype""") cls = get_driver(Provider[config.provider]) if config.provider == 'EC2': config.driver = cls(config.access, config.secret, region=config.region) elif config.provider == 'GCE': if hasattr(config.secret, 'lower'): #string points to key file privkeypath = config.secret config.raw_secret = config.secret else: #if dict, create key file config.raw_secret = "%s.json" % config.secret['private_key_id'] privkeypath = "%s/.burst/%s.json" % ( os.path.expanduser("~"), config.secret['private_key_id']) if not os.path.exists(privkeypath): fp = open(privkeypath, 'w') json.dump(config.secret, fp) fp.close() config.driver = cls(config.access, privkeypath, datacenter=config.region, project=config.project) else: vprint("ERROR: unknown cloud provider", config.provider)
def burst(args, sshuser=None, url=None, uuid=None, burst_user=None, gpus="", ports=None, stop=False, image=None, size=None, pubkey=None, dockerfile="Dockerfile", cloudmap="", dockerdport=2376, conf=None): tunnel = None try: if not os.path.exists(dockerfile): raise Exception("Dockerfile not found") if not os.path.exists(".dockerignore"): raise Exception(""" .dockerignore file not found. Burst requires a .dockerignore to avoid sending excess data to docker build. Because the working directory is rsync'd to the remote host, you typically only need to send the Dockerfile and files that are referred to (such as requirements.txt) to the build daemon. #Template .dockerignore #Default to ignoring everything: ** #exceptions (These will be sent to the docker daemon for building): !/Dockerfile* !requirements.txt """) #if url specified, split into user & IP if url: if not sshuser: sshuser, url = url.split('@') #launch, restart, or reconnect to node node = None #unless running --local: if url or uuid or burst_user: #if server does not exist, launch a fresh one fresh = False restart = False node = get_server(url=url, uuid=uuid, name=burst_user, conf=conf) if burst_user and not node: node = launch_server(burst_user, pubkey=pubkey, size=size, image=image, conf=conf, user=sshuser, gpus=gpus) fresh = True restart = True if node: #if stopped, restart if node.state.lower() != "running": restart = True vprint("Starting server") node = start_server(node) #by now we must have a public IP address url = node.public_ips[0] #wait for ssh daemon to be ready vprint("Waiting for sshd") cmd = [ "ssh", "-o StrictHostKeyChecking=no", "-o UserKnownHostsFile=/dev/null", "-o LogLevel=error", "{0}@{1}".format(sshuser, url), "echo", "'sshd responding'" ] vvprint(cmd) good = False for z in range(10, -1, -1): ret = run(cmd, timeout=15) if ret[0].strip()[-15:] == 'sshd responding': good = True break vprint("sshd not responding; %d attempts left" % z) if z: time.sleep(5) if not good: raise Exception("error in ssh call: %s" % ret[0].strip()) vvprint("SSH returns -->%s|%s<--" % ret) else: raise Exception("Error: node not found") #we have a url unless running --local: if url: #if just launched, install docker if fresh: print("Configuring Docker") # 'sudo apt-get -y update; sudo apt-get -y install docker.io; ' \ #images have docker installed cmd = 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error {0}@{1} ' \ '"sudo usermod -a -G docker ubuntu; ' \ 'sudo systemctl unmask docker; sudo service docker start"'.format(sshuser, url) vvprint(cmd) os.system(cmd) vprint("Connecting through ssh") #set up ssh tunnel mapping docker socket, ports host_port_args = [] docker_port_args = "" if ports: for pa in ports: if ':' in pa: local_port, remote_port = pa.split(':') else: remote_port = local_port = pa docker_port_args += " -p {0}:{0}".format(remote_port) host_port_args.append("-L {0}:localhost:{1}".format( local_port, remote_port)) # print ("PORTS: |%s|%s|" % (docker_port_args, host_port_args)); exit() remote = "-H localhost:%s" % dockerdport ssh_args = [ "ssh", "-o StrictHostKeyChecking=no", "-o UserKnownHostsFile=/dev/null", "-o LogLevel=error", "-NL", "{0}:/var/run/docker.sock".format(dockerdport), "{0}@{1}".format(sshuser, url) ] for arg in host_port_args: ssh_args.insert(3, arg) vvprint(ssh_args) tunnel = subprocess.Popen(ssh_args) time.sleep(2) #path = absolute working directory on host relpath = os.path.abspath('.')[len(os.path.expanduser('~')):] relpath = "/_BURST" + relpath.replace('/', '_') #I can exlain locpath = os.path.abspath('.') path = "/home/{0}{1}".format(sshuser, relpath) #part of check to see if docker is installed and running cmd = [ "docker", "{0}".format(remote), "ps", "--format", '{{json .}}' ] vvprint(cmd) out = run(cmd) vvprint("PS returns -->%s|%s<--" % out) monitor_running = False if out[1]: for line in out[0].split("\n"): if not line: continue j = json.loads(line) # pprint(j) # print ("RUNNING:", j['Image'], j['Labels']) for x in j['Labels'].split(','): if 'ai.burstable.monitor=' == x: monitor_running = True vprint("monitor_running: %s" % monitor_running) #if restarted (including fresh launch), start monitor docker process if restart or not monitor_running: #put sentinel script in working dir; gets rsync'd to host if not os.path.exists(".burst-sentinel.py"): vvprint("creating .burst-sentinel.py in", os.path.abspath('.')) f = open(".burst-sentinel.py", 'w') f.write(burst_sentinel_py) f.close() vprint("Starting monitor process for shutdown++") #run monitor (in docker container) to check if user's burst OR rsync is still running conf = get_config() if conf.provider == "GCE": secret = ".burst/" + conf.raw_secret else: secret = conf.secret # print("SECRET 1:", secret) cmd = f"docker {remote} run --label 'ai.burstable.monitor' " \ f"--rm {get_dockrunflags()} -v /var/run/docker.sock:/var/run/docker.sock" \ f" {MONITOR_IMAGE} burst-monitor" \ f" --ip {url} --access {conf.access} --provider {conf.provider} {get_piper()}" \ f" --secret={secret} --region {conf.region} {('--project ' + conf.project) if conf.project else ''}" vvprint(cmd) vvprint("Shutdown process container ID:") os.system(cmd) #prepare to build docker container vprint("Removing topmost layer") #to avoid running stale image cmd = [ "docker", "{0}".format(remote), "rmi", "--no-prune", DEFAULT_IMAGE ] vvprint(cmd) out, err = run(cmd) vvprint(out) size, image = fix_size_and_image(size, image) if size and size != get_server_size(node): #FIXME raise Exception( "Cannot change size (instance type) -- need to re-launch") # get_server_image is broken, need to prompt better here # if image and image != get_server_image(node): # raise Exception("FIXME: cannot change host image -- need to terminate & re-launch server") vprint("burst: name %s size %s image %s url %s" % (node.name, size, image, url)) #if using cloud storage (s3 etc), set up config & auth for rclone if cloudmap: if remote: stor = get_config()['storage'] if stor['provider'] == 'GCS': #create a keyfile & point to it srvacctf = ".rclone_key_%s.json" % stor['settings'][ 'private_key']['private_key_id'] f = open(srvacctf, 'w') json.dump(stor['settings']['private_key'], f) f.close() stor['settings']['service_account_file'] = srvacctf # build & save rclone.conf s = f"[{stor['config']}]\n" for k, v in stor.items(): if k != 'settings': s += f"{k} = {v}\n" for k, v in stor['settings'].items(): s += f"{k} = {v}\n" f = open(".rclone.conf", 'w') f.write(s) f.close() #sync local working data to host rsync_ignore_path = os.path.abspath("./.burstignore") if not os.path.exists(rsync_ignore_path): vprint("creating empty .burstignore") os.system("touch .burstignore") cmd = 'rsync -rltzu{4} --exclude-from {5} -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error" {0}/. {3}@{1}:{2}/'.format( locpath, url, path, sshuser, get_rsync_v(), rsync_ignore_path) vprint("Synchronizing project folders") vvprint(cmd) os.system(cmd) # if get_config().provider == 'GCE': # # sync service acct creds (for shutdown) # cmd = 'rsync -rltzu{4} --relative -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error" {0}/./.burst/{5} {3}@{1}:{2}/'.format(os.path.expanduser('~'), # url, path, sshuser, get_rsync_v(), get_config().raw_secret) # vprint("Synchronizing credentials for shutdown") # vvprint (cmd) # os.system(cmd) if restart or not monitor_running: vprint("Starting host sentinel process for shutdown") #set up sentinel script in detached screen on host (not docker) to help check on rsync cmd = 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error {0}@{1} ' \ '"cd {2} ; screen -md python3 .burst-sentinel.py"'.format(sshuser, url, path) vvprint(cmd) os.system(cmd) else: vprint("burst: running locally") remote = "" path = os.path.abspath('.') #actually build container -- for reals vprint("Building docker container") cmd = "docker {1} build . --file {2} -t {0} {3}".format( DEFAULT_IMAGE, remote, dockerfile, get_piper()) vvprint(cmd) os.system(cmd) args = " ".join(args) gpu_args = "--gpus " + gpus if gpus else "" #if mounting storage, add arguments & insert commands before (to mount) and after (to unmount) user-specified args cloud_args = "" if cloudmap: cloud, host = cloudmap.split(":") args = f"bash -c 'mkdir -p {host}; rclone mount --vfs-cache-mode writes --vfs-write-back 0 --config .rclone.conf {cloud}: {host} & sleep 3; {args}; umount {host}'" cloud_args = " --privileged" vprint("Running docker container") cmd = "docker {3} run {4} {5} --rm -ti --label ai.burstable.shutdown={7} -v {2}:/home/burst/work {6} {0} {1}".format( DEFAULT_IMAGE, args, path, remote, gpu_args, docker_port_args, cloud_args, stop) #run user-specified args vvprint(cmd) vprint("") v0print("---------------------OUTPUT-----------------------") sys.stdout.flush() os.system(cmd) sys.stdout.flush() v0print("----------------------END-------------------------") sys.stdout.flush() #sync data on host back to local if url: vprint("Synchronizing folders") cmd = "rsync -rltzu{4} --exclude-from {5} -e 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error' '{3}@{1}:{2}/.' {0}/".format( locpath, url, path, sshuser, get_rsync_v(), rsync_ignore_path) vvprint(cmd) os.system(cmd) except Exception as ex: if get_verbosity() >= 256: v0print("--------------------------------") traceback.print_exc() v0print("--------------------------------") else: print() print(ex) # if url and node: # # set up shutdown process # if stop == 0: # vprint ("Stopping VM at %s immediately as instructed" % url) # stop_server(node) # else: # vprint ("Scheduling shutdown of VM at %s for %d seconds from now" % (url, stop)) # conf = get_config() # if conf.provider == "GCE": # secret = ".burst/" + conf.raw_secret # else: # secret = conf.secret # # print("SECRET 1:", secret) # cmd = f"docker {remote} run --rm {get_dockrunflags()} -v {path}:/home/burst/work {MONITOR_IMAGE} burst" \ # f" --verbosity {get_verbosity()} --stop_instance_by_url {url} --delay {stop} --access {conf.access}" \ # f" --secret={secret} --region {conf.region} {('--project ' + conf.project) if conf.project else ''}" \ # f" --provider {conf.provider} {get_piper()}" # vvprint (cmd) # vvprint ("Shutdown process container ID:") # os.system(cmd) if tunnel: tunnel.kill()
args_compute.provider = args.provider else: burst_conf = {} #command line overrides: if args.compute_config: burst_conf['compute_config'] = args.compute_config if args.storage_config: burst_conf['storage_config'] = args.storage_config if args.configfile: burst_conf['configfile'] = args.configfile if args.local and (args.uuid or args.url): vprint(args) parser.error( "when specifying --local, do not set --sshuser, --burst_user, --uuid, or --url" ) exit() t0 = time.time() while time.time() - t0 < args.delay: vprint("%d seconds till action" % (args.delay + .5 + t0 - time.time())) time.sleep(5) #set default burst_user if necessary: if not (args.burst_user or args.uuid or args.url or args.local or args.version): burst_user = getpass.getuser() args.burst_user = "******" + burst_user vprint("Session: %s" % args.burst_user)