def test_fail_connect_with_retries(self): connerr = None # type: Optional[ConnectionError] try: _ = spurplus.connect_with_retries(hostname="some-nonexisting-hostname.com", retries=2, retry_period=1) except ConnectionError as err: connerr = err self.assertIsNotNone(connerr)
def connect(self, hostname: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None, use_key: bool = False, login: bool = False, port: Optional[int] = None, timeout: int = 60, retries: int = 0, retry_delay: int = 1) -> None: # ADD DOCSTRING # TODO: deal with prompting for required fields if not provided if self.connected: raise SSHConnectionError( 'already connected to a remote host. use `SshShell.disconnect` ' 'to disconnect from the current host before connecting to a new ' 'one, or `SshShell.reconnect` to reset the connection to the ' 'current host') port = port or self.port or 22 hostname = hostname or self.hostname or input("Hostname: ") username = username or self.username or input("Username: "******"Password: ") self._shell = spurplus.connect_with_retries( hostname=hostname, username=username, password=password, look_for_private_keys=(not use_key), port=port, connect_timeout=timeout, retries=retries, retry_period=retry_delay) # only update attrs if connection is successful self.hostname = hostname self.username = username self.port = port self.connected = True if self._environ is None: # read environment variables tmp_exe = self.executable or '/bin/bash' if login: printenv_command = [tmp_exe, '-lc', 'printenv'] else: printenv_command = ['printenv'] # TODO: a more robust solution for this in case BASH_FUNC_module isn't last initial_env = self.shell.check_output(printenv_command).split( '\nBASH_FUNC_module()')[0] initial_env = dict( map(lambda x: x.split('=', maxsplit=1), initial_env.splitlines())) self._environ = PseudoEnviron(initial_env=initial_env, custom_vars=self._env_additions) # initial validation of properties that depend on environment self.cwd = self._cwd self.executable = self._executable
def remote_submit(sync_changes=False, config_path=None): """ main function that handles submitting jobs on the cluster from your local machine :param sync_changes: (bool, default: False) if True, upload any local changes to cluster scripts before submitting jobs :param config_path: (str, optional, default: None) path to your config file. If you created your config following the instructions in configs/template_config.ini, you can simply leave this empty :return: None (other than some hopefully some results, eventually!) """ if config_path is None: config = attempt_load_config() else: config = parse_config(config_path) hostname = config['hostname'] username = config['username'] password = config['password'] confirm_overwrite = config['confirm_overwrite_on_upload'] if username.startswith('f00'): job_cmd = 'mksub' else: job_cmd = 'qsub' # set commands if job_config['env_type'] == 'conda': activate_cmd = 'source activate' deactivate_cmd = 'conda deactivate' else: # TODO: add commands for venv & virtualenv activation raise ValueError("Only conda environments are currently supported") with connect_with_retries(hostname=hostname, username=username, password=password) as cluster: if sync_changes: # upload cluster scripts to remote script_dir = opj(dirname(realpath(__file__)), 'cluster_scripts') upload_scripts(cluster, script_dir, job_config, confirm_overwrite) # create bash script to submit and run submit.py from compute node submitter_filepath = write_remote_submitter(cluster, job_config, activate_cmd, deactivate_cmd) # format commands for remote shell submitter_cmds = [job_cmd, submitter_filepath] remote_command = fmt_remote_commands(submitter_cmds) # run the submitter script cluster.run(remote_command)
def destination_exists(self, destination: str, ssh_connection: str=None) -> bool: # Check if exist file in local or in remote server if ssh_connection: # ssh_connection --> ssh://username:password@server:port url = urllib.parse.urlparse(ssh_connection) with spurplus.connect_with_retries( retries=5, hostname=url.hostname, username=url.username, password=url.password if url.password else None, port=url.port if url.port else None ) as shell: return shell.exists(destination) else: return os.path.isdir(destination)
def get_shell(production=False) -> SshShell: global sh if sh is not None: return sh if production: hostname = "safeisland.hesusruiz.org" print("=== Operating in PRODUCTION!!") else: hostname = "safeislandtest.hesusruiz.org" print("=== Operating in DEVELOPMENT!!") sh = spurplus.connect_with_retries( hostname=hostname, username='******', private_key_file='../telsiusin2/awsnode/AWSAlastriaIN2.pem', retries=5, connect_timeout=5, ) return sh
def set_up_test_shell() -> spurplus.SshShell: """sets up a shell to the testing instance.""" params = params_from_environ() try: shell = spurplus.connect_with_retries( hostname=params.hostname, port=params.port, username=params.username, password=params.password, private_key_file=params.private_key_file, missing_host_key=spur.ssh.MissingHostKey.accept, retries=2, retry_period=1) except ConnectionError as err: raise ConnectionError( "Failed to connect to {}@{}:{}, private key file: {}, password is not None: {}" .format(params.username, params.hostname, params.port, params.private_key_file, params.password is not None)) from err return shell
def get_variables_from_startup_script(self, ssh_connection: str) -> dict: # Get value of remote variable reading startup script directly # ssh_connection --> ssh://username:password@server:port url = urllib.parse.urlparse(ssh_connection) with spurplus.connect_with_retries( retries=5, hostname=url.hostname, username=url.username, password=url.password if url.password else None, port=url.port if url.port else None ) as shell: shell.run(['ls']) script = os.sep + 'home' + os.sep + url.username + os.sep + STARTUP_SCRIPT_NAME content = shell.read_text(script) lines = content.split('\n') variables = dict() for line in lines: if line.startswith('zset'): line_as_list = line.split() variables[line_as_list[1]] = line_as_list[2] return variables
def resubmit_failed(confirm_resubmission=False, config_path=None): # TODO: add docstring if config_path is None: config = attempt_load_config() else: config = parse_config(config_path) hostname = config['hostname'] username = config['username'] password = config['password'] confirm = config['confirm_resubmission'] workingdir = job_config['workingdir'] scriptdir = job_config['scriptdir'] job_name = job_config['jobname'] # set confirmation option from config if not set here if confirm and not confirm_resubmission: confirm_resubmission = True # set submission command if username.startswith('f00'): job_cmd = 'mksub' else: job_cmd = 'qsub' with connect_with_retries(hostname=hostname, username=username, password=password) as cluster: cluster_sftp = cluster.as_sftp() # get all created bash scripts all_scripts = cluster_sftp.listdir(scriptdir) print(f"found {len(all_scripts)} job scripts") stdout_files = [ f for f in cluster_sftp.listdir(workingdir) if f.startswith(f'{job_name}.o') ] print(f"found {len(stdout_files)} job stdout files") # get output of qstat command running_jobs = [ line for line in get_qstat(cluster) if len(line) > 0 and line[0].isnumeric() ] # filter out completed jobs, isolate jobid running_jobids = [ line.split('.')[0] for line in running_jobs if line.split()[-2] != 'C' ] print(f"found {len(running_jobids)} running jobs") print("parsing stdout files...") successful_jobs = {} for outfile in stdout_files: jobid = outfile.split('.o')[1] # read stdout file stdout_path = opj(workingdir, outfile) stdout = cluster.read_text(stdout_path) try: job_script = stdout.split('script name: ')[1].splitlines()[0] # track successfully finished jobs if 'job script finished' in stdout: successful_jobs[job_script] = jobid except (IndexError, ValueError): print(f"failed to find corresponding script for {outfile}...") continue to_resubmit = [ s for s in all_scripts if s not in list(successful_jobs.keys()) ] if confirm_resubmission: view_scripts = prompt_input("View jobs to be resubmitted before \ proceeding?") if view_scripts: print('\n'.join(to_resubmit)) resubmit_confirmed = prompt_input("Do you want to resubmit \ these jobs?") if not resubmit_confirmed: sys.exit() print("Removing failed jobs' stdout/stderr files...") for outfile in stdout_files: jobid = outfile.split('.o')[1] if not (jobid in successful_jobs.values() or jobid in running_jobids): stdout_path = opj(workingdir, outfile) stderr_path = opj(workingdir, f'{job_name}.e{jobid}') cluster.remove(stdout_path) cluster.remove(stderr_path) print(f"resubmitting {len(to_resubmit)} jobs") for job in to_resubmit: script_path = opj(scriptdir, job) print(f"resubmitting {job}") cmd = fmt_remote_commands([f'{job_cmd} {script_path}']) cluster.run(cmd)
if not overwrite_confirmed: print(f"skipping {file} (overwrite declined)") continue remote_shell.put(src_path, dest_path, create_directories=False) print(f"uploaded {file}") print("finished uploading scripts") # setup for running as a stand-alone script if __name__ == '__main__': config = attempt_load_config() hostname = config['hostname'] username = config['username'] password = config['password'] confirm_overwrite = config['confirm_overwrite_on_upload'] script_dir = opj(dirname(realpath(__file__)), 'cluster_scripts') with connect_with_retries( hostname=hostname, username=username, password=password ) as cluster: upload_scripts( cluster, script_dir, job_config, confirm_overwrite=confirm_overwrite )
def do_zremoteupgrade(self, args): """Upgrade Z server in remote server. Usage: zremoteupgrade sshconnection zremoteupgrade sshconnection version_name sshconnection --> ssh://username:password@server:port """ arglist = args.split() # Expect 0 or 1 argument if len(arglist) < 1 or len(arglist) > 2: self.perror('zremoteupgrade requires 1 or 2 argument:', traceback_war=False) self.do_help('zremoteupgrade') self._last_result = cmd2.CommandResult('', 'Bad arguments') return # Recover sshconnection ssh_connection = arglist[0] # Recover install path. git url and version name remote_variables = self.get_variables_from_startup_script(ssh_connection) install_path = remote_variables.get(VARIABLE_INSTALL_PATH, None) older_version_name = remote_variables.get(VARIABLE_VERSION_NAME, None) giturl = remote_variables.get(VARIABLE_GIT_URL, None) if len(arglist) == 2: version_name_arg = arglist[1] else: version_name_arg = self.get_last_tag(giturl) # Convert version_name is dev is selected version_name = version_name_arg if version_name_arg.lower() == DEVELOPER_BRANCH[0]: version_name = DEVELOPER_BRANCH[1] if install_path and older_version_name and giturl: # Do some actions in remote server: stop, backup, delete and install # ssh_connection --> ssh://username:password@server:port url = urllib.parse.urlparse(ssh_connection) with spurplus.connect_with_retries( retries=5, hostname=url.hostname, username=url.username, password=url.password if url.password else None, port=url.port if url.port else None ) as shell: # Stop server zctl_path = install_path + os.sep + ZCTL_NAME command = [zctl_path] + ['zstop'] + ['quit'] shell.run(command) # Compress actual deployment for backup generating compressed file to backup folder backup_folder = install_path + os.sep + '..' + os.sep + 'backup' + os.sep filename = '{}-{}.tar.gz'.format(self.get_timestamp(), older_version_name) compressed_file = backup_folder + filename # Execute targz command command = [zctl_path] + ['targz {} {}'.format(install_path, compressed_file)] + ['quit'] shell.run(command) # Delete installation command = ['rm'] + ['-rf'] + [install_path] shell.run(command) # Install the new version self.poutput('Upgrading Z server') self.do_zremoteinstall(install_path + ' ' + giturl + ' ' + version_name_arg + ' ' + ssh_connection) else: self.poutput('No installation folder found')
def do_zremoteinstall(self, args): """Install Z server in remote server. Usage: zremoteinstall destination giturl version_name sshconnection sshconnection --> ssh://username:password@server:port """ arglist = args.split() # Expect 3 argument if not arglist or len(arglist) != 4: self.perror('zremoteinstall requires exactly 4 argument:', traceback_war=False) self.do_help('zremoteinstall') self._last_result = cmd2.CommandResult('', 'Bad arguments') return # Recover arguments destination = arglist[0] giturl_raw = arglist[1] version_name_arg = arglist[2] ssh_connection = arglist[3] # If exists, convert username and password to url encoded to build git url giturl = self.get_git_url_encoded(giturl_raw) # Check if version_name exists if not self.version_exists(giturl_raw, version_name_arg): self.perror('Version does not exist. Please check it.', traceback_war=False) return # If version name is "dev", it is neccesary to convert to the correct branch version_name = self.convert_version_name(version_name_arg) # Check if destination folder exists if self.destination_exists(destination, ssh_connection): self.perror('Destination ifolder exists. Please check it and delete it', traceback_war=False) return # Define and delete if exists destination_tmp and venv_dir_tmp destination_tmp = tempfile.gettempdir() + os.sep + 'install' venv_dir_tmp = os.path.abspath(destination_tmp + os.sep + '..' + os.sep + 'venv') if self.destination_exists(destination_tmp): shutil.rmtree(destination_tmp) if self.destination_exists(venv_dir_tmp): shutil.rmtree(venv_dir_tmp) # Clone repository to temporal destination folder self.poutput('Installing Z server') self.clone_from_repository(giturl, version_name, destination_tmp) # Create virtual environment and install requirements bin_dir_tmp = os.path.abspath(venv_dir_tmp + os.sep + 'bin') os.system('python3 -m venv ' + venv_dir_tmp) os.system(bin_dir_tmp + os.sep + 'pip install --no-cache-dir -r ' + destination_tmp + os.sep + 'requirements.txt') # Define venv_dir. Define python binary folder venv_dir = os.path.abspath(destination + os.sep + '..' + os.sep + 'venv') bin_dir = os.path.abspath(venv_dir + os.sep + 'bin') # Replace all ocurrences of venv_dir_tmp for the new venv folder in destination self.find_replace(bin_dir_tmp, venv_dir_tmp, venv_dir) # Replace shebang (#!) for venv python in destination_tmp files old_shebang = '#!/usr/bin/python3' new_shebang = '#!{}'.format(bin_dir) + os.sep + 'python' zctl_path_tmp = destination_tmp + os.sep + ZCTL_NAME self.find_replace(zctl_path_tmp, old_shebang, new_shebang, first_occurrence=True) # Check operations in remote server url = urllib.parse.urlparse(ssh_connection) with spurplus.connect_with_retries( retries=5, hostname=url.hostname, username=url.username, password=url.password if url.password else None, port=url.port if url.port else None ) as shell: # Delete virtual environment if exist if shell.exists(venv_dir): shell.remove(venv_dir, recursive=True) # Copy temporal virtual environment and temporal installation folder to remote server shell.mkdir(remote_path=destination, parents=True, exist_ok=True) shell.mkdir(remote_path=venv_dir, parents=True, exist_ok=True) self.poutput('Copying Z server') shell.sync_to_remote( local_path=destination_tmp, remote_path=destination, delete=spurplus.Delete.BEFORE, preserve_permissions=True ) self.poutput('Copying virtual environment') shell.sync_to_remote( local_path=venv_dir_tmp, remote_path=venv_dir, delete=spurplus.Delete.BEFORE, preserve_permissions=True ) # Set install path and version name variables and store them # Execute these commands in remote server zctl_path = destination + os.sep + ZCTL_NAME command_1 = 'zset {} {} store'.format(VARIABLE_INSTALL_PATH, destination) command_2 = 'zset {} {} store'.format(VARIABLE_VERSION_NAME, version_name_arg) command_3 = 'zset {} {} store'.format(VARIABLE_GIT_URL, giturl_raw) command_4 = 'zset {} {} store'.format(VARIABLE_VENV_BIN_PATH, bin_dir) command_5 = 'quit' zcommand = '{} {} {} {} {}'.format(command_1, command_2, command_3, command_4, command_5) chmod_command = ['chmod'] + ['+x'] + [zctl_path] command = [zctl_path] + [command_1] + [command_2] + [command_3] + [command_4] + [command_5] shell.run(chmod_command) shell.run(command) self.poutput('Z server installed')
def main() -> int: """Execute the main routine.""" ## # Specify command-line arguments ## parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--hostname", help="Host name of the remote instance", required=True) parser.add_argument("--user", help="Remote user used for the deployment", default="devop") group = parser.add_mutually_exclusive_group() group.add_argument("--data_dir", help="Root directory where data sets reside; " "if not specified, the data is not synced") group.add_argument( "--data_url", help="URL where data sets reside; " "if not specified, the data is not downloaded on the remote host") parser.add_argument( "--dont_install_requirements", help="If set, requirements are not installed on the remote machine.", action='store_true') ## # Parse command-line arguments ## args = parser.parse_args() hostname = str(args.hostname) remote_user = str(args.user) data_dir = None if args.data_dir is None else pathlib.Path(args.data_dir) data_url = None if args.data_url is None else str(args.data_url) assert ((data_dir is None and data_url is None) or ((data_dir is None) ^ (data_url is None))) if data_dir is not None and not data_dir.exists(): raise FileNotFoundError( "Data directory does not exist: {}".format(data_dir)) dont_install_requirements = bool(args.dont_install_requirements) ## # Connect ## print("Connecting to the remote instance at {}@{} ...".format( remote_user, hostname)) with spurplus.connect_with_retries( hostname=hostname, username=remote_user, missing_host_key=spur.ssh.MissingHostKey.warn) as ssh: ## # Specify common paths ## remote_home_dir = pathlib.Path( ssh.check_output(['/bin/bash', '-c', 'echo $HOME']).strip()) remote_mediti_dir = remote_home_dir / "mediti-train" script_dir = pathlib.Path(os.path.realpath(__file__)).parent ## # Install requirements ## if not dont_install_requirements: print("Installing the requirements...") ssh.put(local_path=script_dir / "requirements-gpu.txt", remote_path=remote_mediti_dir / "requirements-gpu.txt") install_pth = remote_mediti_dir / 'install.sh' ssh.write_text(remote_path=install_pth, text=textwrap.dedent('''#!/bin/bash set -e echo "sudo apt-get install'ing ..." sudo apt-get install -y python3-venv wget unzip echo "Creating the virtual environment ..." python3 -m venv venv source venv/bin/activate echo "Installing the python requirements ..." pip3 install -r requirements-gpu.txt ''')) ssh.chmod(remote_path=install_pth, mode=0o700) ssh.run(command=[install_pth.as_posix()], cwd=remote_mediti_dir) ## # Sync the data ## if data_dir is not None: print("Syncing the data...") assert data_dir is not None remote_data_dir = remote_mediti_dir / "data" ssh.mkdir(remote_path=remote_data_dir, exist_ok=True) ssh.sync_to_remote(local_path=data_dir, remote_path=remote_data_dir, delete=spurplus.Delete.BEFORE) if data_url is not None: remote_data_dir = remote_mediti_dir / "data" remote_limbo_dir = remote_mediti_dir / "data-limbo.{}".format( uuid.uuid4()) def remove_limbo_dir() -> None: """Delete the temporary data limbo directory.""" if ssh.exists(remote_limbo_dir): print("Removing the limbo directory: {}".format( remote_limbo_dir)) assert remote_limbo_dir != pathlib.Path() assert remote_mediti_dir in remote_limbo_dir.parents assert remote_limbo_dir != remote_mediti_dir ssh.run(['rm', '-rf', remote_limbo_dir.as_posix()]) with contextlib.ExitStack() as exit_stack: exit_stack.callback(remove_limbo_dir) ssh.mkdir(remote_limbo_dir, exist_ok=True, parents=True) ssh.mkdir(remote_data_dir, exist_ok=True, parents=True) print("Downloading the data to the remote limbo: {}".format( remote_limbo_dir)) # yapf: disable ssh.run([ 'wget', data_url, '-o', (remote_limbo_dir / "data.zip").as_posix()]) # yapf: enable # yapf: disable ssh.run([ 'unzip', (remote_limbo_dir / "data.zip").as_posix(), '-d', (remote_data_dir).as_posix() ]) # yapf: enable ## # Sync the code ## print("Syncing the code...") rel_pths = [ pathlib.Path("fine_tune.py"), pathlib.Path("evaluate.py"), pathlib.Path("file_iterator.py"), pathlib.Path("specsmod.py"), ] remote_src_pth = remote_mediti_dir / "src" for rel_pth in rel_pths: ssh.put(local_path=script_dir / rel_pth, remote_path=remote_src_pth / rel_pth) for rel_pth in [ pathlib.Path("fine_tune.py"), pathlib.Path("evaluate.py") ]: ssh.chmod(remote_path=remote_src_pth / rel_pth, mode=0o700) ## # Goodbye ## print("The deployment has finished.") print("Execute manually to train and evaluate on: {}@{}".format( remote_user, hostname)) return 0