def test_single_tunnel_multi_hosts(self): remote_host = '127.0.0.8' remote_server = ThreadedOpenSSHServer(listen_ip=remote_host, port=self.port) remote_server.start() remote_server.wait_for_port() hosts = [remote_host, remote_host, remote_host] try: client = ParallelSSHClient(hosts, port=self.port, pkey=self.user_key, proxy_host=self.proxy_host, proxy_port=self.port, num_retries=1, proxy_pkey=self.user_key) output = client.run_command(self.cmd, stop_on_errors=False) client.join(output) for host, host_out in output.items(): _stdout = list(host_out.stdout) self.assertListEqual(_stdout, [self.resp]) self.assertEqual(len(hosts), len(list(output.keys()))) del client finally: remote_server.stop() remote_server.join()
def new_parallel_ssh_client(cls, config, key_path=None) -> ParallelSSHClient: hostnames = config.keys() try: if SSH.PROXY: client = ParallelSSHClient( hosts=hostnames, host_config=config, pkey=key_path, proxy_host=SSH.PROXY['proxy_host'], proxy_user=SSH.PROXY['proxy_user'], proxy_port=SSH.PROXY['proxy_port'] # Ignore timeout and num_retires for proxy ) else: client = ParallelSSHClient(hosts=hostnames, host_config=config, timeout=SSH.TIMEOUT, pkey=key_path, num_retries=SSH.NUM_RETRIES) except PKeyFileError as e: log.error('[✘] {}'.format(str(e))) return None else: return client
def test_tunnel_remote_host_timeout(self): remote_host = '127.0.0.18' proxy_host = '127.0.0.19' server = ThreadedOpenSSHServer(listen_ip=proxy_host, port=self.port) remote_server = ThreadedOpenSSHServer(listen_ip=remote_host, port=self.port) for _server in (server, remote_server): _server.start() _server.wait_for_port() try: client = ParallelSSHClient([remote_host], port=self.port, pkey=self.user_key, proxy_host=proxy_host, proxy_port=self.port, num_retries=1, proxy_pkey=self.user_key) output = client.run_command(self.cmd) client.join(output) client._tunnel.cleanup() for _server in (server, remote_server): _server.stop() _server.join() # Gevent timeout cannot be caught by stop_on_errors self.assertRaises(GTimeout, client.run_command, self.cmd, greenlet_timeout=1, stop_on_errors=False) finally: for _server in (server, remote_server): _server.stop()
def test_tunnel_parallel_client_part_failure(self): hosts = ['127.0.0.11', '127.0.0.12', '127.0.0.13'] servers = [ OpenSSHServer(listen_ip=_host, port=self.port) for _host in hosts ] servers[0].start_server() try: client = ParallelSSHClient( hosts, port=self.port, pkey=self.user_key, proxy_host=self.proxy_host, proxy_pkey=self.user_key, proxy_port=self.proxy_port, num_retries=1, retry_delay=.1, ) output = client.run_command(self.cmd, stop_on_errors=False) client.join(output) self.assertEqual(len(hosts), len(output)) self.assertTrue(output[1].exception is not None) self.assertTrue(output[2].exception is not None) self.assertListEqual(list(output[0].stdout), [self.resp]) finally: for server in servers: server.stop()
def test_tunnel_remote_host_timeout(self): remote_host = '127.0.0.18' proxy_host = '127.0.0.19' server = ThreadedOpenSSHServer(listen_ip=proxy_host, port=self.port) remote_server = ThreadedOpenSSHServer(listen_ip=remote_host, port=self.port) for _server in (server, remote_server): _server.start() _server.wait_for_port() try: client = ParallelSSHClient( [remote_host], port=self.port, pkey=self.user_key, proxy_host=proxy_host, proxy_port=self.port, num_retries=1, proxy_pkey=self.user_key) output = client.run_command(self.cmd) client.join(output) client._tunnel.cleanup() for _server in (server, remote_server): _server.stop() _server.join() # Gevent timeout cannot be caught by stop_on_errors self.assertRaises(GTimeout, client.run_command, self.cmd, greenlet_timeout=1, stop_on_errors=False) finally: for _server in (server, remote_server): _server.stop()
def test_proxy_error(self): client = ParallelSSHClient([self.proxy_host], self.port, pkey=self.user_key, proxy_host='127.0.0.155', proxy_port=123, num_retries=1) output = client.run_command(self.cmd, stop_on_errors=False) client.join(output) self.assertIsInstance(output[0].exception, ProxyError)
def test_tunnel_init_failure(self): proxy_host = '127.0.0.20' client = ParallelSSHClient( [self.host], port=self.port, pkey=self.user_key, proxy_host=proxy_host, proxy_port=self.port, num_retries=1, proxy_pkey=self.user_key) output = client.run_command(self.cmd, stop_on_errors=False) client.join(output) exc = output[self.host].exception self.assertIsInstance(exc, ProxyError) self.assertIsInstance(exc.args[1], ConnectionErrorException)
def push_docker_image(hosts: Iterable[RemoteHost], local_image, local_file: str, remote_file: str) -> None: """Push a local Docker image to a set of remote hosts. Does not verify host keys. :param hosts: Set of remote host to upload the image to. :param local_image: Instance of a Docker SDK image object corresponding to the image to distribute. :param local_file: Path a temporary file the image is dumped to. :param remote_file: Path to use for a temporary image file on the remote host. """ # Save local image to file. # Running "docker save %s | gzip > %s" %(local_image.id, local_file) on the host might be # faster, because the Python API seems to write a temporary file and then return it instead # of streaming the data. log.info("Writing image %s to file '%s'.", local_image.short_id, local_file) with gzip.open(local_file, 'wb') as file: for chunk in local_image.save(named=True): file.write(chunk) host_ips = [str(host.ssh_host) for host in hosts] host_config = { str(host.ssh_host): host.get_host_config() for host in hosts } log.info("Copying image %s to %s.", local_image.short_id, ", ".join("%s (%s)" % (h.name, h.ssh_host) for h in hosts)) try: ssh_client = ParallelSSHClient(host_ips, host_config=host_config) # Copy image file to remote hosts. greenlets = ssh_client.scp_send(local_file, remote_file) # Not ideal: Waits until all hosts have the image before proceeding. gevent.joinall(greenlets, raise_error=True) # Load image from file. output = ssh_client.run_command("gunzip -c %s | docker image load" % remote_file) ssh_client.join(output) for host, host_output in output.items(): buffer = io.StringIO() for line in host_output.stdout: print(line, file=buffer) level = logging.INFO if host_output.exit_code == 0 else logging.WARNING log.log(level, "%s responds:\n%s", host, buffer.getvalue()) # Delete the image file. output = ssh_client.run_command("rm %s" % remote_file) ssh_client.join(output) except Exception: log.error("Pushing docker image to remote hosts failed.") raise
def ssh_host(host='', directory=''): if host and directory: with tempfile.NamedTemporaryFile() as local_test_file: local_test_file.write (b'Testing CephFS_Sync') ssh_client = ParallelSSHClient([host], 'root',) target_file = (directory + '/cephfs_sync_' + log_timefy()) ssh_copy = ssh_client.scp_send(local_test_file.name, target_file) #enable_host_logger() try: joinall(ssh_copy, raise_error=True) except Exception: return False return True
def test_tunnel_init_failure(self): proxy_host = '127.0.0.20' client = ParallelSSHClient([self.host], port=self.port, pkey=self.user_key, proxy_host=proxy_host, proxy_port=self.port, num_retries=1, proxy_pkey=self.user_key) output = client.run_command(self.cmd, stop_on_errors=False) client.join(output) exc = output[self.host].exception self.assertIsInstance(exc, ProxyError) self.assertIsInstance(exc.args[1], ConnectionErrorException)
def test_tunnel_parallel_client_running_fail(self): hosts = ['127.0.0.11', '127.0.0.12', '127.0.0.13', '127.0.0.14'] servers = [ OpenSSHServer(listen_ip=_host, port=self.port) for _host in hosts ] for server in servers: server.start_server() try: client = ParallelSSHClient( hosts, port=self.port, pkey=self.user_key, proxy_host=self.proxy_host, proxy_pkey=self.user_key, proxy_port=self.proxy_port, num_retries=1, ) output = client.run_command(self.cmd) client.join(output) for server in (servers[2], servers[3]): server.stop() server.server_proc.communicate() client._host_clients[(2, hosts[2])].disconnect() client._host_clients[(3, hosts[3])].disconnect() output = client.run_command(self.cmd, stop_on_errors=False) client.join(output) self.assertEqual(len(hosts), len(output)) self.assertTrue(output[2].exception is not None) self.assertTrue(output[3].exception is not None) self.assertListEqual(list(output[0].stdout), [self.resp]) self.assertListEqual(list(output[1].stdout), [self.resp]) finally: for server in servers: server.stop()
def test_tunnel(self): remote_host = '127.0.0.8' remote_server = OpenSSHServer(listen_ip=remote_host, port=self.port) remote_server.start_server() try: client = ParallelSSHClient( [remote_host], port=self.port, pkey=self.user_key, proxy_host=self.proxy_host, proxy_port=self.port, num_retries=1, proxy_pkey=self.user_key) output = client.run_command(self.cmd) client.join(output) for host, host_out in output.items(): _stdout = list(host_out.stdout) self.assertListEqual(_stdout, [self.resp]) self.assertEqual(remote_host, list(output.keys())[0]) del client finally: remote_server.stop()
def main(): exp_name = sys.argv[1] total_duration = int(sys.argv[2]) output_dir = sys.argv[3] start_position = int(sys.argv[4]) end_position = int(sys.argv[5]) interval=1 cpu_frequency = total_duration //1 user="******" hosts = [] for i in range(1,5): for j in range(1,5): hosts.append("kb-w{}{}".format(i,j)) client = ParallelSSHClient(hosts,user) try: output = client.run_command('vmstat {} {} > {}_vmfile.tmp '.format(interval, cpu_frequency, exp_name)) except Exception as e: print(e) time.sleep(total_duration) #create the dir if not exist if not os.path.exists(output_dir)==True: os.makedirs(output_dir) for vm in hosts: src_file_name = "{}_vmfile.tmp".format(exp_name) dst_file_name = "{}_vmfile.tmp".format(vm) print("trying to copy: {} {} {}".format(vm,src_file_name,dst_file_name,output_dir)) copy_single_remote_to_local(vm,src_file_name, dst_file_name, output_dir) #post_process_outfile output_file = os.path.join(output_dir,"{}_vmfile.csv".format(vm)) input_file = os.path.join(output_dir,"{}_vmfile.tmp".format(vm)) print("debug: process_vmstat {} {} {} {}".format(input_file,start_position,end_position,output_file)) process_vmstat(input_file,start_position,end_position, output_file)
def is_snapshot_enabled(src_path='', host=None): if src_path and host == None: snap_dir = src_path + '/.snap' if does_dir_exist(snap_dir): return True if host != None and src_path: read_file_check = 'cannot read file system information' snap_dir = src_path + '/.snap' cmd = 'stat -f {}'.format(snap_dir) ssh_client = ParallelSSHClient([host], 'root',) #enable_host_logger() ssh_client_output = ssh_client.run_command(cmd) for host_name, host_output in ssh_client_output.items(): for output_line in host_output.stdout: if read_file_check in output_line: return False return True return False
def test_single_tunnel_multi_hosts_timeout(self): remote_host = '127.0.0.8' remote_server = ThreadedOpenSSHServer( listen_ip=remote_host, port=self.port) remote_server.start() remote_server.wait_for_port() hosts = [remote_host, remote_host, remote_host] try: client = ParallelSSHClient( hosts, port=self.port, pkey=self.user_key, proxy_host=self.proxy_host, proxy_port=self.port, num_retries=1, proxy_pkey=self.user_key, timeout=.001) output = client.run_command(self.cmd, stop_on_errors=False) client.join(output) for host, host_out in output.items(): self.assertIsInstance(output[host].exception, Timeout) finally: remote_server.stop() remote_server.join()
def test_tunnel(self): remote_host = '127.0.0.8' remote_server = OpenSSHServer(listen_ip=remote_host, port=self.port) remote_server.start_server() try: client = ParallelSSHClient([remote_host], port=self.port, pkey=self.user_key, proxy_host=self.proxy_host, proxy_port=self.port, num_retries=1, proxy_pkey=self.user_key) output = client.run_command(self.cmd) client.join(output) for host, host_out in output.items(): _stdout = list(host_out.stdout) self.assertListEqual(_stdout, [self.resp]) self.assertEqual(remote_host, list(output.keys())[0]) del client finally: remote_server.stop()
def is_it_cephfs(src_path='', host=None): """ Instead of matching a $src_path from a list of mounting points, then check if that is a 'cephfs' file system, we just test it straight with: $ stat -f /mnt/cephfs/data/isos/openSUSE-Tumbleweed-DVD-x86_64-Snapshot20190202-Media.iso Part of the output we expect *if we are running it on a 'cephfs'*, is: ... ID: 9892f072c60473e3 Namelen: 255 Type: ceph ... OR, *if we are not running it on a 'cephfs'*: ... ID: b333aa8538b776cf Namelen: 255 Type: btrfs ... """ """cmd = "stat -f {} | egrep -i 'type: ceph' | wc -l".format(src_path)""" cephfs_check = 'Type: ceph' cmd = 'stat -f {}'.format(src_path) if host != None: ssh_client = ParallelSSHClient([host], 'root') #enable_host_logger() ssh_client_output = ssh_client.run_command(cmd) for host_name, host_output in ssh_client_output.items(): for output_line in host_output.stdout: if cephfs_check in output_line: return True else: rc, stdout, stderr = cmd_launcher(cmd=cmd) if rc != 0: raise Exception('Error while executing the command {}. \ Error message: {}'.format(cmd, stderr)) return (cephfs_check in stdout) return False
def test_single_tunnel_multi_hosts(self): remote_host = '127.0.0.8' remote_server = ThreadedOpenSSHServer( listen_ip=remote_host, port=self.port) remote_server.start() remote_server.wait_for_port() hosts = [remote_host, remote_host, remote_host] try: client = ParallelSSHClient( hosts, port=self.port, pkey=self.user_key, proxy_host=self.proxy_host, proxy_port=self.port, num_retries=1, proxy_pkey=self.user_key) output = client.run_command(self.cmd, stop_on_errors=False) client.join(output) for host, host_out in output.items(): _stdout = list(host_out.stdout) self.assertListEqual(_stdout, [self.resp]) self.assertEqual(len(hosts), len(list(output.keys()))) del client finally: remote_server.stop() remote_server.join()
def test_tunnel_host_config(self): hosts = ['127.0.0.11', '127.0.0.12'] servers = [ OpenSSHServer(listen_ip=_host, port=self.port) for _host in hosts ] for server in servers: server.start_server() host_config = [ HostConfig(proxy_host=self.proxy_host, proxy_port=self.proxy_port, proxy_pkey=self.user_key), HostConfig(proxy_host='127.0.0.155', proxy_port=123), ] client = ParallelSSHClient(hosts, port=self.port, pkey=self.user_key, host_config=host_config, num_retries=1) output = client.run_command(self.cmd, stop_on_errors=False) client.join(output) self.assertIsInstance(output[1].exception, ProxyError) stdout = list(output[0].stdout) self.assertListEqual(stdout, [self.resp])
def test_single_tunnel_multi_hosts_timeout(self): remote_host = '127.0.0.8' remote_server = ThreadedOpenSSHServer(listen_ip=remote_host, port=self.port) remote_server.start() remote_server.wait_for_port() hosts = [remote_host, remote_host, remote_host] try: client = ParallelSSHClient(hosts, port=self.port, pkey=self.user_key, proxy_host=self.proxy_host, proxy_port=self.port, num_retries=1, proxy_pkey=self.user_key, timeout=.001) output = client.run_command(self.cmd, stop_on_errors=False) client.join(output) for host, host_out in output.items(): self.assertIsInstance(output[host].exception, Timeout) finally: remote_server.stop() remote_server.join()
def test_tunnel_parallel_client(self): hosts = ['127.0.0.1%s' % (d, ) for d in range(10)] servers = [ OpenSSHServer(listen_ip=_host, port=self.port) for _host in hosts ] for server in servers: server.start_server() hosts_5 = [hosts[0], hosts[1], hosts[2], hosts[3], hosts[4]] try: client = ParallelSSHClient( hosts_5, port=self.port, pkey=self.user_key, proxy_host=self.proxy_host, proxy_pkey=self.user_key, proxy_port=self.proxy_port, num_retries=1, ) start = datetime.now() output = client.run_command(self.cmd) end = datetime.now() dt_5 = end - start client = ParallelSSHClient( hosts, port=self.port, pkey=self.user_key, proxy_host=self.proxy_host, proxy_pkey=self.user_key, proxy_port=self.proxy_port, num_retries=1, ) start = datetime.now() output = client.run_command(self.cmd) end = datetime.now() dt_10 = end - start dt = dt_10.total_seconds() / dt_5.total_seconds() # self.assertTrue(dt < 2) client.join(output) self.assertEqual(len(hosts), len(output)) for i, host_out in enumerate(output): _stdout = list(host_out.stdout) self.assertListEqual(_stdout, [self.resp]) self.assertEqual(hosts[i], host_out.host) finally: for server in servers: server.stop()
def test_tunnel_remote_host_timeout(self): remote_host = '127.0.0.18' proxy_host = '127.0.0.19' server = OpenSSHServer(listen_ip=proxy_host, port=self.port) server.start_server() try: client = ParallelSSHClient([remote_host], port=self.port, pkey=self.user_key, proxy_host=proxy_host, proxy_port=self.port, num_retries=1, proxy_pkey=self.user_key, timeout=2) try: client.run_command(self.cmd) except (GTimeout, Exception): pass else: raise Exception("Command neither failed nor timeout raised") client._tunnel.cleanup() server.stop() finally: server.stop()
def run_command(client: ParallelSSHClient, command: str) -> CommandResult: """Executes identical command on all hosts attached to client. Will wait until all hosts complete the command execution or timeout is reached. Re-raises pssh exceptions. # TODO Handle more specific exceptions """ # stop_on_errors -> allows others hosts to execute when one crashes, combine exceptions # output is like: (hostname, host_output) try: result = client.run_command(command, stop_on_errors=False) client.join(result) except pssh.exceptions.Timeout: log.warning('Command `{}` reached time limit'.format(command)) raise except pssh.exceptions.ProxyError as e: log.error('Could not connect to proxy server, reason: {}'.format(e)) raise except Exception as e: log.critical(e) raise # FIXME Find out what throws this exception else: log.debug('Command `{}` finished'.format(command)) return result
def connectClientToJoinableHosts(self): client = ParallelSSHClient(self.hosts, host_config=self.hosts_config, num_retries=1, timeout=3) print("Connected") print("Launching command...") outputs = client.run_command('ls ' + self.passwords_directory + ' | egrep "*.kdbx"', stop_on_errors=False, timeout=3) hosts_databases_files = dict([(host_output.host, host_output) for host_output in outputs]) # Filtering unjoinable hosts hosts_databases_files = \ dict(filter(lambda host: host[1].exception == None, hosts_databases_files.items())) new_hosts_config = \ dict(filter(lambda host: host[0] in hosts_databases_files.keys(), self.hosts_config.items())) new_hosts = new_hosts_config.keys() joinableClient = None if len(new_hosts_config) < len(self.hosts_config): print("Reconnected client without unjoinable hosts") if self.debug: print(new_hosts_config) if self.debug: print(new_hosts) joinableClient = ParallelSSHClient(new_hosts, host_config=new_hosts_config, num_retries=1, timeout=3) else: joinableClient = client return (joinableClient, hosts_databases_files, new_hosts_config, new_hosts)
def get_client(config: HostsConfig, pconfig: Optional[ProxyConfig] = None, **kwargs) -> ParallelSSHClient: """Builds and returns an ssh client object for given configuration. Client is fetched directly from cache if identical arguments were used recently. """ if pconfig is None: pconfig = {} return ParallelSSHClient(hosts=config.keys(), host_config=config, pkey=SSH.KEY_FILE, proxy_host=pconfig.get('proxy_host'), proxy_user=pconfig.get('proxy_user'), proxy_port=pconfig.get('proxy_port'), num_retries=0, **kwargs)
def test(client): output = run2(client, '/tmp/outland.py') print(output) for host in hosts: stdout = '' try : stdout = list(output[host].stdout) except Timeout: client.reset_output_generators(output[host]) print( host, r.get(host) ) print( host, stdout ) if __name__ == "__main__": args = get_options() client = ParallelSSHClient(hosts) r = redis.StrictRedis(host=config['CMD_BROKER'], port=6379, db=0) if args.test: test(client) exit() output = update(client) while True: for host in hosts: stdout = '' try : stdout = list(output[host].stdout) except Timeout: client.reset_output_generators(output[host]) print( host, r.get(host) )
parser.add_argument("--ExecutionTimes", type=int, default=1, choices=xrange(1, 10), help="Experiment Execution times") parser.add_argument("--RunningLocal", type=int, default=0, choices=xrange(0, 2), help="is running local") args = parser.parse_args() # pssh-client hosts = rasp_hosts client = ParallelSSHClient(hosts, user='******') def restart_flink(): cmd = stop_flink process = subprocess.Popen(cmd.split(), stdout=FNULL, stderr=subprocess.STDOUT) output, error = process.communicate() time.sleep(2) cmd = start_flink process = subprocess.Popen(cmd.split(), stdout=FNULL, stderr=subprocess.STDOUT)
class MimicExp: def __init__(self, config, debug=0): self.base_logdir = "autorun/chaokun_logs" self._parse_config(config) self.debug = debug def _parse_config(self, config): self.host_user_dir = config["host_user_dir"] self.docker_user_dir = config["docker_user_dir"] self.docker_user = config["docker_user"] self.docker_ssh_port = config["docker_ssh_port"] self.nodes = config['nodes'] self.nGPU = config['nGPU'] # for each machine self.eth = config['eth'] # name if NIC self.host_key = config["host_ssh_key"] self.docker_key = config["docker_ssh_key"] def _init_host_ssh(self): """connect to all nodes""" self.pClinet = ParallelSSHClient(self.nodes, pkey=self.host_key) def _init_docker_ssh(self): self.docker0 = SSHClient("localhost", user=self.docker_user, port=self.docker_ssh_port, pkey=self.docker_key) def _ini_host_env(self): """download log folder from aws-s3""" sentinel_cmd = "mkdir ~/autorun; " self._p_exe(sentinel_cmd) check_logs_cmd = "cd ~/autorun; mkdir tmp/; cd tmp/; rm mimic_env_setup.sh; "\ "wget https://gist.githubusercontent.com/zarzen/012c2aa2a1c833e5bf1aeb379bbb9e93/raw/71dec6db3138dcc7e5318598bc770c6ce296b9a4/mimic_env_setup.sh; "\ "/bin/bash mimic_env_setup.sh" self._p_exe(check_logs_cmd) def _p_exe(self, cmd): output = self.pClinet.run_command(cmd) for host, host_output in output.items(): for line in host_output.stdout: print("Host [%s] - %s" % (host, line)) for line in host_output.stderr: print("Host [%s] - %s" % (host, line)) def _docker_exe(self, cmd): _channel, _host, _stdout, _stderr, _ = self.docker0.run_command(cmd) for line in _stdout: print("[{}] {}".format(_host, line)) for line in _stderr: print("[{}] err {}".format(_host, line)) def _start_containers(self): stop_cmd = "docker kill $(docker ps -q)" pull_cmd = "docker pull zarzen/horovod-mod:1.0" start_cmd = "sudo docker run --gpus all --network=host --detach --ipc=host "\ "-v {}/autorun/chaokun_logs:{}/chaokun_logs "\ "zarzen/horovod-mod:1.0".format(self.host_user_dir, self.docker_user_dir ) self._p_exe(stop_cmd) self._p_exe(pull_cmd) self._p_exe(start_cmd) def run(self): self._init_host_ssh() self._ini_host_env() self._start_containers() self._init_docker_ssh() exp_folders = os.listdir(join(self.host_user_dir, self.base_logdir)) # first 10 folder for debugging for now if self.debug: print("debug mode, experimentally run 3 configurations") exp_folders = exp_folders[:3] else: print("no debug flag, try to experiments on all configurations") for idx, _folder in enumerate(exp_folders): self._run_once(_folder) print("*" * 10, "Completed {}/{}".format(idx + 1, len(exp_folders)), "*" * 10) def _run_once(self, folder_name): """ run with """ # read the orginal experiment config to get the bw limit folder_path = join(self.host_user_dir, self.base_logdir, folder_name) if not os.path.isdir(folder_path) or \ not os.path.exists(join(folder_path, "config.json")): return with open(join(folder_path, "config.json")) as ifile: config = json.load(ifile) bw_limit = config['bw_limit'] print("mimic training with folder {} at bw {}".format( folder_name, bw_limit)) self._bw_ctl(bw_limit) cpu_p, net_p = self._exe_res_monitor(folder_path) print(">" * 10, 'launched CPU & Network monitoring') mt_cmd = self._build_mpirun_cmd(config, folder_name) print("executing mimic training command:\n", mt_cmd) # import time # time.sleep(2) self._docker_exe(mt_cmd) cpu_p.terminate() net_p.terminate() def _build_mpirun_cmd(self, config, folder_name): """""" folder_path = join(self.docker_user_dir, "chaokun_logs", folder_name) nNodes = len(config['nodes']) nGPU = config['nGPU'] if self.debug: # because test env only has two nodes with 1 GPU on each nNodes = 2 nGPU = 1 IPs = self.nodes[:nNodes] hostsStr = ",".join(["{}:{}".format(ip, nGPU) for ip in IPs]) cmd = [ "mpirun", "-np", str(nNodes * nGPU), "-H", hostsStr, "-bind-to", "none", "-map-by", "slot", "-x", "LD_LIBRARY_PATH=/usr/local/cuda/lib64", "-x", "NCCL_DEBUG=INFO", "-x", "NCCL_SOCKET_IFNAME=^lo,docker,ens4", "-mca", "btl_tcp_if_exclude lo,docker,ens4", self.docker_user_dir + "/mimic_dt/build/mdt_allreduce_perf", "-b 500M -e 500M -f 2 -g 1 -c 0 -w 0", "-l", join(folder_path, "log_for_dt_mimic.txt"), "|& grep -v \"Read -1\"" ] return " ".join(cmd) def _bw_ctl(self, bw_limit): del_cmd = "sudo tc qdisc del dev {} root tbf rate 40Gbit latency 400ms burst 3000kbit".format( self.eth) # if self.bw_limit = "" then we don't execute the add_cmd add_cmd = "sudo tc qdisc add dev {} root tbf rate {} latency 400ms burst 3000kbit".format( self.eth, bw_limit) print('deleting old bw limit') self._p_exe(del_cmd) print( 'confirm the bw limit deleted (should see error when redoing del)') self._p_exe(del_cmd) if bw_limit != "": self._p_exe(add_cmd) def _exe_res_monitor(self, tg_folder): """ execute cpu and network bandwidth monitor """ # record existing logs cpu_monitor_script = expanduser("~/autorun/monitor_cpu.py") net_monitor_script = expanduser("~/autorun/monitor_net.py") cpu_p = subprocess.Popen( ["python3", cpu_monitor_script, join(tg_folder, "mt_cpu.log")], stdout=subprocess.DEVNULL) net_p = subprocess.Popen( ["python3", net_monitor_script, join(tg_folder, "mt_net.log")], stdout=subprocess.DEVNULL) return cpu_p, net_p def __del__(self): stop_cmd = "docker kill $(docker ps -q)" self._p_exe(stop_cmd)
def _init_host_ssh(self): """connect to all nodes""" self.pClinet = ParallelSSHClient(self.nodes, pkey=self.host_key)
import os from sftp import copy_remote_to_local from post_process_perfstat import post_process_perfstat from collections import defaultdict exp_name = sys.argv[1] total_duration = int(sys.argv[2]) #given in seconds output_dir = sys.argv[3] total_duration_in_ms = total_duration * 1000 #scale to ms #print ("debug>>", sys.argv) hosts = ['kubenode-1', 'kubenode-2', 'kubenode-3', 'kubenode-4'] #hosts = ['kubenode-1'] user = "******" client = ParallelSSHClient(hosts, user) try: output = client.run_command('sh ./perfstat_node/perfstat.sh {} {}'.format( exp_name, total_duration_in_ms)) #print ("debug>> executed") except Exception as e: print e time.sleep(total_duration) #print ("debug>> wakeup") ''' for host, host_output in output.items(): for line in host_output.stdout: print(line)
# Add colored output # Sanitize inputs # Handle different usernames from docopt import docopt from prompt_toolkit import prompt from pssh.clients.native import ParallelSSHClient def runCommand(command, client): print(" Running command %s" % (command)) output = client.run_command(command) for server, serverOutput in output.items(): for line in serverOutput.stdout: print("%s: %s" % (server, line)) if __name__ == '__main__': args = docopt(__doc__, version='1') # print("Got args: ", args) client = ParallelSSHClient(args['SERVER'], user=args['--user']) runCommand('hostname', client) a = prompt('$ ') while a != 'exit': runCommand(a, client) a = prompt('$ ')
import os from lib.sftp import copy_multiple_remote_to_local from perfstat_processor import post_process_perfstat from collections import defaultdict exp_name = sys.argv[1] total_duration = int(sys.argv[2]) #given in seconds output_dir = sys.argv[3] total_duration_in_ms = total_duration * 1000 #scale to ms #print ("debug>>", sys.argv) hosts = ['node-1', 'node-2', 'node-3', 'node-4'] list_of_kvm = hosts.copy() user = "******" client = ParallelSSHClient(hosts, user) try: output = client.run_command( 'sudo sh ./perfstat_node/perfstat.sh {} {}'.format( exp_name, total_duration_in_ms)) #print ("debug>> executed") except Exception as e: print(e) time.sleep(total_duration) #print ("debug>> wakeup") ''' for host, host_output in output.items(): for line in host_output.stdout:
def test(client): output = run2(client, '/tmp/outland.py') print(output) for host in hosts: stdout = '' try: stdout = list(output[host].stdout) except Timeout: client.reset_output_generators(output[host]) print(host, r.get(host)) print(host, stdout) if __name__ == "__main__": args = get_options() client = ParallelSSHClient(hosts) r = redis.StrictRedis(host=config['CMD_BROKER'], port=6379, db=0) if args.test: test(client) exit() output = update(client) while True: for host in hosts: stdout = '' try: stdout = list(output[host].stdout) except Timeout: client.reset_output_generators(output[host]) print(host, r.get(host))
import getpass import time hosts = [ 'brki164-lnx-5.bucknell.edu', 'brki164-lnx-6.bucknell.edu', 'brki164-lnx-7.bucknell.edu', 'brki164-lnx-8.bucknell.edu', 'brki164-lnx-9.bucknell.edu', 'brki164-lnx-10.bucknell.edu', 'brki164-lnx-11.bucknell.edu', 'brki164-lnx-12.bucknell.edu', 'brki164-lnx-13.bucknell.edu', 'brki164-lnx-14.bucknell.edu', 'brki164-lnx-15.bucknell.edu', 'brki164-lnx-16.bucknell.edu', 'brki164-lnx-17.bucknell.edu', 'brki164-lnx-18.bucknell.edu' ] passw = getpass.getpass(prompt="George Password") client = ParallelSSHClient(hosts, user='******', password=passw) #print(client.hosts[0]) #client.run_command('cd Downloads/') output = client.run_command( 'cd Code/NLP/Soup/Not_Slim/ && python distributive_scraper.py ' + str(len(hosts)), use_pty='Fasle') for host in hosts: for line in output[host].stdout: print(line) for host in output: print(output[host].exit_code)
def install(self, user_pem=None, quiet_mode=False): pem_path = user_pem if user_pem is not None else self.config[ "PemFilePath"] cluster_id = self.find_job_flow_id() desc_cluster = self.emr.describe_cluster(ClusterId=cluster_id) cluster_state = desc_cluster['Cluster']['Status']['State'] if cluster_state not in ['WAITING', 'RUNNING']: raise ValueError("Cluster is not active") tags_list = desc_cluster['Cluster']['Tags'] fail_check = True valid_description = ["env=local"] valid_names = ['local'] for tag in tags_list: if 'Description' in tag['Key'] and any( value in tag['Value'] for value in valid_description): fail_check = False break if 'Name' in tag['Key'] and any(name in tag['Value'] for name in valid_names): fail_check = False break if not fail_check: print("Cluster tags should contain Key=Name, Value='local']") print( "Cluster tags should contain Key=Description, Value='env=local']" ) raise ValueError( "Error: Local build can not deployed on this cluster {0}". format(cluster_id)) tar_file_nl = HandleEMRCommands.build() tar_file_name = tar_file_nl[0] tar_file_location = tar_file_nl[1] if pem_path is not None: response = self.emr.list_instances(ClusterId=cluster_id, ) response_code = response['ResponseMetadata']['HTTPStatusCode'] if response_code == 200: hosts = self.active_instances(response) print(hosts) client = ParallelSSHClient(hosts, user='******', pkey=pem_path) copy_files = client.copy_file(tar_file_location, '/home/hadoop/' + tar_file_name) joinall(copy_files, raise_error=True) output = client.run_command( "python3 -m pip install --upgrade --no-deps --force-reinstall /home/hadoop/" + tar_file_name, sudo=True) for host, host_output in output.items(): if quiet_mode: for line in host_output.stderr: print(line) else: for line in host_output.stdout: print(line) print("Deployed to all nodes") return
def script_runner(self, user_pem=None, user_script_name=None, quiet_mode=False): """ :return: """ script_name = user_script_name if user_script_name is not None else self.config[ "ScriptToRun"]["File"] pem_path = user_pem if user_pem is not None else self.config[ "PemFilePath"] if script_name is not None: if pem_path is not None: job_flow_id = self.find_job_flow_id() response = self.emr.list_instances(ClusterId=job_flow_id, ) response_code = response['ResponseMetadata']['HTTPStatusCode'] if response_code == 200: hosts = self.active_instances(response) print(hosts) client = ParallelSSHClient(hosts, user='******', pkey=pem_path) if script_name.startswith("/"): # handle absolute path to_script_name = "/home/hadoop/{}".format( os.path.basename(script_name)) from_script_name = script_name else: # handle relative path to_script_name = "/home/hadoop/{}".format(script_name) from_script_name = os.path.join( os.getcwd(), script_name) logger.info("Copying script {} to {}".format( from_script_name, to_script_name)) copy_files = client.copy_file(from_script_name, to_script_name) joinall(copy_files, raise_error=True) logger.info("Finished copying script {} to {}".format( from_script_name, to_script_name)) logger.info("Running script {}".format(to_script_name)) output = client.run_command("chmod +x {} && {}".format( to_script_name, to_script_name), sudo=True) for host, host_output in output.items(): if quiet_mode: for line in host_output.stderr: print(line) else: for line in host_output.stdout: print(line) logger.info("Finished script {}".format(to_script_name)) return hosts else: raise ValueError( "Could not list instances (status code {})".format( response)) else: raise ValueError( 'pem_file_path is not specified in emrcliconfig_inst_fleets.yaml "pem_file_path:%s"' % pem_path) else: raise ValueError("script runner shell script not specified")