Exemplo n.º 1
0
 def test_single_tunnel_multi_hosts(self):
     remote_host = '127.0.0.8'
     remote_server = ThreadedOpenSSHServer(listen_ip=remote_host,
                                           port=self.port)
     remote_server.start()
     remote_server.wait_for_port()
     hosts = [remote_host, remote_host, remote_host]
     try:
         client = ParallelSSHClient(hosts,
                                    port=self.port,
                                    pkey=self.user_key,
                                    proxy_host=self.proxy_host,
                                    proxy_port=self.port,
                                    num_retries=1,
                                    proxy_pkey=self.user_key)
         output = client.run_command(self.cmd, stop_on_errors=False)
         client.join(output)
         for host, host_out in output.items():
             _stdout = list(host_out.stdout)
             self.assertListEqual(_stdout, [self.resp])
         self.assertEqual(len(hosts), len(list(output.keys())))
         del client
     finally:
         remote_server.stop()
         remote_server.join()
Exemplo n.º 2
0
 def new_parallel_ssh_client(cls,
                             config,
                             key_path=None) -> ParallelSSHClient:
     hostnames = config.keys()
     try:
         if SSH.PROXY:
             client = ParallelSSHClient(
                 hosts=hostnames,
                 host_config=config,
                 pkey=key_path,
                 proxy_host=SSH.PROXY['proxy_host'],
                 proxy_user=SSH.PROXY['proxy_user'],
                 proxy_port=SSH.PROXY['proxy_port']
                 # Ignore timeout and num_retires for proxy
             )
         else:
             client = ParallelSSHClient(hosts=hostnames,
                                        host_config=config,
                                        timeout=SSH.TIMEOUT,
                                        pkey=key_path,
                                        num_retries=SSH.NUM_RETRIES)
     except PKeyFileError as e:
         log.error('[✘] {}'.format(str(e)))
         return None
     else:
         return client
Exemplo n.º 3
0
 def test_tunnel_remote_host_timeout(self):
     remote_host = '127.0.0.18'
     proxy_host = '127.0.0.19'
     server = ThreadedOpenSSHServer(listen_ip=proxy_host, port=self.port)
     remote_server = ThreadedOpenSSHServer(listen_ip=remote_host,
                                           port=self.port)
     for _server in (server, remote_server):
         _server.start()
         _server.wait_for_port()
     try:
         client = ParallelSSHClient([remote_host],
                                    port=self.port,
                                    pkey=self.user_key,
                                    proxy_host=proxy_host,
                                    proxy_port=self.port,
                                    num_retries=1,
                                    proxy_pkey=self.user_key)
         output = client.run_command(self.cmd)
         client.join(output)
         client._tunnel.cleanup()
         for _server in (server, remote_server):
             _server.stop()
             _server.join()
         # Gevent timeout cannot be caught by stop_on_errors
         self.assertRaises(GTimeout,
                           client.run_command,
                           self.cmd,
                           greenlet_timeout=1,
                           stop_on_errors=False)
     finally:
         for _server in (server, remote_server):
             _server.stop()
Exemplo n.º 4
0
 def test_tunnel_parallel_client_part_failure(self):
     hosts = ['127.0.0.11', '127.0.0.12', '127.0.0.13']
     servers = [
         OpenSSHServer(listen_ip=_host, port=self.port) for _host in hosts
     ]
     servers[0].start_server()
     try:
         client = ParallelSSHClient(
             hosts,
             port=self.port,
             pkey=self.user_key,
             proxy_host=self.proxy_host,
             proxy_pkey=self.user_key,
             proxy_port=self.proxy_port,
             num_retries=1,
             retry_delay=.1,
         )
         output = client.run_command(self.cmd, stop_on_errors=False)
         client.join(output)
         self.assertEqual(len(hosts), len(output))
         self.assertTrue(output[1].exception is not None)
         self.assertTrue(output[2].exception is not None)
         self.assertListEqual(list(output[0].stdout), [self.resp])
     finally:
         for server in servers:
             server.stop()
Exemplo n.º 5
0
 def test_tunnel_remote_host_timeout(self):
     remote_host = '127.0.0.18'
     proxy_host = '127.0.0.19'
     server = ThreadedOpenSSHServer(listen_ip=proxy_host, port=self.port)
     remote_server = ThreadedOpenSSHServer(listen_ip=remote_host, port=self.port)
     for _server in (server, remote_server):
         _server.start()
         _server.wait_for_port()
     try:
         client = ParallelSSHClient(
             [remote_host], port=self.port, pkey=self.user_key,
             proxy_host=proxy_host, proxy_port=self.port, num_retries=1,
             proxy_pkey=self.user_key)
         output = client.run_command(self.cmd)
         client.join(output)
         client._tunnel.cleanup()
         for _server in (server, remote_server):
             _server.stop()
             _server.join()
         # Gevent timeout cannot be caught by stop_on_errors
         self.assertRaises(GTimeout, client.run_command, self.cmd,
                           greenlet_timeout=1, stop_on_errors=False)
     finally:
         for _server in (server, remote_server):
             _server.stop()
Exemplo n.º 6
0
 def test_proxy_error(self):
     client = ParallelSSHClient([self.proxy_host],
                                self.port,
                                pkey=self.user_key,
                                proxy_host='127.0.0.155',
                                proxy_port=123,
                                num_retries=1)
     output = client.run_command(self.cmd, stop_on_errors=False)
     client.join(output)
     self.assertIsInstance(output[0].exception, ProxyError)
Exemplo n.º 7
0
 def test_tunnel_init_failure(self):
     proxy_host = '127.0.0.20'
     client = ParallelSSHClient(
         [self.host], port=self.port, pkey=self.user_key,
         proxy_host=proxy_host, proxy_port=self.port, num_retries=1,
         proxy_pkey=self.user_key)
     output = client.run_command(self.cmd, stop_on_errors=False)
     client.join(output)
     exc = output[self.host].exception
     self.assertIsInstance(exc, ProxyError)
     self.assertIsInstance(exc.args[1], ConnectionErrorException)
Exemplo n.º 8
0
def push_docker_image(hosts: Iterable[RemoteHost], local_image,
                      local_file: str, remote_file: str) -> None:
    """Push a local Docker image to a set of remote hosts.

    Does not verify host keys.

    :param hosts: Set of remote host to upload the image to.
    :param local_image: Instance of a Docker SDK image object corresponding to the image to
                        distribute.
    :param local_file: Path a temporary file the image is dumped to.
    :param remote_file: Path to use for a temporary image file on the remote host.
    """
    # Save local image to file.
    # Running "docker save %s | gzip > %s" %(local_image.id, local_file) on the host might be
    # faster, because the Python API seems to write a temporary file and then return it instead
    # of streaming the data.
    log.info("Writing image %s to file '%s'.", local_image.short_id,
             local_file)
    with gzip.open(local_file, 'wb') as file:
        for chunk in local_image.save(named=True):
            file.write(chunk)

    host_ips = [str(host.ssh_host) for host in hosts]
    host_config = {
        str(host.ssh_host): host.get_host_config()
        for host in hosts
    }

    log.info("Copying image %s to %s.", local_image.short_id,
             ", ".join("%s (%s)" % (h.name, h.ssh_host) for h in hosts))

    try:
        ssh_client = ParallelSSHClient(host_ips, host_config=host_config)

        # Copy image file to remote hosts.
        greenlets = ssh_client.scp_send(local_file, remote_file)
        # Not ideal: Waits until all hosts have the image before proceeding.
        gevent.joinall(greenlets, raise_error=True)

        # Load image from file.
        output = ssh_client.run_command("gunzip -c %s | docker image load" %
                                        remote_file)
        ssh_client.join(output)
        for host, host_output in output.items():
            buffer = io.StringIO()
            for line in host_output.stdout:
                print(line, file=buffer)
            level = logging.INFO if host_output.exit_code == 0 else logging.WARNING
            log.log(level, "%s responds:\n%s", host, buffer.getvalue())

        # Delete the image file.
        output = ssh_client.run_command("rm %s" % remote_file)
        ssh_client.join(output)

    except Exception:
        log.error("Pushing docker image to remote hosts failed.")
        raise
Exemplo n.º 9
0
def ssh_host(host='', directory=''):
    if host and directory:
        with tempfile.NamedTemporaryFile() as local_test_file:
            local_test_file.write (b'Testing CephFS_Sync')
            ssh_client = ParallelSSHClient([host], 'root',)
            target_file = (directory + '/cephfs_sync_' + log_timefy())
            ssh_copy = ssh_client.scp_send(local_test_file.name, target_file)
            #enable_host_logger()
            try:
                joinall(ssh_copy, raise_error=True)
            except Exception:
                return False
    return True
Exemplo n.º 10
0
 def test_tunnel_init_failure(self):
     proxy_host = '127.0.0.20'
     client = ParallelSSHClient([self.host],
                                port=self.port,
                                pkey=self.user_key,
                                proxy_host=proxy_host,
                                proxy_port=self.port,
                                num_retries=1,
                                proxy_pkey=self.user_key)
     output = client.run_command(self.cmd, stop_on_errors=False)
     client.join(output)
     exc = output[self.host].exception
     self.assertIsInstance(exc, ProxyError)
     self.assertIsInstance(exc.args[1], ConnectionErrorException)
Exemplo n.º 11
0
 def test_tunnel_parallel_client_running_fail(self):
     hosts = ['127.0.0.11', '127.0.0.12', '127.0.0.13', '127.0.0.14']
     servers = [
         OpenSSHServer(listen_ip=_host, port=self.port) for _host in hosts
     ]
     for server in servers:
         server.start_server()
     try:
         client = ParallelSSHClient(
             hosts,
             port=self.port,
             pkey=self.user_key,
             proxy_host=self.proxy_host,
             proxy_pkey=self.user_key,
             proxy_port=self.proxy_port,
             num_retries=1,
         )
         output = client.run_command(self.cmd)
         client.join(output)
         for server in (servers[2], servers[3]):
             server.stop()
             server.server_proc.communicate()
         client._host_clients[(2, hosts[2])].disconnect()
         client._host_clients[(3, hosts[3])].disconnect()
         output = client.run_command(self.cmd, stop_on_errors=False)
         client.join(output)
         self.assertEqual(len(hosts), len(output))
         self.assertTrue(output[2].exception is not None)
         self.assertTrue(output[3].exception is not None)
         self.assertListEqual(list(output[0].stdout), [self.resp])
         self.assertListEqual(list(output[1].stdout), [self.resp])
     finally:
         for server in servers:
             server.stop()
Exemplo n.º 12
0
 def test_tunnel(self):
     remote_host = '127.0.0.8'
     remote_server = OpenSSHServer(listen_ip=remote_host, port=self.port)
     remote_server.start_server()
     try:
         client = ParallelSSHClient(
             [remote_host], port=self.port, pkey=self.user_key,
             proxy_host=self.proxy_host, proxy_port=self.port, num_retries=1,
             proxy_pkey=self.user_key)
         output = client.run_command(self.cmd)
         client.join(output)
         for host, host_out in output.items():
             _stdout = list(host_out.stdout)
             self.assertListEqual(_stdout, [self.resp])
         self.assertEqual(remote_host, list(output.keys())[0])
         del client
     finally:
         remote_server.stop()
Exemplo n.º 13
0
def main():
    exp_name = sys.argv[1]
    total_duration = int(sys.argv[2])
    output_dir = sys.argv[3]
    start_position = int(sys.argv[4])
    end_position = int(sys.argv[5])
    interval=1
    cpu_frequency = total_duration //1
    user="******"



    hosts = []
    for i in range(1,5):
        for j in range(1,5):
            hosts.append("kb-w{}{}".format(i,j))

    client = ParallelSSHClient(hosts,user)

    try:
        output = client.run_command('vmstat  {} {} > {}_vmfile.tmp '.format(interval, cpu_frequency, exp_name))
    except Exception as e:
        print(e)


    time.sleep(total_duration)



    #create the dir if not exist

    if not os.path.exists(output_dir)==True:
        os.makedirs(output_dir)

    for vm in hosts:
        src_file_name = "{}_vmfile.tmp".format(exp_name)
        dst_file_name = "{}_vmfile.tmp".format(vm)
        print("trying to copy: {} {} {}".format(vm,src_file_name,dst_file_name,output_dir))
        copy_single_remote_to_local(vm,src_file_name, dst_file_name, output_dir)
        #post_process_outfile
        output_file = os.path.join(output_dir,"{}_vmfile.csv".format(vm))
        input_file = os.path.join(output_dir,"{}_vmfile.tmp".format(vm))
        print("debug: process_vmstat {} {} {} {}".format(input_file,start_position,end_position,output_file))
        process_vmstat(input_file,start_position,end_position, output_file)
Exemplo n.º 14
0
def is_snapshot_enabled(src_path='', host=None):
    if src_path and host == None:
        snap_dir = src_path + '/.snap'
        if does_dir_exist(snap_dir):
            return True

    if host != None and src_path:
        read_file_check = 'cannot read file system information'
        snap_dir = src_path + '/.snap'
        cmd = 'stat -f {}'.format(snap_dir)
        ssh_client = ParallelSSHClient([host], 'root',)
        #enable_host_logger()
        ssh_client_output = ssh_client.run_command(cmd)
        for host_name, host_output in ssh_client_output.items():
            for output_line in host_output.stdout:
                if read_file_check in output_line:
                    return False
        return True
    return False
Exemplo n.º 15
0
 def test_single_tunnel_multi_hosts_timeout(self):
     remote_host = '127.0.0.8'
     remote_server = ThreadedOpenSSHServer(
         listen_ip=remote_host, port=self.port)
     remote_server.start()
     remote_server.wait_for_port()
     hosts = [remote_host, remote_host, remote_host]
     try:
         client = ParallelSSHClient(
             hosts, port=self.port, pkey=self.user_key,
             proxy_host=self.proxy_host, proxy_port=self.port, num_retries=1,
             proxy_pkey=self.user_key,
             timeout=.001)
         output = client.run_command(self.cmd, stop_on_errors=False)
         client.join(output)
         for host, host_out in output.items():
             self.assertIsInstance(output[host].exception, Timeout)
     finally:
         remote_server.stop()
         remote_server.join()
Exemplo n.º 16
0
 def test_tunnel(self):
     remote_host = '127.0.0.8'
     remote_server = OpenSSHServer(listen_ip=remote_host, port=self.port)
     remote_server.start_server()
     try:
         client = ParallelSSHClient([remote_host],
                                    port=self.port,
                                    pkey=self.user_key,
                                    proxy_host=self.proxy_host,
                                    proxy_port=self.port,
                                    num_retries=1,
                                    proxy_pkey=self.user_key)
         output = client.run_command(self.cmd)
         client.join(output)
         for host, host_out in output.items():
             _stdout = list(host_out.stdout)
             self.assertListEqual(_stdout, [self.resp])
         self.assertEqual(remote_host, list(output.keys())[0])
         del client
     finally:
         remote_server.stop()
Exemplo n.º 17
0
def is_it_cephfs(src_path='', host=None):
    """
    Instead of matching a $src_path from a list of mounting points, then
    check if that is a 'cephfs' file system, we just test it straight with:

    $ stat -f /mnt/cephfs/data/isos/openSUSE-Tumbleweed-DVD-x86_64-Snapshot20190202-Media.iso

    Part of the output we expect *if we are running it on a 'cephfs'*, is:
        ...
        ID: 9892f072c60473e3 Namelen: 255     Type: ceph
        ...

        OR, *if we are not running it on a 'cephfs'*:

        ...
        ID: b333aa8538b776cf Namelen: 255     Type: btrfs
        ...
    """

    """cmd = "stat -f {} | egrep -i 'type: ceph' | wc -l".format(src_path)"""
    cephfs_check = 'Type: ceph'
    cmd = 'stat -f {}'.format(src_path)

    if host != None:
        ssh_client = ParallelSSHClient([host], 'root')
        #enable_host_logger()
        ssh_client_output = ssh_client.run_command(cmd)
        for host_name, host_output in ssh_client_output.items():
            for output_line in host_output.stdout:
                if cephfs_check in output_line:
                    return True

    else:
        rc, stdout, stderr = cmd_launcher(cmd=cmd)
        if rc != 0:
            raise Exception('Error while executing the command {}. \
                            Error message: {}'.format(cmd, stderr))

        return (cephfs_check in stdout)
    return False
Exemplo n.º 18
0
 def test_single_tunnel_multi_hosts(self):
     remote_host = '127.0.0.8'
     remote_server = ThreadedOpenSSHServer(
         listen_ip=remote_host, port=self.port)
     remote_server.start()
     remote_server.wait_for_port()
     hosts = [remote_host, remote_host, remote_host]
     try:
         client = ParallelSSHClient(
             hosts, port=self.port, pkey=self.user_key,
             proxy_host=self.proxy_host, proxy_port=self.port, num_retries=1,
             proxy_pkey=self.user_key)
         output = client.run_command(self.cmd, stop_on_errors=False)
         client.join(output)
         for host, host_out in output.items():
             _stdout = list(host_out.stdout)
             self.assertListEqual(_stdout, [self.resp])
         self.assertEqual(len(hosts), len(list(output.keys())))
         del client
     finally:
         remote_server.stop()
         remote_server.join()
Exemplo n.º 19
0
 def test_tunnel_host_config(self):
     hosts = ['127.0.0.11', '127.0.0.12']
     servers = [
         OpenSSHServer(listen_ip=_host, port=self.port) for _host in hosts
     ]
     for server in servers:
         server.start_server()
     host_config = [
         HostConfig(proxy_host=self.proxy_host,
                    proxy_port=self.proxy_port,
                    proxy_pkey=self.user_key),
         HostConfig(proxy_host='127.0.0.155', proxy_port=123),
     ]
     client = ParallelSSHClient(hosts,
                                port=self.port,
                                pkey=self.user_key,
                                host_config=host_config,
                                num_retries=1)
     output = client.run_command(self.cmd, stop_on_errors=False)
     client.join(output)
     self.assertIsInstance(output[1].exception, ProxyError)
     stdout = list(output[0].stdout)
     self.assertListEqual(stdout, [self.resp])
Exemplo n.º 20
0
 def test_single_tunnel_multi_hosts_timeout(self):
     remote_host = '127.0.0.8'
     remote_server = ThreadedOpenSSHServer(listen_ip=remote_host,
                                           port=self.port)
     remote_server.start()
     remote_server.wait_for_port()
     hosts = [remote_host, remote_host, remote_host]
     try:
         client = ParallelSSHClient(hosts,
                                    port=self.port,
                                    pkey=self.user_key,
                                    proxy_host=self.proxy_host,
                                    proxy_port=self.port,
                                    num_retries=1,
                                    proxy_pkey=self.user_key,
                                    timeout=.001)
         output = client.run_command(self.cmd, stop_on_errors=False)
         client.join(output)
         for host, host_out in output.items():
             self.assertIsInstance(output[host].exception, Timeout)
     finally:
         remote_server.stop()
         remote_server.join()
Exemplo n.º 21
0
 def test_tunnel_parallel_client(self):
     hosts = ['127.0.0.1%s' % (d, ) for d in range(10)]
     servers = [
         OpenSSHServer(listen_ip=_host, port=self.port) for _host in hosts
     ]
     for server in servers:
         server.start_server()
     hosts_5 = [hosts[0], hosts[1], hosts[2], hosts[3], hosts[4]]
     try:
         client = ParallelSSHClient(
             hosts_5,
             port=self.port,
             pkey=self.user_key,
             proxy_host=self.proxy_host,
             proxy_pkey=self.user_key,
             proxy_port=self.proxy_port,
             num_retries=1,
         )
         start = datetime.now()
         output = client.run_command(self.cmd)
         end = datetime.now()
         dt_5 = end - start
         client = ParallelSSHClient(
             hosts,
             port=self.port,
             pkey=self.user_key,
             proxy_host=self.proxy_host,
             proxy_pkey=self.user_key,
             proxy_port=self.proxy_port,
             num_retries=1,
         )
         start = datetime.now()
         output = client.run_command(self.cmd)
         end = datetime.now()
         dt_10 = end - start
         dt = dt_10.total_seconds() / dt_5.total_seconds()
         # self.assertTrue(dt < 2)
         client.join(output)
         self.assertEqual(len(hosts), len(output))
         for i, host_out in enumerate(output):
             _stdout = list(host_out.stdout)
             self.assertListEqual(_stdout, [self.resp])
             self.assertEqual(hosts[i], host_out.host)
     finally:
         for server in servers:
             server.stop()
Exemplo n.º 22
0
 def test_tunnel_remote_host_timeout(self):
     remote_host = '127.0.0.18'
     proxy_host = '127.0.0.19'
     server = OpenSSHServer(listen_ip=proxy_host, port=self.port)
     server.start_server()
     try:
         client = ParallelSSHClient([remote_host],
                                    port=self.port,
                                    pkey=self.user_key,
                                    proxy_host=proxy_host,
                                    proxy_port=self.port,
                                    num_retries=1,
                                    proxy_pkey=self.user_key,
                                    timeout=2)
         try:
             client.run_command(self.cmd)
         except (GTimeout, Exception):
             pass
         else:
             raise Exception("Command neither failed nor timeout raised")
         client._tunnel.cleanup()
         server.stop()
     finally:
         server.stop()
Exemplo n.º 23
0
def run_command(client: ParallelSSHClient, command: str) -> CommandResult:
    """Executes identical command on all hosts attached to client.

    Will wait until all hosts complete the command execution or timeout is reached.
    Re-raises pssh exceptions.
    # TODO Handle more specific exceptions
    """
    # stop_on_errors -> allows others hosts to execute when one crashes, combine exceptions
    # output is like: (hostname, host_output)
    try:
        result = client.run_command(command, stop_on_errors=False)
        client.join(result)
    except pssh.exceptions.Timeout:
        log.warning('Command `{}` reached time limit'.format(command))
        raise
    except pssh.exceptions.ProxyError as e:
        log.error('Could not connect to proxy server, reason: {}'.format(e))
        raise
    except Exception as e:
        log.critical(e)
        raise  # FIXME Find out what throws this exception
    else:
        log.debug('Command `{}` finished'.format(command))
        return result
Exemplo n.º 24
0
    def connectClientToJoinableHosts(self):
        client = ParallelSSHClient(self.hosts,
                                   host_config=self.hosts_config,
                                   num_retries=1,
                                   timeout=3)
        print("Connected")
        print("Launching command...")
        outputs = client.run_command('ls ' + self.passwords_directory +
                                     ' | egrep "*.kdbx"',
                                     stop_on_errors=False,
                                     timeout=3)
        hosts_databases_files = dict([(host_output.host, host_output)
                                      for host_output in outputs])

        # Filtering unjoinable hosts
        hosts_databases_files = \
                dict(filter(lambda host: host[1].exception == None, hosts_databases_files.items()))
        new_hosts_config = \
                dict(filter(lambda host: host[0] in hosts_databases_files.keys(), self.hosts_config.items()))
        new_hosts = new_hosts_config.keys()
        joinableClient = None

        if len(new_hosts_config) < len(self.hosts_config):
            print("Reconnected client without unjoinable hosts")
            if self.debug:
                print(new_hosts_config)
            if self.debug:
                print(new_hosts)
            joinableClient = ParallelSSHClient(new_hosts,
                                               host_config=new_hosts_config,
                                               num_retries=1,
                                               timeout=3)
        else:
            joinableClient = client
        return (joinableClient, hosts_databases_files, new_hosts_config,
                new_hosts)
Exemplo n.º 25
0
def get_client(config: HostsConfig,
               pconfig: Optional[ProxyConfig] = None,
               **kwargs) -> ParallelSSHClient:
    """Builds and returns an ssh client object for given configuration.

    Client is fetched directly from cache if identical arguments were used recently.
    """
    if pconfig is None:
        pconfig = {}

    return ParallelSSHClient(hosts=config.keys(),
                             host_config=config,
                             pkey=SSH.KEY_FILE,
                             proxy_host=pconfig.get('proxy_host'),
                             proxy_user=pconfig.get('proxy_user'),
                             proxy_port=pconfig.get('proxy_port'),
                             num_retries=0,
                             **kwargs)
Exemplo n.º 26
0
    
def test(client):
    output = run2(client, '/tmp/outland.py')
    print(output)
    for host in hosts:
        stdout = ''
        try :
            stdout = list(output[host].stdout)
        except Timeout:
            client.reset_output_generators(output[host])
        print( host, r.get(host) )
        print( host, stdout )

if __name__ == "__main__":
    args = get_options()
    client = ParallelSSHClient(hosts)
    r = redis.StrictRedis(host=config['CMD_BROKER'], port=6379, db=0)
    
    if args.test:
        test(client)
        exit()
        
    output = update(client)
    while True:
        for host in hosts:
            stdout = ''
            try :
                stdout = list(output[host].stdout)
            except Timeout:
                client.reset_output_generators(output[host])
            print( host, r.get(host) )
Exemplo n.º 27
0
parser.add_argument("--ExecutionTimes",
                    type=int,
                    default=1,
                    choices=xrange(1, 10),
                    help="Experiment Execution times")
parser.add_argument("--RunningLocal",
                    type=int,
                    default=0,
                    choices=xrange(0, 2),
                    help="is running local")

args = parser.parse_args()

# pssh-client
hosts = rasp_hosts
client = ParallelSSHClient(hosts, user='******')


def restart_flink():
    cmd = stop_flink
    process = subprocess.Popen(cmd.split(),
                               stdout=FNULL,
                               stderr=subprocess.STDOUT)
    output, error = process.communicate()

    time.sleep(2)

    cmd = start_flink
    process = subprocess.Popen(cmd.split(),
                               stdout=FNULL,
                               stderr=subprocess.STDOUT)
Exemplo n.º 28
0
class MimicExp:
    def __init__(self, config, debug=0):
        self.base_logdir = "autorun/chaokun_logs"
        self._parse_config(config)
        self.debug = debug

    def _parse_config(self, config):
        self.host_user_dir = config["host_user_dir"]
        self.docker_user_dir = config["docker_user_dir"]
        self.docker_user = config["docker_user"]
        self.docker_ssh_port = config["docker_ssh_port"]
        self.nodes = config['nodes']
        self.nGPU = config['nGPU']  # for each machine
        self.eth = config['eth']  # name if NIC
        self.host_key = config["host_ssh_key"]
        self.docker_key = config["docker_ssh_key"]

    def _init_host_ssh(self):
        """connect to all nodes"""
        self.pClinet = ParallelSSHClient(self.nodes, pkey=self.host_key)

    def _init_docker_ssh(self):
        self.docker0 = SSHClient("localhost",
                                 user=self.docker_user,
                                 port=self.docker_ssh_port,
                                 pkey=self.docker_key)

    def _ini_host_env(self):
        """download log folder from aws-s3"""
        sentinel_cmd = "mkdir ~/autorun; "
        self._p_exe(sentinel_cmd)

        check_logs_cmd = "cd ~/autorun; mkdir tmp/; cd tmp/; rm mimic_env_setup.sh; "\
            "wget https://gist.githubusercontent.com/zarzen/012c2aa2a1c833e5bf1aeb379bbb9e93/raw/71dec6db3138dcc7e5318598bc770c6ce296b9a4/mimic_env_setup.sh; "\
            "/bin/bash mimic_env_setup.sh"
        self._p_exe(check_logs_cmd)

    def _p_exe(self, cmd):
        output = self.pClinet.run_command(cmd)
        for host, host_output in output.items():
            for line in host_output.stdout:
                print("Host [%s] - %s" % (host, line))
            for line in host_output.stderr:
                print("Host [%s] - %s" % (host, line))

    def _docker_exe(self, cmd):
        _channel, _host, _stdout, _stderr, _ = self.docker0.run_command(cmd)
        for line in _stdout:
            print("[{}] {}".format(_host, line))
        for line in _stderr:
            print("[{}] err {}".format(_host, line))

    def _start_containers(self):
        stop_cmd = "docker kill $(docker ps -q)"
        pull_cmd = "docker pull zarzen/horovod-mod:1.0"
        start_cmd = "sudo docker run --gpus all --network=host --detach --ipc=host "\
            "-v {}/autorun/chaokun_logs:{}/chaokun_logs "\
            "zarzen/horovod-mod:1.0".format(self.host_user_dir, self.docker_user_dir
                                            )
        self._p_exe(stop_cmd)
        self._p_exe(pull_cmd)
        self._p_exe(start_cmd)

    def run(self):
        self._init_host_ssh()
        self._ini_host_env()
        self._start_containers()
        self._init_docker_ssh()

        exp_folders = os.listdir(join(self.host_user_dir, self.base_logdir))
        # first 10 folder for debugging for now
        if self.debug:
            print("debug mode, experimentally run 3 configurations")
            exp_folders = exp_folders[:3]
        else:
            print("no debug flag, try to experiments on all configurations")

        for idx, _folder in enumerate(exp_folders):
            self._run_once(_folder)
            print("*" * 10, "Completed {}/{}".format(idx + 1,
                                                     len(exp_folders)),
                  "*" * 10)

    def _run_once(self, folder_name):
        """ run with 
        """
        # read the orginal experiment config to get the bw limit
        folder_path = join(self.host_user_dir, self.base_logdir, folder_name)
        if not os.path.isdir(folder_path) or \
            not os.path.exists(join(folder_path, "config.json")):
            return

        with open(join(folder_path, "config.json")) as ifile:
            config = json.load(ifile)
            bw_limit = config['bw_limit']
            print("mimic training with folder {} at bw {}".format(
                folder_name, bw_limit))
            self._bw_ctl(bw_limit)
            cpu_p, net_p = self._exe_res_monitor(folder_path)
            print(">" * 10, 'launched CPU & Network monitoring')
            mt_cmd = self._build_mpirun_cmd(config, folder_name)
            print("executing mimic training command:\n", mt_cmd)

            # import time
            # time.sleep(2)
            self._docker_exe(mt_cmd)
            cpu_p.terminate()
            net_p.terminate()

    def _build_mpirun_cmd(self, config, folder_name):
        """"""
        folder_path = join(self.docker_user_dir, "chaokun_logs", folder_name)
        nNodes = len(config['nodes'])
        nGPU = config['nGPU']
        if self.debug:
            # because test env only has two nodes with 1 GPU on each
            nNodes = 2
            nGPU = 1
        IPs = self.nodes[:nNodes]
        hostsStr = ",".join(["{}:{}".format(ip, nGPU) for ip in IPs])
        cmd = [
            "mpirun", "-np",
            str(nNodes * nGPU), "-H", hostsStr, "-bind-to", "none", "-map-by",
            "slot", "-x", "LD_LIBRARY_PATH=/usr/local/cuda/lib64", "-x",
            "NCCL_DEBUG=INFO", "-x", "NCCL_SOCKET_IFNAME=^lo,docker,ens4",
            "-mca", "btl_tcp_if_exclude lo,docker,ens4",
            self.docker_user_dir + "/mimic_dt/build/mdt_allreduce_perf",
            "-b 500M -e 500M -f 2 -g 1 -c 0 -w 0", "-l",
            join(folder_path, "log_for_dt_mimic.txt"), "|& grep -v \"Read -1\""
        ]
        return " ".join(cmd)

    def _bw_ctl(self, bw_limit):
        del_cmd = "sudo tc qdisc del dev {} root tbf rate 40Gbit latency 400ms burst 3000kbit".format(
            self.eth)
        # if self.bw_limit = "" then we don't execute the add_cmd
        add_cmd = "sudo tc qdisc add dev {} root tbf rate {} latency 400ms burst 3000kbit".format(
            self.eth, bw_limit)
        print('deleting old bw limit')
        self._p_exe(del_cmd)
        print(
            'confirm the bw limit deleted (should see error when redoing del)')
        self._p_exe(del_cmd)
        if bw_limit != "":
            self._p_exe(add_cmd)

    def _exe_res_monitor(self, tg_folder):
        """ execute cpu and network bandwidth monitor
        """
        # record existing logs
        cpu_monitor_script = expanduser("~/autorun/monitor_cpu.py")
        net_monitor_script = expanduser("~/autorun/monitor_net.py")
        cpu_p = subprocess.Popen(
            ["python3", cpu_monitor_script,
             join(tg_folder, "mt_cpu.log")],
            stdout=subprocess.DEVNULL)
        net_p = subprocess.Popen(
            ["python3", net_monitor_script,
             join(tg_folder, "mt_net.log")],
            stdout=subprocess.DEVNULL)
        return cpu_p, net_p

    def __del__(self):
        stop_cmd = "docker kill $(docker ps -q)"
        self._p_exe(stop_cmd)
Exemplo n.º 29
0
 def _init_host_ssh(self):
     """connect to all nodes"""
     self.pClinet = ParallelSSHClient(self.nodes, pkey=self.host_key)
Exemplo n.º 30
0
import os
from sftp import copy_remote_to_local
from post_process_perfstat import post_process_perfstat
from collections import defaultdict
exp_name = sys.argv[1]
total_duration = int(sys.argv[2])  #given in seconds
output_dir = sys.argv[3]

total_duration_in_ms = total_duration * 1000  #scale to ms

#print ("debug>>", sys.argv)

hosts = ['kubenode-1', 'kubenode-2', 'kubenode-3', 'kubenode-4']
#hosts = ['kubenode-1']
user = "******"
client = ParallelSSHClient(hosts, user)

try:
    output = client.run_command('sh ./perfstat_node/perfstat.sh {} {}'.format(
        exp_name, total_duration_in_ms))
    #print ("debug>> executed")
except Exception as e:
    print e

time.sleep(total_duration)

#print ("debug>> wakeup")
'''
for host, host_output in output.items():
    for line in host_output.stdout:
        print(line)
Exemplo n.º 31
0
# Add colored output
# Sanitize inputs
# Handle different usernames

from docopt import docopt
from prompt_toolkit import prompt
from pssh.clients.native import ParallelSSHClient


def runCommand(command, client):
    print(" Running command %s" % (command))
    output = client.run_command(command)

    for server, serverOutput in output.items():
        for line in serverOutput.stdout:
            print("%s:    %s" % (server, line))


if __name__ == '__main__':
    args = docopt(__doc__, version='1')

    #  print("Got args: ", args)

    client = ParallelSSHClient(args['SERVER'], user=args['--user'])

    runCommand('hostname', client)
    a = prompt('$ ')
    while a != 'exit':
        runCommand(a, client)
        a = prompt('$ ')
Exemplo n.º 32
0
import os
from lib.sftp import copy_multiple_remote_to_local
from perfstat_processor import post_process_perfstat
from collections import defaultdict
exp_name = sys.argv[1]
total_duration = int(sys.argv[2])  #given in seconds
output_dir = sys.argv[3]

total_duration_in_ms = total_duration * 1000  #scale to ms

#print ("debug>>", sys.argv)

hosts = ['node-1', 'node-2', 'node-3', 'node-4']
list_of_kvm = hosts.copy()
user = "******"
client = ParallelSSHClient(hosts, user)

try:
    output = client.run_command(
        'sudo sh ./perfstat_node/perfstat.sh {} {}'.format(
            exp_name, total_duration_in_ms))
    #print ("debug>> executed")
except Exception as e:
    print(e)

time.sleep(total_duration)

#print ("debug>> wakeup")
'''
for host, host_output in output.items():
    for line in host_output.stdout:
Exemplo n.º 33
0
def test(client):
    output = run2(client, '/tmp/outland.py')
    print(output)
    for host in hosts:
        stdout = ''
        try:
            stdout = list(output[host].stdout)
        except Timeout:
            client.reset_output_generators(output[host])
        print(host, r.get(host))
        print(host, stdout)


if __name__ == "__main__":
    args = get_options()
    client = ParallelSSHClient(hosts)
    r = redis.StrictRedis(host=config['CMD_BROKER'], port=6379, db=0)

    if args.test:
        test(client)
        exit()

    output = update(client)
    while True:
        for host in hosts:
            stdout = ''
            try:
                stdout = list(output[host].stdout)
            except Timeout:
                client.reset_output_generators(output[host])
            print(host, r.get(host))
Exemplo n.º 34
0
import getpass
import time

hosts = [
    'brki164-lnx-5.bucknell.edu', 'brki164-lnx-6.bucknell.edu',
    'brki164-lnx-7.bucknell.edu', 'brki164-lnx-8.bucknell.edu',
    'brki164-lnx-9.bucknell.edu', 'brki164-lnx-10.bucknell.edu',
    'brki164-lnx-11.bucknell.edu', 'brki164-lnx-12.bucknell.edu',
    'brki164-lnx-13.bucknell.edu', 'brki164-lnx-14.bucknell.edu',
    'brki164-lnx-15.bucknell.edu', 'brki164-lnx-16.bucknell.edu',
    'brki164-lnx-17.bucknell.edu', 'brki164-lnx-18.bucknell.edu'
]

passw = getpass.getpass(prompt="George Password")

client = ParallelSSHClient(hosts, user='******', password=passw)
#print(client.hosts[0])

#client.run_command('cd Downloads/')
output = client.run_command(
    'cd Code/NLP/Soup/Not_Slim/ && python distributive_scraper.py ' +
    str(len(hosts)),
    use_pty='Fasle')

for host in hosts:
    for line in output[host].stdout:
        print(line)

for host in output:
    print(output[host].exit_code)
Exemplo n.º 35
0
    def install(self, user_pem=None, quiet_mode=False):
        pem_path = user_pem if user_pem is not None else self.config[
            "PemFilePath"]
        cluster_id = self.find_job_flow_id()
        desc_cluster = self.emr.describe_cluster(ClusterId=cluster_id)
        cluster_state = desc_cluster['Cluster']['Status']['State']
        if cluster_state not in ['WAITING', 'RUNNING']:
            raise ValueError("Cluster is not active")
        tags_list = desc_cluster['Cluster']['Tags']

        fail_check = True
        valid_description = ["env=local"]
        valid_names = ['local']

        for tag in tags_list:
            if 'Description' in tag['Key'] and any(
                    value in tag['Value'] for value in valid_description):
                fail_check = False
                break
            if 'Name' in tag['Key'] and any(name in tag['Value']
                                            for name in valid_names):
                fail_check = False
                break

        if not fail_check:
            print("Cluster tags should contain Key=Name, Value='local']")
            print(
                "Cluster tags should contain Key=Description, Value='env=local']"
            )
            raise ValueError(
                "Error: Local build can not deployed on this cluster {0}".
                format(cluster_id))

        tar_file_nl = HandleEMRCommands.build()
        tar_file_name = tar_file_nl[0]
        tar_file_location = tar_file_nl[1]

        if pem_path is not None:
            response = self.emr.list_instances(ClusterId=cluster_id, )
            response_code = response['ResponseMetadata']['HTTPStatusCode']
            if response_code == 200:

                hosts = self.active_instances(response)

                print(hosts)

                client = ParallelSSHClient(hosts, user='******', pkey=pem_path)
                copy_files = client.copy_file(tar_file_location,
                                              '/home/hadoop/' + tar_file_name)
                joinall(copy_files, raise_error=True)

                output = client.run_command(
                    "python3 -m pip install --upgrade --no-deps --force-reinstall /home/hadoop/"
                    + tar_file_name,
                    sudo=True)
                for host, host_output in output.items():
                    if quiet_mode:
                        for line in host_output.stderr:
                            print(line)
                    else:
                        for line in host_output.stdout:
                            print(line)
                print("Deployed to all nodes")

        return
Exemplo n.º 36
0
    def script_runner(self,
                      user_pem=None,
                      user_script_name=None,
                      quiet_mode=False):
        """
        :return:
        """
        script_name = user_script_name if user_script_name is not None else self.config[
            "ScriptToRun"]["File"]
        pem_path = user_pem if user_pem is not None else self.config[
            "PemFilePath"]

        if script_name is not None:
            if pem_path is not None:
                job_flow_id = self.find_job_flow_id()
                response = self.emr.list_instances(ClusterId=job_flow_id, )
                response_code = response['ResponseMetadata']['HTTPStatusCode']
                if response_code == 200:

                    hosts = self.active_instances(response)

                    print(hosts)

                    client = ParallelSSHClient(hosts,
                                               user='******',
                                               pkey=pem_path)

                    if script_name.startswith("/"):
                        # handle absolute path
                        to_script_name = "/home/hadoop/{}".format(
                            os.path.basename(script_name))
                        from_script_name = script_name
                    else:
                        # handle relative path
                        to_script_name = "/home/hadoop/{}".format(script_name)
                        from_script_name = os.path.join(
                            os.getcwd(), script_name)

                    logger.info("Copying script {} to {}".format(
                        from_script_name, to_script_name))

                    copy_files = client.copy_file(from_script_name,
                                                  to_script_name)
                    joinall(copy_files, raise_error=True)

                    logger.info("Finished copying script {} to {}".format(
                        from_script_name, to_script_name))

                    logger.info("Running script {}".format(to_script_name))

                    output = client.run_command("chmod +x {} && {}".format(
                        to_script_name, to_script_name),
                                                sudo=True)

                    for host, host_output in output.items():
                        if quiet_mode:
                            for line in host_output.stderr:
                                print(line)
                        else:
                            for line in host_output.stdout:
                                print(line)

                    logger.info("Finished script {}".format(to_script_name))

                    return hosts

                else:
                    raise ValueError(
                        "Could not list instances (status code {})".format(
                            response))
            else:
                raise ValueError(
                    'pem_file_path is not specified in emrcliconfig_inst_fleets.yaml "pem_file_path:%s"'
                    % pem_path)
        else:
            raise ValueError("script runner shell script not specified")