Exemplo n.º 1
0
def run_multi_host_workload(prm):

    prm_host_set = prm.host_set
    prm_permute_host_dirs = prm.permute_host_dirs
    master_invoke = prm.master_invoke

    starting_gate = master_invoke.starting_gate
    verbose = master_invoke.verbose

    # construct list of ssh threads to invoke in parallel

    sync_files.create_top_dirs(master_invoke, True)
    pickle_fn = os.path.join(prm.master_invoke.network_dir, 'param.pickle')

    # if verbose: print('writing ' + pickle_fn))

    sync_files.write_pickle(pickle_fn, prm)
    if os.getenv('PYPY'):
        python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
        python_prog = 'python'
    elif sys.version.startswith('3'):
        python_prog = 'python3'
    else:
        raise Exception('unrecognized python version %s' % sys.version)

    # print('python_prog = %s'%python_prog)

    remote_thread_list = []
    host_ct = len(prm_host_set)
    for j in range(0, len(prm_host_set)):
        remote_host = prm_host_set[j]
        smf_remote_pgm = os.path.join(prm.remote_pgm_dir,
                                      'smallfile_remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s ' \
            % (python_prog, smf_remote_pgm, prm.master_invoke.network_dir)

        # this_remote_cmd = remote_cmd

        if prm_permute_host_dirs:
            this_remote_cmd += \
                ' --as-host %s' % prm_host_set[(j + 1) % host_ct]
        else:
            this_remote_cmd += ' --as-host %s' % remote_host
        if verbose:
            print(this_remote_cmd)
        if smallfile.is_windows_os or prm.launch_by_daemon:
            remote_thread_list.append(
                launcher_thread.launcher_thread(prm, remote_host,
                                                this_remote_cmd))
        else:
            remote_thread_list.append(
                ssh_thread.ssh_thread(remote_host, this_remote_cmd))

    # start them

    for t in remote_thread_list:
        if not prm.launch_by_daemon:
            # pace starts so that we don't get ssh errors
            time.sleep(0.1)
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur
    # as starting gate file is already present
    # every second we resume scan from last host file not found
    # FIXME: for very large host sets,
    # timeout only if no host responds within X seconds

    exception_seen = None
    hosts_ready = False  # set scope outside while loop
    abortfn = master_invoke.abort_fn()
    last_host_seen = -1
    sec = 0.0
    sec_delta = 0.5
    host_timeout = prm.host_startup_timeout
    if smallfile.is_windows_os:
        host_timeout += 20

    try:
        while sec < host_timeout:
            # HACK to force directory entry coherency for Gluster
            ndirlist = os.listdir(master_invoke.network_dir)
            if master_invoke.verbose:
                print('shared dir list: ' + str(ndirlist))
            hosts_ready = True
            if os.path.exists(abortfn):
                raise Exception('worker host signaled abort')
            for j in range(last_host_seen + 1, len(prm_host_set)):
                h = prm_host_set[j]
                fn = master_invoke.gen_host_ready_fname(h.strip())
                if verbose:
                    print('checking for host filename ' + fn)
                if not os.path.exists(fn):
                    hosts_ready = False
                    break
                last_host_seen = j  # saw this host's ready file
                # we exit while loop only if no hosts in host_timeout seconds
                sec = 0.0
            if hosts_ready:
                break

            # if one of ssh threads has died, no reason to continue

            kill_remaining_threads = False
            for t in remote_thread_list:
                if not t.isAlive():
                    print('thread %s on host %s has died' % (t, h))
                    kill_remaining_threads = True
                    break
            if kill_remaining_threads:
                break

            # be patient for large tests
            # give user some feedback about
            # how many hosts have arrived at the starting gate

            time.sleep(sec_delta)
            sec += sec_delta
            sec_delta += 1
            if verbose:
                print('last_host_seen=%d sec=%d' % (last_host_seen, sec))
    except KeyboardInterrupt as e:
        print('saw SIGINT signal, aborting test')
        exception_seen = e
    except Exception as e:
        exception_seen = e
        hosts_ready = False
    if not hosts_ready:
        smallfile.abort_test(abortfn, [])
        print('ERROR: host %s did not reach starting gate' % h)
        if not exception_seen:
            raise Exception('hosts did not reach starting gate ' +
                            'within %d seconds' % host_timeout)
        else:
            print('saw exception %s, aborting test' % str(e))
    else:

        # ask all hosts to start the test
        # this is like firing the gun at the track meet

        try:
            sync_files.write_sync_file(starting_gate, 'hi')
            print('starting all threads by creating starting gate file %s' %
                  starting_gate)
        except IOError as e:
            print('error writing starting gate: %s' % os.strerror(e.errno))

    # wait for them to finish

    for t in remote_thread_list:
        t.join()
        if t.status != OK:
            print('ERROR: ssh thread for host %s completed with status %d' %
                  (t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing SmallfileWorkload instances
    # with counters and times that we need

    try:
        all_ok = NOTOK
        invoke_list = []
        one_shot_delay = True
        for h in prm_host_set:  # for each host in test

            # read results for each thread run in that host
            # from python pickle of the list of SmallfileWorkload objects

            pickle_fn = master_invoke.host_result_filename(h)
            if verbose:
                print('reading pickle file: %s' % pickle_fn)
            host_invoke_list = []
            try:
                if one_shot_delay and not os.path.exists(pickle_fn):

                    # all threads have joined already, they are done
                    # we allow > 1 sec
                    # for this (NFS) client to see other clients' files

                    time.sleep(1.2)
                    one_shot_delay = False
                with open(pickle_fn, 'rb') as pickle_file:
                    host_invoke_list = pickle.load(pickle_file)
                if verbose:
                    print(' read %d invoke objects' % len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
            except IOError as e:
                if e.errno != errno.ENOENT:
                    raise e
                print('  pickle file %s not found' % pickle_fn)

        output_results.output_results(invoke_list, prm)
        all_ok = OK
    except IOError as e:

        print('host %s filename %s: %s' % (h, pickle_fn, str(e)))
    except KeyboardInterrupt as e:
        print('control-C signal seen (SIGINT)')
    except SMFResultException as e:
        print(str(e))

    sys.exit(all_ok)
Exemplo n.º 2
0
def run_multi_host_workload(prm):

    prm_host_set = prm.host_set
    prm_permute_host_dirs = prm.permute_host_dirs
    master_invoke = prm.master_invoke

    starting_gate = master_invoke.starting_gate
    verbose = master_invoke.verbose

    # construct list of ssh threads to invoke in parallel

    sync_files.create_top_dirs(master_invoke, True)
    pickle_fn = os.path.join(prm.master_invoke.network_dir, 'param.pickle')

    # if verbose: print('writing ' + pickle_fn))

    sync_files.write_pickle(pickle_fn, prm)
    if os.getenv('PYPY'):
        python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
        python_prog = 'python'
    elif sys.version.startswith('3'):
        python_prog = 'python3'
    else:
        raise Exception('unrecognized python version %s' % sys.version)

    # print('python_prog = %s'%python_prog)

    remote_thread_list = []
    host_ct = len(prm_host_set)
    for j in range(0, len(prm_host_set)):
        remote_host = prm_host_set[j]
        smf_remote_pgm = os.path.join(prm.remote_pgm_dir,
                                      'smallfile_remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s ' \
            % (python_prog, smf_remote_pgm, prm.master_invoke.network_dir)

        # this_remote_cmd = remote_cmd

        if prm_permute_host_dirs:
            this_remote_cmd += \
                ' --as-host %s' % prm_host_set[(j + 1) % host_ct]
        else:
            this_remote_cmd += ' --as-host %s' % remote_host
        if verbose:
            print(this_remote_cmd)
        if smallfile.is_windows_os or prm.launch_by_daemon:
            remote_thread_list.append(
                launcher_thread.launcher_thread(prm,
                                                remote_host,
                                                this_remote_cmd))
        else:
            remote_thread_list.append(ssh_thread.ssh_thread(remote_host,
                                                            this_remote_cmd))

    # start them

    for t in remote_thread_list:
        if not prm.launch_by_daemon:
            # pace starts so that we don't get ssh errors
            time.sleep(0.1)
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur
    # as starting gate file is already present
    # every second we resume scan from last host file not found
    # FIXME: for very large host sets,
    # timeout only if no host responds within X seconds

    exception_seen = None
    hosts_ready = False  # set scope outside while loop
    abortfn = master_invoke.abort_fn()
    last_host_seen = -1
    sec = 0.0
    sec_delta = 0.5
    host_timeout = prm.host_startup_timeout
    if smallfile.is_windows_os:
        host_timeout += 20

    try:
        while sec < host_timeout:
            # HACK to force directory entry coherency for Gluster
            ndirlist = os.listdir(master_invoke.network_dir)
            if master_invoke.verbose:
                print('shared dir list: ' + str(ndirlist))
            hosts_ready = True
            if os.path.exists(abortfn):
                raise Exception('worker host signaled abort')
            for j in range(last_host_seen + 1, len(prm_host_set)):
                h = prm_host_set[j]
                fn = master_invoke.gen_host_ready_fname(h.strip())
                if verbose:
                    print('checking for host filename ' + fn)
                if not os.path.exists(fn):
                    hosts_ready = False
                    break
                last_host_seen = j  # saw this host's ready file
                # we exit while loop only if no hosts in host_timeout seconds
                sec = 0.0
            if hosts_ready:
                break

            # if one of ssh threads has died, no reason to continue

            kill_remaining_threads = False
            for t in remote_thread_list:
                if not t.isAlive():
                    print('thread %s on host %s has died' % (t, h))
                    kill_remaining_threads = True
                    break
            if kill_remaining_threads:
                break

            # be patient for large tests
            # give user some feedback about
            # how many hosts have arrived at the starting gate

            time.sleep(sec_delta)
            sec += sec_delta
            sec_delta += 1
            if verbose:
                print('last_host_seen=%d sec=%d' % (last_host_seen, sec))
    except KeyboardInterrupt as e:
        print('saw SIGINT signal, aborting test')
        exception_seen = e
    except Exception as e:
        exception_seen = e
        hosts_ready = False
    if not hosts_ready:
        smallfile.abort_test(abortfn, [])
        print('ERROR: host %s did not reach starting gate' % h)
        if not exception_seen:
            raise Exception('hosts did not reach starting gate ' +
                            'within %d seconds' % host_timeout)
        else:
            print('saw exception %s, aborting test' % str(e))
    else:

        # ask all hosts to start the test
        # this is like firing the gun at the track meet

        try:
            sync_files.write_sync_file(starting_gate, 'hi')
            print('starting all threads by creating starting gate file %s' %
                  starting_gate)
        except IOError as e:
            print('error writing starting gate: %s' % os.strerror(e.errno))

    # wait for them to finish

    for t in remote_thread_list:
        t.join()
        if t.status != OK:
            print('ERROR: ssh thread for host %s completed with status %d' %
                  (t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing SmallfileWorkload instances
    # with counters and times that we need

    try:
        all_ok = NOTOK
        invoke_list = []
        one_shot_delay = True
        for h in prm_host_set:  # for each host in test

            # read results for each thread run in that host
            # from python pickle of the list of SmallfileWorkload objects

            pickle_fn = master_invoke.host_result_filename(h)
            if verbose:
                print('reading pickle file: %s' % pickle_fn)
            host_invoke_list = []
            try:
                if one_shot_delay and not os.path.exists(pickle_fn):

                    # all threads have joined already, they are done
                    # we allow > 1 sec
                    # for this (NFS) client to see other clients' files

                    time.sleep(1.2)
                    one_shot_delay = False
                with open(pickle_fn, 'rb') as pickle_file:
                    host_invoke_list = pickle.load(pickle_file)
                if verbose:
                    print(' read %d invoke objects' % len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
            except IOError as e:
                if e.errno != errno.ENOENT:
                    raise e
                print('  pickle file %s not found' % pickle_fn)

        output_results.output_results(invoke_list, prm)
        all_ok = OK
    except IOError as e:

        print('host %s filename %s: %s' % (h, pickle_fn, str(e)))
    except KeyboardInterrupt as e:
        print('control-C signal seen (SIGINT)')
    except SMFResultException as e:
        print(str(e))

    sys.exit(all_ok)
Exemplo n.º 3
0
def run_multi_host_workload(prm, log):

    # construct list of ssh threads to invoke in parallel

    if os.getenv('PYPY'):
        python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
        python_prog = 'python'
    elif sys.version.startswith('3'):
        python_prog = 'python3'
    else:
        raise Exception('unrecognized python version %s' % sys.version)

    log.debug('python_prog = %s' % python_prog)

    remote_thread_list = []
    host_ct = len(prm.host_set)
    for j in range(0, len(prm.host_set)):
        remote_host = prm.host_set[j]
        fsd_remote_pgm = os.path.join(prm.fsd_remote_dir, 'fs-drift-remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s ' \
            % (prm.python_prog, fsd_remote_pgm, prm.network_shared_path)

        this_remote_cmd += ' --as-host %s' % remote_host
        log.debug(this_remote_cmd)
        if prm.launch_as_daemon:
            remote_thread_list.append(
                launcher_thread.launcher_thread(prm, log, remote_host,
                                                this_remote_cmd))
        else:
            remote_thread_list.append(
                ssh_thread.ssh_thread(log, remote_host, this_remote_cmd))

    # start them, pacing starts so that we don't get ssh errors

    for t in remote_thread_list:
        if prm.launch_as_daemon:
            time.sleep(0.1)
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur
    # as starting gate file is already present
    # every second we resume scan from last host file not found

    exception_seen = None
    abortfn = prm.abort_path
    sec_delta = 0.5
    # timeout if no host replies in next host_timeout seconds
    per_host_timeout = 10.0
    all_host_timeout = 5.0 + len(prm.host_set) / 3
    if all_host_timeout < per_host_timeout:
        per_host_timeout = all_host_timeout / 2

    hosts_ready = False  # set scope outside while loop
    last_host_seen = -1
    sec = 0.0
    start_loop_start = time.time()
    try:
        while sec < per_host_timeout:
            # HACK to force directory entry coherency for Gluster
            #ndirlist = os.listdir(prm.network_shared_path)
            #log.debug('shared dir list: ' + str(ndirlist))
            hosts_ready = True
            if os.path.exists(abortfn):
                raise FsDriftException('worker host signaled abort')
            for j in range(last_host_seen + 1, len(prm.host_set)):
                h = prm.host_set[j]
                fn = multi_thread_workload.gen_host_ready_fname(prm, h.strip())
                log.debug('checking for host filename ' + fn)
                if not os.path.exists(fn):
                    log.info('did not see host filename %s after %f sec' %
                             (fn, sec))
                    hosts_ready = False
                    break
                log.debug('saw host filename ' + fn)
                last_host_seen = j  # saw this host's ready file
                # we exit while loop only if no hosts in per_host_timeout seconds
                sec = 0.0
            if hosts_ready:
                break

            # if one of ssh threads has died, no reason to continue

            kill_remaining_threads = False
            for t in remote_thread_list:
                if not t.is_alive():
                    log.error('thread %s has died' % t)
                    kill_remaining_threads = True
                    break
            if kill_remaining_threads:
                break

            # be patient for large tests
            # give user some feedback about
            # how many hosts have arrived at the starting gate

            time.sleep(sec_delta)
            sec += sec_delta
            time_since_loop_start = time.time() - start_loop_start
            log.debug('last_host_seen=%d sec=%d' % (last_host_seen, sec))
            if time_since_loop_start > all_host_timeout:
                kill_remaining_threads = True
                break
    except KeyboardInterrupt as e:
        log.error('saw SIGINT signal, aborting test')
        exception_seen = e
    except Exception as e:
        exception_seen = e
        log.exception(e)
        hosts_ready = False
    if not hosts_ready:
        multi_thread_workload.abort_test(prm.abort_path, remote_thread_list)
        if not exception_seen:
            log.info(
                'no additional hosts reached starting gate within %5.1f seconds'
                % per_host_timeout)
            return NOTOK
        else:
            raise exception_seen
    else:

        # ask all hosts to start the test
        # this is like firing the gun at the track meet

        try:
            sync_files.write_sync_file(prm.starting_gun_path, 'hi')
            log.debug('starting all threads by creating starting gun file %s' %
                      prm.starting_gun_path)
        except IOError as e:
            log.error('error writing starting gun file: %s' %
                      os.strerror(e.errno))
            multi_thread_workload.abort_test(prm.abort_path,
                                             remote_thread_list)
            raise e

    # wait for them to finish

    for t in remote_thread_list:
        t.join()
        if t.status != OK:
            log.error('ssh thread for host %s completed with status %d' %
                      (t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing SmallfileWorkload instances
    # with counters and times that we need

    try:
        invoke_list = []
        one_shot_delay = True
        for h in prm.host_set:  # for each host in test

            # read results for each thread run in that host
            # from python pickle of the list of SmallfileWorkload objects

            pickle_fn = multi_thread_workload.host_result_filename(prm, h)
            log.debug('reading pickle file: %s' % pickle_fn)
            host_invoke_list = []
            try:
                if one_shot_delay and not os.path.exists(pickle_fn):

                    # all threads have joined already, they are done
                    # we allow > 1 sec
                    # for this (NFS) client to see other clients' files

                    time.sleep(1.2)
                    one_shot_delay = False
                host_invoke_list = read_pickle(pickle_fn)
                log.debug(' read %d invoke objects' % len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
            except IOError as e:
                if e.errno != errno.ENOENT:
                    raise e
                log.error('  pickle file %s not found' % pickle_fn)

        output_results.output_results(prm, invoke_list)
    except IOError as e:
        log.exception(e)
        log.error('host %s filename %s: %s' % (h, pickle_fn, str(e)))
        return NOTOK
    except KeyboardInterrupt as e:
        log.error('control-C signal seen (SIGINT)')
        return NOTOK
    except FsDriftException as e:
        log.exception(e)
        return NOTOK
    return (OK)
Exemplo n.º 4
0
def run_multi_host_workload(prm):

    prm_host_set = prm.host_set
    prm_slave = prm.is_slave
    prm_permute_host_dirs = prm.permute_host_dirs
    master_invoke = prm.master_invoke

    starting_gate = master_invoke.starting_gate
    verbose = master_invoke.verbose
    host = master_invoke.onhost

    # construct list of ssh threads to invoke in parallel

    sync_files.create_top_dirs(master_invoke, True)
    pickle_fn = os.path.join(prm.master_invoke.network_dir,'param.pickle')
    #if verbose: print('writing ' + pickle_fn)
    sync_files.write_pickle(pickle_fn, prm)
    if os.getenv('PYPY'):
      python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
      python_prog = 'python'
    elif sys.version.startswith('3'):
      python_prog = 'python3'
    else:
      raise Exception('unrecognized python version %s'%sys.version)
    #print('python_prog = %s'%python_prog)
    remote_thread_list = []
    host_ct = len(prm_host_set)
    for j in range(0, len(prm_host_set)):
        remote_host = prm_host_set[j]
        smf_remote_pgm = os.path.join(prm.remote_pgm_dir, 'smallfile_remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s '%\
           (python_prog, smf_remote_pgm, prm.master_invoke.network_dir)
        
        #this_remote_cmd = remote_cmd
        if prm_permute_host_dirs:
          this_remote_cmd += ' --as-host %s'%prm_host_set[(j+1)%host_ct]
        else:
          this_remote_cmd += ' --as-host %s'%remote_host
        if verbose: print(this_remote_cmd)
        if smallfile.is_windows_os:
          remote_thread_list.append(launcher_thread.launcher_thread(prm, remote_host, this_remote_cmd ))
        else:
          remote_thread_list.append(ssh_thread.ssh_thread(remote_host, this_remote_cmd))

    # start them, pacing starts so that we don't get ssh errors

    for t in remote_thread_list:
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur as starting gate file is already present
    # every second we resume scan from last host file not found
    # FIXME: for very large host sets, timeout only if no host responds within X seconds
  
    exception_seen = None
    hosts_ready = False  # set scope outside while loop
    abortfn = master_invoke.abort_fn()
    last_host_seen=-1
    sec = 0
    sec_delta = 0.5
    host_timeout = prm.host_startup_timeout
    if smallfile.is_windows_os: host_timeout += 20

    try:
     # FIXME: make timeout criteria be that new new hosts seen in X seconds
     while sec < host_timeout:
      ndirlist = os.listdir(master_invoke.network_dir)
      if master_invoke.verbose: print('shared dir list: ' + str(ndirlist))
      hosts_ready = True
      if os.path.exists(abortfn): raise Exception('worker host signaled abort')
      for j in range(last_host_seen+1, len(prm_host_set)):
        h=prm_host_set[j]
        fn = master_invoke.gen_host_ready_fname(h.strip())
        if verbose: print('checking for host filename '+fn)
        if not os.path.exists(fn):
            hosts_ready = False
            break
        last_host_seen=j
      if hosts_ready: break

      # be patient for large tests
      # give user some feedback about how many hosts have arrived at the starting gate

      time.sleep(sec_delta)
      sec += sec_delta
      sec_delta += 1
      if verbose: print('last_host_seen=%d sec=%d'%(last_host_seen,sec))
    except KeyboardInterrupt as e:
      print('saw SIGINT signal, aborting test')
      exception_seen = e
    except Exception as e:
      exception_seen = e
      hosts_ready = False
    if not hosts_ready:
      smallfile.abort_test(abortfn, [])
      if not exception_seen: 
        raise Exception('hosts did not reach starting gate within %d seconds'%host_timeout)
      else:
        print('saw exception %s, aborting test'%str(e))
    else:
      # ask all hosts to start the test
      # this is like firing the gun at the track meet
      try:
        sync_files.write_sync_file(starting_gate, 'hi')
        if verbose: print('starting gate file %s created'%starting_gate)
      except IOError as e:
        print('error writing starting gate: %s'%os.strerror(e.errno))

    # wait for them to finish

    all_ok = True
    for t in remote_thread_list:
        t.join()
        if t.status != OK: 
          all_ok = False
          print('ERROR: ssh thread for host %s completed with status %d'%(t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing smf_invocation instances with counters and times that we need

    try:
      invoke_list = []
      for h in prm_host_set:  # for each host in test

        # read results for each thread run in that host
        # from python pickle of the list of smf_invocation objects

        pickle_fn = master_invoke.host_result_filename(h)
        if verbose: print('reading pickle file: %s'%pickle_fn)
        host_invoke_list = []
        try:
                if not os.path.exists(pickle_fn): time.sleep(1.2)
                with open(pickle_fn, 'rb') as pickle_file:
                  host_invoke_list = pickle.load(pickle_file)
                if verbose: print(' read %d invoke objects'%len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
        except IOError as e:
                if e.errno != errno.ENOENT: raise e
                print('  pickle file %s not found'%pickle_fn)

      output_results.output_results(invoke_list, prm_host_set, prm.thread_count,pct_files_min)

    except IOError as e:
        print('host %s filename %s: %s'%(h, pickle_fn, str(e)))
        all_ok = False
    except KeyboardInterrupt as e:
        print('control-C signal seen (SIGINT)')
        all_ok = False
    if not all_ok: 
        sys.exit(NOTOK)
    sys.exit(OK)
Exemplo n.º 5
0
def run_multi_host_workload(prm):

    prm_host_set = prm.host_set
    prm_slave = prm.is_slave
    prm_permute_host_dirs = prm.permute_host_dirs
    master_invoke = prm.master_invoke

    starting_gate = master_invoke.starting_gate
    verbose = master_invoke.verbose
    host = master_invoke.onhost

    # construct list of ssh threads to invoke in parallel

    sync_files.create_top_dirs(master_invoke, True)
    pickle_fn = os.path.join(prm.master_invoke.network_dir, 'param.pickle')
    #if verbose: print('writing ' + pickle_fn)
    sync_files.write_pickle(pickle_fn, prm)
    if os.getenv('PYPY'):
        python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
        python_prog = 'python'
    elif sys.version.startswith('3'):
        python_prog = 'python3'
    else:
        raise Exception('unrecognized python version %s' % sys.version)
    #print('python_prog = %s'%python_prog)
    remote_thread_list = []
    host_ct = len(prm_host_set)
    for j in range(0, len(prm_host_set)):
        remote_host = prm_host_set[j]
        smf_remote_pgm = os.path.join(prm.remote_pgm_dir,
                                      'smallfile_remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s '%\
           (python_prog, smf_remote_pgm, prm.master_invoke.network_dir)

        #this_remote_cmd = remote_cmd
        if prm_permute_host_dirs:
            this_remote_cmd += ' --as-host %s' % prm_host_set[(j + 1) %
                                                              host_ct]
        else:
            this_remote_cmd += ' --as-host %s' % remote_host
        if verbose: print(this_remote_cmd)
        if smallfile.is_windows_os:
            remote_thread_list.append(
                launcher_thread.launcher_thread(prm, remote_host,
                                                this_remote_cmd))
        else:
            remote_thread_list.append(
                ssh_thread.ssh_thread(remote_host, this_remote_cmd))

    # start them, pacing starts so that we don't get ssh errors

    for t in remote_thread_list:
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur as starting gate file is already present
    # every second we resume scan from last host file not found
    # FIXME: for very large host sets, timeout only if no host responds within X seconds

    exception_seen = None
    hosts_ready = False  # set scope outside while loop
    abortfn = master_invoke.abort_fn()
    last_host_seen = -1
    sec = 0
    sec_delta = 0.5
    host_timeout = prm.host_startup_timeout
    if smallfile.is_windows_os: host_timeout += 20

    try:
        # FIXME: make timeout criteria be that new new hosts seen in X seconds
        while sec < host_timeout:
            ndirlist = os.listdir(master_invoke.network_dir)
            if master_invoke.verbose:
                print('shared dir list: ' + str(ndirlist))
            hosts_ready = True
            if os.path.exists(abortfn):
                raise Exception('worker host signaled abort')
            for j in range(last_host_seen + 1, len(prm_host_set)):
                h = prm_host_set[j]
                fn = master_invoke.gen_host_ready_fname(h.strip())
                if verbose: print('checking for host filename ' + fn)
                if not os.path.exists(fn):
                    hosts_ready = False
                    break
                last_host_seen = j
            if hosts_ready: break

            # be patient for large tests
            # give user some feedback about how many hosts have arrived at the starting gate

            time.sleep(sec_delta)
            sec += sec_delta
            sec_delta += 1
            if verbose:
                print('last_host_seen=%d sec=%d' % (last_host_seen, sec))
    except KeyboardInterrupt as e:
        print('saw SIGINT signal, aborting test')
        exception_seen = e
    except Exception as e:
        exception_seen = e
        hosts_ready = False
    if not hosts_ready:
        smallfile.abort_test(abortfn, [])
        if not exception_seen:
            raise Exception(
                'hosts did not reach starting gate within %d seconds' %
                host_timeout)
        else:
            print('saw exception %s, aborting test' % str(e))
    else:
        # ask all hosts to start the test
        # this is like firing the gun at the track meet
        try:
            sync_files.write_sync_file(starting_gate, 'hi')
            if verbose: print('starting gate file %s created' % starting_gate)
        except IOError as e:
            print('error writing starting gate: %s' % os.strerror(e.errno))

    # wait for them to finish

    all_ok = True
    for t in remote_thread_list:
        t.join()
        if t.status != OK:
            all_ok = False
            print('ERROR: ssh thread for host %s completed with status %d' %
                  (t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing smf_invocation instances with counters and times that we need

    try:
        invoke_list = []
        for h in prm_host_set:  # for each host in test

            # read results for each thread run in that host
            # from python pickle of the list of smf_invocation objects

            pickle_fn = master_invoke.host_result_filename(h)
            if verbose: print('reading pickle file: %s' % pickle_fn)
            host_invoke_list = []
            try:
                if not os.path.exists(pickle_fn): time.sleep(1.2)
                with open(pickle_fn, 'rb') as pickle_file:
                    host_invoke_list = pickle.load(pickle_file)
                if verbose:
                    print(' read %d invoke objects' % len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
            except IOError as e:
                if e.errno != errno.ENOENT: raise e
                print('  pickle file %s not found' % pickle_fn)

        output_results.output_results(invoke_list, prm_host_set,
                                      prm.thread_count, pct_files_min)

    except IOError as e:
        print('host %s filename %s: %s' % (h, pickle_fn, str(e)))
        all_ok = False
    except KeyboardInterrupt as e:
        print('control-C signal seen (SIGINT)')
        all_ok = False
    if not all_ok:
        sys.exit(NOTOK)
    sys.exit(OK)
Exemplo n.º 6
0
if prm_host_set and not prm_slave:

  # construct list of ssh threads to invoke in parallel

  ssh_thread_list = []
  smallfile.ensure_deleted(starting_gate)
  host_ct = len(prm_host_set)
  for j in range(0, len(prm_host_set)):
        n = prm_host_set[j]
        if prm_permute_host_dirs:
          remote_cmd += ' --as-host %s'%prm_host_set[(j+1)%host_ct]
        remote_cmd += ' --slave Y '
        if verbose: print remote_cmd
        pickle_fn = gen_host_result_filename(top_dir, short_hostname(n))
        smallfile.ensure_deleted(pickle_fn)
        ssh_thread_list.append(ssh_thread.ssh_thread(n, remote_cmd))
  time.sleep(2) # give other clients time to see changes

  # start them, pacing starts so that we don't get ssh errors

  for t in ssh_thread_list:
        t.start()
        time.sleep(0.1)

  # wait for them to finish

  all_ok = True
  for t in ssh_thread_list:
        t.join()
        if t.status != OK: 
          all_ok = False