def __init__(self, redis_host, redis_port, redis_db, config, themis_binary,
                 log_directory, keepalive_refresh,
                 keepalive_timeout, profiler, profiler_options, ld_preload,
                 interfaces):
        self.redis_host = redis_host
        self.redis_port = redis_port
        self.redis_db = redis_db
        self.config_file = config
        with open(config, 'r') as fp:
            self.config = yaml.load(fp)
        self.themis_binary = themis_binary
        self.log_directory = log_directory
        self.keepalive_refresh = keepalive_refresh
        self.keepalive_timeout = keepalive_timeout
        self.profiler = profiler
        self.profiler_options = profiler_options
        self.ld_preload = ld_preload
        self.batch_nonce = random.randint(0, 1000000000)
        self.batch_phase_info = {}
        self.interfaces = interfaces

        self.node_coordinator_log_dir = os.path.join(
            log_directory, "node_coordinators")

        self.coordinator_db = redis_utils.CoordinatorDB(
            redis_host, redis_port, redis_db)

        self.ssh_command = utils.ssh_command()
    def __init__(self, redis_host, redis_port, redis_db, config, themis_binary,
                 log_directory, keepalive_refresh, keepalive_timeout, profiler,
                 profiler_options, ld_preload, interfaces):
        self.redis_host = redis_host
        self.redis_port = redis_port
        self.redis_db = redis_db
        self.config_file = config
        with open(config, 'r') as fp:
            self.config = yaml.load(fp)
        self.themis_binary = themis_binary
        self.log_directory = log_directory
        self.keepalive_refresh = keepalive_refresh
        self.keepalive_timeout = keepalive_timeout
        self.profiler = profiler
        self.profiler_options = profiler_options
        self.ld_preload = ld_preload
        self.batch_nonce = random.randint(0, 1000000000)
        self.batch_phase_info = {}
        self.interfaces = interfaces

        self.node_coordinator_log_dir = os.path.join(log_directory,
                                                     "node_coordinators")

        self.coordinator_db = redis_utils.CoordinatorDB(
            redis_host, redis_port, redis_db)

        self.ssh_command = utils.ssh_command()
예제 #3
0
def parallel_ssh(hosts,
                 script_path,
                 script_options,
                 verbose,
                 print_stdout=False):
    # Script path is either a string, or a list of strings one per host
    if not isinstance(script_options, str):
        # Must be a list of same length as hosts
        if not isinstance(script_options, list):
            sys.exit(
                "Script options must either be a string or a list of options "
                "one per host. Got %s" % script_options)
        if len(script_options) != len(hosts):
            sys.exit(
                "Script options list must be same length as hosts (%d) but got "
                "%s" % len(hosts), script_options)

    # Launch all ssh commands in parallel.
    print "Running %s in parallel on %d hosts..." % (script_path, len(hosts))

    pending_commands = []
    for host_ID, host in enumerate(hosts):
        command_template = ('%(ssh)s %(host)s "%(script_path)s '
                            '%(script_options)s"')

        options_string = script_options
        if isinstance(options_string, list):
            # Pick the option for this host
            options_string = options_string[host_ID]

        command = command_template % {
            "ssh": utils.ssh_command(),
            "host": host,
            "script_path": script_path,
            "script_options": options_string,
        }

        command_object = subprocess.Popen(command,
                                          shell=True,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.STDOUT)

        pending_commands.append((command_object, command, host))

    # Wait for each one to complete.
    failed_hosts = []
    for (command_object, command_string, host) in pending_commands:
        (stdout, stderr) = command_object.communicate()

        if verbose:
            print "%s:%s" % (host, command_string)
        if command_object.returncode != 0:
            print "  FAILURE - returned %d:" % command_object.returncode
            print "  stdout:"
            if stdout is not None:
                for line in stdout.split('\n'):
                    print "    %s" % line
            print "  stderr:"
            if stderr is not None:
                for line in stderr.split('\n'):
                    print "    %s" % line
            failed_hosts.append(host)
        else:
            if verbose:
                print "  SUCCESS!"
            if print_stdout:
                print "  stdout:"
                if stdout is not None:
                    for line in stdout.split('\n'):
                        print "    %s" % line

    if len(failed_hosts) > 0:
        print ""
        print "Parallel %s failed on hosts:" % script_path
        print "  %s" % failed_hosts
        return False

    return True
def run_benchmark(
    binary,
    config,
    batch_directory,
    phase_directory,
    profiler,
    profiler_options,
    peer_list,
    node_list,
    per_peer_config,
    dump_core_directory,
    solo_mode,
    vnstat_interface,
    params,
):

    # Get ssh username.
    username = read_conf_file("cluster.conf", "cluster", "username")

    # Add command line parameters to binary
    binary = "%s -LOG_DIR %s" % (binary, phase_directory)

    if dump_core_directory is not None:
        binary = "cd %s; ulimit -c unlimited; %s" % (dump_core_directory, binary)

    processes = []
    start_time = time.time()
    for index, ip in enumerate(node_list):
        # Now start themis binaries
        if solo_mode:
            # Act as if you are the only peer in the cluster.
            peer_binary = "%s -PEER_LIST %s" % (binary, ip)
        else:
            # Use the entire set of peers and designate yourself as
            # one of them.
            peer_binary = "%s -PEER_LIST %s -MYPEERID %d" % (binary, peer_list, index)

        if per_peer_config:
            # Append the IP address to the config file name
            peer_binary = "%s -CONFIG %s.%s" % (peer_binary, config, ip)
        else:
            peer_binary = "%s -CONFIG %s" % (peer_binary, config)

        # Override config file with specified parameters
        if params:
            peer_binary = "%s %s" % (peer_binary, params)

        if profiler == "operf":
            # Use the batch directory as the operf session dir
            session_dir = os.path.join(batch_directory, "oprofile_data.%s", ip)
            parallel_ssh(None, "mkdir -p %s" % session_dir, username, node_list, False, True, False)
            peer_binary = "%s %s --session-dir=%s %s" % (profiler, profiler_options, session_dir, peer_binary)
        elif profiler is not None:
            # Some other profiler, just prepend it to the binary
            peer_binary = "%s %s %s" % (profiler, profiler_options, peer_binary)

        # Run the node-local benchmark script.
        vnstat_param_string = ""
        if vnstat_interface != None:
            vnstat_param_string = "--vnstat_interface %s" % vnstat_interface
        command = "%s %s \"%s/run_benchmark_local.py %s %s '%s'\"" % (
            ssh_command(),
            ip,
            THEMIS_SCRIPTS_DIR,
            vnstat_param_string,
            phase_directory,
            peer_binary,
        )

        processes.append((subprocess.Popen(command, shell=True), ip))

    print "%d tasks launched on %s\n" % (len(processes), time.asctime())

    elapsed_times = []
    completed_ips = []

    num_nodes = len(processes)

    while len(processes) > 0:
        for process, ip in processes:
            process.poll()
            if process.returncode != None:
                elapsed_time = time.time() - start_time
                process.communicate()
                processes.remove((process, ip))
                elapsed_times.append(elapsed_time)
                completed_ips.append(ip)
                print "Node %s completed in %.2f seconds (%d / %d)" % (ip, elapsed_time, len(elapsed_times), num_nodes)

                break

    stop_time = time.time()

    return (stop_time - start_time, elapsed_times, completed_ips)
def gather_local_file_paths(
    coordinator_db, input_dir, max_input_files_per_disk):
    ssh_command = utils.ssh_command()

    hosts = coordinator_db.live_nodes

    ssh_command_template = string.Template(
        "%(ssh_command)s ${host} '%(script_path)s/list_local_files.py ${disks}'"
        % {
            "ssh_command" : ssh_command,
            "script_path" : os.path.abspath(os.path.dirname(__file__))
            })

    pending_commands = []

    for host in hosts:
        disks = list(coordinator_db.io_disks(host))
        disks.sort()

        local_dirs = map(lambda x: "%s/%s" % (x, input_dir), disks)

        cmd = ssh_command_template.substitute(
            host=host,
            disks = ' '.join(local_dirs))

        pending_commands.append(
            (host, subprocess.Popen(cmd, shell="True", stdout=subprocess.PIPE)))

    worker_inputs = {}
    total_input_size = 0

    for hostname, command in pending_commands:
        worker_inputs[hostname] = {}

        stdout, stderr = command.communicate()

        if command.returncode != 0:
            log.error("Command '%s' failed with error %d" % (
                    command.cmd, command.returncode))
            return None

        file_paths = json.loads(stdout)

        if file_paths is None:
            log.error(("Input directory '%s' doesn't exist on all of host %s's "
                       "input disks") % (input_dir, hostname))
            return None

        for i, file_list in enumerate(file_paths):
            worker_inputs[hostname][i] = []

            num_files = 0
            for filename, file_length in file_list:
                # Allow the user to cap the number of input files for testing.
                if max_input_files_per_disk == None or \
                        num_files < max_input_files_per_disk:
                    file_url = "local://%s%s" % (hostname, filename)
                    worker_inputs[hostname][i].append((file_url, file_length))
                    total_input_size += file_length
                    num_files += 1

    return (worker_inputs, total_input_size)
def parallel_ssh(
    hosts, script_path, script_options, verbose, print_stdout=False):
    # Script path is either a string, or a list of strings one per host
    if not isinstance(script_options, str):
        # Must be a list of same length as hosts
        if not isinstance(script_options, list):
            sys.exit(
                "Script options must either be a string or a list of options "
                "one per host. Got %s" % script_options)
        if len(script_options) != len(hosts):
            sys.exit(
                "Script options list must be same length as hosts (%d) but got "
                "%s" % len(hosts), script_options)

    # Launch all ssh commands in parallel.
    print "Running %s in parallel on %d hosts..." % (script_path, len(hosts))

    pending_commands = []
    for host_ID, host in enumerate(hosts):
        command_template = ('%(ssh)s %(host)s "%(script_path)s '
                            '%(script_options)s"')

        options_string = script_options
        if isinstance(options_string, list):
            # Pick the option for this host
            options_string = options_string[host_ID]

        command = command_template % {
            "ssh" : utils.ssh_command(),
            "host" : host,
            "script_path" : script_path,
            "script_options" : options_string,
            }

        command_object = subprocess.Popen(
            command, shell=True, stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT)

        pending_commands.append((command_object, command, host))

    # Wait for each one to complete.
    failed_hosts = []
    for (command_object, command_string, host) in pending_commands:
        (stdout, stderr) = command_object.communicate()

        if verbose:
            print "%s:%s" % (host, command_string)
        if command_object.returncode != 0:
            print "  FAILURE - returned %d:" % command_object.returncode
            print "  stdout:"
            if stdout is not None:
                for line in stdout.split('\n'):
                    print "    %s" % line
            print "  stderr:"
            if stderr is not None:
                for line in stderr.split('\n'):
                    print "    %s" % line
            failed_hosts.append(host)
        else:
            if verbose:
                print "  SUCCESS!"
            if print_stdout:
                print "  stdout:"
                if stdout is not None:
                    for line in stdout.split('\n'):
                        print "    %s" % line

    if len(failed_hosts) > 0:
        print ""
        print "Parallel %s failed on hosts:" % script_path
        print "  %s" % failed_hosts
        return False

    return True
예제 #7
0
def run_benchmark(
    binary, config, batch_directory, phase_directory, profiler,
    profiler_options, peer_list, node_list, per_peer_config,
    dump_core_directory, solo_mode, vnstat_interface, params):

    # Get ssh username.
    username = read_conf_file("cluster.conf", "cluster", "username")

    # Add command line parameters to binary
    binary = "%s -LOG_DIR %s" % (
        binary, phase_directory)

    if dump_core_directory is not None:
        binary = "cd %s; ulimit -c unlimited; %s" % (
            dump_core_directory, binary)

    processes = []
    start_time = time.time()
    for index, ip in enumerate(node_list):
        # Now start themis binaries
        if solo_mode:
            # Act as if you are the only peer in the cluster.
            peer_binary = "%s -PEER_LIST %s" % (binary, ip)
        else:
            # Use the entire set of peers and designate yourself as
            # one of them.
            peer_binary = "%s -PEER_LIST %s -MYPEERID %d" % (
                binary, peer_list, index)

        if per_peer_config:
            # Append the IP address to the config file name
            peer_binary = "%s -CONFIG %s.%s" % (peer_binary, config, ip)
        else:
            peer_binary = "%s -CONFIG %s" % (peer_binary, config)

        # Override config file with specified parameters
        if params:
            peer_binary = "%s %s" % (peer_binary, params)

        if profiler == "operf":
            # Use the batch directory as the operf session dir
            session_dir = os.path.join(batch_directory, "oprofile_data.%s", ip)
            parallel_ssh(
                None, "mkdir -p %s" % session_dir, username, node_list,
                False, True, False)
            peer_binary = "%s %s --session-dir=%s %s" % (
                profiler, profiler_options, session_dir, peer_binary)
        elif profiler is not None:
            # Some other profiler, just prepend it to the binary
            peer_binary = "%s %s %s" % (
                profiler, profiler_options, peer_binary)

        # Run the node-local benchmark script.
        vnstat_param_string = ""
        if vnstat_interface != None:
            vnstat_param_string = "--vnstat_interface %s" % vnstat_interface
        command = '%s %s "%s/run_benchmark_local.py %s %s \'%s\'"' % (
            ssh_command(), ip, THEMIS_SCRIPTS_DIR, vnstat_param_string,
            phase_directory, peer_binary)

        processes.append((subprocess.Popen(command, shell=True), ip))

    print "%d tasks launched on %s\n" % (len(processes), time.asctime())

    elapsed_times = []
    completed_ips = []

    num_nodes = len(processes)

    while len(processes) > 0:
        for process, ip in processes:
            process.poll()
            if process.returncode != None:
                elapsed_time = time.time() - start_time
                process.communicate()
                processes.remove((process, ip))
                elapsed_times.append(elapsed_time)
                completed_ips.append(ip)
                print "Node %s completed in %.2f seconds (%d / %d)" % (
                    ip, elapsed_time, len(elapsed_times), num_nodes)

                break

    stop_time = time.time()

    return (stop_time - start_time, elapsed_times, completed_ips)