def disk_benchmark(
    input_directory, output_directory, benchmark_size_per_disk, **kwargs):

    if output_directory is None:
        output_directory = utils.sibling_directory(
            input_directory, "disk_speeds")


    (input_url, output_url) = utils.generate_urls(
        input_directory, output_directory, None)

    config = utils.mapreduce_job(
        input_dir = input_url,
        output_dir = output_url,
        map_function = "DiskBenchmarkMapFunction",
        reduce_function = "DiskBenchmarkReduceFunction")

    utils.force_single_partition(config)

    data_size_bytes = int(uc.parse_and_convert(benchmark_size_per_disk, "B"))

    config_params = {
        "DISK_BENCHMARK_DATA_SIZE" : data_size_bytes
        }

    if "params" not in config:
        config["params"] = {}

    for key, value in config_params.items():
        config["params"][key] = value

    return config
Exemplo n.º 2
0
def disk_benchmark(input_directory, output_directory, benchmark_size_per_disk,
                   **kwargs):

    if output_directory is None:
        output_directory = utils.sibling_directory(input_directory,
                                                   "disk_speeds")

    (input_url, output_url) = utils.generate_urls(input_directory,
                                                  output_directory, None)

    config = utils.mapreduce_job(input_dir=input_url,
                                 output_dir=output_url,
                                 map_function="DiskBenchmarkMapFunction",
                                 reduce_function="DiskBenchmarkReduceFunction")

    utils.force_single_partition(config)

    data_size_bytes = int(uc.parse_and_convert(benchmark_size_per_disk, "B"))

    config_params = {"DISK_BENCHMARK_DATA_SIZE": data_size_bytes}

    if "params" not in config:
        config["params"] = {}

    for key, value in config_params.items():
        config["params"][key] = value

    return config
def generate_graysort_inputs(
    redis_host, redis_port, redis_db, total_data_size, method, debug,
    gensort_command,
    username, no_sudo, skew, graysort_compatibility_mode,
    num_files_per_disk, multiple, replica_number, parallelism,
    intermediate_disks):

    disk_list_key = "node_io_disks"
    if intermediate_disks:
        # Write to intermediate disks instead.
        disk_list_key = "node_local_disks"

    # Connect to Redis.
    redis_client = redis.StrictRedis(
        host=redis_host, port=redis_port, db=redis_db)

    # Get list of hosts from redis.
    hosts = list(redis_client.smembers("nodes"))
    hosts.sort()

    # Generate a mapping of hosts -> io disks
    io_disk_map = {}
    first_partition_map = {}
    num_partitions = 0
    for host in hosts:
        disks = list(redis_client.smembers("%s:%s" % (disk_list_key, host)))
        disks.sort()
        disk_list = []
        for disk in disks:
            for f in xrange(num_files_per_disk):
                disk_list.append(disk)
        io_disk_map[host] = disk_list
        first_partition_map[host] = num_partitions
        num_partitions += len(disk_list)

    local_fqdn = socket.getfqdn()
    if replica_number > 1:
        hosts = io_disk_map.keys()
        host_index = hosts.index(local_fqdn)
        # Act as if we are a different host for the purposes of generating
        # replica files
        host_index = (host_index + (replica_number - 1)) % len(io_disk_map)
        local_fqdn = hosts[host_index]


    # Get list of input disks for this local machine.
    input_disks = io_disk_map[local_fqdn]
    first_partition = first_partition_map[local_fqdn]

    job_name = "Graysort"
    if not graysort_compatibility_mode:
        job_name = "Graysort-MapReduceHeaders"
    if replica_number > 1:
        job_name += "_replica"
    input_directory = os.path.join(username, "inputs", job_name)

    # Compute input file names.
    input_file_relative_paths = map(
        lambda x: os.path.join(
            input_directory, "%08d.partition" % (first_partition + x)),
        xrange(len(input_disks)))
    input_files = [os.path.abspath(os.path.join(disk, relative_path))
                   for disk, relative_path
                   in zip(input_disks, input_file_relative_paths)]

    # Find out if data already exists.
    existing_files = filter(os.path.exists, input_files)
    if len(existing_files) == len(input_files):
        print >>sys.stderr, "Data already exists"
        sys.exit(0)

    # Data needs to be generated. Delete any existing files.
    for input_file in existing_files:
        os.remove(input_file)

    # Convert human-readable data size to records or bytes.
    unit = "R"
    data_size = unitconversion.parse_and_convert(total_data_size, unit)

    # Compute record assignments to disks in the cluster.
    assignments = generate_data_assignment(io_disk_map, data_size, multiple)
    local_assignments = assignments[local_fqdn]

    # Finally we're ready to create input files on each disk.
    gensort_commands = []
    for disk_index, input_file in enumerate(input_files):
        # Manually create directories.
        # Create input directory
        directory = os.path.abspath(os.path.join(input_file, os.pardir))
        if no_sudo:
            cmd = mkdir["-p", directory]
        else:
            cmd = sudo[mkdir["-p", directory]]

        if debug:
            print cmd
        else:
            cmd()

        # Change ownership of input directory from root to USER
        if not no_sudo:
            cmd = sudo[chown["-R", username, os.path.dirname(directory)]]

            if debug:
                print cmd
            else:
                cmd()

        # PREPARE gensort command.
        (disk_data_offset, disk_data_size) = local_assignments[disk_index]
        command_options = {}
        command_args = []

        destination_filename = input_file

        # Set skew and MapReduce mode options
        if skew:
            command_args.append("-s")

        if not graysort_compatibility_mode:
            command_args.append("-m")

        command_args.extend([
                "-b%d" % (disk_data_offset), str(disk_data_size),
                destination_filename])

        options_str = ' '.join(('%s %s' % (k, v)
                                for k, v in command_options.items()))

        args_str = ' '.join(command_args)

        command = "%s %s %s" % (gensort_command, options_str, args_str)

        # Finally start the gensort process for this disk.
        if debug:
            print command
        else:
            command = shlex.split(command)
            gensort_commands.append(command)

    # Generate files in random order to attempt to more evenly utilize disks.
    random.shuffle(gensort_commands)
    status = 0
    while len(gensort_commands) > 0:
        running_commands = []
        gensort_processes = []
        for command in gensort_commands:
            print "Running '%s'" % (command)
            gensort_processes.append(subprocess.Popen(
                    command, stderr=subprocess.PIPE,
                    stdout=subprocess.PIPE))

            running_commands.append(command)
            if parallelism > 0 and len(running_commands) >= parallelism:
                # Don't launch more parallel gensort processes
                break

        for command in running_commands:
            gensort_commands.remove(command)

        # Wait for all gensort processes to finish.
        for process in gensort_processes:
            (stdout_data, stderr_data) = process.communicate()
            print "gensort instance completed with return code %d" % (
                process.returncode)

            if process.returncode != 0:
                print stderr_data
                status = 1

    return status
Exemplo n.º 4
0
def generate_graysort_inputs(redis_host, redis_port, redis_db, total_data_size,
                             method, debug, gensort_command, username, no_sudo,
                             skew, graysort_compatibility_mode,
                             num_files_per_disk, multiple, replica_number,
                             parallelism, intermediate_disks):

    disk_list_key = "node_io_disks"
    if intermediate_disks:
        # Write to intermediate disks instead.
        disk_list_key = "node_local_disks"

    # Connect to Redis.
    redis_client = redis.StrictRedis(host=redis_host,
                                     port=redis_port,
                                     db=redis_db)

    # Get list of hosts from redis.
    hosts = list(redis_client.smembers("nodes"))
    hosts.sort()

    # Generate a mapping of hosts -> io disks
    io_disk_map = {}
    first_partition_map = {}
    num_partitions = 0
    for host in hosts:
        disks = list(redis_client.smembers("%s:%s" % (disk_list_key, host)))
        disks.sort()
        disk_list = []
        for disk in disks:
            for f in xrange(num_files_per_disk):
                disk_list.append(disk)
        io_disk_map[host] = disk_list
        first_partition_map[host] = num_partitions
        num_partitions += len(disk_list)

    local_fqdn = socket.getfqdn()
    if replica_number > 1:
        host_index = hosts.index(local_fqdn)
        # Act as if we are a different host for the purposes of generating
        # replica files
        host_index = (host_index + (replica_number - 1)) % len(io_disk_map)
        local_fqdn = hosts[host_index]

    # Get list of input disks for this local machine.
    input_disks = io_disk_map[local_fqdn]
    first_partition = first_partition_map[local_fqdn]

    job_name = "Graysort"
    if not graysort_compatibility_mode:
        job_name = "Graysort-MapReduceHeaders"
    if replica_number > 1:
        job_name += "_replica"
    input_directory = os.path.join(username, "inputs", job_name)

    # Compute input file names.
    input_file_relative_paths = map(
        lambda x: os.path.join(input_directory, "%08d.partition" %
                               (first_partition + x)),
        xrange(len(input_disks)))
    input_files = [
        os.path.abspath(os.path.join(disk, relative_path))
        for disk, relative_path in zip(input_disks, input_file_relative_paths)
    ]

    # Find out if data already exists.
    existing_files = filter(os.path.exists, input_files)
    if len(existing_files) == len(input_files):
        print >> sys.stderr, "Data already exists"
        sys.exit(0)

    # Data needs to be generated. Delete any existing files.
    for input_file in existing_files:
        os.remove(input_file)

    # Convert human-readable data size to records or bytes.
    unit = "R"
    data_size = unitconversion.parse_and_convert(total_data_size, unit)

    # Compute record assignments to disks in the cluster.
    assignments = generate_data_assignment(io_disk_map, data_size, multiple)
    local_assignments = assignments[local_fqdn]

    # Finally we're ready to create input files on each disk.
    gensort_commands = []
    for disk_index, input_file in enumerate(input_files):
        # Manually create directories.
        # Create input directory
        directory = os.path.abspath(os.path.join(input_file, os.pardir))
        if no_sudo:
            cmd = mkdir["-p", directory]
        else:
            cmd = sudo[mkdir["-p", directory]]

        if debug:
            print cmd
        else:
            cmd()

        # Change ownership of input directory from root to USER
        if not no_sudo:
            cmd = sudo[chown["-R", username, os.path.dirname(directory)]]

            if debug:
                print cmd
            else:
                cmd()

        # PREPARE gensort command.
        (disk_data_offset, disk_data_size) = local_assignments[disk_index]
        command_options = {}
        command_args = []

        destination_filename = input_file

        # Set skew and MapReduce mode options
        if skew:
            command_args.append("-s")

        if not graysort_compatibility_mode:
            command_args.append("-m")

        command_args.extend([
            "-b%d" % (disk_data_offset),
            str(disk_data_size), destination_filename
        ])

        options_str = ' '.join(
            ('%s %s' % (k, v) for k, v in command_options.items()))

        args_str = ' '.join(command_args)

        command = "%s %s %s" % (gensort_command, options_str, args_str)

        # Finally start the gensort process for this disk.
        if debug:
            print command
        else:
            command = shlex.split(command)
            gensort_commands.append(command)

    # Generate files in random order to attempt to more evenly utilize disks.
    random.shuffle(gensort_commands)
    status = 0
    while len(gensort_commands) > 0:
        running_commands = []
        gensort_processes = []
        for command in gensort_commands:
            print "Running '%s'" % (command)
            gensort_processes.append(
                subprocess.Popen(command,
                                 stderr=subprocess.PIPE,
                                 stdout=subprocess.PIPE))

            running_commands.append(command)
            if parallelism > 0 and len(running_commands) >= parallelism:
                # Don't launch more parallel gensort processes
                break

        for command in running_commands:
            gensort_commands.remove(command)

        # Wait for all gensort processes to finish.
        for process in gensort_processes:
            (stdout_data, stderr_data) = process.communicate()
            print "gensort instance completed with return code %d" % (
                process.returncode)

            if process.returncode != 0:
                print stderr_data
                status = 1

    return status
Exemplo n.º 5
0
def generate_graysort_inputs(redis_host, redis_port, redis_db, total_data_size,
                             method, debug, pareto_a, pareto_b, max_key_len,
                             max_val_len, min_key_len, min_val_len,
                             large_tuples, hdfs_namenode, hdfs_replication,
                             gensort_command, username, no_sudo, transfer_size,
                             skew, graysort_compatibility_mode,
                             num_files_per_disk, multiple, replica_number,
                             parallelism, intermediate_disks):

    disk_list_key = "node_io_disks"
    if intermediate_disks:
        # Write to intermediate disks instead.
        disk_list_key = "node_local_disks"

    # Connect to Redis.
    redis_client = redis.StrictRedis(host=redis_host,
                                     port=redis_port,
                                     db=redis_db)

    # Get list of hosts from redis.
    hosts = list(redis_client.smembers("nodes"))
    hosts.sort()

    # Generate a mapping of hosts -> io disks
    io_disk_map = {}
    first_partition_map = {}
    num_partitions = 0
    for host in hosts:
        disks = list(redis_client.smembers("%s:%s" % (disk_list_key, host)))
        disks.sort()
        disk_list = []
        for disk in disks:
            for f in xrange(num_files_per_disk):
                disk_list.append(disk)
        io_disk_map[host] = disk_list
        first_partition_map[host] = num_partitions
        num_partitions += len(disk_list)

    local_fqdn = socket.getfqdn()
    if replica_number > 1:
        hosts = io_disk_map.keys()
        host_index = hosts.index(local_fqdn)
        # Act as if we are a different host for the purposes of generating
        # replica files
        host_index = (host_index + (replica_number - 1)) % len(io_disk_map)
        local_fqdn = hosts[host_index]

    # Get list of input disks for this local machine.
    input_disks = io_disk_map[local_fqdn]
    first_partition = first_partition_map[local_fqdn]

    job_name = "Graysort"
    if not graysort_compatibility_mode:
        job_name = "Graysort-MapReduceHeaders"
    if replica_number > 1:
        job_name += "_replica"
    input_directory = os.path.join(username, "inputs", job_name)

    # Compute input file names.
    input_file_relative_paths = map(
        lambda x: os.path.join(input_directory, "%08d.partition" %
                               (first_partition + x)),
        xrange(len(input_disks)))
    input_files = [
        os.path.abspath(os.path.join(disk, relative_path))
        for disk, relative_path in zip(input_disks, input_file_relative_paths)
    ]

    # Find out if data already exists.
    existing_files = filter(os.path.exists, input_files)
    if len(existing_files) == len(input_files):
        print >> sys.stderr, "Data already exists"
        sys.exit(0)

    # Data needs to be generated. Delete any existing files.
    for input_file in existing_files:
        os.remove(input_file)

    # Convert human-readable data size to records or bytes.
    unit = "R"
    if method == "pareto":
        # Pareto uses bytes since records are variably sized.
        unit = "B"
    data_size = unitconversion.parse_and_convert(total_data_size, unit)

    # Compute record assignments to disks in the cluster.
    assignments = generate_data_assignment(io_disk_map, data_size, multiple)
    local_assignments = assignments[local_fqdn]

    # If large tuples were specified, assign them round robin across disks.
    if large_tuples is not None:
        large_tuple_assignments = {}
        # Convert comma-delimited list into a list of triples.
        large_tuples = large_tuples.split(",")
        large_tuples = zip(large_tuples[0::3], large_tuples[1::3],
                           large_tuples[2::3])
        # Round robin the triples across disks.
        for index, large_tuple in enumerate(large_tuples):
            disk_index = index % len(input_disks)
            if disk_index not in large_tuple_assignments:
                large_tuple_assignments[disk_index] = list(large_tuple)
            else:
                large_tuple_assignments[disk_index].extend(list(large_tuple))

    # Finally we're ready to create input files on each disk.
    gensort_commands = []
    for disk_index, input_file in enumerate(input_files):
        # Unless we're using HDFS, we need to manually create directories.
        if method != "gensort_hdfs":
            # Create input directory
            directory = os.path.abspath(os.path.join(input_file, os.pardir))
            if no_sudo:
                cmd = mkdir["-p", directory]
            else:
                cmd = sudo[mkdir["-p", directory]]

            if debug:
                print cmd
            else:
                cmd()

            # Change ownership of input directory from root to USER
            if not no_sudo:
                cmd = sudo[chown["-R", username, os.path.dirname(directory)]]

                if debug:
                    print cmd
                else:
                    cmd()

        # Prepare gensort command.
        (disk_data_offset, disk_data_size) = local_assignments[disk_index]
        command_options = {}
        command_args = []

        if method == "pareto":
            # Set all pareto distribution options in an options string.
            command_options["-pareto_a"] = "%f" % (pareto_a)
            command_options["-pareto_b"] = "%f" % (pareto_b)
            command_options["-maxKeyLen"] = "%d" % (max_key_len)
            command_options["-maxValLen"] = "%d" % (max_val_len)
            command_options["-minKeyLen"] = "%d" % (min_key_len)
            command_options["-minValLen"] = "%d" % (min_val_len)

            # If large tuples were specified, add them to the options string.
            if (large_tuples is not None
                    and disk_index in large_tuple_assignments
                    and len(large_tuple_assignments[disk_index]) > 0):

                command_options["-largeTuples"] = ','.join(
                    map(str, large_tuple_assignments[disk_index]))

            command_args = [input_file, "pareto", str(int(disk_data_size))]

        else:
            # The other methods use some form of gensort, but the filename
            # depends on whether we're writing to HDFS or not.

            if method == "gensort_hdfs":
                # Get local IP address from redis so we can pass it to HDFS.
                ip_address = redis_client.hget("ipv4_address", local_fqdn)

                destination_filename = "http://%s/webhdfs/v1/%s/%d/%s" % (
                    hdfs_namenode, ip_address, disk_index,
                    input_file_relative_paths[disk_index])

                command_args.append("-r%d" % (hdfs_replication))
            else:
                destination_filename = input_file

                if method == "gensort_2013":
                    if transfer_size is not None:
                        destination_filename += ",trans=%s" % transfer_size

                    if skew:
                        command_args.append("-s")

                    if not graysort_compatibility_mode:
                        command_args.append("-m")

            # Not including offset or replication as options because of
            # gensort's ridiculously bad command parsing
            command_args.extend([
                "-b%d" % (disk_data_offset),
                str(disk_data_size), destination_filename
            ])

        options_str = ' '.join(
            ('%s %s' % (k, v) for k, v in command_options.items()))

        args_str = ' '.join(command_args)

        command = "%s %s %s" % (gensort_command, options_str, args_str)

        # Finally start the gensort process for this disk.
        if debug:
            print command
        else:
            command = shlex.split(command)
            gensort_commands.append(command)

    # Generate files in random order to attempt to more evenly utilize disks.
    random.shuffle(gensort_commands)
    status = 0
    while len(gensort_commands) > 0:
        running_commands = []
        gensort_processes = []
        for command in gensort_commands:
            print "Running '%s'" % (command)
            gensort_processes.append(
                subprocess.Popen(command,
                                 stderr=subprocess.PIPE,
                                 stdout=subprocess.PIPE))

            running_commands.append(command)
            if parallelism > 0 and len(running_commands) >= parallelism:
                # Don't launch more parallel gensort processes
                break

        for command in running_commands:
            gensort_commands.remove(command)

        # Wait for all gensort processes to finish.
        for process in gensort_processes:
            (stdout_data, stderr_data) = process.communicate()
            print "gensort instance completed with return code %d" % (
                process.returncode)

            if process.returncode != 0:
                print stderr_data
                status = 1

    return status