Пример #1
0
base_num_map_tasks = num_machines * cores_per_machine
base_num_reduce_tasks = num_machines * cores_per_machine
num_tasks_multiplier_values = [1, 2, 4, 8, 16, 32]
longs_per_value = 6
num_shuffles = 6
sortByKey = False
cacheRdd = False
slaves = [slave_line.strip("\n") for slave_line in open("/root/spark/conf/slaves").readlines()]
print "Running experiment assuming slaves %s" % slaves

for items_per_partition in items_per_partition_values:
  for num_tasks_multiplier in num_tasks_multiplier_values:
    num_reduce_tasks = num_tasks_multiplier * base_num_reduce_tasks
    num_map_tasks = num_tasks_multiplier * base_num_map_tasks
    print "*************Running experiment with {} shuffle values".format(
      items_per_partition)
    parameters = [num_map_tasks, num_reduce_tasks, items_per_partition, longs_per_value, num_shuffles, sortByKey, cacheRdd]
    stringified_parameters = ["{}".format(p) for p in parameters]
    command = "/root/spark/bin/run-example monotasks.ShuffleJob {}".format(
      " ".join(stringified_parameters))
    print command
    subprocess.check_call(command, shell=True)

    utils.copy_and_zip_all_logs(stringified_parameters, slaves)
	
end = time.time()

completion = (end - start)

print "Shuffle script took " + str(completion)
Пример #2
0
cols_per_block = 4096
# This is the number of times the shuffle stage will happen.
# To use less memory, reduce this number, and instead increase
# num_repeats (there's one RDD stored in memory for each shuffle
# block).
num_col_blocks = 5
# The number of times to repeat the whole computation.
num_repeats = 1

for total_rows in total_rows_values:
    for num_row_blocks in num_row_blocks_values:
        rows_per_block = int(total_rows / num_row_blocks)
        parameters = [
            rows_per_block, num_row_blocks, cols_per_block, num_col_blocks,
            num_repeats
        ]
        stringified_parameters = [str(p) for p in parameters]
        master_url = "spark://{}:7077".format(MASTER_HOSTNAME)
        command = (
            "/root/spark/bin/spark-submit --class " +
            "edu.berkeley.cs.amplab.mlmatrix.BlockCoordinateDescent --driver-memory 20G "
            +
            "--driver-class-path /root/ml-matrix/target/scala-2.10/mlmatrix-assembly-0.1.jar "
            + "/root/ml-matrix/target/scala-2.10/mlmatrix-assembly-0.1.jar " +
            "{} {}".format(master_url, " ".join(stringified_parameters)))
        print "Running job with command: " + command
        subprocess.check_call(command, shell=True)

        # Copy the logs back.
        utils.copy_and_zip_all_logs(stringified_parameters, workers)
Пример #3
0
for num_threads_per_disk in num_threads_per_disk_values:
    # Change the number of threads per disk by resetting the Spark config.
    change_num_threads_command = (
        "sed -i \"s/spark\.monotasks\.threadsPerDisk .*/" +
        "spark.monotasks.threadsPerDisk {}/\" {}".format(
            num_threads_per_disk, spark_defaults_filepath))
    print "Changing the number of threads per disk using command: {}".format(
        change_num_threads_command)
    subprocess.check_call(change_num_threads_command, shell=True)

    # For consistency, clear the buffer cache before each experiment.
    print "Clearing the OS buffer cache using command: {}".format(
        clear_slave_cache_command)
    subprocess.check_call(clear_slave_cache_command, shell=True)

    subprocess.check_call(start_all_command, shell=True)
    parameters = [
        num_partitions, items_per_partition, longs_per_item, num_iterations
    ]
    stringified_parameters = [str(p) for p in parameters]
    experiment_command = (
        "{} monotasks.disk.DiskThroughputExperiment {}".format(
            run_example_command, " ".join(stringified_parameters)))
    print "Running experiment using command: {}".format(experiment_command)
    subprocess.check_call(experiment_command, shell=True)

    # Stop Spark in order to finalize the logs.
    subprocess.check_call(stop_all_command, shell=True)
    utils.copy_and_zip_all_logs(
        stringified_parameters + [str(num_threads_per_disk)], workers)
    relative_path="spark/conf/spark-defaults.conf")
run_example_command = utils.get_full_path(
    relative_path="spark/bin/run-example")
run_on_slaves_command = utils.get_full_path(
    relative_path="ephemeral-hdfs/sbin/slaves.sh")

for max_concurrent_tasks in max_concurrent_tasks_values:
    # Change the maximum number of concurrent tasks by resetting the Spark config.
    change_max_concurrent_tasks_command = (
        "sed -i \"s/maxConcurrentTasks .*/" +
        "maxConcurrentTasks {}/\" {}".format(max_concurrent_tasks,
                                             spark_defaults_filepath))
    print "Changing the maximum number of concurrent tasks using command: {}".format(
        change_max_concurrent_tasks_command)
    subprocess.check_call(change_max_concurrent_tasks_command, shell=True)

    total_num_items = target_total_data_gb / (4.9 + values_per_key * 1.92) * (
        64 * 4000000)
    items_per_task = int(total_num_items / num_tasks)
    parameters = [
        num_tasks, num_tasks, items_per_task, values_per_key, num_shuffles
    ]
    stringified_parameters = [str(p) for p in parameters]
    experiment_command = ("{} monotasks.MemorySortJob {}".format(
        run_example_command, " ".join(stringified_parameters)))
    print "Running experiment using command: %s" % experiment_command
    subprocess.check_call(experiment_command, shell=True)

    utils.copy_and_zip_all_logs(
        stringified_parameters + [str(max_concurrent_tasks)], workers)
Пример #5
0
        "sed -i s/SPARK_WORKER_CORES=.*/SPARK_WORKER_CORES=" +
        "{}/ spark/conf/spark-env.sh".format(num_concurrent_tasks))
    print "Changing the number of Spark cores using command ", change_cores_command
    subprocess.check_call(change_cores_command, shell=True)

    copy_config_command = "/root/spark-ec2/copy-dir --delete /root/spark/conf/"
    print "Copying the new configuration to the cluster with command ", copy_config_command
    subprocess.check_call(copy_config_command, shell=True)

    # Need to stop and re-start Spark, so that the new number of cores per worker takes effect.
    subprocess.check_call("/root/spark/sbin/stop-all.sh")
    subprocess.check_call("/root/spark/sbin/start-all.sh")

    parameters = [
        num_disk_tasks, items_per_partition, values_per_item, target_seconds,
        available_cores, num_compute_tasks
    ]

    stringified_parameters = ["{}".format(p) for p in parameters]
    command = ("/root/spark/bin/run-example DiskAndComputeJobs " +
               " ".join(stringified_parameters))
    print command
    subprocess.check_call(command, shell=True)

    utils.copy_and_zip_all_logs(parameters, slaves)

    # Clear the buffer cache, to sidestep issue with machines dying.
    subprocess.check_call(
        "/root/ephemeral-hdfs/bin/slaves.sh /root/spark-ec2/clear-cache.sh",
        shell=True)