def disk_benchmark( input_directory, output_directory, benchmark_size_per_disk, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "disk_speeds") (input_url, output_url) = utils.generate_urls( input_directory, output_directory, None) config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url, map_function = "DiskBenchmarkMapFunction", reduce_function = "DiskBenchmarkReduceFunction") utils.force_single_partition(config) data_size_bytes = int(uc.parse_and_convert(benchmark_size_per_disk, "B")) config_params = { "DISK_BENCHMARK_DATA_SIZE" : data_size_bytes } if "params" not in config: config["params"] = {} for key, value in config_params.items(): config["params"][key] = value return config
def disk_benchmark(input_directory, output_directory, benchmark_size_per_disk, **kwargs): if output_directory is None: output_directory = utils.sibling_directory(input_directory, "disk_speeds") (input_url, output_url) = utils.generate_urls(input_directory, output_directory, None) config = utils.mapreduce_job(input_dir=input_url, output_dir=output_url, map_function="DiskBenchmarkMapFunction", reduce_function="DiskBenchmarkReduceFunction") utils.force_single_partition(config) data_size_bytes = int(uc.parse_and_convert(benchmark_size_per_disk, "B")) config_params = {"DISK_BENCHMARK_DATA_SIZE": data_size_bytes} if "params" not in config: config["params"] = {} for key, value in config_params.items(): config["params"][key] = value return config
def merge_files(input_directory, output_directory, **kwargs): if output_directory is None: output_directory = utils.sibling_directory(input_directory, "%(dirname)s_merged") (input_url, output_url) = utils.generate_urls(input_directory, output_directory) config = utils.mapreduce_job(input_dir=input_url, output_dir=output_url) utils.force_single_partition(config) return config
def merge_files(input_directory, output_directory, hdfs, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "%(dirname)s_merged") (input_url, output_url) = utils.generate_urls( input_directory, output_directory, hdfs) config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url) utils.force_single_partition(config) return config
def sum_values(input_directory, output_directory, **kwargs): if output_directory is None: output_directory = utils.sibling_directory(input_directory, "%(dirname)s_sumcounts") (input_url, output_url) = utils.generate_urls(input_directory, output_directory) config = utils.mapreduce_job(input_dir=input_url, output_dir=output_url, map_function="ZeroKeyMapFunction", reduce_function="SumValuesReduceFunction") utils.force_single_partition(config) return config
def sum_values(input_directory, output_directory, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "%(dirname)s_sumcounts") (input_url, output_url) = utils.generate_urls( input_directory, output_directory) config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url, map_function = "ZeroKeyMapFunction", reduce_function = "SumValuesReduceFunction") utils.force_single_partition(config) return config