def run_network_avro(logger, avro_in='v', avro_out=None): try: program_name = AVRO_APPS[(avro_in, avro_out)] except KeyError: raise ValueError( "not supported: avro_in=%s, avro_out=%s" % (avro_in, avro_out) ) else: program = os.path.join(WD, program_name) for name in program_name, "avro_base.py": shutil.copy(os.path.join(AVRO_PY_DIR, name), WD) os.chmod(program, os.stat(program).st_mode | stat.S_IEXEC) file_in = USERS_PETS_FN if avro_in == 'kv' else AVRO_FN schema_k_out = STATS_SCHEMA_STR if avro_out in {'k', 'kv'} else None schema_v_out = STATS_SCHEMA_STR if avro_out in {'v', 'kv'} else None simulator = HadoopSimulatorNetwork( program, logger, logging.INFO, context_cls=AvroContext, avro_input=avro_in, avro_output=avro_out, avro_output_key_schema=schema_k_out, avro_output_value_schema=schema_v_out ) with open(file_in, 'rb') as fin, open(DATA_OUT, 'wb') as fout: simulator.run(fin, fout, {}, num_reducers=1) dump_counters(simulator, logger) if avro_out: data_out_des = DATA_OUT + '-des' avro_container_dump_results.main(DATA_OUT, data_out_des, avro_out) avro_check_results.main(USERS_CSV_FN, data_out_des) else: avro_check_results.main(USERS_CSV_FN, DATA_OUT)
def main(): program_name = './foobar' dump_to_disk('data.in', DATA) dump_to_disk(program_name, FOOBAR_PY) os.chmod(program_name, 0777) hsn = HadoopSimulatorNetwork(program=program_name, loglevel=logging.INFO) hsn.run(open('data.in'), open('data.out', 'w'), {'a.useless.key': 'we'})
def run_network_avro(logger, avro_in='v', avro_out=None): try: program_name = AVRO_APPS[(avro_in, avro_out)] except KeyError: raise ValueError("not supported: avro_in=%s, avro_out=%s" % (avro_in, avro_out)) else: program = os.path.join(WD, program_name) for name in program_name, "avro_base.py": shutil.copy(os.path.join(AVRO_PY_DIR, name), WD) os.chmod(program, os.stat(program).st_mode | stat.S_IEXEC) file_in = USERS_PETS_FN if avro_in == 'kv' else AVRO_FN schema_k_out = STATS_SCHEMA_STR if avro_out in {'k', 'kv'} else None schema_v_out = STATS_SCHEMA_STR if avro_out in {'v', 'kv'} else None simulator = HadoopSimulatorNetwork(program, logger, logging.INFO, context_cls=AvroContext, avro_input=avro_in, avro_output=avro_out, avro_output_key_schema=schema_k_out, avro_output_value_schema=schema_v_out) with open(file_in, 'rb') as fin, open(DATA_OUT, 'wb') as fout: simulator.run(fin, fout, {}, num_reducers=1) dump_counters(simulator, logger) if avro_out: data_out_des = DATA_OUT + '-des' avro_container_dump_results.main(DATA_OUT, data_out_des, avro_out) avro_check_results.main(USERS_CSV_FN, data_out_des) else: avro_check_results.main(USERS_CSV_FN, DATA_OUT)
def run_network_minimal(logger): program_name = cp_script(os.path.join(WC_DIR, 'wordcount_minimal.py')) data_in, data_out, conf, input_split, output_dir = create_configuration() hs = HadoopSimulatorNetwork(program=program_name, logger=logger, loglevel=logger.level) hs.run(open(data_in), open(data_out, 'wb'), conf) dump_counters(hs, logger) check_results(data_in, data_out, logger) clean_up(data_out, output_dir)
def run_network_full(logger): program_name = cp_script(os.path.join(WC_DIR, 'wordcount_full.py')) data_in, data_out, conf, input_split, output_dir = create_configuration() hs = HadoopSimulatorNetwork(program=program_name, logger=logger, loglevel=logger.level) hs.run(None, None, conf, input_split=input_split) data_out = os.path.join(output_dir, 'part-r-%05d' % int(conf["mapred.task.partition"])) dump_counters(hs, logger) check_results(data_in, data_out, logger) clean_up(data_out, output_dir)
def main(): program_name = './avro_pyrw.py' data_in = './users.avro' path = os.path.realpath(data_in) length = os.stat(path).st_size input_split = InputSplit.to_string('file://' + path, 0, length) out_path = os.path.realpath('.') conf = { "mapreduce.task.partition": "0", "mapreduce.task.output.dir": 'file://%s' % out_path, } hsn = HadoopSimulatorNetwork(program=program_name) hsn.run(None, None, conf, input_split=input_split)
def main(): program_name = './avro_pyrw.py' data_in = './users.avro' path = os.path.realpath(data_in) length = os.stat(path).st_size input_split = InputSplit.to_string('file://'+path, 0, length) out_path = os.path.realpath('.') conf = { "mapreduce.task.partition": "0", "mapreduce.task.output.dir": 'file://%s' % out_path, } hsn = HadoopSimulatorNetwork(program=program_name) hsn.run(None, None, conf, input_split=input_split)
def main(argv): try: data_in = argv[1] except IndexError: sys.exit("Usage: python %s AVRO_FILE" % argv[0]) shutil.copy('../schemas/stats.avsc', 'stats.avsc') program_name = cp_script('./avro_pyrw.py') path = os.path.realpath(data_in) length = os.stat(path).st_size input_split = InputSplit.to_string('file://' + path, 0, length) out_path = os.path.realpath('.') conf = { "mapreduce.task.partition": "0", "mapreduce.task.output.dir": 'file://%s' % out_path, } hsn = HadoopSimulatorNetwork(program=program_name) hsn.run(None, None, conf, input_split=input_split)