def run(self, num_run): with utils.open_cfg(mode='w') as cfg: cfg['out_folders'] = {} if not 'delete_hdfs' in cfg['main']: cfg['main']['delete_hdfs'] = 'true' cfg['main']['num_run'] = str(num_run) ''' sess_file = Path("session.txt") session_no = 0 if sess_file.exists(): with open("session.txt", 'r') as f: fc = f.read() session_no = int(fc) + 1 if len(fc) > 0 else 0 f.close() with open("session.txt", 'w') as f: f.write(str(session_no)) f.close() ''' for i in range(num_run): if self.cluster_id == c.CLUSTER_MAP['spark']: print(bold('Experiment ({}/{})'.format(i + 1, num_run))) try: self.retrieve_nodes() with utils.open_cfg(mode='w') as cfg: cfg['main']['iter_num'] = str(i + 1) x_run.run_benchmark(self.nodes) if i == 0: with utils.open_cfg(mode='w') as cfg: cfg['main']['delete_hdfs'] = 'false' except (OSError, IOError) as exc: print('ERROR: {}\n\nSkipping Experiment ({}/{})'.format( exc, i + 1, num_run))
def submit(args): cluster_id = c.CLUSTER_MAP['spark'] num_run = args.num_runs reuse_dataset = args.reuse_dataset #exp_filepath = args.exp_file_path if args.exp_file_path else "experiment.json" exp_filepaths = args.exp_file_paths if args.exp_file_paths else ["experiment.json"] for exp_filepath in exp_filepaths: exp_file = Path(exp_filepath) if exp_file.exists(): experiment = json.load(open(exp_filepath)) try: benchmark = experiment["BenchmarkName"] except KeyError as error: print("ERROR: {} in experiment file: {}".format(error, exp_filepath)) exit(1) with utils.open_cfg(mode='w') as cfg: for s in cfg.sections(): if s != 'hdfs': cfg.remove_section(s) cfg['main'] = {} cfg['main']['tool_on_master'] = 'false' cfg['main']['experiment_file'] = exp_filepath cfg['main']['process_on_server'] = str(c.PROCESS_ON_SERVER) cfg['main']['iter_num'] = str(1) cfg['submit'] = {} cfg[benchmark] = {} #cfg[benchmark]['profile_name'] = '{}'.format(c.VAR_PAR_MAP[benchmark]['profile_name']) if reuse_dataset: cfg['main']['delete_hdfs'] = str(not reuse_dataset) print(bold('Submit experiment {} performing {} runs for benchmark {} on cluster {}'.format(exp_filepath, num_run, benchmark, cluster_id,))) run_xspark(current_cluster='spark', num_instance=0, num_run=num_run, cluster_id=cluster_id, run=1, terminate=0, reboot=0)
def run_xspark(current_cluster, num_instance=c.NUM_INSTANCE, num_run=c.NUM_RUN, cluster_id=c.CLUSTER_ID, terminate=c.TERMINATE, run=c.RUN, reboot=c.REBOOT, assume_yes=False): """ Main function; * Launch NUMINSTANCE virtual machines * Run DagSymb Application * Download Log * Plot data from log """ print(header('run_xspark(num_instance={}, num_run={}, cluster_id={},terminate={}, run={}, reboot={})' .format(num_instance, num_run, cluster_id, terminate, run, reboot))) ''' get cfg_file and initialize main settings''' with utils.open_cfg(mode='w') as cfg: if 'main' not in cfg: cfg['main'] = {} cfg.set('main', 'current_cluster', current_cluster) if 'tool_on_master' not in cfg['main']: cfg.set('main', 'tool_on_master', 'false') bench_instance = BenchInstanceFactory.get_bench_instance(c.PROVIDER, cluster_id) setup_ok = True if num_instance > 0: setup_ok = bench_instance.setup(num_instance, assume_yes) if reboot: bench_instance.reboot() if setup_ok and run: bench_instance.run(num_run) if terminate: bench_instance.terminate()
def run_disabled(self, num_run): with utils.open_cfg(mode='w') as cfg: cfg['out_folders'] = {} cfg['main']['delete_hdfs'] = 'true' for i in range(num_run): if self.cluster_id == c.CLUSTER_MAP['spark']: print(bold('Experiment ({}/{})'.format(i + 1, num_run))) try: self.retrieve_nodes() x_run.run_benchmark(self.nodes) if i == 0: with utils.open_cfg(mode='w') as cfg: cfg['main']['delete_hdfs'] = 'false' except (OSError, IOError) as exc: print('ERROR: {}\n\nSkipping Experiment ({}/{})'.format( exc, i + 1, num_run))
def kill_cluster(cluster): cluster_id = c.CLUSTER_MAP[cluster] print(bold('Terminate {}...'.format(cluster_id))) run_xspark(current_cluster=cluster, num_instance=0, cluster_id=cluster_id, run=0, terminate=1, reboot=0) with utils.open_cfg(mode='w') as cfg: cfg[cluster] = {}
def setup_application_agnostic(args): cluster = args.cluster num_instances = args.num_instances assume_yes = args.assume_yes if cluster == 'all' or cluster == 'hdfs': with utils.open_cfg(mode='w') as cfg: for s in cfg.sections(): cfg.remove_section(s) if cluster == 'all': setup_cluster('hdfs', num_instances, assume_yes) setup_cluster('spark', num_instances, assume_yes) else: setup_cluster(cluster, num_instances, assume_yes)
def profile(args): cluster_id = c.CLUSTER_MAP['spark'] num_run = args.num_runs reuse_dataset = args.reuse_dataset #exp_filepath = args.exp_file_path if args.exp_file_path else "experiment.json" exp_filepaths = args.exp_file_paths if args.exp_file_paths else ["experiment.json"] num_experiments = len(exp_filepaths) spark_seq = args.spark_seq if args.spark_seq else False index = 0 for exp_filepath in exp_filepaths: exp_file = Path(exp_filepath) index += 1 if exp_file.exists(): experiment = json.load(open(exp_filepath)) try: benchmark = experiment["BenchmarkName"] #benchmark = experiment["BenchmarkBench"][0] except KeyError as error: print("ERROR: {} in experiment file: {}".format(error, exp_filepath)) exit(1) with utils.open_cfg(mode='w') as cfg: for s in cfg.sections(): if s != 'hdfs': cfg.remove_section(s) cfg['main'] = {} cfg['main']['tool_on_master'] = 'false' cfg['main']['experiment_file'] = exp_filepath cfg['main']['process_on_server'] = str(c.PROCESS_ON_SERVER) cfg['main']['iter_num'] = str(1) #vboxvm cfg['main']['num_experiments'] = str(num_experiments) cfg['main']['experiment_num'] = str(index) #cfg['main']['cluster_id'] = cluster_id cfg['profile'] = {} cfg['profile']['spark_seq'] = str(spark_seq) cfg[benchmark] = {} cfg[benchmark]['profile_name'] = '{}'.format(c.VAR_PAR_MAP[benchmark]['profile_name']) if reuse_dataset: cfg['main']['delete_hdfs'] = str(not reuse_dataset) print(bold('Profile experiment {} performing {} runs for benchmark {} on cluster {}'.format(exp_filepath, num_run, benchmark, cluster_id,))) run_xspark(current_cluster='spark', num_instance=0, num_run=num_run, cluster_id=cluster_id, run=1, terminate=0, reboot=0) if not c.PROCESS_ON_SERVER: average_runs.main(profile_name=utils.get_cfg()[benchmark]['profile_name']) deploy_profile(benchmark, cluster_id)
def launch_symex(args): cluster_id = c.CLUSTER_MAP['spark'] app_name = args.app_name app_jar = args.app_jar app_class = args.app_class app_args = args.app_args num_run = args.num_runs reuse_dataset = args.reuse_dataset max_executors = args.max_executors num_partitions = args.num_partitions arg_string = '' with utils.open_cfg(mode='w') as cfg: for s in cfg.sections(): if s != 'hdfs': cfg.remove_section(s) cfg['main'] = {} cfg['main']['profile'] = 'true' if args.profile else 'false' cfg['main']['time_analysis'] = 'true' if args.time_analysis else 'false' cfg['main']['tool_on_master'] = 'false' cfg['main']['app_name'] = '{}'.format(app_name) cfg['main']['app_jar'] = '{}'.format(app_jar) cfg['main']['app_class'] = '{}'.format(app_class) cfg['app_args'] = {} app_arg_pos = 0 for app_arg in app_args: cfg['app_args'][str(app_arg_pos)] = '{}'.format(app_arg) arg_string += ' {}'.format(app_arg) app_arg_pos += 1 arg_string += ' {}'.format(str(num_partitions)) cfg['main']['child_args_string'] = '{}'.format(arg_string) cfg['main']['num_partitions'] = str(num_partitions) if reuse_dataset: cfg['main']['delete_hdfs'] = str(not reuse_dataset) if max_executors: cfg['main']['max_executors'] = str(max_executors) print(bold('Launch {} Experiments for application {} on cluster {} with args: {}'.format(num_run, app_name, cluster_id, arg_string))) run_xspark(current_cluster='spark', num_instance=0, num_run=num_run, cluster_id=cluster_id, run=1, terminate=0, reboot=0) if not c.PROCESS_ON_SERVER: if args.profile: run_log_profiling(None) if args.time_analysis: run_time_analysis(None)
def setup(args): cluster = args.cluster num_instances = args.num_instances assume_yes = args.assume_yes with utils.open_cfg(mode='w') as cfg: if cluster == 'all' or cluster == 'hdfs': for s in cfg.sections(): cfg.remove_section(s) if 'main' not in cfg: cfg['main'] = {} cfg.set('main', 'setup', 'true') if args.app_dir: cfg.set('main', 'appdir', args.app_dir) if cluster == 'all': setup_cluster('hdfs', num_instances, assume_yes) setup_cluster('spark', num_instances, assume_yes) else: setup_cluster(cluster, num_instances, assume_yes)
def launch_exp(args): cluster_id = c.CLUSTER_MAP['spark'] var_par = args.var_par bench = args.benchmark num_run = args.num_runs reuse_dataset = args.reuse_dataset max_executors = args.max_executors num_partitions = args.num_partitions for v in var_par: with utils.open_cfg(mode='w') as cfg: for s in cfg.sections(): if s != 'hdfs': cfg.remove_section(s) cfg['main'] = {} cfg['main']['profile'] = 'true' if args.profile else 'false' cfg['main'][ 'time_analysis'] = 'true' if args.time_analysis else 'false' cfg['main']['tool_on_master'] = 'false' cfg['main']['benchmark'] = bench cfg[bench] = {} cfg[bench][c.VAR_PAR_MAP[bench]['var_name']] = '{}'.format(v) cfg[bench]['num_partitions'] = str(num_partitions) if reuse_dataset: cfg['main']['delete_hdfs'] = str(not reuse_dataset) if max_executors: cfg['main']['max_executors'] = str(max_executors) print( bold( 'Launch {} Experiments for benchmark {} on cluster {} with {}={}...' .format(num_run, bench, cluster_id, c.VAR_PAR_MAP[bench]['var_name'], v))) run_xspark(current_cluster='spark', num_instance=0, num_run=num_run, cluster_id=cluster_id, run=1, terminate=0, reboot=0) if not c.PROCESS_ON_SERVER: if args.profile: run_log_profiling(None) if args.time_analysis: run_time_analysis(None)
def profile_disabled(args): cluster_id = c.CLUSTER_MAP['spark'] var_par = args.var_par exp_profile_name = args.exp_profile_name if args.exp_profile_name else "" benchmark = args.benchmark num_run = args.num_runs max_executors = args.max_executors num_partitions = args.num_partitions for v in var_par: with utils.open_cfg(mode='w') as cfg: cfg['main'] = {} cfg['main']['profile'] = 'true' cfg['main']['tool_on_master'] = 'false' cfg['main']['benchmark'] = benchmark cfg['main']['iter_num'] = str(1) #vboxvm cfg[benchmark] = {} cfg[benchmark][c.VAR_PAR_MAP[benchmark] ['var_name']] = '({}, {})'.format( c.VAR_PAR_MAP[benchmark]['default'][0], v) cfg[benchmark]['profile_name']= \ '{}'.format(c.VAR_PAR_MAP[benchmark]['profile_name']) if not args.exp_profile_name else args.exp_profile_name cfg[benchmark]['num_partitions'] = str(num_partitions) if max_executors: cfg['main']['max_executors'] = max_executors print( bold( 'Profile {} performing {} runs for benchmark {} on cluster {} with {}={}...' .format(exp_profile_name, num_run, benchmark, cluster_id, c.VAR_PAR_MAP[benchmark]['var_name'], v))) run_xspark(current_cluster='spark', num_instance=0, num_run=num_run, cluster_id=cluster_id, run=1, terminate=0, reboot=0) #profiling.main() average_runs.main( profile_name=utils.get_cfg()[benchmark]['profile_name']) #run_log_profiling(args.local) deploy_profile(benchmark, cluster_id)
def run_xspark_disabled(current_cluster, num_instance=c.NUM_INSTANCE, num_run=c.NUM_RUN, cluster_id=c.CLUSTER_ID, terminate=c.TERMINATE, run=c.RUN, reboot=c.REBOOT, assume_yes=False): """ Main function; * Launch spot request of NUMINSTANCE * Run Benchmark * Download Log * Plot data from log """ print( header( 'run_xspark(num_instance={}, num_run={}, cluster_id={},terminate={}, run={}, reboot={})' .format(num_instance, num_run, cluster_id, terminate, run, reboot))) # get cfg_file and initialize main settings with utils.open_cfg(mode='w') as cfg: if 'main' not in cfg: cfg['main'] = {} cfg.set('main', 'current_cluster', current_cluster) bench_instance = BenchInstanceFactory.get_bench_instance( c.PROVIDER, cluster_id) setup_ok = True if num_instance > 0: println("setup_ok = bench_instance.setup(num_instance, assume_yes)") if reboot: println("bench_instance.reboot()") if setup_ok and run: bench_instance.run(num_run) if terminate: println("bench_instance.terminate()")
def submit_symex(args): cluster_id = c.CLUSTER_MAP['spark'] num_run = args.num_runs reuse_dataset = args.reuse_dataset exp_filepaths = args.exp_file_paths if args.exp_file_paths else ["experiment.json"] num_experiments = len(exp_filepaths) index = 0 app_name = '' app_jar = '' app_class = '' guard_evaluator_class = '' num_partitions = '' app_args = {} meta_profile_name = '' for exp_filepath in exp_filepaths: exp_file = Path(exp_filepath) index += 1 if exp_file.exists(): experiment = json.load(open(exp_filepath)) try: app_name = experiment["AppName"] app_jar = experiment["AppJar"] app_class = experiment["AppClass"] guard_evaluator_class = experiment["GuardEvaluatorClass"] num_partitions = experiment["NumPartitions"] app_args = experiment["AppConf"] data_multiplier = experiment["DataMultiplier"] if experiment["DataMultiplier"] else 1 meta_profile_name = experiment["MetaProfileName"] if experiment["MetaProfileName"] else meta_profile_name except KeyError as error: print("ERROR: {} in experiment file: {}".format(error, exp_filepath)) exit(1) with utils.open_cfg(mode='w') as cfg: for s in cfg.sections(): if s != 'hdfs': cfg.remove_section(s) cfg['main'] = {} cfg['main']['app_name'] = app_name cfg['main']['app_jar'] = app_jar cfg['main']['app_class'] = app_class cfg['main']['guard_evaluator_class'] = guard_evaluator_class cfg['main']['tool_on_master'] = 'false' cfg['main']['experiment_file'] = exp_filepath cfg['main']['process_on_server'] = str(c.PROCESS_ON_SERVER) cfg['experiment'] = {} cfg['experiment']['app_name'] = app_name cfg['experiment']['profile_name'] = app_name cfg['experiment']['meta_profile_name'] = meta_profile_name cfg['app_args'] = {} arg_string = '' not_to_scale_args = ["pastMonths", "inputFile", "outputFile", "delimiter", "parallelism", "minimumCompressionProgress", "progressCounter"] for key_app_arg in sorted(app_args.keys(), key=lambda k: int(k)): app_arg_name = '{}'.format(app_args[key_app_arg]["Name"]) app_arg_val = '{}'.format(app_args[key_app_arg]["Value"]) app_arg_value = app_arg_val if app_arg_name in not_to_scale_args else '{}'.format(int(app_arg_val) * int(data_multiplier)) cfg['app_args']['arg'+key_app_arg+': ' + app_arg_name] = app_arg_value arg_string += ' {}'.format(app_arg_value) #arg_string += ' {}'.format(str(num_partitions)) cfg['main']['child_args_string'] = '{}'.format(arg_string) cfg['main']['num_partitions'] = str(num_partitions) cfg['main']['iter_num'] = str(1) #vboxvm cfg['main']['num_experiments'] = str(num_experiments) cfg['main']['experiment_num'] = str(index) if reuse_dataset: cfg['main']['delete_hdfs'] = str(not reuse_dataset) print(bold('Submit experiment {} performing {} runs for application {} on cluster {}'.format(exp_filepath, num_run, app_name, cluster_id,))) run_xspark(current_cluster='spark', num_instance=0, num_run=num_run, cluster_id=cluster_id, run=1, terminate=0, reboot=0)
def profile_symex(args): cluster_id = c.CLUSTER_MAP['spark'] num_run = args.num_runs reuse_dataset = args.reuse_dataset #exp_filepath = args.exp_file_path if args.exp_file_path else "experiment.json" exp_filepaths = args.exp_file_paths if args.exp_file_paths else ["experiment.json"] num_experiments = len(exp_filepaths) spark_seq = args.spark_seq if args.spark_seq else False index = 0 app_name = '' app_jar = '' app_class = '' guard_evaluator_class = '' num_partitions = '' app_args = {} meta_profile_name = '' for exp_filepath in exp_filepaths: exp_file = Path(exp_filepath) index += 1 if exp_file.exists(): experiment = json.load(open(exp_filepath)) try: app_name = experiment["AppName"] app_jar = experiment["AppJar"] app_class = experiment["AppClass"] guard_evaluator_class = experiment["GuardEvaluatorClass"] num_partitions = experiment["NumPartitions"] app_args = experiment["AppConf"] data_multiplier = experiment["DataMultiplier"] if experiment["DataMultiplier"] else 1 meta_profile_name = experiment["MetaProfileName"] if experiment["MetaProfileName"] else meta_profile_name except KeyError as error: print("ERROR: {} in experiment file: {}".format(error, exp_filepath)) exit(1) with utils.open_cfg(mode='w') as cfg: for s in cfg.sections(): if s != 'hdfs': cfg.remove_section(s) cfg['main'] = {} cfg['main']['app_name'] = app_name cfg['main']['app_jar'] = app_jar cfg['main']['app_class'] = app_class cfg['main']['guard_evaluator_class'] = guard_evaluator_class cfg['main']['tool_on_master'] = 'false' cfg['main']['experiment_file'] = exp_filepath cfg['main']['process_on_server'] = str(c.PROCESS_ON_SERVER) cfg['experiment'] = {} cfg['experiment']['app_name'] = app_name cfg['experiment']['profile_name'] = app_name cfg['experiment']['meta_profile_name'] = meta_profile_name cfg['app_args'] = {} arg_string = '' not_to_scale_args = ["pastMonths", "inputFile", "outputFile", "delimiter", "parallelism", "minimumCompressionProgress", "progressCounter"] for key_app_arg in sorted(app_args.keys(), key=lambda k: int(k)): app_arg_name = '{}'.format(app_args[key_app_arg]["Name"]) app_arg_val = '{}'.format(app_args[key_app_arg]["Value"]) app_arg_value = app_arg_val if app_arg_name in not_to_scale_args else '{}'.format(int(app_arg_val) * int(data_multiplier)) cfg['app_args']['arg'+key_app_arg+': ' + app_arg_name] = app_arg_value arg_string += ' {}'.format(app_arg_value) #arg_string += ' {}'.format(str(num_partitions)) cfg['main']['child_args_string'] = '{}'.format(arg_string) cfg['main']['num_partitions'] = str(num_partitions) cfg['main']['iter_num'] = str(1) #vboxvm cfg['main']['num_experiments'] = str(num_experiments) cfg['main']['experiment_num'] = str(index) #cfg['main']['cluster_id'] = cluster_id cfg['profile'] = {} cfg['profile']['spark_seq'] = str(spark_seq) cfg['profile']['profile_name'] = app_name cfg['profile']['metaprofile_name'] = meta_profile_name if reuse_dataset: cfg['main']['delete_hdfs'] = str(not reuse_dataset) print(bold('Profile experiment {} performing {} runs for application {} on cluster {}'.format(exp_filepath, num_run, app_name, cluster_id,))) run_xspark(current_cluster='spark', num_instance=0, num_run=num_run, cluster_id=cluster_id, run=1, terminate=0, reboot=0) if not c.PROCESS_ON_SERVER: average_runs.main(profile_name=utils.get_cfg()['experiment']['profile_name']) join_jsons.join_dags(OUTPUT_DIR) #join_jsons.join_dags("spark_log_profiling"+os.sep+"avg_json") deploy_meta_profile(meta_profile_name, cluster_id, True) #upload all the normal (non-meta) profiles for filename in os.listdir(OUTPUT_DIR): profilename = filename.split(os.sep)[-1].split(".")[0] profile_fname = filename.split(os.sep)[-1] if profilename != meta_profile_name and not "collection" in profilename and profile_fname.split(".")[-1] == "json": deploy_meta_profile(profilename, cluster_id)
def main(input_dir=INPUT_DIR, json_out_dir=OUTPUT_DIR, reprocess=False): processed_dir = os.path.join(ROOT_DIR, 'processed_logs') if reprocess: input_dir = processed_dir make_sure_path_exists(input_dir) make_sure_path_exists(processed_dir) print( "Start log profiling: \ninput_dir:\t{}\nprocessed_dir:\t{}\noutput_dir:\t{}" .format(input_dir, processed_dir, json_out_dir)) log_index = 0 for log in glob.glob(os.path.join(input_dir, 'app-*')): app_name = "" is_errfile = False app_start_time = 0 app_end_time = 0 app_act_start_time = 0 app_act_end_time = 0 dat_folder = 'home/ubuntu/dagsymb/num/' + log.split('.')[0].split( os.sep)[-1] print(ROOT_DIR, os.getcwd(), dat_folder) files = os.listdir(dat_folder) print("Files: ", files) dat_file = "" dat_files = [ x for x in files if x.split('.')[-1] == 'dat' and ( x.split('.')[-2] == 'app' or x.split('_')[-2] == 'run') ] dat_file = dat_files[0] if len(dat_files) > 0 else '' if dat_file == '': err_files = [ x for x in files if x.split('.')[-1] == 'err' and x.split('.')[-2] != 'scheduling-throughput' ] dat_file = err_files[0] if len(err_files) > 0 else '' is_errfile = True print("Files .err: ", dat_files) dat_filepath = dat_folder + '/' + dat_file print(".dat filePath: ", dat_filepath) try: with utils.open_cfg(rpath=dat_folder) as cfg: profile_suffix = cfg['experiment']['profile_name'].split( "-")[-1] except Exception: profile_suffix = log_index stages = [] last_stage = 0 # Build stage dictionary stage_dict = OrderedDict() if ".bz" in log: file_open = bz2.BZ2File(log, "r") else: file_open = open(log) with file_open as logfile: #print(log) if dat_file != '': fdat = open(dat_filepath) with fdat as dat: start_stage = 1 if contains_generation(dat) else 0 for line in logfile: if ".bz" in log: line = line.decode("utf-8") data = json.loads(line) try: if data["Event"] == "SparkListenerApplicationStart": app_name = data["App Name"] app_start_time = data["Timestamp"] stage_dict["jobs"] = {} id_symbols = [] elif data["Event"] == "SparkListenerApplicationEnd": app_end_time = data["Timestamp"] elif data["Event"] == "SparkListenerStageSubmitted": # print(data) stage = data["Stage Info"] stage_id = int(stage["Stage ID"]) - start_stage if stage_id < 0: continue stages.append(stage_id) if stage_id > last_stage: last_stage = stage_id if stage_id not in stage_dict.keys(): stage_dict[stage_id] = {} if stage_id == 0: stage_dict[0]["monocoretotalduration"] = 0 stage_dict[0]["totalduration"] = 0 stage_dict[0]["actualtotalduration"] = 0 stage_dict[stage_id]["duration"] = 0 stage_dict[stage_id]["name"] = stage['Stage Name'] stage_dict[stage_id]["genstage"] = False #print(stage["Parent IDs"]) stage_dict[stage_id]["parentsIds"] = list( map(lambda x: x - start_stage, stage["Parent IDs"])) stage_dict[stage_id]["nominalrate"] = 0.0 stage_dict[stage_id]["weight"] = 0 stage_dict[stage_id]["RDDIds"] = { x["RDD ID"]: { "name": x["Name"], "callsite": x["Callsite"] } for x in stage["RDD Info"] } stage_dict[stage_id]["skipped"] = False stage_dict[stage_id]["cachedRDDs"] = [] stage_dict[stage_id]["numtask"] = 0 stage_dict[stage_id]["recordsread"] = 0.0 stage_dict[stage_id]["shufflerecordsread"] = 0.0 stage_dict[stage_id]["recordswrite"] = 0.0 stage_dict[stage_id]["shufflerecordswrite"] = 0.0 stage_dict[stage_id]["bytesread"] = 0.0 stage_dict[stage_id]["shufflebytesread"] = 0.0 stage_dict[stage_id]["byteswrite"] = 0.0 stage_dict[stage_id]["shufflebyteswrite"] = 0.0 for rdd_info in stage["RDD Info"]: storage_level = rdd_info["Storage Level"] if storage_level["Use Disk"] or storage_level["Use Memory"] or \ storage_level["Deserialized"]: stage_dict[stage_id]["cachedRDDs"].append( rdd_info["RDD ID"]) elif data["Event"] == "SparkListenerStageCompleted": # print(data) stage_id = data["Stage Info"]["Stage ID"] - start_stage #print(stage_id) if stage_id < 0: continue stage_dict[stage_id]["numtask"] = data["Stage Info"][ 'Number of Tasks'] for acc in data["Stage Info"]["Accumulables"]: if acc["Name"] == "internal.metrics.executorRunTime": stage_dict[stage_id]["monocoreduration"] = int( acc["Value"]) stage_dict[0]["monocoretotalduration"] += int( acc["Value"]) if acc["Name"] == "internal.metrics.input.recordsRead": stage_dict[stage_id]["recordsread"] = acc[ "Value"] if acc["Name"] == "internal.metrics.shuffle.read.recordsRead": stage_dict[stage_id][ "shufflerecordsread"] = acc["Value"] if acc["Name"] == "internal.metrics.output.recordsWrite": stage_dict[stage_id]["recordswrite"] = acc[ "Value"] if acc["Name"] == "internal.metrics.shuffle.write.recordsWritten": stage_dict[stage_id][ "shufflerecordswrite"] = acc["Value"] if acc["Name"] == "internal.metrics.input.bytesRead": stage_dict[stage_id]["bytesread"] = acc[ "Value"] if acc["Name"] == "internal.metrics.shuffle.read.localBytesRead": stage_dict[stage_id]["shufflebytesread"] = acc[ "Value"] if acc["Name"] == "internal.metrics.output.bytesWrite": stage_dict[stage_id]["byteswrite"] = acc[ "Value"] if acc["Name"] == "internal.metrics.shuffle.write.bytesWritten": stage_dict[stage_id][ "shufflebyteswrite"] = acc["Value"] except KeyError as e: print(e) skipped = [] if ".bz" in log: file_open = bz2.BZ2File(log, "r") else: file_open = open(log) with file_open as logfile: for line in logfile: if ".bz" in log: line = line.decode("utf-8") data = json.loads(line) try: if data["Event"] == "SparkListenerJobStart": # print(data) job_id = data["Job ID"] stage_dict["jobs"][job_id] = {} # print(stage_dict["jobs"]) id_symb_root = sorted(data["Stage Infos"], key = lambda k: k["Stage ID"])[-1]["Stage Name"]\ .replace(" at ", "_") + "_" seq = 0 while id_symb_root + str(seq) in id_symbols: seq += 1 id_symb = id_symb_root + str(seq) id_symbols.append(id_symb) stage_dict["jobs"][job_id]["id-symb"] = id_symb stage_dict["jobs"][job_id]["stages"] = sorted( data["Stage IDs"]) for stage in data["Stage Infos"]: stage_id = stage["Stage ID"] - start_stage if stage_id < 0: continue if stage_id not in stage_dict.keys(): stage_dict[stage_id] = {} stage_dict[stage_id]["duration"] = 0 stage_dict[stage_id]["name"] = stage[ 'Stage Name'] stage_dict[stage_id]["genstage"] = False stage_dict[stage_id]["parentsIds"] = list( map(lambda x: x - start_stage, stage["Parent IDs"])) stage_dict[stage_id]["nominalrate"] = 0.0 stage_dict[stage_id]["weight"] = 0 stage_dict[stage_id]["RDDIds"] = { x["RDD ID"]: { "name": x["Name"], "callsite": x["Callsite"] } for x in stage["RDD Info"] } stage_dict[stage_id]["skipped"] = True stage_dict[stage_id]["cachedRDDs"] = [] stage_dict[stage_id]["numtask"] = 0 stage_dict[stage_id]["recordsread"] = 0.0 stage_dict[stage_id][ "shufflerecordsread"] = 0.0 stage_dict[stage_id]["recordswrite"] = 0.0 stage_dict[stage_id][ "shufflerecordswrite"] = 0.0 stage_dict[stage_id]["bytesread"] = 0.0 stage_dict[stage_id]["shufflebytesread"] = 0.0 stage_dict[stage_id]["byteswrite"] = 0.0 stage_dict[stage_id]["shufflebyteswrite"] = 0.0 for rdd_info in stage["RDD Info"]: storage_level = rdd_info["Storage Level"] if storage_level["Use Disk"] or storage_level["Use Memory"] or \ storage_level["Deserialized"]: stage_dict[stage_id][ "cachedRDDs"].append( rdd_info["RDD ID"]) skipped.append(stage_id) except KeyError: None stage_dict_key_stages = [k for k in stage_dict.keys() if k != "jobs"] # Replace skipped stage id in parents ids based on RDD IDs for skipped_id in skipped: for stage_id1 in stage_dict_key_stages: #stage_dict.keys(): if stage_id1 != skipped_id and stage_dict[skipped_id]["RDDIds"] == \ stage_dict[stage_id1]["RDDIds"]: for stage_id2 in stage_dict_key_stages: #stage_dict.keys(): if skipped_id in stage_dict[stage_id2]["parentsIds"]: stage_dict[stage_id2]["parentsIds"].remove( skipped_id) stage_dict[stage_id2]["parentsIds"].append( stage_id1) # stage_dict_key_stages = [k for k in stage_dict.keys() if k != "jobs"] for stage in stage_dict_key_stages: if len(stage_dict[stage]["parentsIds"]) == 0: try: cached = list(stage_dict[stage]["cachedRDDs"]) except KeyError: None for i in range(0, stage): try: for rdd in cached: if rdd in stage_dict[i]["cachedRDDs"]: stage_dict[stage]["parentsIds"].append(i) cached.remove(rdd) except KeyError: None #stages = list(stage_dict.keys()) stages = stage_dict_key_stages stages_not_skipped = [s for s in stages if s not in skipped] stage_act_start_times = [0] * len(stages) stage_act_end_times = [0] * len(stages) if dat_file != '': fdat = open(dat_filepath) #print("fdat: ", fdat) with fdat as dat: #print(dat) for line in dat: tokens = line.split(' ') #print("after line 373") if len(tokens) > 6: if tokens[4] == 'Submitting' and ( tokens[5] == 'ResultStage' or tokens[5] == 'ShuffleMapStage') and ( start_stage == 0 or tokens[6] != '0'): date = tokens[0] time = tokens[1] stage_act_start_times[ int(tokens[6]) - start_stage] = date_time_to_timestamp_ms( date, time) if tokens[6] == str(start_stage): app_act_start_time = date_time_to_timestamp_ms( date, time) if (tokens[4] == 'ResultStage' or tokens[4] == 'ShuffleMapStage' ) and tokens[9] == 'finished' and ( start_stage == 0 or tokens[5] != '0'): date = tokens[0] time = tokens[1] stage_act_end_times[ int(tokens[5]) - start_stage] = date_time_to_timestamp_ms( date, time) if tokens[5] == str(last_stage + start_stage): app_act_end_time = date_time_to_timestamp_ms( date, time) else: print('_run.dat file not found, no actualdurations calculated') sum_of_stages_durations = 0 for i in stages: #if i != "jobs": stage_dict[i][ "duration"] = stage_act_end_times[i] - stage_act_start_times[i] sum_of_stages_durations += stage_dict[i]["duration"] stage_dict[0]["totalduration"] = sum_of_stages_durations stage_dict[0][ "actualtotalduration"] = app_act_end_time - app_act_start_time ''' # Replace skipped stage id in parents ids based on RDD IDs for skipped_id in skipped: for stage_id1 in stage_dict.keys(): if stage_id1 != skipped_id and stage_dict[skipped_id]["RDDIds"] == \ stage_dict[stage_id1]["RDDIds"]: for stage_id2 in stage_dict.keys(): if skipped_id in stage_dict[stage_id2]["parentsIds"]: stage_dict[stage_id2]["parentsIds"].remove(skipped_id) stage_dict[stage_id2]["parentsIds"].append(stage_id1) for stage in stage_dict.keys(): if len(stage_dict[stage]["parentsIds"]) == 0: try: cached = list(stage_dict[stage]["cachedRDDs"]) except KeyError: None for i in range(0, stage): try: for rdd in cached: if rdd in stage_dict[i]["cachedRDDs"]: stage_dict[stage]["parentsIds"].append(i) cached.remove(rdd) except KeyError: None ''' if stage_dict: gather_records_rw(stage_dict) #print(stage_dict) # REPEATER = re.compile(r"(.+?)\1+$") # def repeated(s): # match = REPEATER.match(s) # return match.group(1) if match else None # # # Find iterations # lenparent = [] # for key in stageDict.keys(): # lenparent.append(str(len(stageDict[key]['Parent IDs']))) # i = 0 # stage_repeated = None # while stage_repeated == None and i < len(lenparent): # stage_repeated = repeated("".join(lenparent[i:])) # i += 1 # print(i, stage_repeated) # def setWeight(key): # for parentid in stageDict[key]['parentsIds']: # w1 = stageDict[key]["weight"] + 1 # w2 = stageDict[parentid]["weight"] # stageDict[parentid]["weight"] = max(w1, w2) # setWeight(parentid) # # # Set weights # for key in reversed(stageDict.keys()): # setWeight(key) #stage_to_do = len(list(stage_dict.keys())) - len(skipped) stage_to_do = len(stage_dict_key_stages) - len(skipped) #for stage_id in sorted(stage_dict.keys()): for stage_id in sorted(stage_dict_key_stages): parent_output = 0 parent_input = 0 parent_output_bytes = 0 parent_input_bytes = 0 if stage_id not in skipped: stage_dict[stage_id]["weight"] = stage_to_do stage_to_do -= 1 for parent_id in stage_dict[stage_id]["parentsIds"]: parent_output += stage_dict[parent_id]["recordswrite"] parent_output += stage_dict[parent_id][ "shufflerecordswrite"] parent_input += stage_dict[parent_id]["recordsread"] parent_input += stage_dict[parent_id][ "shufflerecordsread"] parent_output_bytes += stage_dict[parent_id][ "byteswrite"] parent_output_bytes += stage_dict[parent_id][ "shufflebyteswrite"] parent_input_bytes += stage_dict[parent_id][ "bytesread"] parent_input_bytes += stage_dict[parent_id][ "shufflebytesread"] if parent_output != 0: stage_dict[stage_id]["nominalrate"] = parent_output / ( stage_dict[stage_id]["monocoreduration"] / 1000.0) stage_dict[stage_id][ "nominalrate_bytes"] = parent_input_bytes / ( stage_dict[stage_id]["monocoreduration"] / 1000.0) elif parent_input != 0: stage_dict[stage_id]["nominalrate"] = parent_input / ( stage_dict[stage_id]["monocoreduration"] / 1000.0) stage_dict[stage_id][ "nominalrate_bytes"] = parent_input_bytes / ( stage_dict[stage_id]["monocoreduration"] / 1000.0) else: stage_input = stage_dict[stage_id][ "recordsread"] + stage_dict[stage_id][ "shufflerecordsread"] stage_input_bytes = stage_dict[stage_id][ "bytesread"] + stage_dict[stage_id][ "shufflebytesread"] if stage_input != 0 and stage_input != stage_dict[ stage_id]["numtask"]: stage_dict[stage_id][ "nominalrate"] = stage_input / ( stage_dict[stage_id]["monocoreduration"] / 1000.0) stage_dict[stage_id][ "nominalrate_bytes"] = stage_input_bytes / ( stage_dict[stage_id]["monocoreduration"] / 1000.0) else: stage_output = stage_dict[stage_id][ "recordswrite"] + stage_dict[stage_id][ "shufflerecordswrite"] stage_output_bytes = stage_dict[stage_id][ "byteswrite"] + stage_dict[stage_id][ "shufflebyteswrite"] stage_dict[stage_id][ "nominalrate"] = stage_input / ( stage_dict[stage_id]["monocoreduration"] / 1000.0) stage_dict[stage_id][ "nominalrate_bytes"] = stage_input_bytes / ( stage_dict[stage_id]["monocoreduration"] / 1000.0) if stage_dict[stage_id]["nominalrate"] == 0.0: stage_dict[stage_id]["genstage"] = True totalduration = stage_dict[0]["monocoretotalduration"] #for key in stage_dict.keys(): for key in stage_dict_key_stages: if key not in skipped: old_weight = stage_dict[key]["weight"] stage_dict[key]["weight"] = np.mean([ old_weight, totalduration / stage_dict[key]["monocoreduration"] ]) totalduration -= stage_dict[key]["monocoreduration"] # Create json output stage_dict[0]["jobs"] = stage_dict["jobs"] stage_dict.pop("jobs") ''' # with open(os.path.join(path, re.sub("[^a-zA-Z0-9.-]", "_", app_name)+"-"+str(log_index)+".json"), with open(os.path.join(path, re.sub("[^a-zA-Z0-9.-]", "_", app_name)+".json"), "w") as jsonoutput: json.dump(stage_dict, jsonoutput, indent=4, sort_keys=True) #log_index += 1 ''' ''' stages = list(stage_dict.keys()) stages_not_skipped = [s for s in stages if s not in skipped] stage_act_start_times = [0] * len(stages) stage_act_end_times = [0] * len(stages) if dat_file != '': fdat = open(dat_filepath) #print("fdat: ", fdat) with fdat as dat: #print(dat) for line in dat: tokens = line.split(' ') #print("after line 373") if len(tokens) > 6: if tokens[4] == 'Submitting' and (tokens[5] == 'ResultStage' or tokens[5] == 'ShuffleMapStage') and (start_stage == 0 or tokens[6] != '0'): date = tokens[0] time = tokens [1] stage_act_start_times[int(tokens[6]) - start_stage] = date_time_to_timestamp_ms(date, time) if tokens[6] == str(start_stage): app_act_start_time = date_time_to_timestamp_ms(date, time) if (tokens[4] == 'ResultStage' or tokens[4] == 'ShuffleMapStage') and tokens[9] == 'finished' and (start_stage == 0 or tokens[5] != '0'): date = tokens[0] time = tokens [1] stage_act_end_times[int(tokens[5]) - start_stage] = date_time_to_timestamp_ms(date, time) if tokens[5] == str(last_stage + start_stage): app_act_end_time = date_time_to_timestamp_ms(date, time) else: print('_run.dat file not found, no actualdurations calculated') for i in stages: stage_dict[i]["duration"] = stage_act_end_times[i] - stage_act_start_times[i] stage_dict[0]["totalduration"] = app_act_end_time - app_act_start_time ''' # create output dir log_name = os.path.basename(log) output_dir = os.path.join( OUTPUT_DIR, re.sub("[^a-zA-Z0-9.-]", "_", app_name) + "_" + log_name.split("-")[1]) if not json_out_dir else json_out_dir make_sure_path_exists(output_dir) # Create json output datagen_strings = ['datagen', 'scheduling-throughput'] out_filename = 'app_datagen.json' if any(x in app_name.lower() for x in datagen_strings) \ else re.sub("[^a-zA-Z0-9.-]", "_", app_name)+"-"+str(profile_suffix)+"_"+log_name.split("-")[1]+ ".json" #out_filename = 'app_datagen.json' if any(x in app_name.lower() for x in datagen_strings) else 'app.json' print('ROOT_DIR: {}\nAPP_NAME: {}\noutputdir: {}\noutfilename:{}'. format(ROOT_DIR, app_name, output_dir, out_filename)) with open(os.path.join(output_dir, out_filename), "w") as jsonoutput: json.dump(stage_dict, jsonoutput, indent=4, sort_keys=True) #os.rename(log, os.path.join(processed_dir, os.path.basename(log_name))) os.rename(log, os.path.join(processed_dir, log.split(os.path.sep)[-1])) log_index += 1
print("MAX_EXECUTOR in process_on_server b4 local assignment: " + str(c.MAX_EXECUTOR)) c.cfg_dict["MaxExecutor"] = c.MAX_EXECUTOR = str(end_index - 1) print("MAX_EXECUTOR in process_on_server after local assignment: " + str(c.MAX_EXECUTOR)) c.CONFIG_DICT["Control"]["MaxExecutor"] = c.MAX_EXECUTOR c.cfg_dict["ConfigDict"] = c.CONFIG_DICT c.update_config_parms(c) ''' #print("process_on_server config_instance.cfg_dict: ") #pp.pprint(c.cfg_dict) #print("passed to log.download: c.CONFIG_DICT: ") #pp.pprint(c.CONFIG_DICT) # DOWNLOAD LOGS output_folder = log.download(logfolder, [i for i in nodes[:end_index]], master_ip, output_folder, c.CONFIG_DICT) with open_cfg() as cfg: profile = True if 'profile' in cfg else False profile_option = cfg.getboolean( 'main', 'profile') if 'main' in cfg and 'profile' in cfg['main'] else False if profile or profile_option: # Profiling processing.main() # Profiling for filename in os.listdir( './spark_log_profiling/output_json/'): # Profiling if output_folder.split("/")[-1].split( "-" )[-1] in filename: # Profiling # Profilimg shutil.copy('./spark_log_profiling/output_json/' + filename, output_folder + "/" + filename) # Profiling run.write_config(output_folder)
from drivers.ccglibcloud.ec2spot import set_spot_drivers from drivers.azurearm.driver import set_azurearm_driver from util.utils import get_cfg, write_cfg, open_cfg #import config as c import pprint pp = pprint.PrettyPrinter(indent=4) #from configure import config_instance import libcloud.common.base libcloud.common.base.RETRY_FAILED_HTTP_REQUESTS = True folder = sys.argv[1] #folder = "home/ubuntu/dagsymb/num/app-20190128162903-0000" cfg_filename = os.path.join(folder, c.CLUSTERS_CFG_FILENAME) with open_cfg(r_path=cfg_filename) as cfg: c.CONFIG_DICT["Deadline"] = c.cfg_dict["Deadline"] = c.DEADLINE = int( cfg["experiment"]["deadline"]) c.cfg_dict["ConfigDict"] = c.CONFIG_DICT print(cfg_filename, c.CONFIG_DICT["Deadline"]) try: print('in plot') plot.plot(folder) except Exception as e: print("Plot failed: ", e) try: print('in metrics') metrics.compute_metrics(folder) except Exception as e: print("Metrics failed: ", e)