def main(directory_list): fh = open(directory_list, 'r') list_path = directory_list.split('/')[:-1] plot_dir = '/'.join(list_path) + 'configuration-comparison-plots-' + str(int(time.time())) + '/' mkdir(plot_dir) applications = [] for directory in fh.readlines(): directory = directory.rstrip('\n\r') #TODO: make this parser more tolerant to unexpected JS files, placement errors, etc for f in listdir(directory): if '.js' in f: print(f) app = appInfo() applications.append(app.buildFromJson(join(directory, f))) plots = {'running_time' : 'running time (MS)', 'gc_time' : 'total time spent in GC (MS)', 'max_heap' : 'max heap usage (MB)', 'avg_process_cpu_load' : 'average cpu load', 'avg_heap' : 'average heap usage (MB)', 'tasks_per_second' : 'tasks per second', 'avg_process_cpu_variance' : 'average cpu variance', 'gc_to_rt' : 'fraction of time spent in GC (MS)'} for plot in plots.keys(): plot_loc = plot_dir + plot + '.png' indicators = [] for app in applications: p_id = app.conf_id metric = getattr(app, plot) indicators.append((p_id, metric)) genBarPlot(indicators, 'Configurations', plots[plot], plot_loc) memories = [] mem_efficiencies = [] cpu_loads = [] for app in applications: p_id = getattr(app, 'conf_id') mem = int(p_id.split('-')[1]) * 1000 # executor heap size in MB mem_efficiency = getattr(app, 'max_heap') / mem memories.append((app.conf_id, mem, getattr(app, 'max_heap'))) mem_efficiencies.append((getattr(app, 'conf_id'), mem_efficiency, int(getattr(app, 'running_time')))) cpu_loads.append((getattr(app, 'conf_id'), getattr(app, 'avg_process_cpu_load'), int(getattr(app, 'running_time')))) # find min max rt min_rt= min([t[2] for t in cpu_loads]) max_rt= max([t[2] for t in cpu_loads]) # normalize cpu_loads = [(app_id, cpu_load, normalize(rt, min_rt, max_rt)) for (app_id, cpu_load, rt) in cpu_loads] mem_efficiencies = [(app_id, mem, normalize(rt, min_rt, max_rt)) for (app_id, mem, rt) in mem_efficiencies] genMemoryUsagePlot(memories, plot_dir + 'mem-efficiency.png') scatterPlot(mem_efficiencies, 'Normalized running time (MS)', 'Max heap usage over heap size', plot_dir + 'mem-scatter.png') scatterPlot(cpu_loads, 'Normalized running time (MS)', 'CPU load', plot_dir + 'cpu-scatter.png')
def main(directory, mode): app_info = appInfo() if directory[-1] == "/": directory = directory[:-1] lst = directory.split("/") path = "/".join(lst[:-1]) # path to input directory if len(lst) > 1: path += "/" directory = lst[-1] # input directory lst = directory.split("-") if len(lst) < 7: print "Invalid directory format.\n" print "Valid is of the form app-ts-id-e-m-c-appName-parameterSpace.\n" print "e.g. : app-20151105221648-0044-29-14-5-cc-5m" return # parse directory name to get app id, name, and its parameters. app_info.app_id = "-".join(lst[0:3]) app_info.conf_id = "-".join(lst[3:6]) app_info.app_name = lst[6] app_info.parameters = "-".join(lst[7:]) pd = path + directory eventlog_fname = "" for f in listdir(pd): split = f.split('-') if split[0] == "app" and len(split) == 3: eventlog_fname = join(pd, f) # EventLog eventlog = EventLog(eventlog_fname) app_info.running_time = eventlog.app_runtime app_info.gc_time = eventlog.gc_time app_info.tasks_per_second = eventlog.tasks_per_second execdirs = sorted([ d for d in listdir(pd) if isdir(join(pd,d)) ]) if (mode == 'executor'): # BtraceLogs and GC logs from all executors btracelogs, gclogs = findLogs(pd) heap_size = int(app_info.conf_id.split('-')[1]) * 1000 # Get Driver logs driver_btrace = None driver_gc = None for f in listdir(pd): if '.btrace' in f: #driver_btrace = BtraceLog(pd + '/' + f) driver_btrace = BtraceLog(join(pd,f)) elif 'DriverGc' in f: driver_gc = f driver_plots_dir = app_info.app_id + '-driver-plots/' if (not isdir(driver_plots_dir)): mkdir(driver_plots_dir) else: print('app already mined') return genPlot('driver-heap-usage', driver_btrace.time, driver_btrace.heap, 'Time in MS', 'JVM Heap usage (MB)', None, driver_btrace.tasks, driver_plots_dir) genPlot('driver-non-heap-usage', driver_btrace.time, driver_btrace.non_heap, 'Time in MS', 'JVM non Heap usage (MB)', None, driver_btrace.tasks, driver_plots_dir) genPlot('driver-memory-usage', driver_btrace.time, driver_btrace.memory, 'Time in MS', 'JVM total memory usage (MB)', None, driver_btrace.tasks, driver_plots_dir) genPlot('driver-process-cpu-usage', driver_btrace.time, driver_btrace.process_cpu, 'Time in MS', 'JVM CPU usage fraction', None, driver_btrace.tasks, driver_plots_dir) genPlot('driver-system-cpu-usage', driver_btrace.time, driver_btrace.system_cpu, 'Time in MS', 'System CPU usage fraction', None, driver_btrace.tasks, driver_plots_dir) if len(btracelogs) > 0: #################################### # Generate plots for every executor# #################################### plots_dir = app_info.app_id + '-executor-plots/' if (not isdir(plots_dir)): mkdir(plots_dir) else: print('app already mined') return for worker in btracelogs: for executor in btracelogs[worker]: # Heap usage genPlot('heap-usage', executor.time, executor.heap, 'Time in MS', 'JVM Heap usage in (MB)', executor.executor_id, executor.tasks, plots_dir, heap_size) # Non Heap usage genPlot('non-heap-usage', executor.time, executor.non_heap, 'Time in MS', 'JVM Non Heap usage (MB)', executor.executor_id, executor.tasks, plots_dir) # All memory (non heap + heap) genPlot('memory-usage', executor.time, executor.memory, 'Time in MS', 'JVM total memory usage (MB)', executor.executor_id, executor.tasks, plots_dir) # Process cpu genPlot('process-cpu-usage', executor.time, executor.process_cpu, 'Time in MS', 'JVM CPU usage fraction', executor.executor_id, executor.tasks, plots_dir) # System cpu genPlot('system-cpu-usage', executor.time, executor.system_cpu, 'Time in MS', 'System CPU usage fraction', executor.executor_id, executor.tasks, plots_dir) elif len(btracelogs) == 0: print "No BTrace logs exist." elif (mode == 'application'): # BtraceLogs and GC logs from all executors btracelogs, gclogs = findLogs(pd) if len(btracelogs) > 0: ################################ # Get metric for all executors # ################################ app_plots_dir = app_info.app_id + '-plots/' if (not isdir(app_plots_dir)): mkdir(app_plots_dir) else: print('app already mined') return app_infos = { 'process_avg_cpus': [], 'process_cpu_variances': [], 'avg_heaps': [], 'avg_non_heaps': [], 'avg_memories': [], 'max_heaps': [], 'system_avg_cpus': []} for worker in btracelogs: app_infos['max_heaps'].append( [(executor.executor_id, executor.max_heap) for executor in btracelogs[worker]]) app_infos['avg_heaps'].append( [(executor.executor_id, executor.avg_heap) for executor in btracelogs[worker]]) app_infos['avg_non_heaps'].append( [(executor.executor_id, executor.avg_non_heap) for executor in btracelogs[worker]]) app_infos['avg_memories'].append( [(executor.executor_id, executor.avg_memory) for executor in btracelogs[worker]]) app_infos['process_avg_cpus'].append( [(executor.executor_id, executor.avg_process_cpu_load) for executor in btracelogs[worker]]) app_infos['system_avg_cpus'].append( [(executor.executor_id, executor.avg_system_cpu_load) for executor in btracelogs[worker]]) app_infos['process_cpu_variances'].append( [(executor.executor_id, executor.process_cpu_variance) for executor in btracelogs[worker]]) plot_name = app_plots_dir + 'process-cpu-variance.png' genBarPlot(flattenList(app_infos['process_cpu_variances']), 'Executor id', 'Process cpu variance', plot_name) plot_name = app_plots_dir + 'max-heap-usage.png' genBarPlot(flattenList(app_infos['max_heaps']), 'Executor id', 'Max Heap used (MB)', plot_name) plot_name = app_plots_dir + 'avg-heap-usage.png' genBarPlot(flattenList(app_infos['avg_heaps']), 'Executor id', 'Average Heap usage (MB)', plot_name) plot_name = app_plots_dir + 'avg-non-heap-usage.png' genBarPlot(flattenList(app_infos['avg_non_heaps']), 'Executor id', 'Average non Heap usage (MB)', plot_name) plot_name = app_plots_dir + 'avg-memory-usage.png' genBarPlot(flattenList(app_infos['avg_memories']), 'Executor id', 'Average process memory usage (MB)', plot_name) plot_name = app_plots_dir + 'avg-process-cpu-fraction.png' genBarPlot(flattenList(app_infos['process_avg_cpus']), 'Executor id', 'Average process cpu load', plot_name) plot_name = app_plots_dir + 'avg-system-cpu-fraction.png' genBarPlot(flattenList(app_infos['system_avg_cpus']), 'Executor id', 'Average system cpu load', plot_name) """ plot_name = app_plots_dir + 'memory-efficiency.png' max_heaps = [(btracelog.executor_id, btracelog.max_heap) for btracelog in btracelogs] max_heap = int(app_info.conf_id.split('-')[1]) * 1000 genMemoryPlot(max_heaps, max_heap, 'Executor id', 'Executor Memory efficiency', plot_name) """ elif len(btracelogs) == 0: print "No BTrace logs exist." elif (mode == 'global'): ################################################################ # Get application wide metrics (average accross all executors) # ################################################################ for f in listdir(pd): if f == directory + "-global-log.js": print "Result already exists." return # BtraceLogs and GC logs from all workers and their executor(s) btracelogs, gclogs = findLogs(pd) app_infos = { 'worker_process_cpus': [], 'process_cpu_variances': [], 'avg_heaps': [], 'avg_non_heaps': [], 'avg_memories': [], 'max_memories': [], 'max_heaps': [], 'max_non_heaps': [], 'system_cpus': [], } if len(btracelogs) > 0: # Avg CPU usage for each worker for worker in btracelogs: app_infos['worker_process_cpus'].append( sum([btracelog.avg_process_cpu_load for btracelog in btracelogs[worker]])) app_infos['process_cpu_variances'].append( [btracelog.process_cpu_variance for btracelog in btracelogs[worker]]) app_infos['avg_heaps'].append( [btracelog.avg_heap for btracelog in btracelogs[worker]]) app_infos['avg_non_heaps'].append( [btracelog.avg_non_heap for btracelog in btracelogs[worker]]) app_infos['avg_memories'].append( [btracelog.avg_memory for btracelog in btracelogs[worker]]) app_infos['max_memories'].append( max([btracelog.max_memory for btracelog in btracelogs[worker]])) app_infos['max_heaps'].append( max([btracelog.max_heap for btracelog in btracelogs[worker]])) app_infos['max_non_heaps'].append( max([btracelog.max_non_heap for btracelog in btracelogs[worker]])) # we probably want only a system view from a single executor on the worker, not all of them? #system_cpu = [btracelog.avg_system_cpu_load for btracelog in btracelogs] #app_info.avg_system_cpu_load = sum(system_cpu) / len(system_cpu) # Avg CPU usage among all executors app_info.avg_process_cpu_load = \ sum(app_infos['worker_process_cpus']) / len(app_infos['worker_process_cpus']) # Avg CPU variance among all executors flat_process_cpu_variance = flattenList(app_infos['process_cpu_variances']) app_info.avg_process_cpu_variance = \ sum(flat_process_cpu_variance) / len(flat_process_cpu_variance) # Avg Heap usage among all executors flat_avg_heaps = flattenList(app_infos['avg_heaps']) app_info.avg_heap = sum(flat_avg_heaps) / len(flat_avg_heaps) # Avg non Heap usage among all executors flat_avg_non_heaps = flattenList(app_infos['avg_non_heaps']) app_info.avg_non_heap = sum(flat_avg_non_heaps) / len(flat_avg_non_heaps) # Avg Heap usage among all executors flat_avg_memories = flattenList(app_infos['avg_memories']) app_info.avg_memory = sum(flat_avg_memories) / len(flat_avg_memories) # Max peak memory (heap + non heap) reached by any executor app_info.max_memory = max(app_infos['max_memories']) # Max Heap usage reached by any executor app_info.max_heap = max(app_infos['max_heaps']) # Max non heap usage reached by any executor app_info.max_non_heap = max(app_infos['max_non_heaps']) # Create a json file containing results app_info.create_summary_log(pd + "/" + directory + "-global-log.js") #print app_info elif len(btracelogs) == 0: print "No BTrace logs exist."