def main(directory_list):
  fh = open(directory_list, 'r')

  list_path = directory_list.split('/')[:-1]
  plot_dir = '/'.join(list_path) + 'configuration-comparison-plots-' + str(int(time.time())) + '/'
  mkdir(plot_dir)

  applications = []

  for directory in fh.readlines():
    directory = directory.rstrip('\n\r')

    #TODO: make this parser more tolerant to unexpected JS files, placement errors, etc
    for f in listdir(directory):
      if '.js' in f:
        print(f)
        app = appInfo()
        applications.append(app.buildFromJson(join(directory, f)))

  plots = {'running_time'             : 'running time (MS)',
           'gc_time'                  : 'total time spent in GC (MS)',
           'max_heap'                 : 'max heap usage (MB)',
           'avg_process_cpu_load'     : 'average cpu load',
           'avg_heap'                 : 'average heap usage (MB)',
           'tasks_per_second'         : 'tasks per second',
           'avg_process_cpu_variance' : 'average cpu variance',
           'gc_to_rt'                 : 'fraction of time spent in GC (MS)'}

  for plot in plots.keys():
    plot_loc = plot_dir + plot + '.png'

    indicators = []
    for app in applications:
      p_id = app.conf_id
      metric = getattr(app, plot)
      indicators.append((p_id, metric))

    genBarPlot(indicators, 'Configurations', plots[plot], plot_loc)


  memories = []
  mem_efficiencies = []
  cpu_loads = []
  for app in applications:
    p_id = getattr(app, 'conf_id')
    mem = int(p_id.split('-')[1]) * 1000 # executor heap size in MB
    mem_efficiency = getattr(app, 'max_heap') / mem

    memories.append((app.conf_id, mem, getattr(app, 'max_heap')))
    mem_efficiencies.append((getattr(app, 'conf_id'),
                             mem_efficiency,
                             int(getattr(app, 'running_time'))))
    cpu_loads.append((getattr(app, 'conf_id'),
                      getattr(app, 'avg_process_cpu_load'),
                      int(getattr(app, 'running_time'))))

  # find min max rt
  min_rt= min([t[2] for t in cpu_loads])
  max_rt= max([t[2] for t in cpu_loads])
  # normalize
  cpu_loads = [(app_id, cpu_load, normalize(rt, min_rt, max_rt)) for (app_id, cpu_load, rt) in cpu_loads]
  mem_efficiencies = [(app_id, mem, normalize(rt, min_rt, max_rt)) for (app_id, mem, rt) in mem_efficiencies]

  genMemoryUsagePlot(memories, plot_dir + 'mem-efficiency.png')

  scatterPlot(mem_efficiencies,
              'Normalized running time (MS)',
              'Max heap usage over heap size',
              plot_dir + 'mem-scatter.png')

  scatterPlot(cpu_loads,
              'Normalized running time (MS)',
              'CPU load',
               plot_dir + 'cpu-scatter.png')
Пример #2
0
def main(directory, mode):
  app_info = appInfo()

  if directory[-1] == "/":
    directory = directory[:-1]
  lst = directory.split("/")

  path = "/".join(lst[:-1])  # path to input directory
  if len(lst) > 1:
    path += "/"
  directory = lst[-1]  # input directory

  lst = directory.split("-")
  if len(lst) < 7:
    print "Invalid directory format.\n"
    print "Valid is of the form app-ts-id-e-m-c-appName-parameterSpace.\n"
    print "e.g. : app-20151105221648-0044-29-14-5-cc-5m"
    return

  # parse directory name to get app id, name, and its parameters.
  app_info.app_id = "-".join(lst[0:3])
  app_info.conf_id = "-".join(lst[3:6])
  app_info.app_name = lst[6]
  app_info.parameters = "-".join(lst[7:])

  pd = path + directory

  eventlog_fname = ""
  for f in listdir(pd):
    split = f.split('-')
    if split[0] == "app" and len(split) == 3:
      eventlog_fname = join(pd, f)

  # EventLog
  eventlog = EventLog(eventlog_fname)
  app_info.running_time = eventlog.app_runtime
  app_info.gc_time = eventlog.gc_time
  app_info.tasks_per_second = eventlog.tasks_per_second

  execdirs = sorted([ d for d in listdir(pd) if isdir(join(pd,d)) ])

  if (mode == 'executor'):
    # BtraceLogs and GC logs from all executors
    btracelogs, gclogs = findLogs(pd)

    heap_size = int(app_info.conf_id.split('-')[1]) * 1000

    # Get Driver logs
    driver_btrace = None
    driver_gc = None
    for f in listdir(pd):
      if '.btrace' in f:
        #driver_btrace = BtraceLog(pd + '/' + f)
        driver_btrace = BtraceLog(join(pd,f))
      elif 'DriverGc' in f:
        driver_gc = f

    driver_plots_dir = app_info.app_id + '-driver-plots/'
    if (not isdir(driver_plots_dir)):
      mkdir(driver_plots_dir)
    else:
      print('app already mined')
      return

    genPlot('driver-heap-usage', driver_btrace.time, driver_btrace.heap,
            'Time in MS', 'JVM Heap usage (MB)', None, driver_btrace.tasks, driver_plots_dir)

    genPlot('driver-non-heap-usage', driver_btrace.time, driver_btrace.non_heap,
            'Time in MS', 'JVM non Heap usage (MB)', None, driver_btrace.tasks, driver_plots_dir)

    genPlot('driver-memory-usage', driver_btrace.time, driver_btrace.memory,
            'Time in MS', 'JVM total memory usage (MB)', None, driver_btrace.tasks, driver_plots_dir)

    genPlot('driver-process-cpu-usage', driver_btrace.time, driver_btrace.process_cpu,
            'Time in MS', 'JVM CPU usage fraction', None, driver_btrace.tasks, driver_plots_dir)

    genPlot('driver-system-cpu-usage', driver_btrace.time, driver_btrace.system_cpu,
            'Time in MS', 'System CPU usage fraction', None, driver_btrace.tasks, driver_plots_dir)

    if len(btracelogs) > 0:
      ####################################
      # Generate plots for every executor#
      ####################################
      plots_dir = app_info.app_id + '-executor-plots/'
      if (not isdir(plots_dir)):
        mkdir(plots_dir)
      else:
        print('app already mined')
        return

      for worker in btracelogs:
        for executor in btracelogs[worker]:
          # Heap usage
          genPlot('heap-usage', executor.time, executor.heap,
                  'Time in MS', 'JVM Heap usage in (MB)', executor.executor_id, executor.tasks, plots_dir, heap_size)

          # Non Heap usage
          genPlot('non-heap-usage', executor.time, executor.non_heap,
                  'Time in MS', 'JVM Non Heap usage (MB)', executor.executor_id, executor.tasks, plots_dir)

          # All memory (non heap + heap)
          genPlot('memory-usage', executor.time, executor.memory,
                  'Time in MS', 'JVM total memory usage (MB)', executor.executor_id, executor.tasks, plots_dir)

          # Process cpu
          genPlot('process-cpu-usage', executor.time, executor.process_cpu,
                  'Time in MS', 'JVM CPU usage fraction', executor.executor_id, executor.tasks, plots_dir)

          # System cpu 
          genPlot('system-cpu-usage', executor.time, executor.system_cpu,
                  'Time in MS', 'System CPU usage fraction', executor.executor_id, executor.tasks, plots_dir)

    elif len(btracelogs) == 0:
      print "No BTrace logs exist."

  elif (mode == 'application'):
    # BtraceLogs and GC logs from all executors
    btracelogs, gclogs = findLogs(pd)

    if len(btracelogs) > 0:
      ################################
      # Get metric for all executors #
      ################################

      app_plots_dir = app_info.app_id + '-plots/'
      if (not isdir(app_plots_dir)):
        mkdir(app_plots_dir)
      else:
        print('app already mined')
        return

      app_infos = { 'process_avg_cpus': [],
                    'process_cpu_variances': [],
                    'avg_heaps': [],
                    'avg_non_heaps': [],
                    'avg_memories': [],
                    'max_heaps': [],
                    'system_avg_cpus': []}

      for worker in btracelogs:
        app_infos['max_heaps'].append(
          [(executor.executor_id, executor.max_heap) for executor in btracelogs[worker]])

        app_infos['avg_heaps'].append(
          [(executor.executor_id, executor.avg_heap) for executor in btracelogs[worker]])

        app_infos['avg_non_heaps'].append(
          [(executor.executor_id, executor.avg_non_heap) for executor in btracelogs[worker]])

        app_infos['avg_memories'].append(
          [(executor.executor_id, executor.avg_memory) for executor in btracelogs[worker]])

        app_infos['process_avg_cpus'].append(
          [(executor.executor_id, executor.avg_process_cpu_load) for executor in btracelogs[worker]])

        app_infos['system_avg_cpus'].append(
          [(executor.executor_id, executor.avg_system_cpu_load) for executor in btracelogs[worker]])

        app_infos['process_cpu_variances'].append(
          [(executor.executor_id, executor.process_cpu_variance) for executor in btracelogs[worker]])


      plot_name = app_plots_dir + 'process-cpu-variance.png'
      genBarPlot(flattenList(app_infos['process_cpu_variances']), 'Executor id', 'Process cpu variance',  plot_name)

      plot_name = app_plots_dir + 'max-heap-usage.png'
      genBarPlot(flattenList(app_infos['max_heaps']), 'Executor id', 'Max Heap used (MB)',  plot_name)

      plot_name = app_plots_dir + 'avg-heap-usage.png'
      genBarPlot(flattenList(app_infos['avg_heaps']), 'Executor id', 'Average Heap usage (MB)',  plot_name)

      plot_name = app_plots_dir + 'avg-non-heap-usage.png'
      genBarPlot(flattenList(app_infos['avg_non_heaps']), 'Executor id', 'Average non Heap usage (MB)',  plot_name)

      plot_name = app_plots_dir + 'avg-memory-usage.png'
      genBarPlot(flattenList(app_infos['avg_memories']), 'Executor id', 'Average process memory usage (MB)',  plot_name)

      plot_name = app_plots_dir + 'avg-process-cpu-fraction.png'
      genBarPlot(flattenList(app_infos['process_avg_cpus']), 'Executor id', 'Average process cpu load',  plot_name)

      plot_name = app_plots_dir + 'avg-system-cpu-fraction.png'
      genBarPlot(flattenList(app_infos['system_avg_cpus']), 'Executor id', 'Average system cpu load',  plot_name)

      """
      plot_name = app_plots_dir + 'memory-efficiency.png'
      max_heaps = [(btracelog.executor_id, btracelog.max_heap) for btracelog in btracelogs]
      max_heap = int(app_info.conf_id.split('-')[1]) * 1000
      genMemoryPlot(max_heaps, max_heap, 'Executor id', 'Executor Memory efficiency', plot_name)
      """

    elif len(btracelogs) == 0:
      print "No BTrace logs exist."


  elif (mode == 'global'):
    ################################################################
    # Get application wide metrics (average accross all executors) #
    ################################################################
    for f in listdir(pd):
      if f == directory + "-global-log.js":
        print "Result already exists."
        return

    # BtraceLogs and GC logs from all workers and their executor(s)
    btracelogs, gclogs = findLogs(pd)
    app_infos = { 'worker_process_cpus': [],
                  'process_cpu_variances': [],
                  'avg_heaps': [],
                  'avg_non_heaps': [],
                  'avg_memories': [],
                  'max_memories': [],
                  'max_heaps': [],
                  'max_non_heaps': [],
                  'system_cpus': [],
                }

    if len(btracelogs) > 0:
      # Avg CPU usage for each worker
      for worker in btracelogs:
        app_infos['worker_process_cpus'].append(
          sum([btracelog.avg_process_cpu_load for btracelog in btracelogs[worker]]))

        app_infos['process_cpu_variances'].append(
          [btracelog.process_cpu_variance for btracelog in btracelogs[worker]])

        app_infos['avg_heaps'].append(
          [btracelog.avg_heap for btracelog in btracelogs[worker]])

        app_infos['avg_non_heaps'].append(
          [btracelog.avg_non_heap for btracelog in btracelogs[worker]])

        app_infos['avg_memories'].append(
          [btracelog.avg_memory for btracelog in btracelogs[worker]])

        app_infos['max_memories'].append(
          max([btracelog.max_memory for btracelog in btracelogs[worker]]))

        app_infos['max_heaps'].append(
          max([btracelog.max_heap for btracelog in btracelogs[worker]]))

        app_infos['max_non_heaps'].append(
          max([btracelog.max_non_heap for btracelog in btracelogs[worker]]))


      # we probably want only a system view from a single executor on the worker, not all of them?
      #system_cpu = [btracelog.avg_system_cpu_load for btracelog in btracelogs]
      #app_info.avg_system_cpu_load = sum(system_cpu) / len(system_cpu)

      # Avg CPU usage among all executors
      app_info.avg_process_cpu_load = \
        sum(app_infos['worker_process_cpus']) / len(app_infos['worker_process_cpus'])

      # Avg CPU variance among all executors
      flat_process_cpu_variance = flattenList(app_infos['process_cpu_variances'])
      app_info.avg_process_cpu_variance = \
        sum(flat_process_cpu_variance) / len(flat_process_cpu_variance)

      # Avg Heap usage among all executors
      flat_avg_heaps = flattenList(app_infos['avg_heaps'])
      app_info.avg_heap = sum(flat_avg_heaps) / len(flat_avg_heaps)

      # Avg non Heap usage among all executors
      flat_avg_non_heaps = flattenList(app_infos['avg_non_heaps'])
      app_info.avg_non_heap = sum(flat_avg_non_heaps) / len(flat_avg_non_heaps)

      # Avg Heap usage among all executors
      flat_avg_memories = flattenList(app_infos['avg_memories'])
      app_info.avg_memory = sum(flat_avg_memories) / len(flat_avg_memories)

      # Max peak memory (heap + non heap) reached by any executor
      app_info.max_memory = max(app_infos['max_memories'])

      # Max Heap usage reached by any executor
      app_info.max_heap = max(app_infos['max_heaps'])

      # Max non heap usage reached by any executor
      app_info.max_non_heap = max(app_infos['max_non_heaps'])

      # Create a json file containing results
      app_info.create_summary_log(pd + "/" + directory + "-global-log.js")

      #print app_info

    elif len(btracelogs) == 0:
      print "No BTrace logs exist."