Пример #1
0
    def get_simulated_runtime(self, waterfall_prefix=""):
        """ Returns the simulated runtime for the job.

    This should be approximately the same as the original runtime of the job, except
    that it doesn't include scheduler delay.

    If a non-empty waterfall_prefix is passed in, makes a waterfall plot based on the simulated
    runtimes.
    """
        total_runtime = 0
        tasks_for_combined_stages = []
        all_start_finish_times = []
        for id, stage in self.stages.iteritems():
            if id in self.stages_to_combine:
                tasks_for_combined_stages.extend(stage.tasks)
            else:
                tasks = sorted(stage.tasks, key=lambda task: task.start_time)
                simulated_runtime, start_finish_times = simulate.simulate([task.runtime() for task in tasks])
                start_finish_times_adjusted = [
                    (start + total_runtime, finish + total_runtime) for start, finish in start_finish_times
                ]
                all_start_finish_times.append(start_finish_times_adjusted)
                total_runtime += simulated_runtime
        if len(tasks_for_combined_stages) > 0:
            tasks = sorted(tasks_for_combined_stages, key=lambda task: task.start_time)
            simulated_runtime, start_finish_times = simulate.simulate([task.runtime() for task in tasks])
            start_finish_times_adjusted = [
                (start - simulated_runtime, finish - simulated_runtime) for start, finish in start_finish_times
            ]
            all_start_finish_times.append(start_finish_times_adjusted)
            total_runtime += simulated_runtime

        if waterfall_prefix:
            self.write_simulated_waterfall(all_start_finish_times, "%s_simulated" % waterfall_prefix)
        return total_runtime
Пример #2
0
    def replace_95_stragglers_with_median_speedup(self):
        """ Returns how much faster the job would have run if there were no stragglers.

    Removes stragglers by replacing the longest 5% of tasks with the median runtime
    for tasks in the stage.
    """
        total_no_stragglers_runtime = 0
        runtimes_for_combined_stages = []
        for id, stage in self.stages.iteritems():
            runtimes = [task.runtime() for task in stage.tasks]
            runtimes.sort()
            median_runtime = get_percentile(runtimes, 0.5)
            threshold_runtime = get_percentile(runtimes, 0.95)
            no_straggler_runtimes = []
            for runtime in runtimes:
                if runtime >= threshold_runtime:
                    no_straggler_runtimes.append(median_runtime)
                else:
                    no_straggler_runtimes.append(runtime)
            if id in self.stages_to_combine:
                runtimes_for_combined_stages.extend(no_straggler_runtimes)
            else:
                no_stragglers_runtime = simulate.simulate(no_straggler_runtimes)[0]
                total_no_stragglers_runtime += no_stragglers_runtime
                original_runtime = simulate.simulate([task.runtime() for task in stage.tasks])[0]
                print "%s: Orig: %s, no stragg: %s" % (id, original_runtime, no_stragglers_runtime)
        if len(runtimes_for_combined_stages) > 0:
            total_no_stragglers_runtime += simulate.simulate(runtimes_for_combined_stages)[0]
        return total_no_stragglers_runtime * 1.0 / self.get_simulated_runtime()
Пример #3
0
  def replace_stragglers_with_median_speedup(self, threshold_fn):
    """ Returns how much faster the job would have run if there were no stragglers.

    For each stage, passes the list of task runtimes into threshold_fn, which should
    return a threshold runtime. Then, replaces all task runtimes greater than the given
    threshold with the median runtime.

    For example, to replace the tasks with the longest 5% of runtimes with the median:
      self.replace_stragglers_with_median_speedup(lambda runtimes: numpy.percentile(runtimes, 95)
    """
    self.print_heading("Computing speedup from replacing straggler tasks with median")
    total_no_stragglers_runtime = 0
    start_and_runtimes_for_combined_stages = []
    original_start_and_runtimes_for_combined_stages = []
    num_stragglers_combined_stages = 0
    for id, stage in self.stages.iteritems():
      runtimes = [task.runtime() for task in stage.tasks]
      median_runtime = numpy.percentile(runtimes, 50)
      threshold_runtime = threshold_fn(runtimes)
      no_straggler_start_and_runtimes = []
      num_stragglers = 0
      sorted_stage_tasks = sorted(stage.tasks, key = lambda t: t.runtime())
      for task in sorted_stage_tasks:
        if task.runtime() >= threshold_runtime:
          assert(median_runtime <= task.runtime())
          no_straggler_start_and_runtimes.append((task.start_time, median_runtime))
          num_stragglers += 1 
        else:
          no_straggler_start_and_runtimes.append((task.start_time, task.runtime()))
      if id in self.stages_to_combine:
        start_and_runtimes_for_combined_stages.extend(no_straggler_start_and_runtimes)
        original_start_and_runtimes_for_combined_stages.extend(
          [(t.start_time, t.runtime()) for t in stage.tasks])
        num_stragglers_combined_stages += num_stragglers
      else:
        max_concurrency = concurrency.get_max_concurrency(stage.tasks)
        no_stragglers_runtime = simulate.simulate(
          [x[1] for x in no_straggler_start_and_runtimes], max_concurrency)[0]
        total_no_stragglers_runtime += no_stragglers_runtime
        original_runtime = simulate.simulate(
          [task.runtime() for task in sorted_stage_tasks], max_concurrency)[0]
        print ("%s: Original: %s, Orig (sim): %s, no stragg: %s (%s stragglers)" %
          (id, stage.finish_time() - stage.start_time, original_runtime, no_stragglers_runtime,
           num_stragglers))
    if len(start_and_runtimes_for_combined_stages) > 0:
      original_start_time = min([x[0] for x in start_and_runtimes_for_combined_stages])
      original_finish_time = max([x[0] + x[1] for x in start_and_runtimes_for_combined_stages])
      start_and_runtimes_for_combined_stages.sort()
      runtimes_for_combined_stages = [x[1] for x in start_and_runtimes_for_combined_stages]
      new_runtime = simulate.simulate(
        runtimes_for_combined_stages, self.combined_stages_concurrency)[0]
      original_runtime = simulate.simulate(
        [x[1] for x in sorted(original_start_and_runtimes_for_combined_stages)],
        self.combined_stages_concurrency)[0]
      print ("Combined: Original: %s, Orig (sim): %s, no stragg: %s (%s stragglers)" %
        (original_finish_time - original_start_time, original_runtime, new_runtime,
         num_stragglers_combined_stages))
      total_no_stragglers_runtime += new_runtime
    return total_no_stragglers_runtime * 1.0 / self.get_simulated_runtime()
Пример #4
0
  def replace_stragglers_with_median_speedup(self, threshold_fn):
    """ Returns how much faster the job would have run if there were no stragglers.

    For each stage, passes the list of task runtimes into threshold_fn, which should
    return a threshold runtime. Then, replaces all task runtimes greater than the given
    threshold with the median runtime.

    For example, to replace the tasks with the longest 5% of runtimes with the median:
      self.replace_stragglers_with_median_speedup(lambda runtimes: numpy.percentile(runtimes, 95)
    """
    self.print_heading("Computing speedup from replacing straggler tasks with median")
    total_no_stragglers_runtime = 0
    start_and_runtimes_for_combined_stages = []
    original_start_and_runtimes_for_combined_stages = []
    num_stragglers_combined_stages = 0
    for id, stage in self.stages.iteritems():
      runtimes = [task.runtime() for task in stage.tasks]
      median_runtime = numpy.percentile(runtimes, 50)
      threshold_runtime = threshold_fn(runtimes)
      no_straggler_start_and_runtimes = []
      num_stragglers = 0
      sorted_stage_tasks = sorted(stage.tasks, key = lambda t: t.runtime())
      for task in sorted_stage_tasks:
        if task.runtime() >= threshold_runtime:
          assert(median_runtime <= task.runtime())
          no_straggler_start_and_runtimes.append((task.start_time, median_runtime))
          num_stragglers += 1 
        else:
          no_straggler_start_and_runtimes.append((task.start_time, task.runtime()))
      if id in self.stages_to_combine:
        start_and_runtimes_for_combined_stages.extend(no_straggler_start_and_runtimes)
        original_start_and_runtimes_for_combined_stages.extend(
          [(t.start_time, t.runtime()) for t in stage.tasks])
        num_stragglers_combined_stages += num_stragglers
      else:
        max_concurrency = concurrency.get_max_concurrency(stage.tasks)
        no_stragglers_runtime = simulate.simulate(
          [x[1] for x in no_straggler_start_and_runtimes], max_concurrency)[0]
        total_no_stragglers_runtime += no_stragglers_runtime
        original_runtime = simulate.simulate(
          [task.runtime() for task in sorted_stage_tasks], max_concurrency)[0]
        print ("%s: Original: %s, Orig (sim): %s, no stragg: %s (%s stragglers)" %
          (id, stage.finish_time() - stage.start_time, original_runtime, no_stragglers_runtime,
           num_stragglers))
    if len(start_and_runtimes_for_combined_stages) > 0:
      original_start_time = min([x[0] for x in start_and_runtimes_for_combined_stages])
      original_finish_time = max([x[0] + x[1] for x in start_and_runtimes_for_combined_stages])
      start_and_runtimes_for_combined_stages.sort()
      runtimes_for_combined_stages = [x[1] for x in start_and_runtimes_for_combined_stages]
      new_runtime = simulate.simulate(
        runtimes_for_combined_stages, self.combined_stages_concurrency)[0]
      original_runtime = simulate.simulate(
        [x[1] for x in sorted(original_start_and_runtimes_for_combined_stages)],
        self.combined_stages_concurrency)[0]
      print ("Combined: Original: %s, Orig (sim): %s, no stragg: %s (%s stragglers)" %
        (original_finish_time - original_start_time, original_runtime, new_runtime,
         num_stragglers_combined_stages))
      total_no_stragglers_runtime += new_runtime
    return total_no_stragglers_runtime * 1.0 / self.get_simulated_runtime()
Пример #5
0
 def fraction_time_waiting_on_compute(self):
   total_compute_wait_time = 0
   total_runtime = 0
   for stage in self.stages.values():
     for task in stage.tasks:
       total_compute_wait_time += (task.runtime() - task.runtime_no_compute())
       total_runtime += task.runtime()
   return total_compute_wait_time * 1.0 / total_runtime
Пример #6
0
 def fraction_time_waiting_on_compute(self):
   total_compute_wait_time = 0
   total_runtime = 0
   for stage in self.stages.values():
     for task in stage.tasks:
       total_compute_wait_time += (task.runtime() - task.runtime_no_compute())
       total_runtime += task.runtime()
   return total_compute_wait_time * 1.0 / total_runtime
Пример #7
0
 def fraction_time_waiting_on_disk(self):
     total_disk_wait_time = 0
     total_runtime = 0
     for stage in self.stages.values():
         for task in stage.tasks:
             total_disk_wait_time += task.runtime() - task.runtime_no_disk_for_shuffle()
             total_runtime += task.runtime()
     return total_disk_wait_time * 1.0 / total_runtime
Пример #8
0
  def get_simulated_runtime(self, waterfall_prefix=""):
    """ Returns the simulated runtime for the job.

    This should be approximately the same as the original runtime of the job, except
    that it doesn't include scheduler delay.

    If a non-empty waterfall_prefix is passed in, makes a waterfall plot based on the simulated
    runtimes.
    """
    total_runtime = 0
    tasks_for_combined_stages = []
    all_start_finish_times = []
    for id, stage in self.stages.iteritems():
      if id in self.stages_to_combine:
        tasks_for_combined_stages.extend(stage.tasks)
      else:
        tasks = sorted(stage.tasks, key = lambda task: task.start_time)
        simulated_runtime, start_finish_times = simulate.simulate(
          [t.runtime() for t in tasks], concurrency.get_max_concurrency(tasks))
        start_finish_times_adjusted = [
          (start + total_runtime, finish + total_runtime) for start, finish in start_finish_times]
        all_start_finish_times.append(start_finish_times_adjusted)
        total_runtime += simulated_runtime
    if len(tasks_for_combined_stages) > 0:
      tasks = sorted(tasks_for_combined_stages, key = lambda task: task.start_time)
      simulated_runtime, start_finish_times = simulate.simulate(
        [task.runtime() for task in tasks], self.combined_stages_concurrency)
      start_finish_times_adjusted = [
        (start - simulated_runtime, finish - simulated_runtime) for start, finish in start_finish_times]
      all_start_finish_times.append(start_finish_times_adjusted)
      total_runtime += simulated_runtime

    if waterfall_prefix:
      self.write_simulated_waterfall(all_start_finish_times, "%s_simulated" % waterfall_prefix)
    return total_runtime 
Пример #9
0
 def fraction_time_computing(self):
   total_compute_time = 0
   total_runtime = 0
   for stage in self.stages.values():
     for task in stage.tasks:
       total_compute_time += task.compute_time()
       total_runtime += task.runtime()
   return total_compute_time * 1.0 / total_runtime
Пример #10
0
 def fraction_time_computing(self):
     total_compute_time = 0
     total_runtime = 0
     for stage in self.stages.values():
         for task in stage.tasks:
             total_compute_time += task.compute_time()
             total_runtime += task.runtime()
     return total_compute_time * 1.0 / total_runtime
Пример #11
0
 def fraction_time_deserializing(self):
   """ Returns the fraction of time spent deserializing data. """
   total_deserialize_time = 0
   total_runtime = 0
   for stage in self.stages.values():
     for task in stage.tasks:
       serialize_time = task.estimated_deserialization_millis
       total_deserialize_time += serialize_time
       total_runtime += task.runtime()
   return total_deserialize_time * 1.0 / total_runtime
Пример #12
0
 def fraction_time_serializing(self):
   """ Returns the fraction of time spent serializing and deserializing data. """
   total_serialize_time = 0
   total_runtime = 0
   for stage in self.stages.values():
     for task in stage.tasks:
       serialize_time = task.estimated_serialization_millis + task.estimated_deserialization_millis
       if (serialize_time > task.compute_time()):
         print ("!!!! Warning: For task %s, serialize time (%s) is larger than compute time (%s)" %
           (task, serialize_time, task.compute_time()))
       total_serialize_time += serialize_time
       total_runtime += task.runtime()
   return total_serialize_time * 1.0 / total_runtime
Пример #13
0
 def fraction_time_using_disk(self):
   """ Fraction of task time spent writing shuffle outputs to disk and reading them back.
   
   Does not include time to spill data to disk (which is fine for now because that feature is
   turned off by default nor the time to persist result data to disk (if that happens).
   """ 
   total_disk_write_time = 0
   total_runtime = 0
   for id, stage in self.stages.iteritems():
     stage_disk_write_time = 0
     stage_total_runtime = 0
     for task in stage.tasks:
       stage_disk_write_time += task.disk_time()
       stage_total_runtime += task.runtime()
     self.logger.debug("Stage %s: Disk write time: %s, total runtime: %s" %
       (id, stage_disk_write_time, stage_total_runtime))
     total_disk_write_time += stage_disk_write_time
     total_runtime += stage_total_runtime
   return total_disk_write_time * 1.0 / total_runtime
Пример #14
0
 def fraction_time_using_disk(self):
   """ Fraction of task time spent writing shuffle outputs to disk and reading them back.
   
   Does not include time to spill data to disk (which is fine for now because that feature is
   turned off by default nor the time to persist result data to disk (if that happens).
   """ 
   total_disk_write_time = 0
   total_runtime = 0
   for id, stage in self.stages.iteritems():
     stage_disk_write_time = 0
     stage_total_runtime = 0
     for task in stage.tasks:
       stage_disk_write_time += task.disk_time()
       stage_total_runtime += task.runtime()
     self.logger.debug("Stage %s: Disk write time: %s, total runtime: %s" %
       (id, stage_disk_write_time, stage_total_runtime))
     total_disk_write_time += stage_disk_write_time
     total_runtime += stage_total_runtime
   return total_disk_write_time * 1.0 / total_runtime
Пример #15
0
    def replace_stragglers_with_median_speedup(self):
        """ Returns how much faster the job would have run if there were no stragglers.

    Removes stragglers by replacing all task runtimes with the median runtime for tasks in the
    stage.
    """
        total_no_stragglers_runtime = 0
        runtimes_for_combined_stages = []
        for id, stage in self.stages.iteritems():
            runtimes = [task.runtime() for task in stage.tasks]
            median_runtime = numpy.median(runtimes)
            no_straggler_runtimes = [numpy.median(runtimes)] * len(stage.tasks)
            if id in self.stages_to_combine:
                runtimes_for_combined_stages.extend(no_straggler_runtimes)
            else:
                total_no_stragglers_runtime += simulate.simulate(no_straggler_runtimes)[0]
        if len(runtimes_for_combined_stages) > 0:
            total_no_stragglers_runtime += simulate.simulate(runtimes_for_combined_stages)[0]
        return total_no_stragglers_runtime * 1.0 / self.get_simulated_runtime()
Пример #16
0
  def replace_all_tasks_with_median_speedup(self):
    """ Returns how much faster the job would have run if there were no stragglers.

    Removes stragglers by replacing all task runtimes with the median runtime for tasks in the
    stage.
    """
    total_no_stragglers_runtime = 0
    runtimes_for_combined_stages = []
    for id, stage in self.stages.iteritems():
      runtimes = [task.runtime() for task in stage.tasks]
      median_runtime = numpy.median(runtimes)
      no_straggler_runtimes = [numpy.median(runtimes)] * len(stage.tasks)
      if id in self.stages_to_combine:
        runtimes_for_combined_stages.extend(no_straggler_runtimes)
      else:
        total_no_stragglers_runtime += simulate.simulate(
          no_straggler_runtimes, concurrency.get_max_concurrency(stage.tasks))[0]
    if len(runtimes_for_combined_stages) > 0:
      total_no_stragglers_runtime += simulate.simulate(
        runtimes_for_combined_stages, self.combined_stages_concurrency)[0]
    return total_no_stragglers_runtime * 1.0 / self.get_simulated_runtime()