def adaptive_autorange( self, threshold: float = 0.1, *, min_run_time: float = 0.01, max_run_time: float = 10.0, callback: Optional[Callable[[int, float], NoReturn]] = None, ) -> common.Measurement: number = self._estimate_block_size(min_run_time=0.05) def time_hook() -> float: return self._timer.timeit(number) def stop_hook(times: List[float]) -> bool: if len(times) > 3: return common.Measurement( number_per_run=number, raw_times=times, task_spec=self._task_spec ).meets_confidence(threshold=threshold) return False times = self._threaded_measurement_loop( number, time_hook, stop_hook, min_run_time, max_run_time, callback=callback) return common.Measurement( number_per_run=number, raw_times=times, task_spec=self._task_spec )
def blocked_autorange( self, callback: Optional[Callable[[int, float], NoReturn]] = None, min_run_time: float = 0.2, ) -> common.Measurement: """Measure many replicates while keeping timer overhead to a minimum. At a high level, blocked_autorange executes the following pseudo-code:: `setup` total_time = 0 while total_time < min_run_time start = timer() for _ in range(block_size): `stmt` total_time += (timer() - start) Note the variable `block_size` in the inner loop. The choice of block size is important to measurement quality, and must balance two competing objectives: 1) A small block size results in more replicates and generally better statistics. 2) A large block size better amortizes the cost of `timer` invocation, and results in a less biased measurement. This is important because CUDA syncronization time is non-trivial (order single to low double digit microseconds) and would otherwise bias the measurement. blocked_autorange sets block_size by running a warmup period, increasing block size until timer overhead is less than 0.1% of the overall computation. This value is then used for the main measurement loop. Returns: A `Measurement` object that contains measured runtimes and repetition counts, and can be used to compute statistics. (mean, median, etc.) """ number = self._estimate_block_size(min_run_time) def time_hook() -> float: return self._timer.timeit(number) def stop_hook(times: List[float]) -> bool: return True times = self._threaded_measurement_loop( number, time_hook, stop_hook, min_run_time=min_run_time, callback=callback) return common.Measurement( number_per_run=number, raw_times=times, task_spec=self._task_spec )
def stop_hook(times: List[float]) -> bool: if len(times) > 3: return common.Measurement( number_per_run=number, raw_times=times, task_spec=self._task_spec ).meets_confidence(threshold=threshold) return False
def timeit(self, number=1000000): with common.set_torch_threads(self._task_spec.num_threads): # Warmup self._timer.timeit(number=max(int(number // 100), 1)) return common.Measurement( number_per_run=number, raw_times=[self._timer.timeit(number=number)], task_spec=self._task_spec)
def timeit(self, number: int = 1000000) -> common.Measurement: """Mirrors the semantics of timeit.Timer.timeit(). Execute the main statement (`stmt`) `number` times. https://docs.python.org/3/library/timeit.html#timeit.Timer.timeit """ with common.set_torch_threads(self._task_spec.num_threads): # Warmup self._timeit(number=max(int(number // 100), 2)) return common.Measurement(number_per_run=number, raw_times=[self._timeit(number=number)], task_spec=self._task_spec)
def blocked_autorange(self, callback=None, min_run_time=0.2): number = self._estimate_block_size(min_run_time) def time_hook() -> float: return self._timer.timeit(number) def stop_hook(times) -> bool: return True times = self._threaded_measurement_loop(number, time_hook, stop_hook, min_run_time=min_run_time, callback=callback) return common.Measurement(number_per_run=number, raw_times=times, task_spec=self._task_spec)