def __call__(self, *args, **kwargs): """Call the wrapped function, with retries. Args: retry_timedelta (kwarg): amount of time to retry before giving up. sleep_base (kwarg): amount of time to sleep upon first failure, all other sleeps are derived from this one. """ retry_timedelta = kwargs.pop('retry_timedelta', self._retry_timedelta) if retry_timedelta is None: retry_timedelta = datetime.timedelta(days=365) num_retries = kwargs.pop('num_retries', self._num_retries) if num_retries is None: num_retries = 1000000 if os.environ.get('WANDB_TEST'): num_retries = 0 sleep_base = kwargs.pop('retry_sleep_base', 1) # an extra function to allow performing more logic on the filtered exceptiosn check_retry_fn = kwargs.pop('check_retry_fn', self._check_retry_fn) sleep = sleep_base start_time = datetime.datetime.now() now = start_time self._num_iter = 0 while True: try: result = self._call_fn(*args, **kwargs) # Only print resolved attempts once every minute if self._num_iter > 2 and now - self._last_print > datetime.timedelta( minutes=1): self._last_print = datetime.datetime.now() wandb.termlog( '{} resolved after {}, resuming normal operation.'. format(self._error_prefix, datetime.datetime.now() - start_time)) return result except self._retryable_exceptions as e: # if the secondary check fails, re-raise if not check_retry_fn(e): raise if (datetime.datetime.now() - start_time >= retry_timedelta or self._num_iter >= num_retries): raise if self._num_iter == 2: logger.exception('Retry attempt failed:') wandb.termlog( '{} ({}), entering retry loop. See {} for full traceback.' .format(self._error_prefix, e.__class__.__name__, util.get_log_file_path())) # if wandb.env.is_debug(): # traceback.print_exc() time.sleep(sleep + random.random() * 0.25 * sleep) sleep *= 2 if sleep > self.MAX_SLEEP_SECONDS: sleep = self.MAX_SLEEP_SECONDS now = datetime.datetime.now() self._num_iter += 1
def log_fname(self): # TODO: we started work to log to a file in the run dir, but it had issues. # For now all logs goto the same place. return util.get_log_file_path()
def __call__(self, *args, **kwargs): """Call the wrapped function, with retries. Args: retry_timedelta (kwarg): amount of time to retry before giving up. sleep_base (kwarg): amount of time to sleep upon first failure, all other sleeps are derived from this one. """ retry_timedelta = kwargs.pop('retry_timedelta', self._retry_timedelta) if retry_timedelta is None: retry_timedelta = datetime.timedelta(days=1000000) num_retries = kwargs.pop('num_retries', self._num_retries) if num_retries is None: num_retries = 1000000 if os.environ.get('WANDB_TEST'): num_retries = 0 sleep_base = 1 try: sleep_base = kwargs.pop('retry_sleep_base') except KeyError: pass first = True sleep = sleep_base start_time = datetime.datetime.now() now = start_time self._num_iter = 0 while True: try: result = self._call_fn(*args, **kwargs) if not first: wandb.termlog( '%s resolved after %s, resuming normal operation.' % (self._error_prefix, datetime.datetime.now() - start_time)) return result except self._retryable_exceptions as e: if (datetime.datetime.now() - start_time >= retry_timedelta or self._num_iter >= num_retries): raise if self._num_iter == 2: logger.exception('Retry attempt failed:') wandb.termlog( '%s (%s), entering retry loop. See %s for full traceback.' % (self._error_prefix, e.__class__.__name__, util.get_log_file_path())) if os.getenv('WANDB_DEBUG'): traceback.print_exc() first = False time.sleep(sleep + random.random() * 0.25 * sleep) sleep *= 2 if sleep > self.MAX_SLEEP_SECONDS: sleep = self.MAX_SLEEP_SECONDS now = datetime.datetime.now() self._num_iter += 1