def event_file_present_loop(self, tensor_location: TensorLocation): event_file_name = tensor_location.event_file_name event_file_present = self._is_event_file_present(event_file_name) num_retry = 0 while not event_file_present and num_retry < self.event_file_retry_limit: if self._has_event_file_been_skipped(event_file_name): raise TensorUnavailableForStep( tname=tensor_location.tensorname, mode=tensor_location.mode, step=tensor_location.mode_step, ) elif has_training_ended(self.path) is True: self.logger.warn( f"IndexReader: Training Has Ended" f"\nIndexReader: {event_file_name} was written but not found." ) raise TensorUnavailableForStep( tname=tensor_location.tensorname, mode=tensor_location.mode, step=tensor_location.mode_step, ) event_file_present = self._is_event_file_present(event_file_name) num_retry += 1 time.sleep(2) if num_retry >= self.event_file_retry_limit: self.logger.warn( f"IndexReader: {event_file_name} was written but not found. After {num_retry} retries." ) raise TensorUnavailableForStep( tname=tensor_location.tensorname, mode=tensor_location.mode, step=tensor_location.mode_step, ) return
def _step(self, step_num, mode=ModeKeys.GLOBAL, worker=None): s = self._get_step_currently(step_num, mode, worker=worker) if s is not None: return s else: self.trial.maybe_refresh(self.name) ss = self.trial.has_passed_step(step_num, mode) if ss == StepState.AVAILABLE: s = self._get_step_currently(step_num, mode, worker=worker) if s is not None: return s raise TensorUnavailableForStep(self.name, step_num, mode) elif ss == StepState.UNAVAILABLE: raise StepUnavailable(step_num, mode) elif ss == StepState.NOT_YET_AVAILABLE: if self.trial.loaded_all_steps is True: last_step = -1 avail_steps = self.trial.steps(mode=mode) if len(avail_steps) > 0: last_step = avail_steps[-1] raise NoMoreData( "Looking for step:{} for mode {} and reached end of training. Max step available is {}" .format(step_num, mode, last_step)) raise StepNotYetAvailable(step_num, mode) assert False, "Should not happen"
def value(self, step_num, mode=ModeKeys.GLOBAL, worker=None): # step refreshes s = self._step(step_num=step_num, mode=mode, worker=worker) if s.value is not None: return s.value elif s.location is not None: value = self.trial.index_reader.fetch_tensor_value(s.location) if self.cache: s.value = value return value else: has_reduction_values = len(s.reduction_values()) > 0 has_reduction_locations = len(s.reduction_locations()) > 0 has_reductions = has_reduction_locations or has_reduction_values raise TensorUnavailableForStep(self.name, step_num, mode, has_reductions)
def reduction_value(self, step_num, reduction_name, mode=ModeKeys.GLOBAL, worker=None, abs=False): """ Returns the value of the reduction requested. If the tensor was saved as a reduction, then just fetches that. Else, tries to compute the reduction and returns. If the tensor value is not available, returns None as reduction Reductions are not cached. #TODO do we want to? :param step_num: step number :param mode: mode of job (train, eval, predict, etc). If this is None, assumes step number is global :param reduction_name: name of reduction :param worker: name of worker :param abs: boolean which represents whether reduction should be applied on absolute value of the tensor or not :return: reduction value requested as a float """ s = self._step(step_num=step_num, mode=mode, worker=worker) rv = s.reduction_value(reduction_name, abs) rl = s.reduction_location(reduction_name, abs) if rv is not None: return rv elif rl is not None: return self.trial.index_reader.fetch_tensor_value(rl) else: if s.value is None and s.location is None: raise TensorUnavailableForStep(tname=reduction_name, step=step_num, mode=mode) elif s.value is None and s.location is not None: step_value = self.trial.index_reader.fetch_tensor_value( s.location) if self.cache: s.value = step_value # save value if cache is set to True else: step_value = s.value return get_numpy_reduction(reduction_name, step_value, abs)
def workers(self, step_num, mode=ModeKeys.GLOBAL) -> list: step_dict = self._get_step_dict(step_num, mode) if step_dict is None: raise TensorUnavailableForStep(self.name, step_num, mode) return list(step_dict.keys())