def predict(self, datas: Iterable[Any]) -> Any: """Make ad-hoc predictions on batches without labels, and return the results. :param datas: the data predict on, each as a separate element as a data point in a batch """ executor: ModelExecutor = self.executor ms: ModelSettings = self.model_settings if ms.prediction_mapper_name is None: raise ModelError( f'The model settings ({ms.name}) is not configured to create ' + "prediction batches: no set 'prediction_mapper'") pm: PredictionMapper = self.config_factory.new_instance( ms.prediction_mapper_name, datas, self.batch_stash) self._notify('predict_start') try: batches: List[Batch] = pm.batches if not executor.model_exists: executor.load() logger.info('predicting...') with time('predicted'): res: ModelResult = executor.predict(batches) eres: EpochResult = res.results[0] ret: Any = pm.map_results(eres) finally: self._notify('predict_end') pm.deallocate() return ret
def model(self) -> BaseNetworkModule: """Get the PyTorch module that is used for training and test. """ if self._model is None: raise ModelError('No model, is populated; use \'load\'') return self._model
def map_split(n: str): s = splits.get(n) if s is None: raise ModelError( f"No split '{n}' in {self.dataset_stash.split_names}, " + f'executor splits: {self.dataset_split_names}') return s
def get_status(self) -> TrainStatus: """Return the epoch to set in the training loop of the :class:`.ModelExecutor`. """ status = self._read_status() if status.action == UpdateAction.STOP: # setting to the max value fails the executors train outter loop # causing a robust non-error exit status.epoch = sys.maxsize elif status.action == UpdateAction.SET_EPOCH: self.current_epoch = status.epoch if self.pbar is not None: self.pbar.reset() self.pbar.update(self.current_epoch) elif status.action == UpdateAction.ITERATE_EPOCH: self.current_epoch += 1 status.epoch = self.current_epoch stop_reason = self._get_stop_reason() if self.pbar is not None: self.pbar.update() if stop_reason is not None: status.action = UpdateAction.STOP status.reason = stop_reason else: raise ModelError(f'Unknownn status: {status}') if status.reason and self.status_logger.isEnabledFor(logging.INFO): self.status_logger.info(status.reason) return status
def __init__(self, net_settings: NetworkSettings, sub_logger: logging.Logger = None): """Initialize. :param net_settings: contains common layers such as droput and batch normalization :param sub_logger: used to log activity in this module so they logged module comes from some parent model """ super().__init__(sub_logger) self.net_settings = ns = net_settings if isinstance(ns, DropoutNetworkSettings): self.dropout = ns.dropout_layer else: self.dropout = None if isinstance(ns, BatchNormNetworkSettings) and \ (ns.batch_norm_d is not None or ns.batch_norm_features is not None): if ns.batch_norm_d is None or ns.batch_norm_features is None: raise ModelError('Both the dimension and features must be ' + f'set if one is set: {ns}') self.batch_norm = ns.batch_norm_layer else: self.batch_norm = None if isinstance(ns, ActivationNetworkSettings): self.activation_function = ns.activation_function else: self.activation_function = None
def result_manager(self) -> ModelResultManager: """Return the executor's result manager. """ rm: ModelResultManager = self.executor.result_manager if rm is None: rm = ModelError('No result manager available') return rm
def previous_results(self) -> ModelResult: """Return the previous results (see class docs). """ rm: ModelResultManager = self.executor.result_manager if rm is None: rm = ModelError('No result manager available') return rm[self.previous_results_key]
def _forward(self, batch: Batch, context: SequenceNetworkContext) -> \ SequenceNetworkOutput: use_crf = self.net_settings.use_crf split_type: DatasetSplitType = context.split_type preds: List[List[int]] = None labels: Optional[Tensor] = batch.get_labels() loss: Tensor = None score: Tensor = None if self.logger.isEnabledFor(logging.DEBUG): self.logger.debug(f'forward on splt: {context.split_type}') if context.split_type != DatasetSplitType.train and self.training: raise ModelError( f'Attempting to use split {split_type} while training') if context.split_type == DatasetSplitType.train: if use_crf: loss = self._forward_train_with_crf(batch) else: preds, loss = self._forward_train_no_crf(batch, context) elif context.split_type == DatasetSplitType.validation: if use_crf: preds, loss, score = self._decode(batch, True) else: preds, loss = self._forward_train_no_crf(batch, context) elif context.split_type == DatasetSplitType.test: if use_crf: preds, _, score = self._decode(batch, False) loss = batch.torch_config.singleton([0], dtype=torch.float32) else: preds, loss = self._forward_train_no_crf(batch, context) else: raise ModelError(f'Unknown data split type: {split_type}') # list of lists of the predictions, which are the CRF output when # enabled if preds is not None: preds = self._map_labels(batch, context, preds) # padded tensor of shape (batch, data i.e. token length) if labels is not None: labels = self._map_labels(batch, context, labels) self._shape_or_list_debug('output preds', preds) self._shape_or_list_debug('output labels', labels) out = SequenceNetworkOutput(preds, loss, score, labels) if preds is not None and labels is not None and len(labels.size()) > 1: out.righsize_labels(preds) return out
def get_predictions_factory(self, column_names: List[str] = None, transform: Callable[[DataPoint], tuple] = None, batch_limit: int = sys.maxsize, name: str = None) \ -> PredictionsDataFrameFactory: """Generate a predictions factoty from the test data set. :param column_names: the list of string column names for each data item the list returned from ``data_point_transform`` to be added to the results for each label/prediction :param transform: a function that returns a tuple, each with an element respective of ``column_names`` to be added to the results for each label/prediction; if ``None`` (the default), ``str`` used (see the `Iris Jupyter Notebook <https://github.com/plandes/deeplearn/blob/master/notebook/iris.ipynb>`_ example) :param batch_limit: the max number of batche of results to output :param name: the name/ID (name of the file sans extension in the results directory) of the previously archived saved results to fetch or ``None`` to get the last result """ rm: ModelResultManager = self.result_manager res: ModelResult if name is None: res = self.last_result key: str = rm.get_last_key(False) else: res = rm.results_stash[name].model_result key: str = name if res is None: raise ModelError(f'No test results found: {name}') if not res.test.contains_results: raise ModelError('No test results found') path: Path = rm.key_to_path(key) return self.predictions_datafrmae_factory_class( path, res, self.batch_stash, column_names, transform, batch_limit)
def _prepare_datasets(self, batch_limit: Union[int, float], to_deallocate: List[Batch], ds_src: List[Stash]) -> List[List[Batch]]: """Return batches for each data set. The batches are returned per dataset as given in :meth:`_get_dataset_splits`. Return: [(training batch 1..N), (validation batch 1..N), (test batch 1..N)] """ biter = self.model_settings.batch_iteration cnt = 0 if logger.isEnabledFor(logging.INFO): logger.info(f'preparing datasets using iteration: {biter}') self._notify('prepare_datasets_start', biter) if biter == 'gpu': ds_dst = [] for src in ds_src: vlim = self._calc_batch_limit(src, batch_limit) cpu_batches = tuple(it.islice(src.values(), vlim)) gpu_batches = list(map(lambda b: b.to(), cpu_batches)) cnt += len(gpu_batches) # the `to` call returns the same instance if the tensor is # already on the GPU, so only deallocate batches copied over for cpu_batch, gpu_batch in zip(cpu_batches, gpu_batches): if cpu_batch is not gpu_batch: to_deallocate.append(cpu_batch) if not self.model_settings.cache_batches: to_deallocate.extend(gpu_batches) ds_dst.append(gpu_batches) elif biter == 'cpu': ds_dst = [] for src in ds_src: vlim = self._calc_batch_limit(src, batch_limit) batches = list(it.islice(src.values(), vlim)) cnt += len(batches) if not self.model_settings.cache_batches: to_deallocate.extend(batches) ds_dst.append(batches) elif biter == 'buffered': ds_dst = ds_src cnt = '?' else: raise ModelError(f'No such batch iteration method: {biter}') self._notify('prepare_datasets_end', biter) self._preproces_training(ds_dst[0]) return cnt, ds_dst
def _calc_batch_limit(self, src: Stash, batch_limit: Union[int, float]) -> int: if batch_limit <= 0: raise ModelError(f'Batch limit must be positive: {batch_limit}') if isinstance(batch_limit, float): if batch_limit > 1.0: raise ModelError('Batch limit must be less than 1 ' + f'when a float: {batch_limit}') vlim = round(len(src) * batch_limit) if logger.isEnabledFor(logging.DEBUG): logger.debug('batch limit calculated as a percentage: ' + f'{vlim} = {len(src)} * {batch_limit}') else: vlim = batch_limit if isinstance(src, SplitStashContainer): desc = f' for {src.split_name}' else: desc = '' if logger.isEnabledFor(logging.INFO): logger.info(f'using batch limit: {vlim}{desc}') return vlim
def last_result(self) -> ModelResult: """The last recorded result during an :meth:`.ModelExecutor.train` or :meth:`.ModelExecutor.test` invocation is used. """ res = self.executor.model_result if res is None: rm: ModelResultManager = self.result_manager res = rm.load() if res is None: raise ModelError('No results found') return res
def _execute(self, model: BaseNetworkModule, optimizer: Optimizer, criterion, batch: Batch, split_type: DatasetSplitType) -> \ Tuple[Tensor]: logger = self.logger cctx = SequenceNetworkContext(split_type, criterion) seq_out: SequenceNetworkOutput = model(batch, cctx) outcomes: Tensor = seq_out.predictions loss: Tensor = seq_out.loss if logger.isEnabledFor(logging.DEBUG): logger.debug(f'{batch.id}: output: {seq_out}') if seq_out.labels is not None: labels = seq_out.labels else: labels: Tensor = batch.get_labels() labels = self._encode_labels(labels) if logger.isEnabledFor(logging.DEBUG): if labels is not None: logger.debug(f'label shape: {labels.shape}') self._debug_output('after forward', labels, outcomes) # iterate over the error surface self._step(loss, split_type, optimizer, model) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'split: {split_type}, loss: {loss}') # transform the labels in the same manner as the predictions so tensor # shapes match if not self.model_settings.nominal_labels: labels = self._decode_outcomes(labels) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'label nom decoded: {labels.shape}') if outcomes is None and split_type != DatasetSplitType.train: raise ModelError('Expecting predictions for all splits except ' + f'{DatasetSplitType.train} on {split_type}') if logger.isEnabledFor(logging.DEBUG): if outcomes is not None: logger.debug(f'outcomes: {outcomes.shape}') if labels is not None: logger.debug(f'labels: {labels.shape}') loss, labels, outcomes, outputs = self.torch_config.to_cpu_deallocate( loss, labels, outcomes, seq_out.outputs) return loss, labels, outcomes, outputs
def __post_init__(self): super().__init__() if not isinstance(self.dataset_stash, DatasetSplitStash) and False: raise ModelError('Expecting type DatasetSplitStash but ' + f'got {self.dataset_stash.__class__}') self._model = None self._dealloc_model = False self.model_result: ModelResult = None self.batch_stash.delegate_attr: bool = True self._criterion_optimizer_scheduler = PersistedWork( '_criterion_optimizer_scheduler', self) self._result_manager = PersistedWork('_result_manager', self) self._train_manager = PersistedWork('_train_manager', self) self.cached_batches = {} self.debug = False
def test(self, description: str = None) -> ModelResult: """Load the model from disk and test it. """ if self.debuged: raise ModelError('Testing is not allowed in debug mode') executor = self.executor executor.load() logger.info('testing...') self._notify('test_start', description) with time('tested'): res = executor.test(description) if self.writer is not None: res.write(writer=self.writer) self._notify('test_end', description) return res
def state_dict(self, destination=None, prefix='', *args, **kwargs): state = super().state_dict(destination, prefix, *args, **kwargs) if logger.isEnabledFor(logging.DEBUG): self._debug(f'state_dict: trainable: {self.trainable}') if not self.trainable: emb_key = self._get_emb_key(prefix) if logger.isEnabledFor(logging.DEBUG): self._debug(f'state_dict: embedding key: {emb_key}') if emb_key is not None: if emb_key not in state: raise ModelError(f'No key {emb_key} in {state.keys()}') arr = state[emb_key] if arr is not None: if logger.isEnabledFor(logging.DEBUG): self._debug(f'state_dict: emb state: {arr.shape}') assert arr.shape == self.embed_model.matrix.shape state[emb_key] = None return state
def _execute(self, sets_name: str, description: str, func: Callable, ds_src: tuple) -> bool: """Either train or test the model based on method ``func``. :param sets_name: the name of the data sets, which ``train`` or ``test`` :param func: the method to call to do the training or testing :param ds_src: a tuple of datasets in a form such as ``(train, validation, test)`` (see :meth:`_get_dataset_splits`) :return: ``True`` if training/testing was successful, otherwise `the an exception occured or early bail """ to_deallocate: List[Batch] = [] ds_dst: List[List[Batch]] = None batch_limit = self.model_settings.batch_limit biter = self.model_settings.batch_iteration if self.model_settings.cache_batches and biter == 'buffered': raise ModelError('Can not cache batches for batch ' + 'iteration setting \'buffered\'') if logger.isEnabledFor(logging.INFO): logger.info(f'batch iteration: {biter}, limit: {batch_limit}' + f', caching: {self.model_settings.cache_batches}' f', cached: {len(self.cached_batches)}') self._notify('execute_start', sets_name) self._gc(1) ds_dst = self.cached_batches.get(sets_name) if ds_dst is None: cnt = 0 with time('loaded {cnt} batches'): cnt, ds_dst = self._prepare_datasets(batch_limit, to_deallocate, ds_src) if self.model_settings.cache_batches: self.cached_batches[sets_name] = ds_dst if logger.isEnabledFor(logging.INFO): logger.info('train/test sets: ' + f'{" ".join(map(lambda l: str(len(l)), ds_dst))}') try: with time(f'executed {sets_name}'): func(*ds_dst) if description is not None: res_name = f'{self.model_result.index}: {description}' self.model_result.name = res_name return True except EarlyBailError as e: logger.warning(f'<{e}>') self.reset() return False finally: self._notify('execute_end', sets_name) self._train_manager.clear() if logger.isEnabledFor(logging.INFO): logger.info(f'deallocating {len(to_deallocate)} batches') for batch in to_deallocate: if logger.isEnabledFor(logging.DEBUG): logger.debug(f'deallocating: {batch}') batch.deallocate() self._gc(1) self.torch_config.empty_cache()
def __getstate__(self): raise ModelError(f'Layers should not be pickeled: {self}')
def _execute(self, model: BaseNetworkModule, optimizer: Optimizer, criterion, batch: Batch, split_type: DatasetSplitType) -> \ Tuple[Tensor]: """Execute one epoch of training, testing, validation or prediction. :param model: the model to excercise :param optimizer: the optimization algorithm (i.e. adam) to iterate :param criterion: the loss function (i.e. cross entropy loss) used for the backward propogation step :param batch: contains the data to test, predict, and optionally the labels for training and validation :param split_type: indicates if we're training, validating or testing :return: a tuple of the loss, labels, outcomes, and the output (i.e. logits); the outcomes are the decoded (:meth:`_decode_outcomes`) output and represent some ready to use data, like argmax'd classification nominal label integers """ logger = self.logger labels: Tensor = batch.get_labels() # forward pass, get our output, which are usually the logits output: Tensor = model(batch) # sanity check if output is None: raise ModelError('Null model output') # check for sane state with labels, and munge if necessary if labels is None: # sanity check if split_type != DatasetSplitType.test: raise ModelError('Expecting no split type on prediction, ' + f'but got: {split_type}') if logger.isEnabledFor(logging.DEBUG): logger.debug('skipping loss calculation on prediction execute') loss = None else: # put labels in a form to be used by the loss function labels = self._encode_labels(labels) self._debug_output('input', labels, output) # calculate the loss with the logps and the labels loss = criterion(output, labels) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'split: {split_type}, loss: {loss}') # iterate over the error surface self._step(loss, split_type, optimizer, model) self._debug_output('output', labels, output) # apply the same decoding on the labels as the output if necessary if labels is not None and not self.model_settings.nominal_labels: labels = self._decode_outcomes(labels) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'label nom decoded: {labels.shape}') outcomes = self._decode_outcomes(output) loss, labels, outcomes, output = self.torch_config.to_cpu_deallocate( loss, labels, outcomes, output) return loss, labels, outcomes, output