Exemplo n.º 1
0
 def __post_init__(self):
     super().__init__()
     if hasattr(self, '_data_points') and self._data_points is not None:
         self.data_point_ids = tuple(map(lambda d: d.id, self.data_points))
     self._decoded_state = PersistedWork('_decoded_state',
                                         self,
                                         transient=True)
     self.state = 'n'
Exemplo n.º 2
0
 def __post_init__(self, weighted_split_path: Path):
     super().__post_init__()
     if weighted_split_path is None:
         path = '_label_counts'
     else:
         file_name = f'weighted-labels-{self.weighted_split_name}.dat'
         path = weighted_split_path / file_name
     self._label_counts = PersistedWork(path, self)
Exemplo n.º 3
0
 def __post_init__(self):
     super().__post_init__()
     if self.partition_attr is None:
         raise DatasetError("Missing 'partition_attr' field")
     dfpath = self.split_labels_path
     if dfpath is None:
         dfpath = '_strat_split_labels'
     self._strat_split_labels = PersistedWork(dfpath, self, mkdir=True)
Exemplo n.º 4
0
 def __post_init__(self):
     super().__post_init__()
     PersistableContainer.__init__(self)
     if not isinstance(self.split_container, SplitKeyContainer):
         raise DatasetError('Expecting type SplitKeyContainer but ' +
                            f'got: {type(self.split_container)}')
     self._inst_split_name = None
     self._keys_by_split = PersistedWork('_keys_by_split', self)
     self._splits = PersistedWork('_splits', self)
Exemplo n.º 5
0
 def __post_init__(self, config_factory: ConfigFactory):
     super().__init__()
     self._init_config_factory(config_factory)
     self._config_factory = PersistedWork('_config_factory', self)
     self._executor = PersistedWork('_executor', self)
     self.debuged = False
     if self.progress_bar_cols == 'term':
         try:
             term_width = os.get_terminal_size()[0]
             # make space for embedded validation loss messages
             self.progress_bar_cols = term_width - 5
         except OSError:
             logger.debug('unable to automatically determine ' +
                          'terminal width--skipping')
             self.progress_bar_cols = None
Exemplo n.º 6
0
 def __post_init__(self):
     super().__post_init__()
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug('creating fd vec manager')
     if self.token_feature_ids is None:
         self.token_feature_ids = self.doc_parser.token_feature_ids
     else:
         feat_diff = self.token_feature_ids - self.doc_parser.token_feature_ids
         if len(feat_diff) > 0:
             fdiffs = ', '.join(feat_diff)
             raise VectorizerError(
                 'Parser token features do not exist in vectorizer: ' +
                 f'{self.token_feature_ids} - ' +
                 f'{self.doc_parser.token_feature_ids} = {fdiffs}')
     self._spacy_vectorizers = PersistedWork('_spacy_vectorizers', self)
Exemplo n.º 7
0
 def __post_init__(self, cache: bool):
     super().__init__()
     if self.cache_dir is not None and not self.cache_dir.exists():
         if logger.isEnabledFor(logging.DEBUG):
             logger.info(f'creating cache directory: {self.cache_dir}')
         self.cache_dir.mkdir(parents=True, exist_ok=True)
     if self.cased is None:
         if self.model_id.find('uncased') >= 0:
             self.cased = False
         else:
             logger.info("'cased' not given--assuming a cased model")
             self.cased = True
     self._tokenizer = PersistedWork('_tokenzier', self, cache)
     self._model = PersistedWork('_model', self, cache)
     if self.cache_dir is not None and not self.cache_dir.exists():
         if logger.isEnabledFor(logging.DEBUG):
             logger.info(f'creating cache directory: {self.cache_dir}')
         self.cache_dir.mkdir(parents=True, exist_ok=True)
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'id: {self.model_id}, cased: {self.cased}')
Exemplo n.º 8
0
 def __post_init__(self, decoded_attributes):
     super().__post_init__()
     Deallocatable.__init__(self)
     # TODO: this class conflates key split and delegate stash functionality
     # in the `split_stash_container`.  An instance of this type serves the
     # purpose, but it need not be.  Instead it just needs to be both a
     # SplitKeyContainer and a Stash.  This probably should be split out in
     # to two different fields.
     cont = self.split_stash_container
     if not isinstance(cont, SplitStashContainer) \
        and (not isinstance(cont, SplitKeyContainer) or
             not isinstance(cont, Stash)):
         raise DeepLearnError('Expecting SplitStashContainer but got ' +
                              f'{self.split_stash_container.__class__}')
     self.data_point_id_sets_path.parent.mkdir(parents=True, exist_ok=True)
     self._batch_data_point_sets = PersistedWork(
         self.data_point_id_sets_path, self)
     self.priming = False
     self.decoded_attributes = decoded_attributes
     self._update_comp_stash_attribs()
Exemplo n.º 9
0
    def __init__(self, use_gpu: bool = True, data_type: type = torch.float32,
                 cuda_device_index: int = None):
        """Initialize this configuration.

        :param use_gpu: whether or not to use CUDA/GPU

        :param data_type: the default data type to use when creating new
                          tensors in this configuration

        :param cuda_device_index: the CUDA device to use, which defaults to 0
                                  if CUDA if ``use_gpu`` is ``True``

        """
        super().__init__()
        logger.debug(f'use_gpu: {use_gpu}')
        self.use_gpu = use_gpu
        self.data_type = data_type
        # we can't globally cache this in case there are multiple instances of
        # this class for which have different values of `use_gpu`
        self._init_device_pw = PersistedWork('_init_device_pw', self)
        self._cpu_device_pw = PersistedWork('_cpu_device_pw', self, cache_global=True)
        self._cpu_device_pw._mark_deallocated()
        self._cuda_device_index = cuda_device_index
Exemplo n.º 10
0
    def from_struct(cls: type, struct: Dict[str, Any],
                    target_dir: Path) -> Distribution:
        """Return a distrbution directly from the data structure created from
        :class:`.Discoverer`.

        :param struct: the data structure given by :meth:`.Discoverer.freeze`
                       using ``flatten=True``

        :param target_dir: where the distribution will be *thawed*

        """
        self = cls(None, None, target_dir, PathTranslator(target_dir))
        self._struct = PersistedWork('_struct', self, initial_value=struct)
        return self
Exemplo n.º 11
0
 def __post_init__(self):
     self._previous_results = PersistedWork(
         '_previous_results', self,
         cache_global=self.cache_previous_results)
Exemplo n.º 12
0
class FeatureDocumentVectorizerManager(FeatureVectorizerManager):
    """Creates and manages instances of :class:`.FeatureDocumentVectorizer`
    and parses text in to feature based document.

    This is used to manage the relationship of a given set of parsed features
    keeping in mind that parsing will usually happen as a preprocessing step.
    A second step is the vectorization of those features, which can be any
    proper subset of those features parsed in the previous step.  However,
    these checks, of course, are not necessary if pickling isn't used across
    the parse and vectorization steps.

    Instances can set a hard fixed token length, but which vectorized tensors
    have a like fixed width based on the setting of :obj:`token_length`.
    However, this can also be set to use the longest sentence of the document,
    which is useful when computing vectorized tensors from the document as a
    batch, even if the input data are batched as a group of sentences in a
    document.

    :see: :class:`.FeatureDocumentVectorizer`

    :see :meth:`parse`

    """
    doc_parser: FeatureDocumentParser = field()
    """Used to :meth:`parse` documents."""

    token_length: int = field()
    """The length of tokens used in fixed length features.  This is used as a
    dimension in decoded tensors.  If this value is ``-1``, use the longest
    sentence of the document as the token length, which is usually counted as
    the batch.

    :see: :meth:`get_token_length`

    """
    token_feature_ids: Set[str] = field(default=None)
    """Indicates which spaCy parsed features to generate in the vectorizers held in
    this instance.  Examples include ``norm``, ``ent``, ``dep``, ``tag``.

    If this is not set, it defaults to the the `token_feature_ids` in
    :obj:`doc_parser`.

    :see: :obj:`.SpacyFeatureVectorizer.VECTORIZERS`

    """
    def __post_init__(self):
        super().__post_init__()
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug('creating fd vec manager')
        if self.token_feature_ids is None:
            self.token_feature_ids = self.doc_parser.token_feature_ids
        else:
            feat_diff = self.token_feature_ids - self.doc_parser.token_feature_ids
            if len(feat_diff) > 0:
                fdiffs = ', '.join(feat_diff)
                raise VectorizerError(
                    'Parser token features do not exist in vectorizer: ' +
                    f'{self.token_feature_ids} - ' +
                    f'{self.doc_parser.token_feature_ids} = {fdiffs}')
        self._spacy_vectorizers = PersistedWork('_spacy_vectorizers', self)

    @property
    def is_batch_token_length(self) -> bool:
        """Return whether or not the token length is variable based on the longest
        token length in the batch.

        """
        return self.token_length < 0

    def get_token_length(self, doc: FeatureDocument) -> int:
        """Get the token length for the document.  If :obj:`is_batch_token_length` is
        ``True``, then the token length is computed based on the longest
        sentence in the document ``doc``.  See the class docs.

        :param doc: used to compute the longest sentence if
                    :obj:`is_batch_token_length` is ``True``

        :return: the (global) token length for the document

        """
        if self.is_batch_token_length:
            return doc.max_sentence_len
        else:
            return self.token_length

    def parse(self, text: Union[str, List[str]], *args, **kwargs) -> \
            FeatureDocument:
        """Parse text or a text as a list of sentences.

        **Important**: Parsing documents through this manager instance is
        better since safe checks are made that features are available from
        those used when documents are parsed before pickling.

        :param text: either a string or a list of strings; if the former a
                     document with one sentence will be created, otherwise a
                     document is returned with a sentence for each string in
                     the list

        """
        return self.doc_parser.parse(text, *args, **kwargs)

    @property
    @persisted('_spacy_vectorizers')
    def spacy_vectorizers(self) -> Dict[str, SpacyFeatureVectorizer]:
        """Return vectorizers based on the :obj:`token_feature_ids` configured on this
        instance.  Keys are token level feature ids found in
        :obj:`.SpacyFeatureVectorizer.VECTORIZERS`.

        :return: an :class:`collections.OrderedDict` of vectorizers

        """
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug('creating spacy vectorizers')
        token_feature_ids = set(SpacyFeatureVectorizer.VECTORIZERS.keys())
        token_feature_ids = token_feature_ids & self.token_feature_ids
        token_feature_ids = sorted(token_feature_ids)
        vectorizers = collections.OrderedDict()
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'creating token features: {token_feature_ids}')
        for feature_id in sorted(token_feature_ids):
            cls = SpacyFeatureVectorizer.VECTORIZERS[feature_id]
            inst = cls(name=f'spacy vectorizer: {feature_id}',
                       config_factory=self.config_factory,
                       feature_id=feature_id,
                       torch_config=self.torch_config,
                       vocab=self.doc_parser.model.vocab)
            vectorizers[feature_id] = inst
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'created {len(vectorizers)} vectorizers')
        return vectorizers

    def deallocate(self):
        if self._spacy_vectorizers.is_set():
            vecs = self.spacy_vectorizers
            for vec in vecs.values():
                vec.deallocate()
            vecs.clear()
        super().deallocate()
Exemplo n.º 13
0
 def __post_init__(self):
     super().__post_init__()
     Deallocatable.__init__(self)
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'split stash post init: {self.dataframe_path}')
     self._dataframe = PersistedWork(self.dataframe_path, self, mkdir=True)
Exemplo n.º 14
0
class DataframeStash(ReadOnlyStash,
                     Deallocatable,
                     Writable,
                     PrimeableStash,
                     metaclass=ABCMeta):
    """A factory stash that uses a Pandas data frame from which to load.  It uses
    the data frame index as the keys and :class:`pandas.Series` as values.  The
    dataframe is usually constructed by reading a file (i.e.CSV) and doing some
    transformation before using it in an implementation of this stash.

    The dataframe created by :meth:`_get_dataframe` must have a string or
    integer index since keys for all stashes are of type :class:`str`.  The
    index will be mapped to a string if it is an int automatically.

    """
    dataframe_path: Path = field()
    """The path to store the pickeled version of the generated dataframe
    created with :meth:`_get_dataframe`.

    """
    def __post_init__(self):
        super().__post_init__()
        Deallocatable.__init__(self)
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'split stash post init: {self.dataframe_path}')
        self._dataframe = PersistedWork(self.dataframe_path, self, mkdir=True)

    def deallocate(self):
        super().deallocate()
        self._dataframe.deallocate()

    @abstractmethod
    def _get_dataframe(self) -> pd.DataFrame:
        """Get or create the dataframe

        """
        pass

    def _prepare_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        dt = df.index.dtype
        if dt != object:
            if dt != int:
                s = f'Data frame index must be a string or int, but got: {dt}'
                raise DataframeError(s)
            else:
                df.index = df.index.map(str)
        return df

    @property
    @persisted('_dataframe')
    def dataframe(self):
        df = self._get_dataframe()
        df = self._prepare_dataframe(df)
        return df

    def prime(self):
        super().prime()
        self.dataframe

    def clear(self):
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug('clearing dataframe stash')
        self._dataframe.clear()

    def load(self, name: str) -> pd.Series:
        return self.dataframe.loc[name]

    def exists(self, name: str) -> bool:
        return name in self.dataframe.index

    def keys(self) -> Iterable[str]:
        return map(str, self.dataframe.index)

    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout):
        df = self.dataframe
        self._write_line(f'rows: {df.shape[0]}', depth, writer)
        self._write_line(f'cols: {", ".join(df.columns)}', depth, writer)
Exemplo n.º 15
0
 def __post_init__(self):
     super().__post_init__()
     self._keys_by_split = PersistedWork(self.key_path, self, mkdir=True)
Exemplo n.º 16
0
class SplitKeyDataframeStash(DataframeStash, SplitKeyContainer):
    """A stash and split key container that reads from a dataframe.

    """
    key_path: Path = field()
    """The path where the key splits (as a ``dict``) is pickled."""

    split_col: str = field()
    """The column name in the dataframe used to indicate the split
    (i.e. ``train`` vs ``test``).

    """
    def __post_init__(self):
        super().__post_init__()
        self._keys_by_split = PersistedWork(self.key_path, self, mkdir=True)

    def deallocate(self):
        super().deallocate()
        self._keys_by_split.deallocate()

    def _create_keys_for_split(self, split_name: str, df: pd.DataFrame) -> \
            Iterable[str]:
        """Generate an iterable of string keys.  It is expected this method to be
        potentially very expensive, so the results are cached to disk.  This
        implementation returns the dataframe index.

        :param split_name: the name of the split (i.e. ``train`` vs ``test``)
        :param df: the data frame for the grouping of keys from CSV of data

        """
        return df.index

    def _get_counts_by_key(self) -> Dict[str, int]:
        sc = self.split_col
        return dict(self.dataframe.groupby([sc])[sc].count().items())

    @persisted('_split_names')
    def _get_split_names(self) -> Set[str]:
        return set(self.dataframe[self.split_col].unique())

    @persisted('_keys_by_split')
    def _get_keys_by_split(self) -> Dict[str, Tuple[str]]:
        keys_by_split = OrderedDict()
        split_col = self.split_col
        for split, df in self.dataframe.groupby([split_col]):
            logger.info(f'parsing keys for {split}')
            keys = self._create_keys_for_split(split, df)
            keys_by_split[split] = tuple(keys)
        return keys_by_split

    def clear(self):
        super().clear()
        self.clear_keys()

    def clear_keys(self):
        """Clear only the cache of keys generated from the group by.

        """
        self._keys_by_split.clear()

    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout):
        total = self.dataframe.shape[0]
        self._write_line('data frame splits:', depth, writer)
        for split, cnt in self.counts_by_key.items():
            self._write_line(f'{split}: {cnt} ({cnt/total*100:.1f}%)', depth,
                             writer)
        self._write_line(f'total: {total}', depth, writer)
Exemplo n.º 17
0
class BatchStash(TorchMultiProcessStash, SplitKeyContainer, Writeback,
                 Deallocatable, metaclass=ABCMeta):
    """A stash that vectorizes features in to easily consumable tensors for
    training and testing.  This stash produces instances of :class:`.Batch`,
    which is a batch in the machine learning sense, and the first dimension of
    what will become the tensor used in PyTorch.  Each of these batches has a
    logical one to many relationship to that batche's respective set of data
    points, which is encapsulated in the :class:`.DataPoint` class.

    The stash creates subprocesses to vectorize features in to tensors in
    chunks of IDs (data point IDs) from the subordinate stash using
    ``DataPointIDSet`` instances.

    To speed up experiements, all available features configured in
    ``vectorizer_manager_set`` are encoded on disk.  However, only the
    ``decoded_attributes`` (see attribute below) are avilable to the model
    regardless of what was created during encoding time.

    The lifecycle of the data follows:

    1. Feature data created by the client, which could be language features,
       row data etc.

    2. Vectorize the feature data using the vectorizers in
       ``vectorizer_manager_set``.  This creates the feature contexts
       (``FeatureContext``) specifically meant to be pickeled.

    3. Pickle the feature contexts when dumping to disk, which is invoked in
       the child processes of this class.

    4. At train time, load the feature contexts from disk.

    5. Decode the feature contexts in to PyTorch tensors.

    6. The model manager uses the ``to`` method to copy the CPU tensors to the
       GPU (where GPUs are available).

    :see _process: for details on the pickling of the batch instances

    """
    _DICTABLE_WRITE_EXCLUDES = {'batch_feature_mappings'}

    data_point_type: Type[DataPoint] = field()
    """A subclass type of :class:`.DataPoint` implemented for the specific
    feature.

    """
    batch_type: Type[Batch] = field()
    """The batch class to be instantiated when created batchs.

    """
    split_stash_container: SplitStashContainer = field()
    """The source data stash that has both the data and data set keys for each
    split (i.e. ``train`` vs ``test``).

    """
    vectorizer_manager_set: FeatureVectorizerManagerSet = field()
    """Used to vectorize features in to tensors."""

    batch_size: int = field()
    """The number of data points in each batch, except the last (unless the
    data point cardinality divides the batch size).

    """
    model_torch_config: TorchConfig = field()
    """The PyTorch configuration used to (optionally) copy CPU to GPU memory.

    """
    data_point_id_sets_path: Path = field()
    """The path of where to store key data for the splits; note that the
    container might store it's key splits in some other location.

    """
    decoded_attributes: InitVar[Set[str]] = field()
    """The attributes to decode; only these are avilable to the model
    regardless of what was created during encoding time; if None, all are
    available.

    """
    batch_feature_mappings: BatchFeatureMapping = field(default=None)
    """The meta data used to encode and decode each feature in to tensors.

    """
    batch_limit: int = field(default=sys.maxsize)
    """The max number of batches to process, which is useful for debugging."""

    def __post_init__(self, decoded_attributes):
        super().__post_init__()
        Deallocatable.__init__(self)
        # TODO: this class conflates key split and delegate stash functionality
        # in the `split_stash_container`.  An instance of this type serves the
        # purpose, but it need not be.  Instead it just needs to be both a
        # SplitKeyContainer and a Stash.  This probably should be split out in
        # to two different fields.
        cont = self.split_stash_container
        if not isinstance(cont, SplitStashContainer) \
           and (not isinstance(cont, SplitKeyContainer) or
                not isinstance(cont, Stash)):
            raise DeepLearnError('Expecting SplitStashContainer but got ' +
                                 f'{self.split_stash_container.__class__}')
        self.data_point_id_sets_path.parent.mkdir(parents=True, exist_ok=True)
        self._batch_data_point_sets = PersistedWork(
            self.data_point_id_sets_path, self)
        self.priming = False
        self.decoded_attributes = decoded_attributes
        self._update_comp_stash_attribs()

    @property
    def decoded_attributes(self) -> Set[str]:
        """The attributes to decode.  Only these are avilable to the model regardless
        of what was created during encoding time; if None, all are available

        """
        return self._decoded_attributes

    @decoded_attributes.setter
    def decoded_attributes(self, attribs: Set[str]):
        """The attributes to decode.  Only these are avilable to the model regardless
        of what was created during encoding time; if None, all are available

        """
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'setting decoded attributes: {attribs}')
        self._decoded_attributes = attribs
        if isinstance(self.delegate, BatchDirectoryCompositeStash):
            self.delegate.load_keys = attribs

    @property
    @persisted('_batch_metadata')
    def batch_metadata(self) -> BatchMetadata:
        mapping: BatchFeatureMapping
        if self.batch_feature_mappings is not None:
            mapping = self.batch_feature_mappings
        else:
            batch: Batch = self.batch_type(None, None, None, None)
            batch.batch_stash = self
            mapping = batch._get_batch_feature_mappings()
            batch.deallocate()
        vec_mng_set: FeatureVectorizerManagerSet = self.vectorizer_manager_set
        attrib_keeps = self.decoded_attributes
        vec_mng_names = set(vec_mng_set.keys())
        by_attrib = {}
        mmng: ManagerFeatureMapping
        for mmng in mapping.manager_mappings:
            vec_mng_name: str = mmng.vectorizer_manager_name
            if vec_mng_name in vec_mng_names:
                vec_mng: FeatureVectorizerManager = vec_mng_set[vec_mng_name]
                field: FieldFeatureMapping
                for field in mmng.fields:
                    if field.attr in attrib_keeps:
                        vec = vec_mng[field.feature_id]
                        by_attrib[field.attr] = BatchFieldMetadata(field, vec)
        return BatchMetadata(self.data_point_type, self.batch_type,
                             mapping, by_attrib)

    def _update_comp_stash_attribs(self):
        """Update the composite stash grouping if we're using one and if this class is
        already configured.

        """
        if isinstance(self.delegate, BatchDirectoryCompositeStash):
            meta: BatchMetadata = self.batch_metadata
            meta_attribs: Set[str] = set(
                map(lambda f: f.attr, meta.mapping.get_attributes()))
            groups: Tuple[Set[str]] = self.delegate.groups
            gattribs = reduce(lambda x, y: x | y, groups)
            to_remove = gattribs - meta_attribs
            new_groups = []
            if len(to_remove) > 0:
                group: Set[str]
                for group in groups:
                    ng: Set[str] = meta_attribs & group
                    if len(ng) > 0:
                        new_groups.append(ng)
                self.delegate.groups = tuple(new_groups)
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug(f'meta attribs: {meta_attribs}, groups: {groups}')

    @property
    @persisted('_batch_data_point_sets')
    def batch_data_point_sets(self) -> List[DataPointIDSet]:
        """Create the data point ID sets.  Each instance returned will correlate to a
        batch and each set of keys point to a feature :class:`.DataPoint`.

        """
        psets = []
        batch_id = 0
        cont = self.split_stash_container
        tc_seed = TorchConfig.get_random_seed_context()
        if logger.isEnabledFor(logging.INFO):
            logger.info(f'{self.name}: creating keys with ({type(cont)}) ' +
                        f'using batch size of {self.batch_size}')
        for split, keys in cont.keys_by_split.items():
            if logger.isEnabledFor(logging.INFO):
                logger.info(f'keys for split {split}: {len(keys)}')
            # keys are ordered and needed to be as such for consistency
            # keys = sorted(keys, key=int)
            cslice = it.islice(chunks(keys, self.batch_size), self.batch_limit)
            for chunk in cslice:
                chunk = tuple(chunk)
                if logger.isEnabledFor(logging.DEBUG):
                    logger.debug(f'chunked size: {len(chunk)}')
                dp_set = DataPointIDSet(str(batch_id), chunk, split, tc_seed)
                psets.append(dp_set)
                batch_id += 1
        logger.info(f'created {len(psets)} each set limited with ' +
                    f'{self.batch_limit} with batch_limit={self.batch_limit}')
        return psets

    def _get_keys_by_split(self) -> Dict[str, Tuple[str]]:
        by_batch = collections.defaultdict(lambda: [])
        for dps in self.batch_data_point_sets:
            by_batch[dps.split_name].append(dps.batch_id)
        return {k: tuple(by_batch[k]) for k in by_batch.keys()}

    def _create_data(self) -> List[DataPointIDSet]:
        """Data created for the sub proceesses are the first N data point ID sets.

        """
        return self.batch_data_point_sets

    def populate_batch_feature_mapping(self, batch: Batch):
        """Add batch feature mappings to a batch instance."""
        if self.batch_feature_mappings is not None:
            batch.batch_feature_mappings = self.batch_feature_mappings

    def create_batch(self, points: Tuple[DataPoint], split_name: str = None,
                     batch_id: str = None):
        """Create a new batch instance with data points, which happens when primed.

        """
        bcls: Type[Batch] = self.batch_type
        batch: Batch = bcls(self, batch_id, split_name, points)
        self.populate_batch_feature_mapping(batch)
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'created batch: {batch}')
        return batch

    def _process(self, chunk: List[DataPointIDSet]) -> \
            Iterable[Tuple[str, Any]]:
        """Create the batches by creating the set of data points for each
        :class:`.DataPointIDSet` instance.  When the subordinate stash dumps
        the batch (specifically a subclass of :class:`.Batch`), the overrided
        pickle logic is used to *detatch* the batch by encoded all data in to
        :class:`.FeatureContext` instances.

        """
        if logger.isEnabledFor(logging.INFO):
            logger.info(f'{self.name}: processing: {len(chunk)} data points')
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'chunk data points: {chunk}')
        tseed = chunk[0].torch_seed_context
        dpcls: Type[DataPoint] = self.data_point_type
        cont = self.split_stash_container
        if tseed is not None:
            TorchConfig.set_random_seed(
                tseed['seed'], tseed['disable_cudnn'], False)
        dset: DataPointIDSet
        for dset in chunk:
            batch_id: str = dset.batch_id
            points: Tuple[DataPoint] = tuple(
                map(lambda dpid: dpcls(dpid, self, cont[dpid]),
                    dset.data_point_ids))
            batch: Batch = self.create_batch(points, dset.split_name, batch_id)
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug(f'created batch: {batch}')
            yield (batch_id, batch)

    def _get_data_points_for_batch(self, batch: Any) -> Tuple[Any]:
        """Return the data points that were used to create ``batch``.

        """
        dpcls = self.data_point_type
        cont = self.split_stash_container
        return tuple(map(lambda dpid: dpcls(dpid, self, cont[dpid]),
                         batch.data_point_ids))

    def load(self, name: str):
        with time('loaded batch {name} ({obj.split_name})'):
            obj = super().load(name)
        # add back the container of the batch to reconstitute the original
        # features and use the CUDA for tensor device transforms
        if obj is not None:
            if not hasattr(obj, 'batch_stash'):
                obj.batch_stash = self
            if (not hasattr(obj, 'batch_feature_mappings') or obj.batch_feature_mappings is None):
                self.populate_batch_feature_mapping(obj)
        return obj

    def _prime_vectorizers(self):
        vec_mng_set: FeatureVectorizerManagerSet = self.vectorizer_manager_set
        vecs = map(lambda v: v.values(), vec_mng_set.values())
        for vec in chain.from_iterable(vecs):
            if isinstance(vec, Primeable):
                if logger.isEnabledFor(logging.DEBUG):
                    logger.debug(f'priming {vec}')
                vec.prime()

    def prime(self):
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'priming {self.__class__}, is child: ' +
                         f'{self.is_child}, currently priming: {self.priming}')
        if self.priming:
            raise DeepLearnError('Already priming')
        self.priming = True
        try:
            self.batch_data_point_sets
            self._prime_vectorizers()
            super().prime()
        finally:
            self.priming = False

    def deallocate(self):
        self._batch_data_point_sets.deallocate()
        if id(self.delegate) != id(self.split_stash_container):
            self._try_deallocate(self.delegate)
        self._try_deallocate(self.split_stash_container)
        self.vectorizer_manager_set.deallocate()
        super().deallocate()

    def _from_dictable(self, *args, **kwargs):
        # avoid long Wriable.write output
        dct = super()._from_dictable(*args, **kwargs)
        rms = tuple(filter(lambda k: k.startswith('_'), dct.keys()))
        for k in rms:
            del dct[k]
        return dct

    def clear(self):
        """Clear the batch, batch data point sets."""
        logger.debug('clearing')
        super().clear()
        self._batch_data_point_sets.clear()

    def clear_all(self):
        """Clear the batch, batch data point sets, and the source data
        (:obj:`split_stash_container`).

        """
        self.clear()
        self.split_stash_container.clear()
Exemplo n.º 18
0
class StratifiedStashSplitKeyContainer(StashSplitKeyContainer):
    """Like :class:`.StashSplitKeyContainer` but data is stratified by a label
    (:obj:`partition_attr`) across each split.

    """
    partition_attr: str = field(default=None)
    """The label used to partition the strata across each split"""

    stratified_write: bool = field(default=True)
    """Whether or not to include the stratified counts when writing with
    :meth:`write`.

    """
    split_labels_path: Path = field(default=None)
    """If provided, the path is a pickled cache of
    :obj:`stratified_count_dataframe`.

    """
    def __post_init__(self):
        super().__post_init__()
        if self.partition_attr is None:
            raise DatasetError("Missing 'partition_attr' field")
        dfpath = self.split_labels_path
        if dfpath is None:
            dfpath = '_strat_split_labels'
        self._strat_split_labels = PersistedWork(dfpath, self, mkdir=True)

    def _create_splits(self) -> Dict[str, Tuple[str]]:
        dist_keys: Sequence[str] = self.distribution.keys()
        dist_last: str = next(iter(dist_keys))
        dists: Set[str] = set(dist_keys) - {dist_last}
        rows = []
        for k, v in self.stash.items():
            rows.append((k, getattr(v, self.partition_attr)))
        df = pd.DataFrame(rows, columns=['key', self.partition_attr])
        lab_splits: Dict[str, Set[str]] = collections.defaultdict(set)
        for lab, dfg in df.groupby(self.partition_attr):
            splits = {}
            keys: List[str] = dfg['key'].to_list()
            if self.shuffle:
                random.shuffle(keys)
            count = len(keys)
            for dist in dists:
                prop = self.distribution[dist]
                n_samples = math.ceil(float(count) * prop)
                samp = set(keys[:n_samples])
                splits[dist] = samp
                lab_splits[dist].update(samp)
                keys = keys[n_samples:]
            samp = set(keys)
            splits[dist_last] = samp
            lab_splits[dist_last].update(samp)
        assert sum(map(len, lab_splits.values())) == len(df)
        assert reduce(lambda a, b: a | b, lab_splits.values()) == \
            set(df['key'].tolist())
        shuf_splits = {}
        for lab, keys in lab_splits.items():
            if self.shuffle:
                keys = list(keys)
                random.shuffle(keys)
            shuf_splits[lab] = tuple(keys)
        return shuf_splits

    def _count_proportions_by_split(self) -> Dict[str, Dict[str, str]]:
        lab_counts = {}
        kbs = self.keys_by_split
        for split_name in sorted(kbs.keys()):
            keys = kbs[split_name]
            counts = collections.defaultdict(lambda: 0)
            for k in keys:
                item = self.stash[k]
                lab = getattr(item, self.partition_attr)
                counts[lab] += 1
            lab_counts[split_name] = counts
        return lab_counts

    @property
    @persisted('_strat_split_labels')
    def stratified_split_labels(self) -> pd.DataFrame:
        """A dataframe with all keys, their respective labels and split.

        """
        kbs = self.keys_by_split
        rows = []
        for split_name in sorted(kbs.keys()):
            keys = kbs[split_name]
            for k in keys:
                item = self.stash[k]
                lab = getattr(item, self.partition_attr)
                rows.append((split_name, k, lab))
        return pd.DataFrame(rows, columns='split_name id label'.split())

    def clear(self):
        super().clear()
        self._strat_split_labels.clear()

    @property
    def stratified_count_dataframe(self) -> pd.DataFrame:
        """A count summarization of :obj:`stratified_split_labels`.

        """
        df = self.stratified_split_labels
        df = df.groupby('split_name label'.split()).size().\
            reset_index(name='count')
        df['proportion'] = df['count'] / df['count'].sum()
        df = df.sort_values('split_name label'.split()).reset_index(drop=True)
        return df

    def _fmt_prop_by_split(self) -> Dict[str, Dict[str, str]]:
        df = self.stratified_count_dataframe
        tot = df['count'].sum()
        dsets: Dict[str, Dict[str, str]] = collections.OrderedDict()
        for split_name, dfg in df.groupby('split_name'):
            dfg['fmt'] = df['count'].apply(lambda x: f'{x/tot*100:.2f}%')
            dsets[split_name] = dict(dfg[['label', 'fmt']].values)
        return dsets

    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout):
        if self.stratified_write:
            lab_counts: Dict[str, Dict[str, str]] = self._fmt_prop_by_split()
            self._write_dict(lab_counts, depth, writer)
            self._write_line(f'Total: {len(self.stash)}', depth, writer)
        else:
            super().write(depth, writer)
Exemplo n.º 19
0
class ResultAnalyzer(object):
    """Load results from a previous run of the :class:`ModelExecutor` and a more
    recent run.  This run is usually a currently running model to compare the
    results during training.  This might provide meaningful information such as
    whether to early stop training.

    """
    executor: ModelExecutor = field()
    """The executor (not the running executor necessary) that will load the
    results if not already loadded.

    """

    previous_results_key: str = field()
    """The key given to retreive the previous results with
    :class:`ModelResultManager`.

    """

    cache_previous_results: bool = field()
    """If ``True``, globally cache the previous results to avoid having to
    reload each time.

    """

    def __post_init__(self):
        self._previous_results = PersistedWork(
            '_previous_results', self,
            cache_global=self.cache_previous_results)

    def clear(self):
        """Clear the previous results, if cached.

        """
        self._previous_results.clear()

    @property
    @persisted('_previous_results')
    def previous_results(self) -> ModelResult:
        """Return the previous results (see class docs).

        """
        rm: ModelResultManager = self.executor.result_manager
        if rm is None:
            rm = ModelError('No result manager available')
        return rm[self.previous_results_key]

    @property
    def current_results(self) -> Tuple[ModelResult, ModelResult]:
        """Return the current results (see class docs).

        """
        if self.executor.model_result is None:
            self.executor.load()
        return self.executor.model_result

    @property
    def comparison(self) -> DataComparison:
        """Load the results data and create a comparison instance read to write or
        jsonify.

        """
        prev, cur = self.previous_results, self.current_results
        prev_losses = prev.validation.losses
        cur_losses = cur.validation.losses
        cur_len = len(cur_losses)
        df = pd.DataFrame({'epoch': range(cur_len),
                           'previous': prev_losses[:cur_len],
                           'current': cur_losses})
        df['improvement'] = df['previous'] - df['current']
        return DataComparison(self.previous_results_key, prev, cur, df)
Exemplo n.º 20
0
class ModelFacade(PersistableContainer, Writable):
    """This class provides easy to use client entry points to the model executor,
    which trains, validates, tests, saves and loads the model.

    More common attributes, such as the learning rate and number of epochs, are
    properties that dispatch to :py:obj:`executor`.  For the others, go
    directly to the property.

    :see: :class:`zensols.deeplearn.domain.ModelSettings`

    """
    SINGLETONS = {}

    config: Configurable = field()
    """The configuraiton used to create the facade, and used to create a new
    configuration factory to load models.

    """
    config_factory: InitVar[ConfigFactory] = field(default=None)
    """The configuration factory used to create this facade, or ``None`` if no
    factory was used.

    """
    progress_bar: bool = field(default=True)
    """Create text/ASCII based progress bar if ``True``."""

    progress_bar_cols: Union[str, int] = field(default='term')
    """The number of console columns to use for the text/ASCII based progress
    bar.  If the value is ``term``, then use the terminal width.

    """
    executor_name: str = field(default='executor')
    """The configuration entry name for the executor, which defaults to
    ``executor``.

    """
    writer: TextIOBase = field(default=sys.stdout)
    """The writer to this in methods like :meth:`train`, and :meth:`test` for
    writing performance metrics results and predictions or ``None`` to not
    output them.

    """
    predictions_datafrmae_factory_class: Type[PredictionsDataFrameFactory] = \
        field(default=PredictionsDataFrameFactory)
    """The factory class used to create predictions.

    :see: :meth:`get_predictions_factory`

    """
    def __post_init__(self, config_factory: ConfigFactory):
        super().__init__()
        self._init_config_factory(config_factory)
        self._config_factory = PersistedWork('_config_factory', self)
        self._executor = PersistedWork('_executor', self)
        self.debuged = False
        if self.progress_bar_cols == 'term':
            try:
                term_width = os.get_terminal_size()[0]
                # make space for embedded validation loss messages
                self.progress_bar_cols = term_width - 5
            except OSError:
                logger.debug('unable to automatically determine ' +
                             'terminal width--skipping')
                self.progress_bar_cols = None

    @classmethod
    def get_singleton(cls, *args, **kwargs) -> Any:
        key = str(cls)
        inst = cls.SINGLETONS.get(key)
        if inst is None:
            inst = cls(*args, **kwargs)
            cls.SINGLETONS[key] = inst
        return inst

    def _init_config_factory(self, config_factory: ConfigFactory):
        if isinstance(config_factory, ImportConfigFactory):
            params = config_factory.__dict__
            keeps = set('reload shared reload_pattern'.split())
            params = {k: params[k] for k in set(params.keys()) & keeps}
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug(f'import config factory params: {params}')
            self._config_factory_params = params
        else:
            self._config_factory_params = {}

    def _create_executor(self) -> ModelExecutor:
        """Create a new instance of an executor.  Used by :obj:`executor`.

        """
        logger.info('creating new executor')
        executor = self.config_factory(
            self.executor_name,
            progress_bar=self.progress_bar,
            progress_bar_cols=self.progress_bar_cols)
        return executor

    @property
    @persisted('_config_factory')
    def config_factory(self):
        """The configuration factory used to create facades.

        """
        return ImportConfigFactory(self.config, **self._config_factory_params)

    @property
    @persisted('_executor')
    def executor(self) -> ModelExecutor:
        """A cached instance of the executor tied to the instance of this class.

        """
        return self._create_executor()

    @property
    def net_settings(self) -> NetworkSettings:
        """Return the executor's network settings.

        """
        return self.executor.net_settings

    @property
    def model_settings(self) -> ModelSettings:
        """Return the executor's model settings.

        """
        return self.executor.model_settings

    @property
    def result_manager(self) -> ModelResultManager:
        """Return the executor's result manager.

        """
        rm: ModelResultManager = self.executor.result_manager
        if rm is None:
            rm = ModelError('No result manager available')
        return rm

    @property
    def feature_stash(self) -> Stash:
        """The stash used to generate the feature, which is not to be confused
        with the batch source stash ``batch_stash``.

        """
        return self.executor.feature_stash

    @property
    def batch_stash(self) -> BatchStash:
        """The stash used to encode and decode batches by the executor.

        """
        return self.executor.batch_stash

    @property
    def dataset_stash(self) -> DatasetSplitStash:
        """The stash used to encode and decode batches split by dataset.

        """
        return self.executor.dataset_stash

    @property
    def vectorizer_manager_set(self) -> FeatureVectorizerManagerSet:
        """Return the vectorizer manager set used for the facade.  This is taken from
        the executor's batch stash.

        """
        return self.batch_stash.vectorizer_manager_set

    @property
    def batch_metadata(self) -> BatchMetadata:
        """Return the batch metadata used on the executor.

        :see: :class:`zensols.deepnlp.model.module.EmbeddingNetworkSettings`

        """
        return self.batch_stash.batch_metadata

    @property
    def label_attribute_name(self):
        """Get the label attribute name.

        """
        bmeta = self.batch_metadata
        if bmeta is not None:
            return bmeta.mapping.label_attribute_name

    def _notify(self, event: str, context: Any = None):
        """Notify observers of events from this class.

        """
        self.model_settings.observer_manager.notify(event, self, context)

    def remove_metadata_mapping_field(self, attr: str) -> bool:
        """Remove a field by attribute if it exists across all metadata mappings.

        This is useful when a very expensive vectorizer slows down tasks, such
        as prediction, on a single run of a program.  For this use case,
        override :meth:`predict` to call this method before calling the super
        ``predict`` method.

        :param attr: the name of the field's attribute to remove

        :return: ``True`` if the field was removed, ``False`` otherwise

        """
        removed = False
        meta: BatchMetadata = self.batch_metadata
        mapping: BatchFeatureMapping
        for mapping in meta.mapping.manager_mappings:
            removed = removed or mapping.remove_field(attr)
        return removed

    @property
    def dropout(self) -> float:
        """The dropout for the entire network.

        """
        return self.net_settings.dropout

    @dropout.setter
    def dropout(self, dropout: float):
        """The dropout for the entire network.

        """
        self.net_settings.dropout = dropout

    @property
    def epochs(self) -> int:
        """The number of epochs for training and validation.

        """
        return self.model_settings.epochs

    @epochs.setter
    def epochs(self, n_epochs: int):
        """The number of epochs for training and validation.

        """
        self.model_settings.epochs = n_epochs

    @property
    def learning_rate(self) -> float:
        """The learning rate to set on the optimizer.

        """
        return self.model_settings.learning_rate

    @learning_rate.setter
    def learning_rate(self, learning_rate: float):
        """The learning rate to set on the optimizer.

        """
        self.executor.model_settings.learning_rate = learning_rate

    @property
    def cache_batches(self) -> bool:
        """The cache_batches for the entire network.

        """
        return self.model_settings.cache_batches

    @cache_batches.setter
    def cache_batches(self, cache_batches: bool):
        """The cache_batches for the entire network.

        """
        # if the caching strategy changed, be safe and deallocate and purge to
        # lazy recreate everything
        if self.model_settings.cache_batches != cache_batches:
            self.clear()
        self.model_settings.cache_batches = cache_batches

    def clear(self):
        """Clear out any cached executor.

        """
        if logger.isEnabledFor(logging.INFO):
            logger.info('clearing')
        executor = self.executor
        config_factory = self.config_factory
        executor.deallocate()
        config_factory.deallocate()
        self._executor.clear()
        self._config_factory.clear()

    def reload(self):
        """Clears all state and reloads the configuration.

        """
        self.clear()
        self.config.reload()

    def deallocate(self):
        super().deallocate()
        self.SINGLETONS.pop(str(self.__class__), None)

    @classmethod
    def load_from_path(cls, path: Path, *args, **kwargs) -> ModelFacade:
        """Construct a new facade from the data saved in a persisted model file.  This
        uses the :py:meth:`.ModelManager.load_from_path` to reconstruct the
        returned facade, which means some attributes are taken from default if
        not taken from ``*args`` or ``**kwargs``.

        Arguments:
           Passed through to the initializer of invoking class ``cls``.

        :return: a new instance of a :class:`.ModelFacade`

        :see: :meth:`.ModelManager.load_from_path`

        """
        if logger.isEnabledFor(logging.INFO):
            logger.info(f'loading from facade from {path}')
        mm = ModelManager.load_from_path(path)
        if 'executor_name' not in kwargs:
            kwargs['executor_name'] = mm.model_executor_name
        executor = mm.load_executor()
        executor.model_settings.path = path
        mm.config_factory.deallocate()
        facade: ModelFacade = cls(executor.config, *args, **kwargs)
        facade._config_factory.set(executor.config_factory)
        facade._executor.set(executor)
        return facade

    def debug(self, debug_value: Union[bool, int] = True):
        """Debug the model by setting the configuration to debug mode and invoking a
        single forward pass.  Logging must be configured properly to get the
        output, which is typically just invoking
        :py:meth:`logging.basicConfig`.

        :param debug_value: ``True`` turns on executor debugging; if an
                            ``int``, the higher the value, the more the logging

        """
        executor = self.executor
        self._configure_debug_logging()
        executor.debug = debug_value
        executor.progress_bar = False
        executor.model_settings.batch_limit = 1
        self.debuged = True
        executor.train()

    def persist_result(self):
        """Save the last recorded result during an :py:meth:`.Executor.train` or
        :py:meth:`.Executor.test` invocation to disk.  Optionally also save a
        plotted graphics file to disk as well when :obj:`persist_plot_result`
        is set to ``True``.

        Note that in Jupyter notebooks, this method has the side effect of
        plotting the results in the cell when ``persist_plot_result`` is
        ``True``.

        :param persist_plot_result: if ``True``, plot and save the graph as a
                                    PNG file to the results directory

        """
        executor = self.executor
        rmng: ModelResultManager = self.result_manager
        if executor.result_manager is not None:
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug(f'dumping model result: {executor.model_result}')
            rmng.dump(executor.model_result)

    def train(self, description: str = None) -> ModelResult:
        """Train and test or just debug the model depending on the configuration.

        :param description: a description used in the results, which is useful
                            when making incremental hyperparameter changes to
                            the model

        """
        executor = self.executor
        executor.reset()
        logger.info('training...')
        self._notify('train_start', description)
        with time('trained'):
            res = executor.train(description)
        self._notify('train_end', description)
        return res

    def test(self, description: str = None) -> ModelResult:
        """Load the model from disk and test it.

        """
        if self.debuged:
            raise ModelError('Testing is not allowed in debug mode')
        executor = self.executor
        executor.load()
        logger.info('testing...')
        self._notify('test_start', description)
        with time('tested'):
            res = executor.test(description)
        if self.writer is not None:
            res.write(writer=self.writer)
        self._notify('test_end', description)
        return res

    def train_production(self, description: str = None) -> ModelResult:
        """Train on the training and test data sets, then test

        :param description: a description used in the results, which is useful
                            when making incremental hyperparameter changes to
                            the model

        """
        executor = self.executor
        executor.reset()
        if self.writer is not None:
            executor.write(writer=self.writer)
        logger.info('training...')
        self._notify('train_production_start', description)
        with time('trained'):
            res = executor.train_production(description)
        self._notify('train_production_end', description)
        return res

    def predict(self, datas: Iterable[Any]) -> Any:
        """Make ad-hoc predictions on batches without labels, and return the results.

        :param datas: the data predict on, each as a separate element as a data
                      point in a batch

        """
        executor: ModelExecutor = self.executor
        ms: ModelSettings = self.model_settings
        if ms.prediction_mapper_name is None:
            raise ModelError(
                f'The model settings ({ms.name}) is not configured to create '
                + "prediction batches: no set 'prediction_mapper'")
        pm: PredictionMapper = self.config_factory.new_instance(
            ms.prediction_mapper_name, datas, self.batch_stash)
        self._notify('predict_start')
        try:
            batches: List[Batch] = pm.batches
            if not executor.model_exists:
                executor.load()
            logger.info('predicting...')
            with time('predicted'):
                res: ModelResult = executor.predict(batches)
            eres: EpochResult = res.results[0]
            ret: Any = pm.map_results(eres)
        finally:
            self._notify('predict_end')
            pm.deallocate()
        return ret

    def stop_training(self):
        """Early stop training if the model is currently training.  This invokes the
        :meth:`.TrainManager.stop`, communicates to the training process to
        stop on the next check.

        :return: ``True`` if the application is configured to early stop and
                 the signal has not already been given

        """
        self._notify('stop_training')
        return self.executor.train_manager.stop()

    @property
    def last_result(self) -> ModelResult:
        """The last recorded result during an :meth:`.ModelExecutor.train` or
        :meth:`.ModelExecutor.test` invocation is used.

        """
        res = self.executor.model_result
        if res is None:
            rm: ModelResultManager = self.result_manager
            res = rm.load()
            if res is None:
                raise ModelError('No results found')
        return res

    def write_result(self,
                     depth: int = 0,
                     writer: TextIOBase = sys.stdout,
                     include_settings: bool = False,
                     include_converged: bool = False,
                     include_config: bool = False):
        """Load the last set of results from the file system and print them out.  The
        result to print is taken from :obj:`last_result`

        :param depth: the number of indentation levels

        :param writer: the data sink

        :param include_settings: whether or not to include model and network
                                 settings in the output

        :param include_config: whether or not to include the configuration in
                               the output

        """
        if logger.isEnabledFor(logging.INFO):
            logger.info('load previous results')
        res = self.last_result
        res.write(depth,
                  writer,
                  include_settings=include_settings,
                  include_converged=include_converged,
                  include_config=include_config)

    def plot_result(self,
                    result: ModelResult = None,
                    save: bool = False,
                    show: bool = False) -> ModelResult:
        """Plot results and optionally save and show them.  If this is called in a
        Jupyter notebook, the plot will be rendered in a cell.

        :param result: the result to plot, or if ``None``, use
                       :py:meth:`last_result`

        :param save: if ``True``, save the plot to the results directory with
                     the same naming as the last data results

        :param show: if ``True``, invoke ``matplotlib``'s ``show`` function to
                     visualize in a non-Jupyter environment

        :return: the result used to graph, which comes from the executor when
                 none is given to the invocation

        """
        result = self.last_result if result is None else result
        grapher = self.executor.result_manager.get_grapher()
        grapher.plot([result])
        if save:
            grapher.save()
        if show:
            grapher.show()
        return result

    def get_predictions_factory(self, column_names: List[str] = None,
                                transform: Callable[[DataPoint], tuple] = None,
                                batch_limit: int = sys.maxsize,
                                name: str = None) \
            -> PredictionsDataFrameFactory:
        """Generate a predictions factoty from the test data set.

        :param column_names: the list of string column names for each data item
                             the list returned from ``data_point_transform`` to
                             be added to the results for each label/prediction

        :param transform:

            a function that returns a tuple, each with an element respective of
            ``column_names`` to be added to the results for each
            label/prediction; if ``None`` (the default), ``str`` used (see the
            `Iris Jupyter Notebook
            <https://github.com/plandes/deeplearn/blob/master/notebook/iris.ipynb>`_
            example)

        :param batch_limit: the max number of batche of results to output

        :param name: the name/ID (name of the file sans extension in the
                     results directory) of the previously archived saved
                     results to fetch or ``None`` to get the last result

        """
        rm: ModelResultManager = self.result_manager
        res: ModelResult
        if name is None:
            res = self.last_result
            key: str = rm.get_last_key(False)
        else:
            res = rm.results_stash[name].model_result
            key: str = name
        if res is None:
            raise ModelError(f'No test results found: {name}')
        if not res.test.contains_results:
            raise ModelError('No test results found')
        path: Path = rm.key_to_path(key)
        return self.predictions_datafrmae_factory_class(
            path, res, self.batch_stash, column_names, transform, batch_limit)

    def get_predictions(self, *args, **kwargs) -> pd.DataFrame:
        """Generate a Pandas dataframe containing all predictions from the test data
        set.  This method is meant to be overridden by application specific
        facades to customize prediction output.

        :see: :meth:`get_predictions_factory`

        :param args: arguments passed to :meth:`get_predictions_factory`

        :param kwargs: arguments passed to :meth:`get_predictions_factory`

        """
        df_fac = self.get_predictions_factory(*args, **kwargs)
        return df_fac.dataframe

    def write_predictions(self, lines: int = 10):
        """Print the predictions made during the test phase of the model execution.

        :param lines: the number of lines of the predictions data frame to be
                      printed

        :param writer: the data sink

        """
        preds = self.get_predictions()
        print(preds.head(lines), file=self.writer)

    def get_result_analyzer(self, key: str = None,
                            cache_previous_results: bool = False) \
            -> ResultAnalyzer:
        """Return a results analyzer for comparing in flight training progress.

        """
        rm: ModelResultManager = self.result_manager
        if key is None:
            key = rm.get_last_key()
        return ResultAnalyzer(self.executor, key, cache_previous_results)

    @property
    def class_explorer(self) -> FacadeClassExplorer:
        return self._create_facade_explorer()

    def _create_facade_explorer(self) -> FacadeClassExplorer:
        """Return a facade explorer used to print the facade's object graph.

        """
        return FacadeClassExplorer()

    def write(self,
              depth: int = 0,
              writer: TextIOBase = None,
              include_executor: bool = True,
              include_metadata: bool = True,
              include_settings: bool = True,
              include_model: bool = True,
              include_config: bool = False,
              include_object_graph: bool = False):
        writer = self.writer if writer is None else writer
        writer = sys.stdout if writer is None else writer
        bmeta = None
        try:
            bmeta = self.batch_metadata
        except AttributeError:
            pass
        if include_executor:
            self._write_line(f'{self.executor.name}:', depth, writer)
            self.executor.write(depth + 1,
                                writer,
                                include_settings=include_settings,
                                include_model=include_model)
        if include_metadata and bmeta is not None:
            self._write_line('metadata:', depth, writer)
            bmeta.write(depth + 1, writer)
        if include_object_graph:
            self._write_line('graph:', depth, writer)
            ce = self._create_facade_explorer()
            ce.write(self, depth=depth + 1, writer=writer)
        if include_config:
            self._write_line('config:', depth, writer)
            self.config.write(depth + 1, writer)

    def _deallocate_config_instance(self, inst: Any):
        if isinstance(self.config_factory, ImportConfigFactory):
            inst = self.config_factory.clear_instance(inst)
        dealloc = isinstance(inst, Deallocatable)
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'deallocate {inst}: {type(inst)}: {dealloc}')
        if dealloc:
            inst.deallocate()

    def _configure_debug_logging(self):
        """When debuging the model, configure the logging system for output.  The
        correct loggers need to be set to debug mode to print the model
        debugging information such as matrix shapes.

        """
        for name in ['zensols.deeplearn.model', __name__]:
            logging.getLogger(name).setLevel(logging.DEBUG)

    def _configure_cli_logging(self, info_loggers: List[str],
                               debug_loggers: List[str]):
        info_loggers.extend([
            # multi-process (i.e. batch creation)
            'zensols.multi.stash',
            'zensols.deeplearn.batch.multi',
            # validation/training loss messages
            'zensols.deeplearn.model.executor.status',
            __name__
        ])
        if not self.progress_bar:
            info_loggers.extend([
                # load messages
                'zensols.deeplearn.batch.stash',
                # save results messages
                'zensols.deeplearn.result',
                # validation/training loss messages
                'zensols.deeplearn.model.executor.progress',
                # model save/load
                'zensols.deeplearn.model.manager',
                # early stop messages
                'zensols.deeplearn.model.trainmng',
                # performance metrics formatting
                'zensols.deeplearn.model.format',
                # model save messages
                'zensols.deeplearn.result.manager',
                # observer module API messages
                'zensols.deeplearn.observer.status',
                #'zensols.deeplearn.observer.event',
                # CLI interface
                'zensols.deeplearn.cli.app'
            ])

    @staticmethod
    def configure_default_cli_logging(log_level: int = logging.WARNING):
        """Configure the logging system with the defaults.

        """
        fmt = '%(asctime)s[%(levelname)s]%(name)s: %(message)s'
        logging.basicConfig(format=fmt, level=log_level)

    def configure_cli_logging(self, log_level: int = None):
        """"Configure command line (or Python REPL) debugging.  Each facade can turn on
        name spaces that make sense as useful information output for long
        running training/testing iterations.

        This calls "meth:`_configure_cli_logging` to collect the names of
        loggers at various levels.

        """
        info = []
        debug = []
        if log_level is not None:
            self.configure_default_cli_logging(log_level)
        self._configure_cli_logging(info, debug)
        for name in info:
            logging.getLogger(name).setLevel(logging.INFO)
        for name in debug:
            logging.getLogger(name).setLevel(logging.DEBUG)

    def configure_jupyter(self,
                          log_level: int = logging.WARNING,
                          progress_bar_cols: int = 120):
        """Configures logging and other configuration related to a Jupyter notebook.
        This is just like :py:meth:`configure_cli_logging`, but adjusts logging
        for what is conducive for reporting in Jupyter cells.

        ;param log_level: the default logging level for the logging system

        :param progress_bar_cols: the number of columns to use for the progress
                                  bar

        """
        self.configure_cli_logging(log_level)
        for name in [
                # turn off loading messages
                'zensols.deeplearn.batch.stash',
                # turn off model save messages
                'zensols.deeplearn.result.manager'
        ]:
            logging.getLogger(name).setLevel(logging.WARNING)
        # number of columns for the progress bar
        self.executor.progress_bar_cols = progress_bar_cols
        # turn off console output (non-logging)
        self.writer = None

    @staticmethod
    def get_encode_sparse_matrices() -> bool:
        """Return whether or not sparse matricies are encoded.

        :see: :meth:`set_sparse`

        """
        return SparseTensorFeatureContext.USE_SPARSE

    @staticmethod
    def set_encode_sparse_matrices(use_sparse: bool = False):
        """If called before batches are created, encode all tensors the would be
        encoded as dense rather than sparse when ``use_sparse`` is ``False``.
        Oherwise, tensors will be encoded as sparse where it makes sense on a
        per vectorizer basis.

        """
        SparseTensorFeatureContext.USE_SPARSE = use_sparse
Exemplo n.º 21
0
class FacadeApplication(Deallocatable):
    """Base class for applications that use :class:`.ModelFacade`.

    """
    CLI_META = {'mnemonic_excludes': {'get_cached_facade', 'create_facade',
                                      'deallocate', 'clear_cached_facade'},
                'option_overrides': {'model_path': {'long_name': 'model',
                                                    'short_name': None}}}
    """Tell the command line app API to igonore subclass and client specific use
    case methods.

    """
    config: Configurable = field()
    """The config used to create facade instances."""

    facade_name: str = field(default='facade')
    """The client facade."""

    # simply copy this field and documentation to the implementation class to
    # add model path location (for those subclasses that don't have the
    # ``CLASS_INSPECTOR`` class level attribute set (see
    # :obj:`~zensols.util.introspect.inspect.ClassInspector.INSPECT_META`);
    # this can also be set as a parameter such as with
    # :methd:`.FacadeModelApplication.test`
    model_path: Path = field(default=None)
    """The path to the model or use the last trained model if not provided.

    """
    config_factory_args: Dict[str, Any] = field(default_factory=dict)
    """The arguments given to the :class:`~zensols.config.ImportConfigFactory`,
    which could be useful for reloading all classes while debugingg.

    """
    config_overwrites: Configurable = field(default=None)
    """A configurable that clobbers any configuration in :obj:`config` for those
    sections/options set.

    """
    def __post_init__(self):
        self.dealloc_resources = []
        self._cached_facade = PersistedWork('_cached_facade', self, True)

    def _enable_cli_logging(self, facade: ModelFacade):
        facade.progress_bar = False
        facade.configure_cli_logging()

    def create_facade(self) -> ModelFacade:
        """Create a new instance of the facade."""
        # we must create a new (non-shared) instance of the facade since it
        # will get deallcated after complete.
        config = self.config
        model_path = self.model_path
        if self.config_overwrites is not None:
            config = cp.deepcopy(config)
            config.merge(self.config_overwrites)
        if model_path is None:
            cf = ImportConfigFactory(config, **self.config_factory_args)
            facade: ModelFacade = cf.instance(self.facade_name)
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug(f'created facade: {facade}')
            self.dealloc_resources.extend((cf, facade))
        else:
            if logger.isEnabledFor(logging.INFO):
                logger.info(f'loading model from {model_path}')
            with dealloc(ImportConfigFactory(
                    config, **self.config_factory_args)) as cf:
                cls: Type[ModelFacade] = cf.get_class(self.facade_name)
            facade: ModelFacade = cls.load_from_path(model_path)
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug(f'created facade: {type(facade)} ' +
                             f'from path: {model_path}')
            self.dealloc_resources.append(facade)
        return facade

    @persisted('_cached_facade')
    def get_cached_facade(self, path: Path = None) -> ModelFacade:
        """Return a created facade that is cached in this application instance.

        """
        return self.create_facade()

    def clear_cached_facade(self):
        """Clear any cached facade this application instance.

        """
        if self._cached_facade.is_set():
            self._cached_facade().deallocate()
        self._cached_facade.clear()

    def deallocate(self):
        super().deallocate()
        self._try_deallocate(self.dealloc_resources, recursive=True)
        self._cached_facade.deallocate()
Exemplo n.º 22
0
 def __post_init__(self):
     self.dealloc_resources = []
     self._cached_facade = PersistedWork('_cached_facade', self, True)