def __post_init__(self): super().__post_init__() if self.encode_transformed and self.embed_model.trainable: # once the transformer last hidden state is dumped during encode # the parameters are lost, which are needed to train the model # properly raise VectorizerError('a trainable model can not encode ' + 'transformed vectorized features')
def _validate(self): if not self._validated: for vec in self.delegates: if hasattr(vec, 'feature_tye') and \ vec.feature_type != TextFeatureType.TOKEN: raise VectorizerError('Only token level vectorizers are ' + f'supported, but got {vec}') self._validated = True
def _assert_decoded_doc_dim(self, arr: Tensor, expect: int): """Check the decoded document dimesion and rase an error for those that do not match. """ if len(arr.size()) != expect: raise VectorizerError(f'Expecting {expect} tensor dimensions, ' + f'but got shape: {arr.shape}')
def _combine_documents(self, docs: Tuple[FeatureDocument]) -> \ FeatureDocument: if self.fold_method == 'raise' and len(docs) > 1: raise VectorizerError( f'Configured to support single document but got {len(docs)}') concat_tokens = self.fold_method == 'concat_tokens' if logger.isEnabledFor(logging.DEBUG): logger.debug(f'foldl method: {self.fold_method}, ' + f'concat_tokens={concat_tokens}') return FeatureDocument.combine_documents( docs, concat_tokens=concat_tokens)
def _assert_doc(self, doc: Union[Tuple[FeatureDocument], FeatureDocument]): """Raise an error if any input is not a :class:`.FeatureDocument`. :raises: :class:`.VectorizerError` if any input isn't a document """ if self._is_mult(doc): docs = doc for doc in docs: self._assert_doc(doc) elif not isinstance(doc, FeatureDocument): raise VectorizerError( f'Expecting document, but got type: {type(doc)}')
def __post_init__(self): super().__post_init__() if logger.isEnabledFor(logging.DEBUG): logger.debug('creating fd vec manager') if self.token_feature_ids is None: self.token_feature_ids = self.doc_parser.token_feature_ids else: feat_diff = self.token_feature_ids - self.doc_parser.token_feature_ids if len(feat_diff) > 0: fdiffs = ', '.join(feat_diff) raise VectorizerError( 'Parser token features do not exist in vectorizer: ' + f'{self.token_feature_ids} - ' + f'{self.doc_parser.token_feature_ids} = {fdiffs}') self._spacy_vectorizers = PersistedWork('_spacy_vectorizers', self)
def encode(self, doc: Union[Tuple[FeatureDocument], FeatureDocument]) -> \ FeatureContext: ctx: FeatureContext if self.fold_method == 'concat_tokens' or \ self.fold_method == 'sentence': ctx = super().encode(doc) elif self.fold_method == 'separate': self._assert_doc(doc) ctx = self._encode_sentences(doc) elif self.fold_method == 'raise': if self._is_mult(doc): raise VectorizerError( f'Expecting single document but got: {len(doc)} documents') ctx = super().encode(doc) return ctx
def get_flattened_features_shape(self, attribs: Set[str]) -> Tuple[int]: """Return the shape if all vectorizers were used. """ bmapping = self.batch_feature_mapping label_feature_id = bmapping.label_feature_id n_flat_neurons = 0 for feature_id, v in self.items(): _, field_map = bmapping.get_field_map_by_feature_id(feature_id) if field_map is None: s = f'no feature: {feature_id} in vectorizer {self.name}' raise VectorizerError(s) attr = field_map.attr if feature_id != label_feature_id and \ (attribs is None or attr in attribs): n = reduce(operator.mul, filter(lambda n: n > 0, v.shape)) n_flat_neurons += n return (n_flat_neurons, )
def _slice_by_attributes(self, arr: Tensor) -> Tensor: """Create a new tensor from column based slices of the encoded tensor for each specified feature id given in :obj:`decoded_feature_ids`. """ keeps = set(self.decoded_feature_ids) col_start = 0 tensors = [] for fvec in self.manager.spacy_vectorizers.values(): col_end = col_start + fvec.shape[1] fid = fvec.feature_id if logger.isEnabledFor(logging.DEBUG): logger.debug(f'type={fid}, to keep={keeps}') if fid in keeps: tensors.append(arr[:, :, col_start:col_end]) keeps.remove(fid) col_start = col_end if len(keeps) > 0: raise VectorizerError(f'Unknown feature type IDs: {keeps}') sarr = torch.cat(tensors, dim=2) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'slice dim: {sarr.shape}') return sarr
def _encode(self, doc: FeatureDocument) -> FeatureContext: slen = len(doc) tlen = self.manager.get_token_length(doc) attr = self.feature_attribute arr = self.torch_config.zeros((slen, tlen, self.shape[2])) doc_val = getattr(doc, attr) if self.level == 'document' else None if logger.isEnabledFor(logging.DEBUG): logger.debug(f'vectorizing: {attr} for token length: {tlen} ' + f'in to {arr.shape}') for six, sent in enumerate(doc.sents): if self.level == 'document': feats = [doc_val] * len(sent) elif self.level == 'sentenece': sent_val = getattr(sent, attr) feats = [sent_val] * len(sent) elif self.level == 'token': feats = tuple(map(lambda s: getattr(s, attr), sent)) else: raise VectorizerError(f'Unknown doc level: {self.level}') self._encode_cats(feats, arr[six]) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'vectorized: {len(doc)} sents in to {arr.shape}') return SparseTensorFeatureContext.instance(self.feature_id, arr, self.torch_config)
def _assert_token_output(self, expected: str = 'last_hidden_state'): if self.embed_model.output != expected: raise VectorizerError(f"""\ Expanders only work at the token level, so output such as \ `{expected}`, which provides an output for each token in the \ transformer embedding, is required, got: {self.embed_model.output}""")
def __post_init__(self): super().__post_init__() if self.delegate_feature_id is None: raise VectorizerError('Expected attribute: delegate_feature_id') self._assert_token_output()
def __post_init__(self): super().__post_init__() if self.fold_method not in self._FOLD_METHODS: raise VectorizerError(f'No such fold method: {self.fold_method}')