def _convert_to_crf_tokens(self, message: Message) -> List[CRFToken]: """Take a message and convert it to crfsuite format.""" crf_format = [] tokens = train_utils.tokens_without_cls(message) text_dense_features = self._get_dense_features(message) tags = self._get_tags(message) for i, token in enumerate(tokens): pattern = self._pattern_of_token(message, i) entity = self.get_tag_for(tags, ENTITY_ATTRIBUTE_TYPE, i) group = self.get_tag_for(tags, ENTITY_ATTRIBUTE_GROUP, i) role = self.get_tag_for(tags, ENTITY_ATTRIBUTE_ROLE, i) pos_tag = token.get(POS_TAG_KEY) dense_features = (text_dense_features[i] if text_dense_features is not None else []) crf_format.append( CRFToken( text=token.text, pos_tag=pos_tag, entity_tag=entity, entity_group_tag=group, entity_role_tag=role, pattern=pattern, dense_features=dense_features, )) return crf_format
def apply_bilou_schema(training_data: TrainingData, include_cls_token: bool = True) -> None: """Get a list of BILOU entity tags and set them on the given messages. Args: training_data: the training data """ for message in training_data.training_examples: entities = message.get(ENTITIES) if not entities: continue tokens = message.get(TOKENS_NAMES[TEXT]) if not include_cls_token: tokens = train_utils.tokens_without_cls(message) for attribute, message_key in [ (ENTITY_ATTRIBUTE_TYPE, BILOU_ENTITIES), (ENTITY_ATTRIBUTE_ROLE, BILOU_ENTITIES_ROLE), (ENTITY_ATTRIBUTE_GROUP, BILOU_ENTITIES_GROUP), ]: entities = map_message_entities(message, attribute) output = bilou_tags_from_offsets(tokens, entities) message.set(message_key, output)
def extract_entities(self, message: Message) -> List[Dict[Text, Any]]: """Extract entities from the given message using the trained model(s).""" if self.entity_taggers is None: return [] tokens = train_utils.tokens_without_cls(message) crf_tokens = self._convert_to_crf_tokens(message) predictions = {} for tag_name, entity_tagger in self.entity_taggers.items(): # use predicted entity tags as features for second level CRFs include_tag_features = tag_name != ENTITY_ATTRIBUTE_TYPE if include_tag_features: self._add_tag_to_crf_token(crf_tokens, predictions) features = self._crf_tokens_to_features(crf_tokens, include_tag_features) predictions[tag_name] = entity_tagger.predict_marginals_single( features) # convert predictions into a list of tags and a list of confidences tags, confidences = self._tag_confidences(tokens, predictions) return self.convert_predictions_into_entities(message.text, tokens, tags, confidences)
def set_gensim_features(self, message: Message, attribute: Text = TEXT) -> None: tokens = message.get(TOKENS_NAMES[attribute]) if not tokens: return None # If we key is not available then we featuizer it with an array of zeros word_vectors = [ self.kv[t.text] if t.text in self.kv else np.zeros(self.kv.vector_size) for t in train_utils.tokens_without_cls(message, attribute) ] # Sum up all the word vectors so that we have one for __CLS__ text_vector = reduce(lambda a, b: a + b, word_vectors) X = np.array(word_vectors + [text_vector]) # remember, we need one for __CLS__ features = self._combine_with_existing_dense_features( message, additional_features=X, feature_name=DENSE_FEATURE_NAMES[attribute]) message.set(DENSE_FEATURE_NAMES[attribute], features)
def _compute_sequence_encodings( self, batch_examples: List[Message], module: Any, attribute: Text = TEXT ) -> Tuple[np.ndarray, List[int]]: list_of_tokens = [ train_utils.tokens_without_cls(example, attribute) for example in batch_examples ] number_of_tokens_in_sentence = [ len(sent_tokens) for sent_tokens in list_of_tokens ] # join the tokens to get a clean text to ensure the sequence length of # the returned embeddings from ConveRT matches the length of the tokens # (including sub-tokens) tokenized_texts = self._tokens_to_text(list_of_tokens) token_features = self._sequence_encoding_of_text(tokenized_texts, module) # ConveRT might split up tokens into sub-tokens # take the mean of the sub-token vectors and use that as the token vector token_features = train_utils.align_token_features( list_of_tokens, token_features ) return token_features, number_of_tokens_in_sentence
def _prepare_mitie_sample(training_example: Message) -> Any: import mitie text = training_example.text tokens = train_utils.tokens_without_cls(training_example) sample = mitie.ner_training_instance([t.text for t in tokens]) for ent in training_example.get(ENTITIES, []): try: # if the token is not aligned an exception will be raised start, end = MitieEntityExtractor.find_entity( ent, text, tokens) except ValueError as e: raise_warning(f"Failed to use example '{text}' to train MITIE " f"entity extractor. Example will be skipped." f"Error: {e}") continue try: # mitie will raise an exception on malicious # input - e.g. on overlapping entities sample.add_entity(list(range(start, end)), ent["entity"]) except Exception as e: raise_warning(f"Failed to add entity example " f"'{str(e)}' of sentence '{str(text)}'. " f"Example will be ignored. Reason: " f"{e}") continue return sample
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = self._mitie_feature_extractor(**kwargs) tokens = train_utils.tokens_without_cls(message) features = self.features_for_tokens(tokens, mitie_feature_extractor) final_features = Features( features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS]) message.add_features(final_features)
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = self._mitie_feature_extractor(**kwargs) tokens = train_utils.tokens_without_cls(message) features = self.features_for_tokens(tokens, mitie_feature_extractor) message.set( DENSE_FEATURE_NAMES[TEXT], self._combine_with_existing_dense_features( message, features, DENSE_FEATURE_NAMES[TEXT]), )
def process_training_example(self, example: Message, attribute: Text, mitie_feature_extractor: Any): tokens = train_utils.tokens_without_cls(example, attribute) if tokens is not None: features = self.features_for_tokens(tokens, mitie_feature_extractor) example.set( DENSE_FEATURE_NAMES[attribute], self._combine_with_existing_dense_features( example, features, DENSE_FEATURE_NAMES[attribute]), )
def process_training_example(self, example: Message, attribute: Text, mitie_feature_extractor: Any): tokens = train_utils.tokens_without_cls(example, attribute) if tokens is not None: features = self.features_for_tokens(tokens, mitie_feature_extractor) final_features = Features( features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS]) example.add_features(final_features)
def set_fasttext_features(self, message: Message, attribute: Text = TEXT) -> None: tokens = message.get(TOKENS_NAMES[attribute]) if not tokens: return None text_vector = self.model.get_word_vector(message.text) word_vectors = [ self.model.get_word_vector(t.text) for t in train_utils.tokens_without_cls(message, attribute) ] X = np.array(word_vectors + [text_vector]) # remember, we need one for __CLS__ features = self._combine_with_existing_dense_features( message, additional_features=X, feature_name=DENSE_FEATURE_NAMES[attribute] ) message.set(DENSE_FEATURE_NAMES[attribute], features)
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = kwargs.get("mitie_feature_extractor") if not mitie_feature_extractor: raise Exception("Failed to train 'MitieFeaturizer'. " "Missing a proper MITIE feature extractor.") ents = self.extract_entities( message.text, train_utils.tokens_without_cls(message), mitie_feature_extractor, ) extracted = self.add_extractor_name(ents) extracted = self.clean_up_entities(message, extracted) message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
def set_bpemb_features(self, message: Message, attribute: Text = TEXT) -> None: tokens = message.get(TOKENS_NAMES[attribute]) if not tokens: return None text_vector = self.create_word_vector(document=message.text) word_vectors = [ self.create_word_vector(document=t.text) for t in train_utils.tokens_without_cls(message, attribute) ] X = np.array(word_vectors + [text_vector]) features = self._combine_with_existing_dense_features( message, additional_features=X, feature_name=DENSE_FEATURE_NAMES[attribute]) message.set(DENSE_FEATURE_NAMES[attribute], features)
def _create_feature_to_idx_dict( self, training_data: TrainingData) -> Dict[Text, Dict[Text, int]]: """Create dictionary of all feature values. Each feature key, defined in the component configuration, points to different feature values and their indices in the overall resulting feature vector. """ # get all possible feature values all_features = [] for example in training_data.training_examples: tokens_without_cls = train_utils.tokens_without_cls(example) all_features.append(self._tokens_to_features(tokens_without_cls)) # build vocabulary of features feature_vocabulary = self._build_feature_vocabulary(all_features) # assign a unique index to each feature value return self._map_features_to_indices(feature_vocabulary)
def _get_tags(self, message: Message) -> Dict[Text, List[Text]]: """Get assigned entity tags of message.""" tokens = train_utils.tokens_without_cls(message) tags = {} for tag_name in self.crf_order: if self.component_config[BILOU_FLAG]: bilou_key = bilou_utils.get_bilou_key_for_tag(tag_name) if message.get(bilou_key): _tags = message.get(bilou_key) else: _tags = [NO_ENTITY_TAG for _ in tokens] else: _tags = [ determine_token_labels(token, message.get(ENTITIES), attribute_key=tag_name) for token in tokens ] tags[tag_name] = _tags return tags