def _combine_with_existing_features( message: Message, additional_features: Any, feature_name: Text = MESSAGE_VECTOR_FEATURE_NAMES[ MESSAGE_TEXT_ATTRIBUTE], ) -> Any: if message.get(feature_name) is not None: return np.concatenate( (message.get(feature_name), additional_features), axis=-1) else: return additional_features
def __additional_ner_features(message: Message) -> List[Any]: features = message.get("ner_features", []) tokens = message.get("tokens", []) if len(tokens) != len(features): warn_string = "Number of custom NER features ({}) does not match number of tokens ({})".format( len(features), len(tokens)) raise Exception(warn_string) # convert to python-crfsuite feature format features_out = [] for feature in features: feature_dict = { str(index): token_features for index, token_features in enumerate(feature) } converted = {"custom_ner_features": feature_dict} features_out.append(converted) return features_out
def process(self, message: Message, **kwargs: Any) -> None: self._check_spacy_doc(message) extracted = self.add_extractor_name(self.extract_entities(message)) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def _from_crf_to_json(self, message: Message, entities: List[Any]) -> List[Dict[Text, Any]]: if self.pos_features: tokens = message.get("spacy_doc") else: tokens = message.get("tokens") if len(tokens) != len(entities): raise Exception( "Inconsistency in amount of tokens between crfsuite and message" ) if self.component_config["BILOU_flag"]: return self._convert_bilou_tagging_to_entity_result( message, tokens, entities) else: # not using BILOU tagging scheme, multi-word entities are split. return self._convert_simple_tagging_to_entity_result( tokens, entities)
def _from_json_to_crf( self, message: Message, entity_offsets: List[Tuple[int, int, Text]] ) -> List[Tuple[Optional[Text], Optional[Text], Text, Dict[Text, Any], Optional[Dict[Text, Any]], ]]: """Convert json examples to format of underlying crfsuite.""" if self.pos_features: from spacy.gold import GoldParse # pytype: disable=import-error doc_or_tokens = message.get("spacy_doc") gold = GoldParse(doc_or_tokens, entities=entity_offsets) ents = [l[5] for l in gold.orig_annot] else: doc_or_tokens = message.get("tokens") ents = self._bilou_tags_from_offsets(doc_or_tokens, entity_offsets) # collect badly annotated examples collected = [] for t, e in zip(doc_or_tokens, ents): if e == "-": collected.append(t) elif collected: collected_text = " ".join([t.text for t in collected]) logger.warning("Misaligned entity annotation for '{}' " "in sentence '{}' with intent '{}'. " "Make sure the start and end values of the " "annotated training examples end at token " "boundaries (e.g. don't include trailing " "whitespaces or punctuation)." "".format(collected_text, message.text, message.get("intent"))) collected = [] if not self.component_config["BILOU_flag"]: for i, label in enumerate(ents): if self._bilou_from_label(label) in {"B", "I", "U", "L"}: # removes BILOU prefix from label ents[i] = self._entity_from_label(label) return self._from_text_to_crf(message, ents)
def _from_text_to_crf( self, message: Message, entities: List[Text] = None ) -> List[Tuple[Optional[Text], Optional[Text], Text, Dict[Text, Any], Optional[Dict[Text, Any]], ]]: """Takes a sentence and switches it to crfsuite format.""" crf_format = [] if self.pos_features: tokens = message.get("spacy_doc") else: tokens = message.get("tokens") ner_features = (self.__additional_ner_features(message) if self.use_ner_features else None) for i, token in enumerate(tokens): pattern = self.__pattern_of_token(message, i) entity = entities[i] if entities else "N/A" tag = self.__tag_of_token(token) if self.pos_features else None custom_ner_features = ner_features[ i] if self.use_ner_features else None crf_format.append( (token.text, tag, entity, pattern, custom_ner_features)) return crf_format
def _convert_example(example: Message) -> List[Tuple[int, int, Text]]: def convert_entity(entity): return entity["start"], entity["end"], entity["entity"] return [convert_entity(ent) for ent in example.get("entities", [])]