예제 #1
0
    def extract_features_from_doc(self, doc: Document, start_token_idx,
                                  end_token_idx):
        ret = {"seq_len": end_token_idx - start_token_idx}

        for name, (converter,
                   preprocessor) in self.we_converters_preprocessors.items():
            tokens = doc.tokens[start_token_idx:end_token_idx]
            tokens = [
                preprocessor(token) if preprocessor is not None else token
                for token in tokens
            ]
            ret[name] = encode_sequence(tokens, converter)

        for key in self.vectors_keys:
            ret[key] = doc.token_features[key][start_token_idx:end_token_idx]

        for name, converter in self.word_level_features.items():
            ret[name] = encode_sequence(
                doc.token_features[name][start_token_idx:end_token_idx],
                converter)

        for name, fe in self.gazetteer_feature_extractors.items():
            ret[name] = fe.extract_features(doc, start_token_idx,
                                            end_token_idx)

        if 'chars' in self.char_level_features:
            chars = TokenFeatureExtractor._get_chars_features(
                doc, start_token_idx, end_token_idx,
                self.char_level_features['chars']['padding_size'])
            ret['chars'] = encode_sequence3d(
                chars, self.char_level_features['chars']['converter'])

        return ret
예제 #2
0
 def extract_features_from_doc(self, doc: Document, start_token, end_token):
     features = {}
     if "ne" in self.converters:
         features["ne"] = encode_sequence(
             self._get_ne_types(doc, start_token, end_token),
             self.converters["ne"])
     return features
 def extract_features(self, doc: Document, start_token_idx: int,
                      end_token_idx: int):
     tokens = doc.token_features["lemmas"] if self._lemmatize else doc.tokens
     token_slice = map(self._processor,
                       tokens[start_token_idx:end_token_idx])
     return encode_sequence(map(self._gazetteer.__contains__, token_slice),
                            self._converter)
예제 #4
0
    def _get_entities_encoder_features(self, doc, start_token: int,
                                       end_token: int) -> dict:
        features = {}

        # currently we can have here only entitites types and depths, use common pattern
        for feature, feature_converter in self.entities_encoder_features_converters.items(
        ):
            features[feature] = encode_sequence(
                doc.token_features[feature][start_token:end_token],
                feature_converter)

        return features
예제 #5
0
    def extract_features_from_doc(self, doc: Document, include_labels=False):
        samples = []

        for sent_idx, sent in enumerate(doc.sentences):
            sample = self.token_feature_extractor.extract_features_from_doc(doc, sent.start_token, sent.end_token)
            sample.update(self.ne_feature_extractor.extract_features_from_doc(doc, sent.start_token, sent.end_token))
            if include_labels:
                labels = self.labelling_strategy.encode_labels(sent, doc.entities.contained_in(sent))
                sample["labels"] = encode_sequence(labels, self.labels_converter)

            samples.append(sample)

        return samples
    def extract_features_from_doc(self, doc: Document, start_token, end_token,
                                  wrt_span):
        features = {}

        factories = [("token_position", _get_token_positions_to_span),
                     ("token_log_position", _get_token_positions_to_span),
                     ("sent_position", _get_sentence_positions_to_span),
                     ("at_root_dt_path",
                      _get_tokens_on_path_from_root_to_span),
                     ("root_dt_path_position", lambda d, f, l, s:
                      _get_tokens_on_path_from_root_to_span(d, f, l, s, True))]

        for feature_name, feature_factory in factories:
            converter = self.converters.get(feature_name, None)
            if converter is not None:
                features[feature_name] = encode_sequence(
                    feature_factory(doc, start_token, end_token, wrt_span),
                    converter)
        return features
예제 #7
0
 def encoded_labels_to_entities(self, sent: Sentence, encoded_labels: List[int]) -> List[Entity]:
     return self.labelling_strategy.decode_labels(
         sent, encode_sequence(encoded_labels, self.reversed_labels_converter))