Exemplo n.º 1
0
 def __init__(self, config, multi_label):
     super(AssociationPipeline, self).__init__(config)
     self.multi_label = multi_label
     self.association_encoder = SequenceLabelingEncoder()
     self.association_encoder.fit(config.association_types +
                                  [self.config.pad_token])
     self.association_pad_idx = self.association_encoder.transform(
         [self.config.pad_token])
Exemplo n.º 2
0
 def _target_encoder(self):
     if self.multi_label:
         return SequenceMultiLabelingEncoder()
     return SequenceLabelingEncoder()
Exemplo n.º 3
0
 def _target_encoder(self):
     return SequenceLabelingEncoder()
Exemplo n.º 4
0
class AssociationPipeline(BasePipeline):
    def __init__(self, config, multi_label):
        super(AssociationPipeline, self).__init__(config)
        self.multi_label = multi_label
        self.association_encoder = SequenceLabelingEncoder()
        self.association_encoder.fit(config.association_types +
                                     [self.config.pad_token])
        self.association_pad_idx = self.association_encoder.transform(
            [self.config.pad_token])

    def _post_data_initialization(self, Y):
        Y_ = list(itertools.chain.from_iterable([y[0] for y in Y]))
        super()._post_data_initialization(Y_)

    def text_to_tokens_mask(self, X, Y=None):
        pad_token = [self.config.pad_token
                     ] if self.multi_label else self.config.pad_token
        if Y is not None:
            Y = list(zip(*Y))
        out_gen = self._text_to_ids(X,
                                    Y=Y,
                                    pad_token=(pad_token, pad_token, -1, -2))
        class_list = self.association_encoder.classes_.tolist()
        assoc_pad_id = class_list.index(pad_token)
        for out in out_gen:
            feats = {"tokens": out.token_ids, "mask": out.mask}
            if Y is None:
                yield feats
            else:
                labels = []
                assoc_mat = [[assoc_pad_id for _ in range(len(out.labels))]
                             for _ in range(len(out.labels))]
                for i, (l, _, _, idx) in enumerate(out.labels):
                    labels.append(l)
                    for j, (_, a_t, a_i, _) in enumerate(out.labels):
                        if a_t != pad_token and idx == a_i:
                            assoc_mat[i][j] = class_list.index(a_t)

                yield feats, {
                    "labels": self.label_encoder.transform(labels),
                    "associations": np.array(assoc_mat, dtype=np.int32)
                }

    def _compute_class_counts(self, encoded_dataset):
        counter = Counter()
        for doc, target_arr in encoded_dataset:
            targets = target_arr['labels'][doc['mask'].astype(np.bool)]
            counter.update(self.label_encoder.inverse_transform(targets))
        return counter

    def _format_for_encoding(self, X):
        return [X]

    def _format_for_inference(self, X):
        return [[x] for x in X]

    def feed_shape_type_def(self):
        TS = tf.TensorShape
        target_shape = ([
            self.config.max_length, self.label_encoder.target_dim
        ] if self.multi_label else [self.config.max_length])
        return (({
            "tokens": tf.int32,
            "mask": tf.float32
        }, {
            "labels": tf.int32,
            "associations": tf.int32
        }), ({
            "tokens": TS([self.config.max_length, 2]),
            "mask": TS([self.config.max_length])
        }, {
            "labels":
            TS(target_shape),
            "associations":
            TS([self.config.max_length, self.config.max_length])
        }))

    def _target_encoder(self):
        return SequenceLabelingEncoder()