def __init__(self, config, multi_label): super(AssociationPipeline, self).__init__(config) self.multi_label = multi_label self.association_encoder = SequenceLabelingEncoder() self.association_encoder.fit(config.association_types + [self.config.pad_token]) self.association_pad_idx = self.association_encoder.transform( [self.config.pad_token])
def _target_encoder(self): if self.multi_label: return SequenceMultiLabelingEncoder() return SequenceLabelingEncoder()
def _target_encoder(self): return SequenceLabelingEncoder()
class AssociationPipeline(BasePipeline): def __init__(self, config, multi_label): super(AssociationPipeline, self).__init__(config) self.multi_label = multi_label self.association_encoder = SequenceLabelingEncoder() self.association_encoder.fit(config.association_types + [self.config.pad_token]) self.association_pad_idx = self.association_encoder.transform( [self.config.pad_token]) def _post_data_initialization(self, Y): Y_ = list(itertools.chain.from_iterable([y[0] for y in Y])) super()._post_data_initialization(Y_) def text_to_tokens_mask(self, X, Y=None): pad_token = [self.config.pad_token ] if self.multi_label else self.config.pad_token if Y is not None: Y = list(zip(*Y)) out_gen = self._text_to_ids(X, Y=Y, pad_token=(pad_token, pad_token, -1, -2)) class_list = self.association_encoder.classes_.tolist() assoc_pad_id = class_list.index(pad_token) for out in out_gen: feats = {"tokens": out.token_ids, "mask": out.mask} if Y is None: yield feats else: labels = [] assoc_mat = [[assoc_pad_id for _ in range(len(out.labels))] for _ in range(len(out.labels))] for i, (l, _, _, idx) in enumerate(out.labels): labels.append(l) for j, (_, a_t, a_i, _) in enumerate(out.labels): if a_t != pad_token and idx == a_i: assoc_mat[i][j] = class_list.index(a_t) yield feats, { "labels": self.label_encoder.transform(labels), "associations": np.array(assoc_mat, dtype=np.int32) } def _compute_class_counts(self, encoded_dataset): counter = Counter() for doc, target_arr in encoded_dataset: targets = target_arr['labels'][doc['mask'].astype(np.bool)] counter.update(self.label_encoder.inverse_transform(targets)) return counter def _format_for_encoding(self, X): return [X] def _format_for_inference(self, X): return [[x] for x in X] def feed_shape_type_def(self): TS = tf.TensorShape target_shape = ([ self.config.max_length, self.label_encoder.target_dim ] if self.multi_label else [self.config.max_length]) return (({ "tokens": tf.int32, "mask": tf.float32 }, { "labels": tf.int32, "associations": tf.int32 }), ({ "tokens": TS([self.config.max_length, 2]), "mask": TS([self.config.max_length]) }, { "labels": TS(target_shape), "associations": TS([self.config.max_length, self.config.max_length]) })) def _target_encoder(self): return SequenceLabelingEncoder()