def __init__(self, path: str = None, vocab: VocabTF = None, normalize: bool = False, load_all=True, mask_zero=True, trainable=False, name=None, dtype=None, dynamic=False, **kwargs): super().__init__(trainable, name, dtype, dynamic, **kwargs) if load_all and vocab and vocab.locked: vocab.unlock() self.vocab, self.array_np = self._load(path, vocab, normalize) self.vocab.lock() self.array_ks = tf.keras.layers.Embedding( input_dim=len(self.vocab), output_dim=self.dim, trainable=trainable, embeddings_initializer=tf.keras.initializers.Constant( self.array_np), mask_zero=mask_zero) self.mask_zero = mask_zero self.supports_masking = mask_zero
def _load(path, vocab, normalize=False) -> Tuple[VocabTF, Union[np.ndarray, None]]: if not vocab: vocab = VocabTF() if not path: return vocab, None assert vocab.unk_idx is not None word2vec, dim = load_word2vec(path) for word in word2vec: vocab.get_idx(word) pret_embs = np.zeros(shape=(len(vocab), dim), dtype=np.float32) state = np.random.get_state() np.random.seed(0) bias = np.random.uniform(low=-0.001, high=0.001, size=dim).astype(dtype=np.float32) scale = np.sqrt(3.0 / dim) for word, idx in vocab.token_to_idx.items(): vec = word2vec.get(word, None) if vec is None: vec = word2vec.get(word.lower(), None) # if vec is not None: # vec += bias if vec is None: # vec = np.random.uniform(-scale, scale, [dim]) vec = np.zeros([dim], dtype=np.float32) pret_embs[idx] = vec # noinspection PyTypeChecker np.random.set_state(state) return vocab, pret_embs
def fit(self, trn_path: str, **kwargs): self.word_vocab = VocabTF() self.tag_vocab = VocabTF(pad_token=None, unk_token=None) for ngrams, tags in self.file_to_samples(trn_path): for words in ngrams: self.word_vocab.update(words) self.tag_vocab.update(tags)
def fit(self, trn_path: str, **kwargs) -> int: self.tag_vocab = VocabTF(unk_token=None) num_samples = 0 for words, tags in self.file_to_inputs(trn_path, gold=True): num_samples += 1 self.tag_vocab.update(tags) return num_samples
def fit(self, trn_path: str, **kwargs) -> int: self.vocab = VocabTF() num_samples = 0 for x, y in self.file_to_inputs(trn_path): self.vocab.update(x) num_samples += 1 return num_samples
def load_vocabs(self, save_dir, filename='vocabs.json'): save_dir = get_resource(save_dir) vocabs = SerializableDict() vocabs.load_json(os.path.join(save_dir, filename)) for key, value in vocabs.items(): vocab = VocabTF() vocab.copy_from(value) setattr(self.transform, key, vocab)
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=False, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.token_vocab = VocabTF() self.pos_vocab = VocabTF(pad_token=None, unk_token=None) self.ner_vocab = VocabTF(pad_token=None) self.deprel_vocab = VocabTF(pad_token=None, unk_token=None) self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
def __init__(self, filepath: str = None, vocab: VocabTF = None, expand_vocab=True, lowercase=False, input_dim=None, output_dim=None, unk=None, normalize=False, embeddings_initializer='VarianceScaling', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None, name=None, **kwargs): if vocab is None: vocab = VocabTF() self.vocab = vocab super().__init__(filepath, vocab, expand_vocab, lowercase, input_dim, output_dim, unk, normalize, embeddings_initializer, embeddings_regularizer, activity_regularizer, embeddings_constraint, mask_zero, input_length, name, **kwargs)
class WindowTokenTransform(TSVTaggingTransform): def fit(self, trn_path: str, **kwargs): self.word_vocab = VocabTF() self.tag_vocab = VocabTF(pad_token=None, unk_token=None) for ngrams, tags in self.file_to_samples(trn_path): for words in ngrams: self.word_vocab.update(words) self.tag_vocab.update(tags) def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: window_radius = self.config.window_radius window_size = 2 * window_radius + 1 types = tf.string, tf.string shapes = [None, window_size], [None] values = self.word_vocab.pad_token, self.tag_vocab.first_token return types, shapes, values def inputs_to_samples(self, inputs, gold=False): window_radius = self.config.window_radius for t in inputs: if gold: words, tags = t else: words, tags = t, [self.padding_values[-1]] * len(t) ngrams = [] for i, word in enumerate(words): features = [] for t in range(-window_radius, window_radius + 1): index = i + t if index < 0: feature = 'bos{}'.format(index) elif index >= len(words): feature = 'eos+{}'.format(index - len(words) + 1) else: feature = words[index] features.append(feature) ngrams.append(features) yield ngrams, tags def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: for xs in X: words = [] for x in xs: words.append(self.word_vocab.idx_to_token[int(x[len(x) // 2])]) yield words
def fit(self, trn_path: str, **kwargs) -> int: if self.config.get('joint_pos', None): self.config.use_pos = True if self.graph: # noinspection PyCallByClass num = CoNLL_SDP_Transform.fit(self, trn_path, **kwargs) else: num = super().fit(trn_path, **kwargs) if self.config.get('topk', None): counter = Counter() for sent in self.file_to_samples(trn_path, gold=True): for idx, cell in enumerate(sent): form, head, deprel = cell counter[form] += 1 self.topk_vocab = VocabTF() for k, v in counter.most_common(self.config.topk): self.topk_vocab.add(k) return num
def fit(self, trn_path: str, **kwargs) -> int: self.form_vocab = VocabTF() self.form_vocab.add(ROOT) # make root the 2ed elements while 0th is pad, 1st is unk if self.use_pos: self.cpos_vocab = VocabTF(pad_token=None, unk_token=None) self.rel_vocab = VocabTF(pad_token=None, unk_token=None) num_samples = 0 counter = Counter() for sent in self.file_to_samples(trn_path, gold=True): num_samples += 1 for idx, cell in enumerate(sent): if len(cell) == 4: form, cpos, head, deprel = cell elif len(cell) == 3: if self.use_pos: form, cpos = cell[0] else: form = cell[0] head, deprel = cell[1:] else: raise ValueError('Unknown data arrangement') if idx == 0: root = form else: counter[form] += 1 if self.use_pos: self.cpos_vocab.add(cpos) self.rel_vocab.update(deprel) for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]: self.form_vocab.add(token) return num_samples
def vocab_from_txt(txt_file_path, bigram_only=False, window_size=4, **kwargs) -> Tuple[VocabTF, VocabTF, VocabTF]: char_vocab, ngram_vocab, tag_vocab = VocabTF(), VocabTF(), VocabTF( pad_token=None, unk_token=None) for X, Y in generate_ngram_bmes(txt_file_path, bigram_only, window_size, gold=True): char_vocab.update(X[0]) for ngram in X[1:]: ngram_vocab.update(filter(lambda x: x, ngram)) tag_vocab.update(Y) return char_vocab, ngram_vocab, tag_vocab
def fit(self, trn_path: str, **kwargs) -> int: self.word_vocab = VocabTF() self.tag_vocab = VocabTF(pad_token=None, unk_token=None) num_samples = 0 for words, tags in self.file_to_inputs(trn_path, True): self.word_vocab.update(words) self.tag_vocab.update(tags) num_samples += 1 if self.char_vocab: self.char_vocab = VocabTF() for word in self.word_vocab.token_to_idx.keys(): if word in (self.word_vocab.pad_token, self.word_vocab.unk_token): continue self.char_vocab.update(list(word)) return num_samples
def fit(self, trn_path: str, **kwargs): word_vocab, ngram_vocab, tag_vocab = VocabTF(), VocabTF(), VocabTF( pad_token=None, unk_token=None) num_samples = 0 for X, Y in self.file_to_samples(trn_path, gold=True): num_samples += 1 word_vocab.update(X[0]) for ngram in X[1:]: ngram_vocab.update(filter(lambda x: x, ngram)) tag_vocab.update(Y) self.word_vocab, self.ngram_vocab, self.tag_vocab = word_vocab, ngram_vocab, tag_vocab if self.config.window_size: vocabs = word_vocab, ngram_vocab, tag_vocab else: vocabs = word_vocab, None, tag_vocab self.word_vocab, self.ngram_vocab, self.tag_vocab = vocabs return num_samples
class TACREDTransform(Transform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=False, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.token_vocab = VocabTF() self.pos_vocab = VocabTF(pad_token=None, unk_token=None) self.ner_vocab = VocabTF(pad_token=None) self.deprel_vocab = VocabTF(pad_token=None, unk_token=None) self.rel_vocab = VocabTF(pad_token=None, unk_token=None) def fit(self, trn_path: str, **kwargs) -> int: count = 0 for (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation in self.file_to_samples(trn_path, gold=True): count += 1 self.token_vocab.update(tokens) self.pos_vocab.update(pos) self.ner_vocab.update(ner) self.deprel_vocab.update(deprel) self.rel_vocab.add(relation) return count def file_to_inputs(self, filepath: str, gold=True): data = load_json(filepath) for d in data: tokens = list(d['token']) ss, se = d['subj_start'], d['subj_end'] os, oe = d['obj_start'], d['obj_end'] pos = d['stanford_pos'] ner = d['stanford_ner'] deprel = d['stanford_deprel'] head = [int(x) for x in d['stanford_head']] assert any([x == 0 for x in head]) relation = d['relation'] yield (tokens, pos, ner, head, deprel, ss, se, os, oe), relation def inputs_to_samples(self, inputs, gold=False): for input in inputs: if gold: (tokens, pos, ner, head, deprel, ss, se, os, oe), relation = input else: tokens, pos, ner, head, deprel, ss, se, os, oe = input relation = self.rel_vocab.safe_pad_token l = len(tokens) subj_positions = get_positions(ss, se, l) obj_positions = get_positions(os, oe, l) subj_type = ner[ss] obj_type = ner[os] # anonymize tokens tokens[ss:se + 1] = ['SUBJ-' + subj_type] * (se - ss + 1) tokens[os:oe + 1] = ['OBJ-' + obj_type] * (oe - os + 1) # min head is 0, but root is not included in tokens, so take 1 off from each head head = [h - 1 for h in head] yield (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: # (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation types = (tf.string, tf.string, tf.string, tf.int32, tf.string, tf.int32, tf.int32, tf.string, tf.string), tf.string shapes = ([None], [None], [None], [None], [None], [None], [None], [], []), [] pads = (self.token_vocab.safe_pad_token, self.pos_vocab.safe_pad_token, self.ner_vocab.safe_pad_token, 0, self.deprel_vocab.safe_pad_token, 0, 0, self.ner_vocab.safe_pad_token, self.ner_vocab.safe_pad_token), self.rel_vocab.safe_pad_token return types, shapes, pads def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type = x tokens = self.token_vocab.lookup(tokens) pos = self.pos_vocab.lookup(pos) ner = self.ner_vocab.lookup(ner) deprel = self.deprel_vocab.lookup(deprel) subj_type = self.ner_vocab.lookup(subj_type) obj_type = self.ner_vocab.lookup(obj_type) return tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type def y_to_idx(self, y) -> tf.Tensor: return self.rel_vocab.lookup(y)
def build_transform(self, embeddings, **kwargs): if embeddings_require_string_input(embeddings): self.transform.map_x = False if embeddings_require_char_input(embeddings): self.transform.char_vocab = VocabTF() return super().build_transform(**kwargs)
class TransformerTransform(TsvTaggingFormat, Transform): def __init__(self, tokenizer=None, config: SerializableDict = None, map_x=False, map_y=False, **kwargs) -> None: super().__init__(config, map_x, map_y, **kwargs) self._tokenizer = tokenizer self.tag_vocab: VocabTF = None self.special_token_ids = None self.pad = '[PAD]' self.unk = '[UNK]' @property def max_seq_length(self): # -2 for special tokens [CLS] and [SEP] return self.config.get('max_seq_length', 128) - 2 @property def tokenizer(self): return self._tokenizer @tokenizer.setter def tokenizer(self, tokenizer): self._tokenizer = tokenizer vocab = tokenizer._vocab if hasattr(tokenizer, '_vocab') else tokenizer.vocab if self.pad not in vocab: # English albert use <pad> instead of [PAD] self.pad = '<pad>' if self.unk not in vocab: self.unk = '<unk>' self.special_token_ids = tf.constant([vocab[token] for token in [self.pad, '[CLS]', '[SEP]']], dtype=tf.int32) def fit(self, trn_path: str, **kwargs) -> int: self.tag_vocab = VocabTF(unk_token=None) num_samples = 0 for words, tags in self.file_to_inputs(trn_path, gold=True): num_samples += 1 self.tag_vocab.update(tags) return num_samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: max_seq_length = self.config.get('max_seq_length', 128) types = (tf.int32, tf.int32, tf.int32), tf.int32 # (input_ids, input_mask, segment_ids), label_ids shapes = ([max_seq_length], [max_seq_length], [max_seq_length]), [None] values = (0, 0, 0), self.tag_vocab.pad_idx return types, shapes, values def lock_vocabs(self): super().lock_vocabs() def inputs_to_samples(self, inputs, gold=False): max_seq_length = self.config.get('max_seq_length', 128) tokenizer = self._tokenizer xlnet = False roberta = False pad_token = self.pad cls_token = '[CLS]' sep_token = '[SEP]' unk_token = self.unk pad_label_idx = self.tag_vocab.pad_idx pad_token = tokenizer.convert_tokens_to_ids([pad_token])[0] for sample in inputs: if gold: words, tags = sample else: words, tags = sample, [self.tag_vocab.idx_to_token[1]] * len(sample) input_ids, input_mask, segment_ids, label_ids = convert_examples_to_features(words, max_seq_length, tokenizer, tags, self.tag_vocab.token_to_idx, cls_token_at_end=xlnet, # xlnet has a cls token at the end cls_token=cls_token, cls_token_segment_id=2 if xlnet else 0, sep_token=sep_token, sep_token_extra=roberta, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=xlnet, # pad on the left for xlnet pad_token_id=pad_token, pad_token_segment_id=4 if xlnet else 0, pad_token_label_id=pad_label_idx, unk_token=unk_token) if None in input_ids: print(input_ids) if None in input_mask: print(input_mask) if None in segment_ids: print(input_mask) yield (input_ids, input_mask, segment_ids), label_ids def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: raise NotImplementedError('transformers has its own tagger, not need to convert idx for x') def y_to_idx(self, y) -> tf.Tensor: raise NotImplementedError('transformers has its own tagger, not need to convert idx for y') def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool: return isinstance(input[0], str) def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, X=None, inputs=None, batch=None, **kwargs) -> Iterable: assert batch is not None, 'Need the batch to know actual length of Y' label_mask = batch[1] Y = tf.argmax(Y, axis=-1) Y = Y[label_mask > 0] tags = [self.tag_vocab.idx_to_token[tid] for tid in Y] offset = 0 for words in inputs: yield tags[offset:offset + len(words)] offset += len(words)
class CoNLL_SDP_Transform(CoNLLTransform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=2, use_pos=True, **kwargs) -> None: super().__init__(config, map_x, map_y, lower, n_buckets, min_freq, use_pos, **kwargs) self.orphan_relation = ROOT def lock_vocabs(self): super().lock_vocabs() # heuristic to find the orphan relation self._find_orphan_relation() def _find_orphan_relation(self): for rel in self.rel_vocab.idx_to_token: if 'root' in rel.lower(): self.orphan_relation = rel break def file_to_inputs(self, filepath: str, gold=True): assert gold, 'only support gold file for now' use_pos = self.use_pos conllu = filepath.endswith('.conllu') enhanced_only = self.config.get('enhanced_only', None) for i, sent in enumerate(read_conll(filepath)): parsed_sent = [] if conllu: for cell in sent: ID = cell[0] form = cell[1] cpos = cell[3] head = cell[6] deprel = cell[7] deps = cell[8] deps = [x.split(':', 1) for x in deps.split('|')] heads = [int(x[0]) for x in deps if x[0].isdigit()] rels = [x[1] for x in deps if x[0].isdigit()] if enhanced_only: if head in heads: offset = heads.index(head) heads.pop(offset) rels.pop(offset) else: if head not in heads: heads.append(head) rels.append(deprel) parsed_sent.append([form, cpos, heads, rels] if use_pos else [form, heads, rels]) else: prev_cells = None heads = [] rels = [] for j, cell in enumerate(sent): ID = cell[0] form = cell[1] cpos = cell[3] head = cell[6] deprel = cell[7] if prev_cells and ID != prev_cells[0]: # found end of token parsed_sent.append( [prev_cells[1], prev_cells[2], heads, rels] if use_pos else [prev_cells[1], heads, rels]) heads = [] rels = [] heads.append(head) rels.append(deprel) prev_cells = [ID, form, cpos, head, deprel] if use_pos else [ID, form, head, deprel] parsed_sent.append( [prev_cells[1], prev_cells[2], heads, rels] if use_pos else [prev_cells[1], heads, rels]) yield parsed_sent def fit(self, trn_path: str, **kwargs) -> int: self.form_vocab = VocabTF() self.form_vocab.add(ROOT) # make root the 2ed elements while 0th is pad, 1st is unk if self.use_pos: self.cpos_vocab = VocabTF(pad_token=None, unk_token=None) self.rel_vocab = VocabTF(pad_token=None, unk_token=None) num_samples = 0 counter = Counter() for sent in self.file_to_samples(trn_path, gold=True): num_samples += 1 for idx, cell in enumerate(sent): if len(cell) == 4: form, cpos, head, deprel = cell elif len(cell) == 3: if self.use_pos: form, cpos = cell[0] else: form = cell[0] head, deprel = cell[1:] else: raise ValueError('Unknown data arrangement') if idx == 0: root = form else: counter[form] += 1 if self.use_pos: self.cpos_vocab.add(cpos) self.rel_vocab.update(deprel) for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]: self.form_vocab.add(token) return num_samples def inputs_to_samples(self, inputs, gold=False): use_pos = self.use_pos for sent in inputs: sample = [] for i, cell in enumerate(sent): if isinstance(cell, tuple): cell = list(cell) elif isinstance(cell, str): cell = [cell] if self.config['lower']: cell[0] = cell[0].lower() if not gold: cell += [[0], [self.rel_vocab.safe_pad_token]] sample.append(cell) # insert root word with arbitrary fields, anyway it will be masked if use_pos: form, cpos, head, deprel = sample[0] sample.insert(0, [self.bos, self.bos, [0], deprel]) else: form, head, deprel = sample[0] sample.insert(0, [self.bos, [0], deprel]) yield sample def batched_inputs_to_batches(self, corpus, indices, shuffle): use_pos = self.use_pos raw_batch = [[], [], [], []] if use_pos else [[], [], []] max_len = len(max([corpus[i] for i in indices], key=len)) for idx in indices: arc = np.zeros((max_len, max_len), dtype=np.bool) rel = np.zeros((max_len, max_len), dtype=np.int64) for b in raw_batch[:2]: b.append([]) for m, cells in enumerate(corpus[idx]): if use_pos: for b, c, v in zip(raw_batch, cells, [self.form_vocab, self.cpos_vocab]): b[-1].append(v.get_idx_without_add(c)) else: for b, c, v in zip(raw_batch, cells, [self.form_vocab]): b[-1].append(v.get_idx_without_add(c)) for n, r in zip(cells[-2], cells[-1]): arc[m, n] = True rid = self.rel_vocab.get_idx_without_add(r) if rid is None: logger.warning(f'Relation OOV: {r} not exists in train') continue rel[m, n] = rid raw_batch[-2].append(arc) raw_batch[-1].append(rel) batch = [] for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab]): b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post', value=v.safe_pad_token_idx, dtype='int64') batch.append(b) batch += raw_batch[2:] assert len(batch) == 4 yield (batch[0], batch[1]), (batch[2], batch[3]) def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = (tf.int64, tf.int64), (tf.bool, tf.int64) shapes = ([None, None], [None, None]), ([None, None, None], [None, None, None]) values = (self.form_vocab.safe_pad_token_idx, self.cpos_vocab.safe_pad_token_idx), ( False, self.rel_vocab.safe_pad_token_idx) return types, shapes, values def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable: arc_preds, rel_preds, mask = Y sents = [] for arc_sent, rel_sent, length in zip(arc_preds, rel_preds, tf.math.count_nonzero(mask, axis=-1)): sent = [] for arc, rel in zip(tolist(arc_sent[1:, 1:]), tolist(rel_sent[1:, 1:])): ar = [] for idx, (a, r) in enumerate(zip(arc, rel)): if a: ar.append((idx + 1, self.rel_vocab.idx_to_token[r])) if not ar: # orphan ar.append((0, self.orphan_relation)) sent.append(ar) sents.append(sent) return sents def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, conll=True) -> Iterable: (words, feats, mask), (arc_preds, rel_preds) = X, Y xs = inputs ys = self.Y_to_outputs((arc_preds, rel_preds, mask)) sents = [] for x, y in zip(xs, ys): sent = CoNLLSentence() for idx, ((form, cpos), pred) in enumerate(zip(x, y)): head = [p[0] for p in pred] deprel = [p[1] for p in pred] if conll: sent.append(CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel)) else: sent.append([head, deprel]) sents.append(sent) return sents
def vocab_from_tsv(tsv_file_path, lower=False, lock_word_vocab=False, lock_char_vocab=True, lock_tag_vocab=True) \ -> Tuple[VocabTF, VocabTF, VocabTF]: word_vocab = VocabTF() char_vocab = VocabTF() tag_vocab = VocabTF(unk_token=None) with open(tsv_file_path, encoding='utf-8') as tsv_file: for line in tsv_file: cells = line.strip().split() if cells: word, tag = cells if lower: word_vocab.add(word.lower()) else: word_vocab.add(word) char_vocab.update(list(word)) tag_vocab.add(tag) if lock_word_vocab: word_vocab.lock() if lock_char_vocab: char_vocab.lock() if lock_tag_vocab: tag_vocab.lock() return word_vocab, char_vocab, tag_vocab
class CoNLL_DEP_Transform(CoNLLTransform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=2, **kwargs) -> None: super().__init__(config, map_x, map_y, lower, n_buckets, min_freq, **kwargs) def batched_inputs_to_batches(self, corpus, indices, shuffle): """Convert batched inputs to batches of samples Args: corpus(list): A list of inputs indices(list): A list of indices, each list belongs to a batch shuffle: Returns: """ raw_batch = [[], [], [], []] for idx in indices: for b in raw_batch: b.append([]) for cells in corpus[idx]: for b, c, v in zip(raw_batch, cells, [self.form_vocab, self.cpos_vocab, None, self.rel_vocab]): b[-1].append(v.get_idx_without_add(c) if v else c) batch = [] for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab, None, self.rel_vocab]): b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post', value=v.safe_pad_token_idx if v else 0, dtype='int64') batch.append(b) assert len(batch) == 4 yield (batch[0], batch[1]), (batch[2], batch[3]) def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = (tf.int64, tf.int64), (tf.int64, tf.int64) shapes = ([None, None], [None, None]), ([None, None], [None, None]) values = (self.form_vocab.safe_pad_token_idx, self.cpos_vocab.safe_pad_token_idx), ( 0, self.rel_vocab.safe_pad_token_idx) return types, shapes, values def inputs_to_samples(self, inputs, gold=False): token_mapping: dict = self.config.get('token_mapping', None) use_pos = self.config.get('use_pos', True) for sent in inputs: sample = [] for i, cell in enumerate(sent): if isinstance(cell, tuple): cell = list(cell) elif isinstance(cell, str): cell = [cell] if token_mapping: cell[0] = token_mapping.get(cell[0], cell[0]) if self.config['lower']: cell[0] = cell[0].lower() if not gold: cell += [0, self.rel_vocab.safe_pad_token] sample.append(cell) # insert root word with arbitrary fields, anyway it will be masked # form, cpos, head, deprel = sample[0] sample.insert(0, [self.bos, self.bos, 0, self.bos] if use_pos else [self.bos, 0, self.bos]) yield sample def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, conll=True, arc_scores=None, rel_scores=None) -> Iterable: (words, feats, mask), (arc_preds, rel_preds) = X, Y if inputs is None: inputs = self.X_to_inputs(X) ys = self.Y_to_outputs((arc_preds, rel_preds, mask), inputs=inputs) sents = [] for x, y in zip(inputs, ys): sent = CoNLLSentence() for idx, (cell, (head, deprel)) in enumerate(zip(x, y)): if self.use_pos and not self.config.get('joint_pos', None): form, cpos = cell else: form, cpos = cell, None if conll: sent.append( CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel) if conll == '.conll' else CoNLLUWord(id=idx + 1, form=form, upos=cpos, head=head, deprel=deprel)) else: sent.append([head, deprel]) sents.append(sent) return sents def fit(self, trn_path: str, **kwargs) -> int: use_pos = self.config.use_pos self.form_vocab = VocabTF() self.form_vocab.add(ROOT) # make root the 2ed elements while 0th is pad, 1st is unk if self.use_pos: self.cpos_vocab = VocabTF(pad_token=None, unk_token=None) self.rel_vocab = VocabTF(pad_token=None, unk_token=None) num_samples = 0 counter = Counter() for sent in self.file_to_samples(trn_path, gold=True): num_samples += 1 for idx, cell in enumerate(sent): if use_pos: form, cpos, head, deprel = cell else: form, head, deprel = cell if idx == 0: root = form else: counter[form] += 1 if use_pos: self.cpos_vocab.add(cpos) self.rel_vocab.add(deprel) for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]: self.form_vocab.add(token) return num_samples @property def root_rel_idx(self): root_rel_idx = self.config.get('root_rel_idx', None) if root_rel_idx is None: for idx, rel in enumerate(self.rel_vocab.idx_to_token): if 'root' in rel.lower() and rel != self.bos: self.config['root_rel_idx'] = root_rel_idx = idx break return root_rel_idx def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable: arc_preds, rel_preds, mask = Y sents = [] for arc_sent, rel_sent, length in zip(arc_preds, rel_preds, tf.math.count_nonzero(mask, axis=-1)): arcs = tolist(arc_sent)[1:length + 1] rels = tolist(rel_sent)[1:length + 1] sents.append([(a, self.rel_vocab.idx_to_token[r]) for a, r in zip(arcs, rels)]) return sents
class TextTransform(Transform): def __init__(self, forward=True, seq_len=10, tokenizer='char', config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None: super().__init__(config, map_x, map_y, seq_len=seq_len, tokenizer=tokenizer, forward=forward, **kwargs) self.vocab: VocabTF = None def tokenize_func(self): if self.config.tokenizer == 'char': return list elif self.config.tokenizer == 'whitespace': return lambda x: x.split() else: return lambda x: x.split(self.config.tokenizer) def fit(self, trn_path: str, **kwargs) -> int: self.vocab = VocabTF() num_samples = 0 for x, y in self.file_to_inputs(trn_path): self.vocab.update(x) num_samples += 1 return num_samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = tf.string, tf.string shapes = [None], [None] defaults = self.vocab.pad_token, self.vocab.pad_token return types, shapes, defaults def file_to_inputs(self, filepath: str, gold=True): forward = self.config.forward seq_len = self.config.seq_len buffer = [] tokenizer = self.tokenize_func() with open(filepath, encoding='utf-8') if forward else FileReadBackwards(filepath, encoding="utf-8") as src: for line in src: tokens = tokenizer(line) buffer += tokens while len(buffer) > seq_len: yield buffer[:seq_len], buffer[1:1 + seq_len] buffer.pop(0) def inputs_to_samples(self, inputs, gold=False): forward = self.config.forward for t in inputs: if gold: x, y = t else: x, y = t, t if not forward: x = list(reversed(x)) y = list(reversed(y)) yield x, y def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: return self.vocab.lookup(x) def y_to_idx(self, y) -> tf.Tensor: return self.x_to_idx(y) def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, **kwargs) -> Iterable: pred = tf.argmax(Y, axis=-1) for ys, ms in zip(pred, inputs): ret = [] for y in ys: ret.append(self.vocab.idx_to_token[int(y)]) yield ret def input_is_single_sample(self, input: Any) -> bool: return isinstance(input[0], str)
class CoNLL_Transformer_Transform(CoNLL_DEP_Transform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=0, max_seq_length=256, use_pos=False, mask_p=None, graph=False, topk=None, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.tokenizer: PreTrainedTokenizer = None self.transformer_config: PretrainedConfig = None if graph: self.orphan_relation = ROOT def lock_vocabs(self): super().lock_vocabs() if self.graph: CoNLL_SDP_Transform._find_orphan_relation(self) def fit(self, trn_path: str, **kwargs) -> int: if self.config.get('joint_pos', None): self.config.use_pos = True if self.graph: # noinspection PyCallByClass num = CoNLL_SDP_Transform.fit(self, trn_path, **kwargs) else: num = super().fit(trn_path, **kwargs) if self.config.get('topk', None): counter = Counter() for sent in self.file_to_samples(trn_path, gold=True): for idx, cell in enumerate(sent): form, head, deprel = cell counter[form] += 1 self.topk_vocab = VocabTF() for k, v in counter.most_common(self.config.topk): self.topk_vocab.add(k) return num def inputs_to_samples(self, inputs, gold=False): if self.graph: yield from CoNLL_SDP_Transform.inputs_to_samples(self, inputs, gold) else: yield from super().inputs_to_samples(inputs, gold) def file_to_inputs(self, filepath: str, gold=True): if self.graph: yield from CoNLL_SDP_Transform.file_to_inputs(self, filepath, gold) else: yield from super().file_to_inputs(filepath, gold) @property def mask_p(self) -> float: return self.config.get('mask_p', None) @property def graph(self): return self.config.get('graph', None) def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: mask_p = self.mask_p types = (tf.int64, (tf.int64, tf.int64, tf.int64)), (tf.bool if self.graph else tf.int64, tf.int64, tf.int64) if mask_p else ( tf.bool if self.graph else tf.int64, tf.int64) if self.graph: shapes = ([None, None], ([None, None], [None, None], [None, None])), ( [None, None, None], [None, None, None], [None, None]) if mask_p else ( [None, None, None], [None, None, None]) else: shapes = ([None, None], ([None, None], [None, None], [None, None])), ( [None, None], [None, None], [None, None]) if mask_p else ([None, None], [None, None]) values = (self.form_vocab.safe_pad_token_idx, (0, 0, 0)), \ (0, self.rel_vocab.safe_pad_token_idx, 0) if mask_p else (0, self.rel_vocab.safe_pad_token_idx) types_shapes_values = types, shapes, values if self.use_pos: types_shapes_values = [((shapes[0][0], shapes[0][1] + (shapes[0][0],)), shapes[1]) for shapes in types_shapes_values] return types_shapes_values def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: form_batch, feat, prefix_mask = X sents = [] for form_sent, length in zip(form_batch, tf.math.count_nonzero(prefix_mask, axis=-1)): forms = tolist(form_sent)[1:length + 1] sents.append([self.form_vocab.idx_to_token[f] for f in forms]) return sents def batched_inputs_to_batches(self, corpus, indices, shuffle): use_pos = self.use_pos if use_pos: raw_batch = [[], [], [], []] else: raw_batch = [[], [], []] if self.graph: max_len = len(max([corpus[i] for i in indices], key=len)) for idx in indices: arc = np.zeros((max_len, max_len), dtype=np.bool) rel = np.zeros((max_len, max_len), dtype=np.int64) for b in raw_batch[:2 if use_pos else 1]: b.append([]) for m, cells in enumerate(corpus[idx]): if use_pos: for b, c, v in zip(raw_batch, cells, [None, self.cpos_vocab]): b[-1].append(v.get_idx_without_add(c) if v else c) else: for b, c, v in zip(raw_batch, cells, [None]): b[-1].append(c) for n, r in zip(cells[-2], cells[-1]): arc[m, n] = True rid = self.rel_vocab.get_idx_without_add(r) if rid is None: logger.warning(f'Relation OOV: {r} not exists in train') continue rel[m, n] = rid raw_batch[-2].append(arc) raw_batch[-1].append(rel) else: for idx in indices: for s in raw_batch: s.append([]) for cells in corpus[idx]: if use_pos: for s, c, v in zip(raw_batch, cells, [None, self.cpos_vocab, None, self.rel_vocab]): s[-1].append(v.get_idx_without_add(c) if v else c) else: for s, c, v in zip(raw_batch, cells, [None, None, self.rel_vocab]): s[-1].append(v.get_idx_without_add(c) if v else c) # Transformer tokenizing config = self.transformer_config tokenizer = self.tokenizer xlnet = config_is(config, 'xlnet') roberta = config_is(config, 'roberta') pad_token = tokenizer.pad_token pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0] cls_token = tokenizer.cls_token sep_token = tokenizer.sep_token max_seq_length = self.config.max_seq_length batch_forms = [] batch_input_ids = [] batch_input_mask = [] batch_prefix_offset = [] mask_p = self.mask_p if mask_p: batch_masked_offsets = [] mask_token_id = tokenizer.mask_token_id for sent_idx, sent in enumerate(raw_batch[0]): batch_forms.append([self.form_vocab.get_idx_without_add(token) for token in sent]) sent = adjust_tokens_for_transformers(sent) sent = sent[1:] # remove <root> use [CLS] instead pad_label_idx = self.form_vocab.pad_idx input_ids, input_mask, segment_ids, prefix_mask = \ convert_examples_to_features(sent, max_seq_length, tokenizer, cls_token_at_end=xlnet, # xlnet has a cls token at the end cls_token=cls_token, cls_token_segment_id=2 if xlnet else 0, sep_token=sep_token, sep_token_extra=roberta, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=xlnet, # pad on the left for xlnet pad_token_id=pad_token_id, pad_token_segment_id=4 if xlnet else 0, pad_token_label_id=pad_label_idx, do_padding=False) num_masks = sum(prefix_mask) # assert len(sent) == num_masks # each token has a True subtoken if num_masks < len(sent): # long sent gets truncated, +1 for root batch_forms[-1] = batch_forms[-1][:num_masks + 1] # form raw_batch[-1][sent_idx] = raw_batch[-1][sent_idx][:num_masks + 1] # head raw_batch[-2][sent_idx] = raw_batch[-2][sent_idx][:num_masks + 1] # rel raw_batch[-3][sent_idx] = raw_batch[-3][sent_idx][:num_masks + 1] # pos prefix_mask[0] = True # <root> is now [CLS] prefix_offset = [idx for idx, m in enumerate(prefix_mask) if m] batch_input_ids.append(input_ids) batch_input_mask.append(input_mask) batch_prefix_offset.append(prefix_offset) if mask_p: if shuffle: size = int(np.ceil(mask_p * len(prefix_offset[1:]))) # never mask [CLS] mask_offsets = np.random.choice(np.arange(1, len(prefix_offset)), size, replace=False) for offset in sorted(mask_offsets): assert 0 < offset < len(input_ids) # mask_word = raw_batch[0][sent_idx][offset] # mask_prefix = tokenizer.convert_ids_to_tokens([input_ids[prefix_offset[offset]]])[0] # assert mask_word.startswith(mask_prefix) or mask_prefix.startswith( # mask_word) or mask_prefix == "'", \ # f'word {mask_word} prefix {mask_prefix} not match' # could vs couldn # mask_offsets.append(input_ids[offset]) # subword token # mask_offsets.append(offset) # form token input_ids[prefix_offset[offset]] = mask_token_id # mask prefix # whole word masking, mask the rest of the word for i in range(prefix_offset[offset] + 1, len(input_ids) - 1): if prefix_mask[i]: break input_ids[i] = mask_token_id batch_masked_offsets.append(sorted(mask_offsets)) else: batch_masked_offsets.append([0]) # No masking in prediction batch_forms = tf.keras.preprocessing.sequence.pad_sequences(batch_forms, padding='post', value=self.form_vocab.safe_pad_token_idx, dtype='int64') batch_input_ids = tf.keras.preprocessing.sequence.pad_sequences(batch_input_ids, padding='post', value=pad_token_id, dtype='int64') batch_input_mask = tf.keras.preprocessing.sequence.pad_sequences(batch_input_mask, padding='post', value=0, dtype='int64') batch_prefix_offset = tf.keras.preprocessing.sequence.pad_sequences(batch_prefix_offset, padding='post', value=0, dtype='int64') batch_heads = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[-2], padding='post', value=0, dtype='int64') batch_rels = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[-1], padding='post', value=self.rel_vocab.safe_pad_token_idx, dtype='int64') if mask_p: batch_masked_offsets = tf.keras.preprocessing.sequence.pad_sequences(batch_masked_offsets, padding='post', value=pad_token_id, dtype='int64') feats = (tf.constant(batch_input_ids, dtype='int64'), tf.constant(batch_input_mask, dtype='int64'), tf.constant(batch_prefix_offset)) if use_pos: batch_pos = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[1], padding='post', value=self.cpos_vocab.safe_pad_token_idx, dtype='int64') feats += (batch_pos,) yield (batch_forms, feats), \ (batch_heads, batch_rels, batch_masked_offsets) if mask_p else (batch_heads, batch_rels) def len_of_sent(self, sent): # Transformer tokenizing config = self.transformer_config tokenizer = self.tokenizer xlnet = config_is(config, 'xlnet') roberta = config_is(config, 'roberta') pad_token = tokenizer.pad_token pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0] cls_token = tokenizer.cls_token sep_token = tokenizer.sep_token max_seq_length = self.config.max_seq_length sent = sent[1:] # remove <root> use [CLS] instead pad_label_idx = self.form_vocab.pad_idx sent = [x[0] for x in sent] sent = adjust_tokens_for_transformers(sent) input_ids, input_mask, segment_ids, prefix_mask = \ convert_examples_to_features(sent, max_seq_length, tokenizer, cls_token_at_end=xlnet, # xlnet has a cls token at the end cls_token=cls_token, cls_token_segment_id=2 if xlnet else 0, sep_token=sep_token, sep_token_extra=roberta, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=xlnet, # pad on the left for xlnet pad_token_id=pad_token_id, pad_token_segment_id=4 if xlnet else 0, pad_token_label_id=pad_label_idx, do_padding=False) return len(input_ids) def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=5000, shuffle=None, repeat=None, drop_remainder=False, prefetch=1, cache=True) -> tf.data.Dataset: if shuffle: return CoNLL_DEP_Transform.samples_to_dataset(self, samples, map_x, map_y, batch_size, shuffle, repeat, drop_remainder, prefetch, cache) def generator(): # custom bucketing, load corpus into memory corpus = list(x for x in (samples() if callable(samples) else samples)) n_tokens = 0 batch = [] for idx, sent in enumerate(corpus): sent_len = self.len_of_sent(sent) if n_tokens + sent_len > batch_size and batch: yield from self.batched_inputs_to_batches(corpus, batch, shuffle) n_tokens = 0 batch = [] n_tokens += sent_len batch.append(idx) if batch: yield from self.batched_inputs_to_batches(corpus, batch, shuffle) # debug for transformer # next(generator()) return Transform.samples_to_dataset(self, generator, False, False, 0, False, repeat, drop_remainder, prefetch, cache) def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable: if self.graph: ys = CoNLL_SDP_Transform.Y_to_outputs(self, Y, gold, inputs, X) ys = [[([t[0] for t in l], [t[1] for t in l]) for l in y] for y in ys] return ys return super().Y_to_outputs(Y, gold, inputs, X)
def __init__(self, filepath: str = None, vocab: VocabTF = None, expand_vocab=True, lowercase=True, input_dim=None, output_dim=None, unk=None, normalize=False, embeddings_initializer='VarianceScaling', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None, name=None, cpu=True, **kwargs): filepath = get_resource(filepath) word2vec, _output_dim = load_word2vec(filepath) if output_dim: assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}' output_dim = _output_dim # if the `unk` token exists in the pretrained, # then replace it with a self-defined one, usually the one in word vocab if unk and unk in word2vec: word2vec[vocab.safe_unk_token] = word2vec.pop(unk) if vocab is None: vocab = VocabTF() vocab.update(word2vec.keys()) if expand_vocab and vocab.mutable: for word in word2vec: vocab.get_idx(word.lower() if lowercase else word) if input_dim: assert input_dim == len( vocab), f'input_dim = {input_dim} does not match {filepath}' input_dim = len(vocab) # init matrix self._embeddings_initializer = embeddings_initializer embeddings_initializer = tf.keras.initializers.get( embeddings_initializer) with tf.device('cpu:0') if cpu else DummyContext(): pret_embs = embeddings_initializer( shape=[input_dim, output_dim]).numpy() # insert to pret_embs for word, idx in vocab.token_to_idx.items(): vec = word2vec.get(word, None) # Retry lower case if vec is None and lowercase: vec = word2vec.get(word.lower(), None) if vec is not None: pret_embs[idx] = vec if normalize: pret_embs /= np.std(pret_embs) if not name: name = os.path.splitext(os.path.basename(filepath))[0] super().__init__(input_dim, output_dim, tf.keras.initializers.Constant(pret_embs), embeddings_regularizer, activity_regularizer, embeddings_constraint, mask_zero, input_length, name=name, **kwargs) self.filepath = filepath self.expand_vocab = expand_vocab self.lowercase = lowercase
class TSVTaggingTransform(TsvTaggingFormat, Transform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, use_char=False, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.word_vocab: Optional[VocabTF] = None self.tag_vocab: Optional[VocabTF] = None self.char_vocab: Optional[VocabTF] = None def fit(self, trn_path: str, **kwargs) -> int: self.word_vocab = VocabTF() self.tag_vocab = VocabTF(pad_token=None, unk_token=None) num_samples = 0 for words, tags in self.file_to_inputs(trn_path, True): self.word_vocab.update(words) self.tag_vocab.update(tags) num_samples += 1 if self.char_vocab: self.char_vocab = VocabTF() for word in self.word_vocab.token_to_idx.keys(): if word in (self.word_vocab.pad_token, self.word_vocab.unk_token): continue self.char_vocab.update(list(word)) return num_samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = tf.string, tf.string shapes = [None], [None] values = self.word_vocab.pad_token, self.tag_vocab.first_token return types, shapes, values def inputs_to_samples(self, inputs, gold=False): lower = self.config.get('lower', False) if gold: if lower: for x, y in inputs: yield x.lower(), y else: yield from inputs else: for x in inputs: yield x.lower() if lower else x, [self.padding_values[-1] ] * len(x) def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: return self.word_vocab.lookup(x) def y_to_idx(self, y) -> tf.Tensor: return self.tag_vocab.lookup(y) def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: for xs in X: words = [] for x in xs: words.append( str_tensor_to_str(x) if self.char_vocab else self. word_vocab.idx_to_token[int(x)]) yield words def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None, **kwargs) -> Iterable: if not gold: Y = tf.argmax(Y, axis=2) for ys, xs in zip(Y, inputs): tags = [] for y, x in zip(ys, xs): tags.append(self.tag_vocab.idx_to_token[int(y)]) yield tags def input_is_single_sample( self, input: Union[List[str], List[List[str]]]) -> bool: return isinstance(input[0], str) def input_truth_output_to_str(self, input: List[str], truth: List[str], output: List[str]): text = '' for word, gold_tag, pred_tag in zip(input, truth, output): text += ' '.join([word, gold_tag, pred_tag]) + '\n' text += '\n' return text