示例#1
0
 def __init__(self,
              path: str = None,
              vocab: VocabTF = None,
              normalize: bool = False,
              load_all=True,
              mask_zero=True,
              trainable=False,
              name=None,
              dtype=None,
              dynamic=False,
              **kwargs):
     super().__init__(trainable, name, dtype, dynamic, **kwargs)
     if load_all and vocab and vocab.locked:
         vocab.unlock()
     self.vocab, self.array_np = self._load(path, vocab, normalize)
     self.vocab.lock()
     self.array_ks = tf.keras.layers.Embedding(
         input_dim=len(self.vocab),
         output_dim=self.dim,
         trainable=trainable,
         embeddings_initializer=tf.keras.initializers.Constant(
             self.array_np),
         mask_zero=mask_zero)
     self.mask_zero = mask_zero
     self.supports_masking = mask_zero
示例#2
0
    def _load(path,
              vocab,
              normalize=False) -> Tuple[VocabTF, Union[np.ndarray, None]]:
        if not vocab:
            vocab = VocabTF()
        if not path:
            return vocab, None
        assert vocab.unk_idx is not None

        word2vec, dim = load_word2vec(path)
        for word in word2vec:
            vocab.get_idx(word)

        pret_embs = np.zeros(shape=(len(vocab), dim), dtype=np.float32)
        state = np.random.get_state()
        np.random.seed(0)
        bias = np.random.uniform(low=-0.001, high=0.001,
                                 size=dim).astype(dtype=np.float32)
        scale = np.sqrt(3.0 / dim)
        for word, idx in vocab.token_to_idx.items():
            vec = word2vec.get(word, None)
            if vec is None:
                vec = word2vec.get(word.lower(), None)
                # if vec is not None:
                #     vec += bias
            if vec is None:
                # vec = np.random.uniform(-scale, scale, [dim])
                vec = np.zeros([dim], dtype=np.float32)
            pret_embs[idx] = vec
        # noinspection PyTypeChecker
        np.random.set_state(state)
        return vocab, pret_embs
示例#3
0
 def fit(self, trn_path: str, **kwargs):
     self.word_vocab = VocabTF()
     self.tag_vocab = VocabTF(pad_token=None, unk_token=None)
     for ngrams, tags in self.file_to_samples(trn_path):
         for words in ngrams:
             self.word_vocab.update(words)
         self.tag_vocab.update(tags)
 def fit(self, trn_path: str, **kwargs) -> int:
     self.tag_vocab = VocabTF(unk_token=None)
     num_samples = 0
     for words, tags in self.file_to_inputs(trn_path, gold=True):
         num_samples += 1
         self.tag_vocab.update(tags)
     return num_samples
示例#5
0
文件: text.py 项目: lei1993/HanLP
 def fit(self, trn_path: str, **kwargs) -> int:
     self.vocab = VocabTF()
     num_samples = 0
     for x, y in self.file_to_inputs(trn_path):
         self.vocab.update(x)
         num_samples += 1
     return num_samples
示例#6
0
 def load_vocabs(self, save_dir, filename='vocabs.json'):
     save_dir = get_resource(save_dir)
     vocabs = SerializableDict()
     vocabs.load_json(os.path.join(save_dir, filename))
     for key, value in vocabs.items():
         vocab = VocabTF()
         vocab.copy_from(value)
         setattr(self.transform, key, vocab)
示例#7
0
文件: tacred.py 项目: lei1993/HanLP
 def __init__(self,
              config: SerializableDict = None,
              map_x=True,
              map_y=True,
              lower=False,
              **kwargs) -> None:
     super().__init__(**merge_locals_kwargs(locals(), kwargs))
     self.token_vocab = VocabTF()
     self.pos_vocab = VocabTF(pad_token=None, unk_token=None)
     self.ner_vocab = VocabTF(pad_token=None)
     self.deprel_vocab = VocabTF(pad_token=None, unk_token=None)
     self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
示例#8
0
 def __init__(self,
              filepath: str = None,
              vocab: VocabTF = None,
              expand_vocab=True,
              lowercase=False,
              input_dim=None,
              output_dim=None,
              unk=None,
              normalize=False,
              embeddings_initializer='VarianceScaling',
              embeddings_regularizer=None,
              activity_regularizer=None,
              embeddings_constraint=None,
              mask_zero=True,
              input_length=None,
              name=None,
              **kwargs):
     if vocab is None:
         vocab = VocabTF()
     self.vocab = vocab
     super().__init__(filepath, vocab, expand_vocab, lowercase, input_dim,
                      output_dim, unk, normalize, embeddings_initializer,
                      embeddings_regularizer, activity_regularizer,
                      embeddings_constraint, mask_zero, input_length, name,
                      **kwargs)
示例#9
0
class WindowTokenTransform(TSVTaggingTransform):

    def fit(self, trn_path: str, **kwargs):
        self.word_vocab = VocabTF()
        self.tag_vocab = VocabTF(pad_token=None, unk_token=None)
        for ngrams, tags in self.file_to_samples(trn_path):
            for words in ngrams:
                self.word_vocab.update(words)
            self.tag_vocab.update(tags)

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        window_radius = self.config.window_radius
        window_size = 2 * window_radius + 1
        types = tf.string, tf.string
        shapes = [None, window_size], [None]
        values = self.word_vocab.pad_token, self.tag_vocab.first_token
        return types, shapes, values

    def inputs_to_samples(self, inputs, gold=False):
        window_radius = self.config.window_radius
        for t in inputs:
            if gold:
                words, tags = t
            else:
                words, tags = t, [self.padding_values[-1]] * len(t)
            ngrams = []
            for i, word in enumerate(words):
                features = []
                for t in range(-window_radius, window_radius + 1):
                    index = i + t
                    if index < 0:
                        feature = 'bos{}'.format(index)
                    elif index >= len(words):
                        feature = 'eos+{}'.format(index - len(words) + 1)
                    else:
                        feature = words[index]
                    features.append(feature)
                ngrams.append(features)
            yield ngrams, tags

    def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
        for xs in X:
            words = []
            for x in xs:
                words.append(self.word_vocab.idx_to_token[int(x[len(x) // 2])])
            yield words
示例#10
0
文件: conll_tf.py 项目: lei1993/HanLP
 def fit(self, trn_path: str, **kwargs) -> int:
     if self.config.get('joint_pos', None):
         self.config.use_pos = True
     if self.graph:
         # noinspection PyCallByClass
         num = CoNLL_SDP_Transform.fit(self, trn_path, **kwargs)
     else:
         num = super().fit(trn_path, **kwargs)
     if self.config.get('topk', None):
         counter = Counter()
         for sent in self.file_to_samples(trn_path, gold=True):
             for idx, cell in enumerate(sent):
                 form, head, deprel = cell
                 counter[form] += 1
         self.topk_vocab = VocabTF()
         for k, v in counter.most_common(self.config.topk):
             self.topk_vocab.add(k)
     return num
示例#11
0
文件: conll_tf.py 项目: lei1993/HanLP
    def fit(self, trn_path: str, **kwargs) -> int:
        self.form_vocab = VocabTF()
        self.form_vocab.add(ROOT)  # make root the 2ed elements while 0th is pad, 1st is unk
        if self.use_pos:
            self.cpos_vocab = VocabTF(pad_token=None, unk_token=None)
        self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
        num_samples = 0
        counter = Counter()
        for sent in self.file_to_samples(trn_path, gold=True):
            num_samples += 1
            for idx, cell in enumerate(sent):
                if len(cell) == 4:
                    form, cpos, head, deprel = cell
                elif len(cell) == 3:
                    if self.use_pos:
                        form, cpos = cell[0]
                    else:
                        form = cell[0]
                    head, deprel = cell[1:]
                else:
                    raise ValueError('Unknown data arrangement')
                if idx == 0:
                    root = form
                else:
                    counter[form] += 1
                if self.use_pos:
                    self.cpos_vocab.add(cpos)
                self.rel_vocab.update(deprel)

        for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]:
            self.form_vocab.add(token)
        return num_samples
示例#12
0
def vocab_from_txt(txt_file_path,
                   bigram_only=False,
                   window_size=4,
                   **kwargs) -> Tuple[VocabTF, VocabTF, VocabTF]:
    char_vocab, ngram_vocab, tag_vocab = VocabTF(), VocabTF(), VocabTF(
        pad_token=None, unk_token=None)
    for X, Y in generate_ngram_bmes(txt_file_path,
                                    bigram_only,
                                    window_size,
                                    gold=True):
        char_vocab.update(X[0])
        for ngram in X[1:]:
            ngram_vocab.update(filter(lambda x: x, ngram))
        tag_vocab.update(Y)
    return char_vocab, ngram_vocab, tag_vocab
示例#13
0
 def fit(self, trn_path: str, **kwargs) -> int:
     self.word_vocab = VocabTF()
     self.tag_vocab = VocabTF(pad_token=None, unk_token=None)
     num_samples = 0
     for words, tags in self.file_to_inputs(trn_path, True):
         self.word_vocab.update(words)
         self.tag_vocab.update(tags)
         num_samples += 1
     if self.char_vocab:
         self.char_vocab = VocabTF()
         for word in self.word_vocab.token_to_idx.keys():
             if word in (self.word_vocab.pad_token,
                         self.word_vocab.unk_token):
                 continue
             self.char_vocab.update(list(word))
     return num_samples
示例#14
0
 def fit(self, trn_path: str, **kwargs):
     word_vocab, ngram_vocab, tag_vocab = VocabTF(), VocabTF(), VocabTF(
         pad_token=None, unk_token=None)
     num_samples = 0
     for X, Y in self.file_to_samples(trn_path, gold=True):
         num_samples += 1
         word_vocab.update(X[0])
         for ngram in X[1:]:
             ngram_vocab.update(filter(lambda x: x, ngram))
         tag_vocab.update(Y)
     self.word_vocab, self.ngram_vocab, self.tag_vocab = word_vocab, ngram_vocab, tag_vocab
     if self.config.window_size:
         vocabs = word_vocab, ngram_vocab, tag_vocab
     else:
         vocabs = word_vocab, None, tag_vocab
     self.word_vocab, self.ngram_vocab, self.tag_vocab = vocabs
     return num_samples
示例#15
0
文件: tacred.py 项目: lei1993/HanLP
class TACREDTransform(Transform):
    def __init__(self,
                 config: SerializableDict = None,
                 map_x=True,
                 map_y=True,
                 lower=False,
                 **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.token_vocab = VocabTF()
        self.pos_vocab = VocabTF(pad_token=None, unk_token=None)
        self.ner_vocab = VocabTF(pad_token=None)
        self.deprel_vocab = VocabTF(pad_token=None, unk_token=None)
        self.rel_vocab = VocabTF(pad_token=None, unk_token=None)

    def fit(self, trn_path: str, **kwargs) -> int:
        count = 0
        for (tokens, pos, ner, head, deprel, subj_positions, obj_positions,
             subj_type, obj_type), relation in self.file_to_samples(trn_path,
                                                                    gold=True):
            count += 1
            self.token_vocab.update(tokens)
            self.pos_vocab.update(pos)
            self.ner_vocab.update(ner)
            self.deprel_vocab.update(deprel)
            self.rel_vocab.add(relation)
        return count

    def file_to_inputs(self, filepath: str, gold=True):
        data = load_json(filepath)
        for d in data:
            tokens = list(d['token'])
            ss, se = d['subj_start'], d['subj_end']
            os, oe = d['obj_start'], d['obj_end']
            pos = d['stanford_pos']
            ner = d['stanford_ner']
            deprel = d['stanford_deprel']
            head = [int(x) for x in d['stanford_head']]
            assert any([x == 0 for x in head])
            relation = d['relation']
            yield (tokens, pos, ner, head, deprel, ss, se, os, oe), relation

    def inputs_to_samples(self, inputs, gold=False):
        for input in inputs:
            if gold:
                (tokens, pos, ner, head, deprel, ss, se, os,
                 oe), relation = input
            else:
                tokens, pos, ner, head, deprel, ss, se, os, oe = input
                relation = self.rel_vocab.safe_pad_token
            l = len(tokens)
            subj_positions = get_positions(ss, se, l)
            obj_positions = get_positions(os, oe, l)
            subj_type = ner[ss]
            obj_type = ner[os]
            # anonymize tokens
            tokens[ss:se + 1] = ['SUBJ-' + subj_type] * (se - ss + 1)
            tokens[os:oe + 1] = ['OBJ-' + obj_type] * (oe - os + 1)
            # min head is 0, but root is not included in tokens, so take 1 off from each head
            head = [h - 1 for h in head]
            yield (tokens, pos, ner, head, deprel, subj_positions,
                   obj_positions, subj_type, obj_type), relation

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        # (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation
        types = (tf.string, tf.string, tf.string, tf.int32, tf.string,
                 tf.int32, tf.int32, tf.string, tf.string), tf.string
        shapes = ([None], [None], [None], [None], [None], [None], [None], [],
                  []), []
        pads = (self.token_vocab.safe_pad_token, self.pos_vocab.safe_pad_token,
                self.ner_vocab.safe_pad_token, 0,
                self.deprel_vocab.safe_pad_token, 0, 0,
                self.ner_vocab.safe_pad_token,
                self.ner_vocab.safe_pad_token), self.rel_vocab.safe_pad_token
        return types, shapes, pads

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type = x
        tokens = self.token_vocab.lookup(tokens)
        pos = self.pos_vocab.lookup(pos)
        ner = self.ner_vocab.lookup(ner)
        deprel = self.deprel_vocab.lookup(deprel)
        subj_type = self.ner_vocab.lookup(subj_type)
        obj_type = self.ner_vocab.lookup(obj_type)
        return tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type

    def y_to_idx(self, y) -> tf.Tensor:
        return self.rel_vocab.lookup(y)
示例#16
0
 def build_transform(self, embeddings, **kwargs):
     if embeddings_require_string_input(embeddings):
         self.transform.map_x = False
         if embeddings_require_char_input(embeddings):
             self.transform.char_vocab = VocabTF()
     return super().build_transform(**kwargs)
class TransformerTransform(TsvTaggingFormat, Transform):
    def __init__(self,
                 tokenizer=None,
                 config: SerializableDict = None,
                 map_x=False, map_y=False, **kwargs) -> None:
        super().__init__(config, map_x, map_y, **kwargs)
        self._tokenizer = tokenizer
        self.tag_vocab: VocabTF = None
        self.special_token_ids = None
        self.pad = '[PAD]'
        self.unk = '[UNK]'

    @property
    def max_seq_length(self):
        # -2 for special tokens [CLS] and [SEP]
        return self.config.get('max_seq_length', 128) - 2

    @property
    def tokenizer(self):
        return self._tokenizer

    @tokenizer.setter
    def tokenizer(self, tokenizer):
        self._tokenizer = tokenizer
        vocab = tokenizer._vocab if hasattr(tokenizer, '_vocab') else tokenizer.vocab
        if self.pad not in vocab:
            # English albert use <pad> instead of [PAD]
            self.pad = '<pad>'
        if self.unk not in vocab:
            self.unk = '<unk>'
        self.special_token_ids = tf.constant([vocab[token] for token in [self.pad, '[CLS]', '[SEP]']],
                                             dtype=tf.int32)

    def fit(self, trn_path: str, **kwargs) -> int:
        self.tag_vocab = VocabTF(unk_token=None)
        num_samples = 0
        for words, tags in self.file_to_inputs(trn_path, gold=True):
            num_samples += 1
            self.tag_vocab.update(tags)
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        max_seq_length = self.config.get('max_seq_length', 128)
        types = (tf.int32, tf.int32, tf.int32), tf.int32
        # (input_ids, input_mask, segment_ids), label_ids
        shapes = ([max_seq_length], [max_seq_length], [max_seq_length]), [None]
        values = (0, 0, 0), self.tag_vocab.pad_idx
        return types, shapes, values

    def lock_vocabs(self):
        super().lock_vocabs()

    def inputs_to_samples(self, inputs, gold=False):
        max_seq_length = self.config.get('max_seq_length', 128)
        tokenizer = self._tokenizer
        xlnet = False
        roberta = False
        pad_token = self.pad
        cls_token = '[CLS]'
        sep_token = '[SEP]'
        unk_token = self.unk

        pad_label_idx = self.tag_vocab.pad_idx
        pad_token = tokenizer.convert_tokens_to_ids([pad_token])[0]
        for sample in inputs:
            if gold:
                words, tags = sample
            else:
                words, tags = sample, [self.tag_vocab.idx_to_token[1]] * len(sample)

            input_ids, input_mask, segment_ids, label_ids = convert_examples_to_features(words,
                                                                                         max_seq_length, tokenizer,
                                                                                         tags,
                                                                                         self.tag_vocab.token_to_idx,
                                                                                         cls_token_at_end=xlnet,
                                                                                         # xlnet has a cls token at the end
                                                                                         cls_token=cls_token,
                                                                                         cls_token_segment_id=2 if xlnet else 0,
                                                                                         sep_token=sep_token,
                                                                                         sep_token_extra=roberta,
                                                                                         # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                                                                                         pad_on_left=xlnet,
                                                                                         # pad on the left for xlnet
                                                                                         pad_token_id=pad_token,
                                                                                         pad_token_segment_id=4 if xlnet else 0,
                                                                                         pad_token_label_id=pad_label_idx,
                                                                                         unk_token=unk_token)

            if None in input_ids:
                print(input_ids)
            if None in input_mask:
                print(input_mask)
            if None in segment_ids:
                print(input_mask)
            yield (input_ids, input_mask, segment_ids), label_ids

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        raise NotImplementedError('transformers has its own tagger, not need to convert idx for x')

    def y_to_idx(self, y) -> tf.Tensor:
        raise NotImplementedError('transformers has its own tagger, not need to convert idx for y')

    def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool:
        return isinstance(input[0], str)

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, X=None, inputs=None, batch=None,
                     **kwargs) -> Iterable:
        assert batch is not None, 'Need the batch to know actual length of Y'
        label_mask = batch[1]

        Y = tf.argmax(Y, axis=-1)
        Y = Y[label_mask > 0]
        tags = [self.tag_vocab.idx_to_token[tid] for tid in Y]
        offset = 0
        for words in inputs:
            yield tags[offset:offset + len(words)]
            offset += len(words)
示例#18
0
文件: conll_tf.py 项目: lei1993/HanLP
class CoNLL_SDP_Transform(CoNLLTransform):

    def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=2,
                 use_pos=True, **kwargs) -> None:
        super().__init__(config, map_x, map_y, lower, n_buckets, min_freq, use_pos, **kwargs)
        self.orphan_relation = ROOT

    def lock_vocabs(self):
        super().lock_vocabs()
        # heuristic to find the orphan relation
        self._find_orphan_relation()

    def _find_orphan_relation(self):
        for rel in self.rel_vocab.idx_to_token:
            if 'root' in rel.lower():
                self.orphan_relation = rel
                break

    def file_to_inputs(self, filepath: str, gold=True):
        assert gold, 'only support gold file for now'
        use_pos = self.use_pos
        conllu = filepath.endswith('.conllu')
        enhanced_only = self.config.get('enhanced_only', None)
        for i, sent in enumerate(read_conll(filepath)):
            parsed_sent = []
            if conllu:
                for cell in sent:
                    ID = cell[0]
                    form = cell[1]
                    cpos = cell[3]
                    head = cell[6]
                    deprel = cell[7]
                    deps = cell[8]
                    deps = [x.split(':', 1) for x in deps.split('|')]
                    heads = [int(x[0]) for x in deps if x[0].isdigit()]
                    rels = [x[1] for x in deps if x[0].isdigit()]
                    if enhanced_only:
                        if head in heads:
                            offset = heads.index(head)
                            heads.pop(offset)
                            rels.pop(offset)
                    else:
                        if head not in heads:
                            heads.append(head)
                            rels.append(deprel)
                    parsed_sent.append([form, cpos, heads, rels] if use_pos else [form, heads, rels])
            else:
                prev_cells = None
                heads = []
                rels = []
                for j, cell in enumerate(sent):
                    ID = cell[0]
                    form = cell[1]
                    cpos = cell[3]
                    head = cell[6]
                    deprel = cell[7]
                    if prev_cells and ID != prev_cells[0]:  # found end of token
                        parsed_sent.append(
                            [prev_cells[1], prev_cells[2], heads, rels] if use_pos else [prev_cells[1], heads, rels])
                        heads = []
                        rels = []
                    heads.append(head)
                    rels.append(deprel)
                    prev_cells = [ID, form, cpos, head, deprel] if use_pos else [ID, form, head, deprel]
                parsed_sent.append(
                    [prev_cells[1], prev_cells[2], heads, rels] if use_pos else [prev_cells[1], heads, rels])
            yield parsed_sent

    def fit(self, trn_path: str, **kwargs) -> int:
        self.form_vocab = VocabTF()
        self.form_vocab.add(ROOT)  # make root the 2ed elements while 0th is pad, 1st is unk
        if self.use_pos:
            self.cpos_vocab = VocabTF(pad_token=None, unk_token=None)
        self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
        num_samples = 0
        counter = Counter()
        for sent in self.file_to_samples(trn_path, gold=True):
            num_samples += 1
            for idx, cell in enumerate(sent):
                if len(cell) == 4:
                    form, cpos, head, deprel = cell
                elif len(cell) == 3:
                    if self.use_pos:
                        form, cpos = cell[0]
                    else:
                        form = cell[0]
                    head, deprel = cell[1:]
                else:
                    raise ValueError('Unknown data arrangement')
                if idx == 0:
                    root = form
                else:
                    counter[form] += 1
                if self.use_pos:
                    self.cpos_vocab.add(cpos)
                self.rel_vocab.update(deprel)

        for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]:
            self.form_vocab.add(token)
        return num_samples

    def inputs_to_samples(self, inputs, gold=False):
        use_pos = self.use_pos
        for sent in inputs:
            sample = []
            for i, cell in enumerate(sent):
                if isinstance(cell, tuple):
                    cell = list(cell)
                elif isinstance(cell, str):
                    cell = [cell]
                if self.config['lower']:
                    cell[0] = cell[0].lower()
                if not gold:
                    cell += [[0], [self.rel_vocab.safe_pad_token]]
                sample.append(cell)
            # insert root word with arbitrary fields, anyway it will be masked
            if use_pos:
                form, cpos, head, deprel = sample[0]
                sample.insert(0, [self.bos, self.bos, [0], deprel])
            else:
                form, head, deprel = sample[0]
                sample.insert(0, [self.bos, [0], deprel])
            yield sample

    def batched_inputs_to_batches(self, corpus, indices, shuffle):
        use_pos = self.use_pos
        raw_batch = [[], [], [], []] if use_pos else [[], [], []]
        max_len = len(max([corpus[i] for i in indices], key=len))
        for idx in indices:
            arc = np.zeros((max_len, max_len), dtype=np.bool)
            rel = np.zeros((max_len, max_len), dtype=np.int64)
            for b in raw_batch[:2]:
                b.append([])
            for m, cells in enumerate(corpus[idx]):
                if use_pos:
                    for b, c, v in zip(raw_batch, cells,
                                       [self.form_vocab, self.cpos_vocab]):
                        b[-1].append(v.get_idx_without_add(c))
                else:
                    for b, c, v in zip(raw_batch, cells,
                                       [self.form_vocab]):
                        b[-1].append(v.get_idx_without_add(c))
                for n, r in zip(cells[-2], cells[-1]):
                    arc[m, n] = True
                    rid = self.rel_vocab.get_idx_without_add(r)
                    if rid is None:
                        logger.warning(f'Relation OOV: {r} not exists in train')
                        continue
                    rel[m, n] = rid
            raw_batch[-2].append(arc)
            raw_batch[-1].append(rel)
        batch = []
        for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab]):
            b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post',
                                                              value=v.safe_pad_token_idx,
                                                              dtype='int64')
            batch.append(b)
        batch += raw_batch[2:]
        assert len(batch) == 4
        yield (batch[0], batch[1]), (batch[2], batch[3])

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = (tf.int64, tf.int64), (tf.bool, tf.int64)
        shapes = ([None, None], [None, None]), ([None, None, None], [None, None, None])
        values = (self.form_vocab.safe_pad_token_idx, self.cpos_vocab.safe_pad_token_idx), (
            False, self.rel_vocab.safe_pad_token_idx)
        return types, shapes, values

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable:
        arc_preds, rel_preds, mask = Y
        sents = []

        for arc_sent, rel_sent, length in zip(arc_preds, rel_preds,
                                              tf.math.count_nonzero(mask, axis=-1)):
            sent = []
            for arc, rel in zip(tolist(arc_sent[1:, 1:]), tolist(rel_sent[1:, 1:])):
                ar = []
                for idx, (a, r) in enumerate(zip(arc, rel)):
                    if a:
                        ar.append((idx + 1, self.rel_vocab.idx_to_token[r]))
                if not ar:
                    # orphan
                    ar.append((0, self.orphan_relation))
                sent.append(ar)
            sents.append(sent)

        return sents

    def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                             gold=False, inputs=None, conll=True) -> Iterable:
        (words, feats, mask), (arc_preds, rel_preds) = X, Y
        xs = inputs
        ys = self.Y_to_outputs((arc_preds, rel_preds, mask))
        sents = []
        for x, y in zip(xs, ys):
            sent = CoNLLSentence()
            for idx, ((form, cpos), pred) in enumerate(zip(x, y)):
                head = [p[0] for p in pred]
                deprel = [p[1] for p in pred]
                if conll:
                    sent.append(CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel))
                else:
                    sent.append([head, deprel])
            sents.append(sent)
        return sents
示例#19
0
def vocab_from_tsv(tsv_file_path, lower=False, lock_word_vocab=False, lock_char_vocab=True, lock_tag_vocab=True) \
        -> Tuple[VocabTF, VocabTF, VocabTF]:
    word_vocab = VocabTF()
    char_vocab = VocabTF()
    tag_vocab = VocabTF(unk_token=None)
    with open(tsv_file_path, encoding='utf-8') as tsv_file:
        for line in tsv_file:
            cells = line.strip().split()
            if cells:
                word, tag = cells
                if lower:
                    word_vocab.add(word.lower())
                else:
                    word_vocab.add(word)
                char_vocab.update(list(word))
                tag_vocab.add(tag)
    if lock_word_vocab:
        word_vocab.lock()
    if lock_char_vocab:
        char_vocab.lock()
    if lock_tag_vocab:
        tag_vocab.lock()
    return word_vocab, char_vocab, tag_vocab
示例#20
0
文件: conll_tf.py 项目: lei1993/HanLP
class CoNLL_DEP_Transform(CoNLLTransform):

    def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32,
                 min_freq=2, **kwargs) -> None:
        super().__init__(config, map_x, map_y, lower, n_buckets, min_freq, **kwargs)

    def batched_inputs_to_batches(self, corpus, indices, shuffle):
        """Convert batched inputs to batches of samples

        Args:
          corpus(list): A list of inputs
          indices(list): A list of indices, each list belongs to a batch
          shuffle:

        Returns:


        """
        raw_batch = [[], [], [], []]
        for idx in indices:
            for b in raw_batch:
                b.append([])
            for cells in corpus[idx]:
                for b, c, v in zip(raw_batch, cells,
                                   [self.form_vocab, self.cpos_vocab, None, self.rel_vocab]):
                    b[-1].append(v.get_idx_without_add(c) if v else c)
        batch = []
        for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab, None, self.rel_vocab]):
            b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post',
                                                              value=v.safe_pad_token_idx if v else 0,
                                                              dtype='int64')
            batch.append(b)
        assert len(batch) == 4
        yield (batch[0], batch[1]), (batch[2], batch[3])

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = (tf.int64, tf.int64), (tf.int64, tf.int64)
        shapes = ([None, None], [None, None]), ([None, None], [None, None])
        values = (self.form_vocab.safe_pad_token_idx, self.cpos_vocab.safe_pad_token_idx), (
            0, self.rel_vocab.safe_pad_token_idx)
        return types, shapes, values

    def inputs_to_samples(self, inputs, gold=False):
        token_mapping: dict = self.config.get('token_mapping', None)
        use_pos = self.config.get('use_pos', True)
        for sent in inputs:
            sample = []
            for i, cell in enumerate(sent):
                if isinstance(cell, tuple):
                    cell = list(cell)
                elif isinstance(cell, str):
                    cell = [cell]
                if token_mapping:
                    cell[0] = token_mapping.get(cell[0], cell[0])
                if self.config['lower']:
                    cell[0] = cell[0].lower()
                if not gold:
                    cell += [0, self.rel_vocab.safe_pad_token]
                sample.append(cell)
            # insert root word with arbitrary fields, anyway it will be masked
            # form, cpos, head, deprel = sample[0]
            sample.insert(0, [self.bos, self.bos, 0, self.bos] if use_pos else [self.bos, 0, self.bos])
            yield sample

    def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                             gold=False, inputs=None, conll=True, arc_scores=None, rel_scores=None) -> Iterable:
        (words, feats, mask), (arc_preds, rel_preds) = X, Y
        if inputs is None:
            inputs = self.X_to_inputs(X)
        ys = self.Y_to_outputs((arc_preds, rel_preds, mask), inputs=inputs)
        sents = []
        for x, y in zip(inputs, ys):
            sent = CoNLLSentence()
            for idx, (cell, (head, deprel)) in enumerate(zip(x, y)):
                if self.use_pos and not self.config.get('joint_pos', None):
                    form, cpos = cell
                else:
                    form, cpos = cell, None
                if conll:
                    sent.append(
                        CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel) if conll == '.conll'
                        else CoNLLUWord(id=idx + 1, form=form, upos=cpos, head=head, deprel=deprel))
                else:
                    sent.append([head, deprel])
            sents.append(sent)
        return sents

    def fit(self, trn_path: str, **kwargs) -> int:
        use_pos = self.config.use_pos
        self.form_vocab = VocabTF()
        self.form_vocab.add(ROOT)  # make root the 2ed elements while 0th is pad, 1st is unk
        if self.use_pos:
            self.cpos_vocab = VocabTF(pad_token=None, unk_token=None)
        self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
        num_samples = 0
        counter = Counter()
        for sent in self.file_to_samples(trn_path, gold=True):
            num_samples += 1
            for idx, cell in enumerate(sent):
                if use_pos:
                    form, cpos, head, deprel = cell
                else:
                    form, head, deprel = cell
                if idx == 0:
                    root = form
                else:
                    counter[form] += 1
                if use_pos:
                    self.cpos_vocab.add(cpos)
                self.rel_vocab.add(deprel)

        for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]:
            self.form_vocab.add(token)
        return num_samples

    @property
    def root_rel_idx(self):
        root_rel_idx = self.config.get('root_rel_idx', None)
        if root_rel_idx is None:
            for idx, rel in enumerate(self.rel_vocab.idx_to_token):
                if 'root' in rel.lower() and rel != self.bos:
                    self.config['root_rel_idx'] = root_rel_idx = idx
                    break
        return root_rel_idx

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable:
        arc_preds, rel_preds, mask = Y
        sents = []

        for arc_sent, rel_sent, length in zip(arc_preds, rel_preds,
                                              tf.math.count_nonzero(mask, axis=-1)):
            arcs = tolist(arc_sent)[1:length + 1]
            rels = tolist(rel_sent)[1:length + 1]
            sents.append([(a, self.rel_vocab.idx_to_token[r]) for a, r in zip(arcs, rels)])

        return sents
示例#21
0
文件: text.py 项目: lei1993/HanLP
class TextTransform(Transform):

    def __init__(self,
                 forward=True,
                 seq_len=10,
                 tokenizer='char',
                 config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None:
        super().__init__(config, map_x, map_y, seq_len=seq_len, tokenizer=tokenizer, forward=forward, **kwargs)
        self.vocab: VocabTF = None

    def tokenize_func(self):
        if self.config.tokenizer == 'char':
            return list
        elif self.config.tokenizer == 'whitespace':
            return lambda x: x.split()
        else:
            return lambda x: x.split(self.config.tokenizer)

    def fit(self, trn_path: str, **kwargs) -> int:
        self.vocab = VocabTF()
        num_samples = 0
        for x, y in self.file_to_inputs(trn_path):
            self.vocab.update(x)
            num_samples += 1
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = tf.string, tf.string
        shapes = [None], [None]
        defaults = self.vocab.pad_token, self.vocab.pad_token
        return types, shapes, defaults

    def file_to_inputs(self, filepath: str, gold=True):
        forward = self.config.forward
        seq_len = self.config.seq_len
        buffer = []
        tokenizer = self.tokenize_func()
        with open(filepath, encoding='utf-8') if forward else FileReadBackwards(filepath, encoding="utf-8") as src:
            for line in src:
                tokens = tokenizer(line)
                buffer += tokens
                while len(buffer) > seq_len:
                    yield buffer[:seq_len], buffer[1:1 + seq_len]
                    buffer.pop(0)

    def inputs_to_samples(self, inputs, gold=False):
        forward = self.config.forward
        for t in inputs:
            if gold:
                x, y = t
            else:
                x, y = t, t
            if not forward:
                x = list(reversed(x))
                y = list(reversed(y))
            yield x, y

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        return self.vocab.lookup(x)

    def y_to_idx(self, y) -> tf.Tensor:
        return self.x_to_idx(y)

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, **kwargs) -> Iterable:
        pred = tf.argmax(Y, axis=-1)
        for ys, ms in zip(pred, inputs):
            ret = []
            for y in ys:
                ret.append(self.vocab.idx_to_token[int(y)])
            yield ret

    def input_is_single_sample(self, input: Any) -> bool:
        return isinstance(input[0], str)
示例#22
0
文件: conll_tf.py 项目: lei1993/HanLP
class CoNLL_Transformer_Transform(CoNLL_DEP_Transform):

    def __init__(self, config: SerializableDict = None, map_x=True, map_y=True,
                 lower=True, n_buckets=32, min_freq=0, max_seq_length=256, use_pos=False,
                 mask_p=None, graph=False, topk=None,
                 **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.tokenizer: PreTrainedTokenizer = None
        self.transformer_config: PretrainedConfig = None
        if graph:
            self.orphan_relation = ROOT

    def lock_vocabs(self):
        super().lock_vocabs()
        if self.graph:
            CoNLL_SDP_Transform._find_orphan_relation(self)

    def fit(self, trn_path: str, **kwargs) -> int:
        if self.config.get('joint_pos', None):
            self.config.use_pos = True
        if self.graph:
            # noinspection PyCallByClass
            num = CoNLL_SDP_Transform.fit(self, trn_path, **kwargs)
        else:
            num = super().fit(trn_path, **kwargs)
        if self.config.get('topk', None):
            counter = Counter()
            for sent in self.file_to_samples(trn_path, gold=True):
                for idx, cell in enumerate(sent):
                    form, head, deprel = cell
                    counter[form] += 1
            self.topk_vocab = VocabTF()
            for k, v in counter.most_common(self.config.topk):
                self.topk_vocab.add(k)
        return num

    def inputs_to_samples(self, inputs, gold=False):
        if self.graph:
            yield from CoNLL_SDP_Transform.inputs_to_samples(self, inputs, gold)
        else:
            yield from super().inputs_to_samples(inputs, gold)

    def file_to_inputs(self, filepath: str, gold=True):
        if self.graph:
            yield from CoNLL_SDP_Transform.file_to_inputs(self, filepath, gold)
        else:
            yield from super().file_to_inputs(filepath, gold)

    @property
    def mask_p(self) -> float:
        return self.config.get('mask_p', None)

    @property
    def graph(self):
        return self.config.get('graph', None)

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        mask_p = self.mask_p
        types = (tf.int64, (tf.int64, tf.int64, tf.int64)), (tf.bool if self.graph else tf.int64, tf.int64, tf.int64) if mask_p else (
            tf.bool if self.graph else tf.int64, tf.int64)
        if self.graph:
            shapes = ([None, None], ([None, None], [None, None], [None, None])), (
                [None, None, None], [None, None, None], [None, None]) if mask_p else (
                [None, None, None], [None, None, None])
        else:
            shapes = ([None, None], ([None, None], [None, None], [None, None])), (
                [None, None], [None, None], [None, None]) if mask_p else ([None, None], [None, None])

        values = (self.form_vocab.safe_pad_token_idx, (0, 0, 0)), \
                 (0, self.rel_vocab.safe_pad_token_idx, 0) if mask_p else (0, self.rel_vocab.safe_pad_token_idx)
        types_shapes_values = types, shapes, values
        if self.use_pos:
            types_shapes_values = [((shapes[0][0], shapes[0][1] + (shapes[0][0],)), shapes[1]) for shapes in
                                   types_shapes_values]
        return types_shapes_values

    def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
        form_batch, feat, prefix_mask = X
        sents = []

        for form_sent, length in zip(form_batch, tf.math.count_nonzero(prefix_mask, axis=-1)):
            forms = tolist(form_sent)[1:length + 1]
            sents.append([self.form_vocab.idx_to_token[f] for f in forms])

        return sents

    def batched_inputs_to_batches(self, corpus, indices, shuffle):
        use_pos = self.use_pos
        if use_pos:
            raw_batch = [[], [], [], []]
        else:
            raw_batch = [[], [], []]
        if self.graph:
            max_len = len(max([corpus[i] for i in indices], key=len))
            for idx in indices:
                arc = np.zeros((max_len, max_len), dtype=np.bool)
                rel = np.zeros((max_len, max_len), dtype=np.int64)
                for b in raw_batch[:2 if use_pos else 1]:
                    b.append([])
                for m, cells in enumerate(corpus[idx]):
                    if use_pos:
                        for b, c, v in zip(raw_batch, cells, [None, self.cpos_vocab]):
                            b[-1].append(v.get_idx_without_add(c) if v else c)
                    else:
                        for b, c, v in zip(raw_batch, cells, [None]):
                            b[-1].append(c)
                    for n, r in zip(cells[-2], cells[-1]):
                        arc[m, n] = True
                        rid = self.rel_vocab.get_idx_without_add(r)
                        if rid is None:
                            logger.warning(f'Relation OOV: {r} not exists in train')
                            continue
                        rel[m, n] = rid
                raw_batch[-2].append(arc)
                raw_batch[-1].append(rel)
        else:
            for idx in indices:
                for s in raw_batch:
                    s.append([])
                for cells in corpus[idx]:
                    if use_pos:
                        for s, c, v in zip(raw_batch, cells, [None, self.cpos_vocab, None, self.rel_vocab]):
                            s[-1].append(v.get_idx_without_add(c) if v else c)
                    else:
                        for s, c, v in zip(raw_batch, cells, [None, None, self.rel_vocab]):
                            s[-1].append(v.get_idx_without_add(c) if v else c)

        # Transformer tokenizing
        config = self.transformer_config
        tokenizer = self.tokenizer
        xlnet = config_is(config, 'xlnet')
        roberta = config_is(config, 'roberta')
        pad_token = tokenizer.pad_token
        pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0]
        cls_token = tokenizer.cls_token
        sep_token = tokenizer.sep_token
        max_seq_length = self.config.max_seq_length
        batch_forms = []
        batch_input_ids = []
        batch_input_mask = []
        batch_prefix_offset = []
        mask_p = self.mask_p
        if mask_p:
            batch_masked_offsets = []
            mask_token_id = tokenizer.mask_token_id
        for sent_idx, sent in enumerate(raw_batch[0]):
            batch_forms.append([self.form_vocab.get_idx_without_add(token) for token in sent])
            sent = adjust_tokens_for_transformers(sent)
            sent = sent[1:]  # remove <root> use [CLS] instead
            pad_label_idx = self.form_vocab.pad_idx
            input_ids, input_mask, segment_ids, prefix_mask = \
                convert_examples_to_features(sent,
                                             max_seq_length,
                                             tokenizer,
                                             cls_token_at_end=xlnet,
                                             # xlnet has a cls token at the end
                                             cls_token=cls_token,
                                             cls_token_segment_id=2 if xlnet else 0,
                                             sep_token=sep_token,
                                             sep_token_extra=roberta,
                                             # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                                             pad_on_left=xlnet,
                                             # pad on the left for xlnet
                                             pad_token_id=pad_token_id,
                                             pad_token_segment_id=4 if xlnet else 0,
                                             pad_token_label_id=pad_label_idx,
                                             do_padding=False)
            num_masks = sum(prefix_mask)
            # assert len(sent) == num_masks  # each token has a True subtoken
            if num_masks < len(sent):  # long sent gets truncated, +1 for root
                batch_forms[-1] = batch_forms[-1][:num_masks + 1]  # form
                raw_batch[-1][sent_idx] = raw_batch[-1][sent_idx][:num_masks + 1]  # head
                raw_batch[-2][sent_idx] = raw_batch[-2][sent_idx][:num_masks + 1]  # rel
                raw_batch[-3][sent_idx] = raw_batch[-3][sent_idx][:num_masks + 1]  # pos
            prefix_mask[0] = True  # <root> is now [CLS]
            prefix_offset = [idx for idx, m in enumerate(prefix_mask) if m]
            batch_input_ids.append(input_ids)
            batch_input_mask.append(input_mask)
            batch_prefix_offset.append(prefix_offset)
            if mask_p:
                if shuffle:
                    size = int(np.ceil(mask_p * len(prefix_offset[1:])))  # never mask [CLS]
                    mask_offsets = np.random.choice(np.arange(1, len(prefix_offset)), size, replace=False)
                    for offset in sorted(mask_offsets):
                        assert 0 < offset < len(input_ids)
                        # mask_word = raw_batch[0][sent_idx][offset]
                        # mask_prefix = tokenizer.convert_ids_to_tokens([input_ids[prefix_offset[offset]]])[0]
                        # assert mask_word.startswith(mask_prefix) or mask_prefix.startswith(
                        #     mask_word) or mask_prefix == "'", \
                        #     f'word {mask_word} prefix {mask_prefix} not match'  # could vs couldn
                        # mask_offsets.append(input_ids[offset]) # subword token
                        # mask_offsets.append(offset)  # form token
                        input_ids[prefix_offset[offset]] = mask_token_id  # mask prefix
                        # whole word masking, mask the rest of the word
                        for i in range(prefix_offset[offset] + 1, len(input_ids) - 1):
                            if prefix_mask[i]:
                                break
                            input_ids[i] = mask_token_id

                    batch_masked_offsets.append(sorted(mask_offsets))
                else:
                    batch_masked_offsets.append([0])  # No masking in prediction

        batch_forms = tf.keras.preprocessing.sequence.pad_sequences(batch_forms, padding='post',
                                                                    value=self.form_vocab.safe_pad_token_idx,
                                                                    dtype='int64')
        batch_input_ids = tf.keras.preprocessing.sequence.pad_sequences(batch_input_ids, padding='post',
                                                                        value=pad_token_id,
                                                                        dtype='int64')
        batch_input_mask = tf.keras.preprocessing.sequence.pad_sequences(batch_input_mask, padding='post',
                                                                         value=0,
                                                                         dtype='int64')
        batch_prefix_offset = tf.keras.preprocessing.sequence.pad_sequences(batch_prefix_offset, padding='post',
                                                                            value=0,
                                                                            dtype='int64')
        batch_heads = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[-2], padding='post',
                                                                    value=0,
                                                                    dtype='int64')
        batch_rels = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[-1], padding='post',
                                                                   value=self.rel_vocab.safe_pad_token_idx,
                                                                   dtype='int64')
        if mask_p:
            batch_masked_offsets = tf.keras.preprocessing.sequence.pad_sequences(batch_masked_offsets, padding='post',
                                                                                 value=pad_token_id,
                                                                                 dtype='int64')
        feats = (tf.constant(batch_input_ids, dtype='int64'), tf.constant(batch_input_mask, dtype='int64'),
                 tf.constant(batch_prefix_offset))
        if use_pos:
            batch_pos = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[1], padding='post',
                                                                      value=self.cpos_vocab.safe_pad_token_idx,
                                                                      dtype='int64')
            feats += (batch_pos,)
        yield (batch_forms, feats), \
              (batch_heads, batch_rels, batch_masked_offsets) if mask_p else (batch_heads, batch_rels)

    def len_of_sent(self, sent):
        # Transformer tokenizing
        config = self.transformer_config
        tokenizer = self.tokenizer
        xlnet = config_is(config, 'xlnet')
        roberta = config_is(config, 'roberta')
        pad_token = tokenizer.pad_token
        pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0]
        cls_token = tokenizer.cls_token
        sep_token = tokenizer.sep_token
        max_seq_length = self.config.max_seq_length
        sent = sent[1:]  # remove <root> use [CLS] instead
        pad_label_idx = self.form_vocab.pad_idx
        sent = [x[0] for x in sent]
        sent = adjust_tokens_for_transformers(sent)
        input_ids, input_mask, segment_ids, prefix_mask = \
            convert_examples_to_features(sent,
                                         max_seq_length,
                                         tokenizer,
                                         cls_token_at_end=xlnet,
                                         # xlnet has a cls token at the end
                                         cls_token=cls_token,
                                         cls_token_segment_id=2 if xlnet else 0,
                                         sep_token=sep_token,
                                         sep_token_extra=roberta,
                                         # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                                         pad_on_left=xlnet,
                                         # pad on the left for xlnet
                                         pad_token_id=pad_token_id,
                                         pad_token_segment_id=4 if xlnet else 0,
                                         pad_token_label_id=pad_label_idx,
                                         do_padding=False)
        return len(input_ids)

    def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=5000, shuffle=None, repeat=None,
                           drop_remainder=False, prefetch=1, cache=True) -> tf.data.Dataset:
        if shuffle:
            return CoNLL_DEP_Transform.samples_to_dataset(self, samples, map_x, map_y, batch_size, shuffle, repeat,
                                                          drop_remainder, prefetch, cache)

        def generator():
            # custom bucketing, load corpus into memory
            corpus = list(x for x in (samples() if callable(samples) else samples))
            n_tokens = 0
            batch = []
            for idx, sent in enumerate(corpus):
                sent_len = self.len_of_sent(sent)
                if n_tokens + sent_len > batch_size and batch:
                    yield from self.batched_inputs_to_batches(corpus, batch, shuffle)
                    n_tokens = 0
                    batch = []
                n_tokens += sent_len
                batch.append(idx)
            if batch:
                yield from self.batched_inputs_to_batches(corpus, batch, shuffle)

        # debug for transformer
        # next(generator())
        return Transform.samples_to_dataset(self, generator, False, False, 0, False, repeat, drop_remainder, prefetch,
                                            cache)

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable:
        if self.graph:
            ys = CoNLL_SDP_Transform.Y_to_outputs(self, Y, gold, inputs, X)
            ys = [[([t[0] for t in l], [t[1] for t in l]) for l in y] for y in ys]
            return ys
        return super().Y_to_outputs(Y, gold, inputs, X)
示例#23
0
 def __init__(self,
              filepath: str = None,
              vocab: VocabTF = None,
              expand_vocab=True,
              lowercase=True,
              input_dim=None,
              output_dim=None,
              unk=None,
              normalize=False,
              embeddings_initializer='VarianceScaling',
              embeddings_regularizer=None,
              activity_regularizer=None,
              embeddings_constraint=None,
              mask_zero=True,
              input_length=None,
              name=None,
              cpu=True,
              **kwargs):
     filepath = get_resource(filepath)
     word2vec, _output_dim = load_word2vec(filepath)
     if output_dim:
         assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}'
     output_dim = _output_dim
     # if the `unk` token exists in the pretrained,
     # then replace it with a self-defined one, usually the one in word vocab
     if unk and unk in word2vec:
         word2vec[vocab.safe_unk_token] = word2vec.pop(unk)
     if vocab is None:
         vocab = VocabTF()
         vocab.update(word2vec.keys())
     if expand_vocab and vocab.mutable:
         for word in word2vec:
             vocab.get_idx(word.lower() if lowercase else word)
     if input_dim:
         assert input_dim == len(
             vocab), f'input_dim = {input_dim} does not match {filepath}'
     input_dim = len(vocab)
     # init matrix
     self._embeddings_initializer = embeddings_initializer
     embeddings_initializer = tf.keras.initializers.get(
         embeddings_initializer)
     with tf.device('cpu:0') if cpu else DummyContext():
         pret_embs = embeddings_initializer(
             shape=[input_dim, output_dim]).numpy()
     # insert to pret_embs
     for word, idx in vocab.token_to_idx.items():
         vec = word2vec.get(word, None)
         # Retry lower case
         if vec is None and lowercase:
             vec = word2vec.get(word.lower(), None)
         if vec is not None:
             pret_embs[idx] = vec
     if normalize:
         pret_embs /= np.std(pret_embs)
     if not name:
         name = os.path.splitext(os.path.basename(filepath))[0]
     super().__init__(input_dim,
                      output_dim,
                      tf.keras.initializers.Constant(pret_embs),
                      embeddings_regularizer,
                      activity_regularizer,
                      embeddings_constraint,
                      mask_zero,
                      input_length,
                      name=name,
                      **kwargs)
     self.filepath = filepath
     self.expand_vocab = expand_vocab
     self.lowercase = lowercase
示例#24
0
class TSVTaggingTransform(TsvTaggingFormat, Transform):
    def __init__(self,
                 config: SerializableDict = None,
                 map_x=True,
                 map_y=True,
                 use_char=False,
                 **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.word_vocab: Optional[VocabTF] = None
        self.tag_vocab: Optional[VocabTF] = None
        self.char_vocab: Optional[VocabTF] = None

    def fit(self, trn_path: str, **kwargs) -> int:
        self.word_vocab = VocabTF()
        self.tag_vocab = VocabTF(pad_token=None, unk_token=None)
        num_samples = 0
        for words, tags in self.file_to_inputs(trn_path, True):
            self.word_vocab.update(words)
            self.tag_vocab.update(tags)
            num_samples += 1
        if self.char_vocab:
            self.char_vocab = VocabTF()
            for word in self.word_vocab.token_to_idx.keys():
                if word in (self.word_vocab.pad_token,
                            self.word_vocab.unk_token):
                    continue
                self.char_vocab.update(list(word))
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = tf.string, tf.string
        shapes = [None], [None]
        values = self.word_vocab.pad_token, self.tag_vocab.first_token
        return types, shapes, values

    def inputs_to_samples(self, inputs, gold=False):
        lower = self.config.get('lower', False)
        if gold:
            if lower:
                for x, y in inputs:
                    yield x.lower(), y
            else:
                yield from inputs
        else:
            for x in inputs:
                yield x.lower() if lower else x, [self.padding_values[-1]
                                                  ] * len(x)

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        return self.word_vocab.lookup(x)

    def y_to_idx(self, y) -> tf.Tensor:
        return self.tag_vocab.lookup(y)

    def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
        for xs in X:
            words = []
            for x in xs:
                words.append(
                    str_tensor_to_str(x) if self.char_vocab else self.
                    word_vocab.idx_to_token[int(x)])
            yield words

    def Y_to_outputs(self,
                     Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                     gold=False,
                     inputs=None,
                     X=None,
                     **kwargs) -> Iterable:
        if not gold:
            Y = tf.argmax(Y, axis=2)
        for ys, xs in zip(Y, inputs):
            tags = []
            for y, x in zip(ys, xs):
                tags.append(self.tag_vocab.idx_to_token[int(y)])
            yield tags

    def input_is_single_sample(
            self, input: Union[List[str], List[List[str]]]) -> bool:
        return isinstance(input[0], str)

    def input_truth_output_to_str(self, input: List[str], truth: List[str],
                                  output: List[str]):
        text = ''
        for word, gold_tag, pred_tag in zip(input, truth, output):
            text += ' '.join([word, gold_tag, pred_tag]) + '\n'

        text += '\n'
        return text