def __init__(self, modelFile, vocabFile, addTarget2Vocab=False):
     # CHAR_INPUT="chars", 并且会转化为word_index
     self._vocabFile = vocabFile
     self._addTarget2Vocab = addTarget2Vocab
     self._CONST_CHAR = Const.CHAR_INPUT
     self._CONST_WORDS = Const.INPUT
     self._CONST_TARGET = Const.TARGET
     self._input_fields = [self._CONST_WORDS, Const.INPUT_LEN]
     self._word_counter, self._word_vocab, self._target_counter, \
     self._target_vocab, self._target = self._get_vocabs()
     self._vocab4word = Vocabulary()
     self._update_word()
     if self._addTarget2Vocab:
         self._vocab4target = Vocabulary(unknown=None, padding=None)
         self._input_fields.append(self._CONST_TARGET)
         self._update_target()
     self._model = Predictor(ModelLoader().load_pytorch_model(modelFile))
Пример #2
0
    def test_bert_3(self):

        vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split())
        embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert',
                              include_cls_sep=False)
        model = BertForTokenClassification(embed, 7)

        input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]])

        pred = model(input_ids)
        self.assertTrue(isinstance(pred, dict))
        self.assertTrue(Const.OUTPUT in pred)
        self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 3, 7))
Пример #3
0
    def test_bert_1_w(self):
        vocab = Vocabulary().add_word_lst("this is a test .".split())
        embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert',
                              include_cls_sep=False)

        with self.assertWarns(Warning):
            model = BertForSequenceClassification(embed, 2)

            input_ids = torch.LongTensor([[1, 2, 3], [5, 6, 0]])

            pred = model.predict(input_ids)
            self.assertTrue(isinstance(pred, dict))
            self.assertTrue(Const.OUTPUT in pred)
            self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2,))
Пример #4
0
    def test_bert_4(self):
        vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split())
        embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert',
                              include_cls_sep=False)
        model = BertForQuestionAnswering(embed)

        input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]])

        pred = model(input_ids)
        self.assertTrue(isinstance(pred, dict))
        self.assertTrue('pred_start' in pred)
        self.assertTrue('pred_end' in pred)
        self.assertEqual(tuple(pred['pred_start'].shape), (2, 3))
        self.assertEqual(tuple(pred['pred_end'].shape), (2, 3))
Пример #5
0
    def test_bert_2(self):

        vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split())
        embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert',
                              include_cls_sep=True)

        model = BertForMultipleChoice(embed, 2)

        input_ids = torch.LongTensor([[[2, 6, 7], [1, 6, 5]]])
        print(input_ids.size())

        pred = model(input_ids)
        self.assertTrue(isinstance(pred, dict))
        self.assertTrue(Const.OUTPUT in pred)
        self.assertEqual(tuple(pred[Const.OUTPUT].shape), (1, 2))
Пример #6
0
    def load(cls, folder):
        """
        从folder中读取数据初始化RobertaEmbedding

        :param folder:
        :return:
        """
        for name in [VOCAB_NAME, ROBERTA_EMBED_HYPER, ROBERTA_EMBED_FOLDER]:
            assert os.path.exists(os.path.join(
                folder, name)), f"{name} not found in {folder}."

        vocab = Vocabulary.load(os.path.join(folder, VOCAB_NAME))
        with open(os.path.join(folder, ROBERTA_EMBED_HYPER),
                  'r',
                  encoding='utf-8') as f:
            hyper = json.load(f)
        model_name_or_path = os.path.join(folder, ROBERTA_EMBED_FOLDER)

        roberta = cls(vocab=vocab,
                      model_dir_or_name=model_name_or_path,
                      **hyper)
        return roberta
Пример #7
0
    def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', embedding_dim=-1, requires_grad: bool = True,
                 init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs):
        """

        :param vocab: Vocabulary. 若该项为None则会读取所有的embedding。
        :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个
            以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。
            如果输入为None则使用embedding_dim的维度随机初始化一个embedding。
        :param int embedding_dim: 随机初始化的embedding的维度,当该值为大于0的值时,将忽略model_dir_or_name。
        :param bool requires_grad: 是否需要gradient. 默认为True
        :param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法, 传入的方法应该接受一个tensor,并
            inplace地修改其值。
        :param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独
            为大写的词语开辟一个vector表示,则将lower设置为False。
        :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
        :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
        :param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。
        :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。
        :param dict kwarngs: only_train_min_freq, 仅对train中的词语使用min_freq筛选; only_norm_found_vector是否仅对在预训练中找到的词语使用normalize。
        """
        super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
        if embedding_dim > 0:
            model_dir_or_name = None

        # 得到cache_path
        if model_dir_or_name is None:
            assert embedding_dim >= 1, "The dimension of embedding should be larger than 1."
            embedding_dim = int(embedding_dim)
            model_path = None
        elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES:
            model_url = _get_embedding_url('static', model_dir_or_name.lower())
            model_path = cached_path(model_url, name='embedding')
            # 检查是否存在
        elif os.path.isfile(os.path.abspath(os.path.expanduser(model_dir_or_name))):
            model_path = os.path.abspath(os.path.expanduser(model_dir_or_name))
        elif os.path.isdir(os.path.abspath(os.path.expanduser(model_dir_or_name))):
            model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt')
        else:
            raise ValueError(f"Cannot recognize {model_dir_or_name}.")

        # 根据min_freq缩小vocab
        truncate_vocab = (vocab.min_freq is None and min_freq > 1) or (vocab.min_freq and vocab.min_freq < min_freq)
        if truncate_vocab:
            truncated_vocab = deepcopy(vocab)
            truncated_vocab.min_freq = min_freq
            truncated_vocab.word2idx = None
            if lower:  # 如果有lower,将大小写的的freq需要同时考虑到
                lowered_word_count = defaultdict(int)
                for word, count in truncated_vocab.word_count.items():
                    lowered_word_count[word.lower()] += count
                for word in truncated_vocab.word_count.keys():
                    word_count = truncated_vocab.word_count[word]
                    if lowered_word_count[word.lower()] >= min_freq and word_count < min_freq:
                        truncated_vocab.add_word_lst([word] * (min_freq - word_count),
                                                     no_create_entry=truncated_vocab._is_word_no_create_entry(word))

            # 只限制在train里面的词语使用min_freq筛选
            if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None:
                for word in truncated_vocab.word_count.keys():
                    if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word] < min_freq:
                        truncated_vocab.add_word_lst([word] * (min_freq - truncated_vocab.word_count[word]),
                                                     no_create_entry=True)
            truncated_vocab.build_vocab()
            truncated_words_to_words = torch.arange(len(vocab)).long()
            for word, index in vocab:
                truncated_words_to_words[index] = truncated_vocab.to_index(word)
            logger.info(
                f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.")
            vocab = truncated_vocab

        self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False)
        # 读取embedding
        if lower:
            lowered_vocab = Vocabulary(padding=vocab.padding, unknown=vocab.unknown)
            for word, index in vocab:
                if vocab._is_word_no_create_entry(word):
                    lowered_vocab.add_word(word.lower(), no_create_entry=True)
                else:
                    lowered_vocab.add_word(word.lower())  # 先加入需要创建entry的
            logger.info(f"All word in the vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} "
                        f"unique lowered words.")
            if model_path:
                embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method)
            else:
                embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method)
                self.register_buffer('words_to_words', torch.arange(len(vocab)).long())
            if lowered_vocab.unknown:
                unknown_idx = lowered_vocab.unknown_idx
            else:
                unknown_idx = embedding.size(0) - 1  # 否则是最后一个为unknow
                self.register_buffer('words_to_words', torch.arange(len(vocab)).long())
            words_to_words = torch.full((len(vocab),), fill_value=unknown_idx).long()
            for word, index in vocab:
                if word not in lowered_vocab:
                    word = word.lower()
                    if word not in lowered_vocab and lowered_vocab._is_word_no_create_entry(word):
                        continue  # 如果不需要创建entry,已经默认unknown了
                words_to_words[index] = self.words_to_words[lowered_vocab.to_index(word)]
            self.register_buffer('words_to_words', words_to_words)
            self._word_unk_index = lowered_vocab.unknown_idx  # 替换一下unknown的index
        else:
            if model_path:
                embedding = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method)
            else:
                embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method)
                self.register_buffer('words_to_words', torch.arange(len(vocab)).long())
        if not self.only_norm_found_vector and normalize:
            embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12)

        if truncate_vocab:
            for i in range(len(truncated_words_to_words)):
                index_in_truncated_vocab = truncated_words_to_words[i]
                truncated_words_to_words[i] = self.words_to_words[index_in_truncated_vocab]
            del self.words_to_words
            self.register_buffer('words_to_words', truncated_words_to_words)
        self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
                                      padding_idx=vocab.padding_idx,
                                      max_norm=None, norm_type=2, scale_grad_by_freq=False,
                                      sparse=False, _weight=embedding)
        self._embed_size = self.embedding.weight.size(1)
        self.requires_grad = requires_grad
        self.dropout = MyDropout(dropout)
Пример #8
0
# preprocess the file
import sys
from fastNLP.io.loader import CSVLoader
bundle = CSVLoader(headers=['raw_words1', 'raw_words2', 'target'],
                   sep='\t').load(sys.argv[1])

#####test
import jieba
from fastNLP.core import Vocabulary
bundle.apply(lambda line: jieba.lcut(line['raw_words1']) + ['[SEP]'] + jieba.
             lcut(line['raw_words2']),
             new_field_name='words')
bundle.apply(lambda line: len(line['words']), new_field_name='seq_len')
bundle.apply(lambda line: 1, new_field_name='target')
vocab = Vocabulary()
vocab.from_dataset(bundle.get_dataset("train"),
                   field_name='words',
                   no_create_entry_dataset=[
                       bundle.get_dataset("test"),
                       bundle.get_dataset("dev")
                   ])
vocab.index_dataset(bundle.get_dataset("train"), field_name='words')
vocab.index_dataset(bundle.get_dataset("test"), field_name='words')
vocab.index_dataset(bundle.get_dataset("dev"), field_name='words')

# establish the model
from fastNLP import Const
import torch
from fastNLP.models import BertForSentenceMatching
from fastNLP.embeddings.bert_embedding import BertEmbedding
embed = BertEmbedding(vocab, model_dir_or_name='cn-base', requires_grad=False)
def equip_chinese_ner_with_lexicon(datasets,
                                   vocabs,
                                   embeddings,
                                   w_list,
                                   word_embedding_path=None,
                                   only_lexicon_in_train=False,
                                   word_char_mix_embedding_path=None,
                                   number_normalized=False,
                                   lattice_min_freq=1,
                                   only_train_min_freq=0):
    from fastNLP.core import Vocabulary

    def normalize_char(inp):
        result = []
        for c in inp:
            if c.isdigit():
                result.append('0')
            else:
                result.append(c)

        return result

    def normalize_bigram(inp):
        result = []
        for bi in inp:
            tmp = bi
            if tmp[0].isdigit():
                tmp = '0' + tmp[:1]
            if tmp[1].isdigit():
                tmp = tmp[0] + '0'

            result.append(tmp)
        return result

    if number_normalized == 3:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if only_lexicon_in_train:
        print('已支持只加载在trian中出现过的词汇')

    def get_skip_path(chars, w_trie):
        sentence = ''.join(chars)
        result = w_trie.get_lexicon(sentence)
        # print(result)

        return result

    from V0.utils_ import Trie
    from functools import partial
    from fastNLP.core import Vocabulary
    # from fastNLP.embeddings import StaticEmbedding
    from fastNLP_module import StaticEmbedding
    from fastNLP import DataSet
    a = DataSet()
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    if only_lexicon_in_train:
        lexicon_in_train = set()
        for s in datasets['train']['chars']:
            lexicon_in_s = w_trie.get_lexicon(s)
            for s, e, lexicon in lexicon_in_s:
                lexicon_in_train.add(''.join(lexicon))

        print('lexicon in train:{}'.format(len(lexicon_in_train)))
        print('i.e.: {}'.format(list(lexicon_in_train)[:10]))
        w_trie = Trie()
        for w in lexicon_in_train:
            w_trie.insert(w)

    import copy
    for k, v in datasets.items():
        v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars',
                      'lexicons')
        v.apply_field(copy.copy, 'chars', 'raw_chars')
        v.add_seq_len('lexicons', 'lex_num')
        v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons',
                      'lex_s')
        v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons',
                      'lex_e')

    if number_normalized == 1:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if number_normalized == 2:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def concat(ins):
        chars = ins['chars']
        lexicons = ins['lexicons']
        result = chars + list(map(lambda x: x[2], lexicons))
        # print('lexicons:{}'.format(lexicons))
        # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons))))
        # print('result:{}'.format(result))
        return result

    def get_pos_s(ins):
        lex_s = ins['lex_s']
        seq_len = ins['seq_len']
        pos_s = list(range(seq_len)) + lex_s

        return pos_s

    def get_pos_e(ins):
        lex_e = ins['lex_e']
        seq_len = ins['seq_len']
        pos_e = list(range(seq_len)) + lex_e

        return pos_e

    for k, v in datasets.items():
        v.apply(concat, new_field_name='lattice')
        v.set_input('lattice')
        v.apply(get_pos_s, new_field_name='pos_s')
        v.apply(get_pos_e, new_field_name='pos_e')
        v.set_input('pos_s', 'pos_e')

    # print(list(datasets['train'][:10]['lexicons']))
    # print(list(datasets['train'][:10]['lattice']))
    # print(list(datasets['train'][:10]['lex_s']))
    # print(list(datasets['train'][:10]['lex_e']))
    # print(list(datasets['train'][:10]['pos_s']))
    # print(list(datasets['train'][:10]['pos_e']))
    # exit(1208)

    word_vocab = Vocabulary()
    word_vocab.add_word_lst(w_list)
    vocabs['word'] = word_vocab

    lattice_vocab = Vocabulary()
    lattice_vocab.from_dataset(datasets['train'],
                               field_name='lattice',
                               no_create_entry_dataset=[
                                   v for k, v in datasets.items()
                                   if k != 'train'
                               ])
    vocabs['lattice'] = lattice_vocab
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word')
    #
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word')

    # for k,v in datasets.items():
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count')
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_l2r_word',new_field_name='skips_l2r_word')
    #
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back')
    #
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_r2l_word',new_field_name='skips_r2l_word')

    if word_embedding_path is not None:
        word_embedding = StaticEmbedding(word_vocab,
                                         word_embedding_path,
                                         word_dropout=0)
        embeddings['word'] = word_embedding

    if word_char_mix_embedding_path is not None:
        lattice_embedding = StaticEmbedding(
            lattice_vocab,
            word_char_mix_embedding_path,
            word_dropout=0.01,
            min_freq=lattice_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['lattice'] = lattice_embedding

    vocabs['char'].index_dataset(*(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
    vocabs['bigram'].index_dataset(*(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
    vocabs['label'].index_dataset(*(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')
    vocabs['lattice'].index_dataset(*(datasets.values()),
                                    field_name='lattice',
                                    new_field_name='lattice')
    vocabs['span_label'].index_dataset(*(datasets.values()),
                                       field_name='span_label',
                                       new_field_name='span_label')
    vocabs['attr_label'].index_dataset(*(datasets.values()),
                                       field_name='attr_start_label',
                                       new_field_name='attr_start_label')
    vocabs['attr_label'].index_dataset(*(datasets.values()),
                                       field_name='attr_end_label',
                                       new_field_name='attr_end_label')

    return datasets, vocabs, embeddings
Пример #10
0
def equip_chinese_ner_with_lexicon(
        datasets,
        vocabs,
        embeddings,
        w_list,
        word_embedding_path=None,
        only_lexicon_in_train=False,
        word_char_mix_embedding_path=None,  # 字和词的embedding信息
        number_normalized=False,
        lattice_min_freq=1,
        only_train_min_freq=0):
    from fastNLP.core import Vocabulary

    def normalize_char(inp):
        result = []
        for c in inp:
            if c.isdigit():
                result.append('0')
            else:
                result.append(c)

        return result

    def normalize_bigram(inp):
        result = []
        for bi in inp:
            tmp = bi
            if tmp[0].isdigit():
                tmp = '0' + tmp[:1]
            if tmp[1].isdigit():
                tmp = tmp[0] + '0'

            result.append(tmp)
        return result

    if number_normalized == 3:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if only_lexicon_in_train:
        print('已支持只加载在trian中出现过的词汇')

    def get_skip_path(chars, w_trie):
        sentence = ''.join(chars)
        result = w_trie.get_lexicon(sentence)
        # print(result)

        return result

    from V0.utils_ import Trie
    from functools import partial
    from fastNLP.core import Vocabulary
    from fastNLP_module import StaticEmbedding
    from fastNLP import DataSet
    a = DataSet()
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    if only_lexicon_in_train:
        lexicon_in_train = set()
        for s in datasets['train']['chars']:
            lexicon_in_s = w_trie.get_lexicon(s)
            for s, e, lexicon in lexicon_in_s:
                lexicon_in_train.add(''.join(lexicon))

        print('lexicon in train:{}'.format(len(lexicon_in_train)))
        print('i.e.: {}'.format(list(lexicon_in_train)[:10]))
        w_trie = Trie()
        for w in lexicon_in_train:
            w_trie.insert(w)

    import copy
    for k, v in datasets.items():
        v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars',
                      'lexicons')
        v.apply_field(copy.copy, 'chars', 'raw_chars')
        v.add_seq_len('lexicons', 'lex_num')
        v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons',
                      'lex_s')
        v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons',
                      'lex_e')

    if number_normalized == 1:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if number_normalized == 2:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def concat(ins):
        chars = ins['chars']
        lexicons = ins['lexicons']
        result = chars + list(map(lambda x: x[2], lexicons))
        # print('lexicons:{}'.format(lexicons))
        # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons))))
        # print('result:{}'.format(result))
        return result

    def get_pos_s(ins):
        lex_s = ins['lex_s']
        seq_len = ins['seq_len']
        pos_s = list(range(seq_len)) + lex_s

        return pos_s

    def get_pos_e(ins):
        lex_e = ins['lex_e']
        seq_len = ins['seq_len']
        pos_e = list(range(seq_len)) + lex_e

        return pos_e

    for k, v in datasets.items():
        v.apply(concat, new_field_name='lattice')
        v.set_input('lattice')
        v.apply(get_pos_s, new_field_name='pos_s')
        v.apply(get_pos_e, new_field_name='pos_e')
        v.set_input('pos_s', 'pos_e')

    word_vocab = Vocabulary()
    word_vocab.add_word_lst(w_list)
    vocabs['word'] = word_vocab

    lattice_vocab = Vocabulary()
    lattice_vocab.from_dataset(datasets['train'],
                               field_name='lattice',
                               no_create_entry_dataset=[
                                   v for k, v in datasets.items()
                                   if k != 'train'
                               ])
    vocabs['lattice'] = lattice_vocab
    """
    1.word_embedding_path 这个参数到底是用做什么的?
    我将其设置成了 None。但是如果为None,那么embedding['word']没有了还可以吗?
    
    2.StaticEmbedding:
    给定预训练embedding的名称或路径,根据vocab从embedding中抽取相应的数据(只会将出现在vocab中的词抽取出来, 如果没有找到,则会随机初始化一个值(但如果该word是被标记为no_create_entry的话,则不会单独创建一个值,而是会被指向unk的index))
    """
    if word_embedding_path is not None:
        word_embedding = StaticEmbedding(word_vocab,
                                         word_embedding_path,
                                         word_dropout=0)
        embeddings['word'] = word_embedding

    if word_char_mix_embedding_path is not None:
        lattice_embedding = StaticEmbedding(
            lattice_vocab,
            word_char_mix_embedding_path,
            word_dropout=0.01,
            min_freq=lattice_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['lattice'] = lattice_embedding

    vocabs['char'].index_dataset(*(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
    vocabs['bigram'].index_dataset(*(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
    vocabs['label'].index_dataset(*(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')
    vocabs['lattice'].index_dataset(*(datasets.values()),
                                    field_name='lattice',
                                    new_field_name='lattice')

    return datasets, vocabs, embeddings
Пример #11
0
def from_raw_text_new(chars, vocabs, w_list, number_normalized=False):
    from fastNLP.core import DataSet
    from utils import get_bigrams
    bigrams = get_bigrams(chars)
    seq_len = len(chars)
    target = ['O'] * seq_len
    dataset = DataSet({
        'chars': [chars],
        'bigrams': [bigrams],
        'seq_len': [seq_len],
        'target': [target]
    })
    datasets = {'train': dataset}

    def normalize_char(inp):
        result = []
        for c in inp:
            if c.isdigit():
                result.append('0')
            else:
                result.append(c)

        return result

    def normalize_bigram(inp):
        result = []
        for bi in inp:
            tmp = bi
            if tmp[0].isdigit():
                tmp = '0' + tmp[:1]
            if tmp[1].isdigit():
                tmp = tmp[0] + '0'

            result.append(tmp)
        return result

    if number_normalized == 3:
        print('not support exit!')
        exit()
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def get_skip_path(chars, w_trie):
        sentence = ''.join(chars)
        result = w_trie.get_lexicon(sentence)
        # print(result)

        return result

    from V0.utils_ import Trie
    from functools import partial
    from fastNLP.core import Vocabulary
    from fastNLP.embeddings import StaticEmbedding
    from fastNLP import DataSet
    a = DataSet()
    a.apply
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    import copy
    for k, v in datasets.items():
        v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars',
                      'lexicons')
        v.apply_field(copy.copy, 'chars', 'raw_chars')
        v.add_seq_len('lexicons', 'lex_num')
        v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons',
                      'lex_s')
        v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons',
                      'lex_e')

    if number_normalized == 1:
        print('not support exit!')
        exit()
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if number_normalized == 2:
        print('not support exit!')
        exit()
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def concat(ins):
        chars = ins['chars']
        lexicons = ins['lexicons']
        result = chars + list(map(lambda x: x[2], lexicons))
        # print('lexicons:{}'.format(lexicons))
        # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons))))
        # print('result:{}'.format(result))
        return result

    def get_pos_s(ins):
        lex_s = ins['lex_s']
        seq_len = ins['seq_len']
        pos_s = list(range(seq_len)) + lex_s

        return pos_s

    def get_pos_e(ins):
        lex_e = ins['lex_e']
        seq_len = ins['seq_len']
        pos_e = list(range(seq_len)) + lex_e

        return pos_e

    for k, v in datasets.items():
        v.apply(concat, new_field_name='lattice')
        v.set_input('lattice')
        v.apply(get_pos_s, new_field_name='pos_s')
        v.apply(get_pos_e, new_field_name='pos_e')
        v.set_input('pos_s', 'pos_e')

    # print(list(datasets['train'][:10]['lexicons']))
    # print(list(datasets['train'][:10]['lattice']))
    # print(list(datasets['train'][:10]['lex_s']))
    # print(list(datasets['train'][:10]['lex_e']))
    # print(list(datasets['train'][:10]['pos_s']))
    # print(list(datasets['train'][:10]['pos_e']))
    # exit(1208)

    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word')
    #
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word')

    # for k,v in datasets.items():
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count')
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_l2r_word',new_field_name='skips_l2r_word')
    #
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back')
    #
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_r2l_word',new_field_name='skips_r2l_word')

    vocabs['char'].index_dataset(*(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
    vocabs['bigram'].index_dataset(*(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
    vocabs['label'].index_dataset(*(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')
    vocabs['lattice'].index_dataset(*(datasets.values()),
                                    field_name='lattice',
                                    new_field_name='lattice')

    return datasets, vocabs
class CustomizedNER(object):
    def __init__(self, modelFile, vocabFile, addTarget2Vocab=False):
        # CHAR_INPUT="chars", 并且会转化为word_index
        self._vocabFile = vocabFile
        self._addTarget2Vocab = addTarget2Vocab
        self._CONST_CHAR = Const.CHAR_INPUT
        self._CONST_WORDS = Const.INPUT
        self._CONST_TARGET = Const.TARGET
        self._input_fields = [self._CONST_WORDS, Const.INPUT_LEN]
        self._word_counter, self._word_vocab, self._target_counter, \
        self._target_vocab, self._target = self._get_vocabs()
        self._vocab4word = Vocabulary()
        self._update_word()
        if self._addTarget2Vocab:
            self._vocab4target = Vocabulary(unknown=None, padding=None)
            self._input_fields.append(self._CONST_TARGET)
            self._update_target()
        self._model = Predictor(ModelLoader().load_pytorch_model(modelFile))

    def _target_token(self, word_token, cont, number="", word=""):
        ret = dict()
        sign = True
        lastIdx = len(word_token) - 1
        for num, token in zip(enumerate(word_token), cont):
            if num[1] in self._target:
                if sign:
                    number += str(num[1])
                    word += token
                    if num[0] < lastIdx and not word_token[num[0] + 1]:
                        sign = False
                else:
                    ret.setdefault(number, set())
                    ret[number].add(word)
                    number = ""
                    word = token
                    sign = True
        if number:
            ret.setdefault(number, set())
            ret[number].add(word)
        return ret

    def _extract_ner(self, tokenNum, token, weighted=False):
        if not weighted:
            cls = self._target.get(int(max(tokenNum, key=tokenNum.count)), "")
            if cls.endswith("LOC"):
                return {"LOC": [x for x in token]}
            elif cls.endswith("PER"):
                return {"PER": [x for x in token]}
            elif cls.endswith("ORG"):
                return {"ORG": [x for x in token]}

    def _get_ner(self, tokenNumber, tokenWord):
        nerDict = self._target_token(tokenNumber, tokenWord)
        ret = dict()
        for num, token in nerDict.items():
            if len(num) == 1:
                continue
            for k, v in self._extract_ner(num, token).items():
                ret.setdefault(k, list())
                ret[k].extend(v)
        return ret

    def _read_vocab(self):
        with open(self._vocabFile, "r", encoding="utf-8") as vocabIn:
            return eval(vocabIn.read())

    def _reverse_dict(self, dic):
        ret = dict()
        for key, value in dic.items():
            ret.setdefault(value, key)
        return ret

    def _tartget_label(self, dic):
        ret = self._reverse_dict(dic)
        del ret[0]
        return ret

    def _get_vocabs(self):
        vocabs = self._read_vocab()
        word_count = vocabs.get("wordsWc", dict())
        wordsVocab = vocabs.get("wordsVocab", dict())
        target_count = vocabs.get("targetWc", dict())
        targetVocab = vocabs.get("targetVocab", dict())
        reverseTargetVocab = self._tartget_label(targetVocab)
        return Counter(word_count), wordsVocab, Counter(
            target_count), targetVocab, reverseTargetVocab

    def _update_word(self):
        self._vocab4word.update(self._word_vocab)
        self._vocab4word.word_count = self._word_counter

    def _update_target(self):
        self._vocab4target.update(self._target_vocab)
        self._vocab4target.word_count = self._target_counter

    @property
    def model(self):
        if not self._model:
            raise
        return self._model

    def formatRowString(self, msg):
        msg = msg.strip()
        tokenized_char = [x for x in msg]
        self._dataset = DataSet()
        if self._addTarget2Vocab:
            ins = Instance(chars=tokenized_char,
                           raw_chars=tokenized_char,
                           target=list(dict(self._target_vocab).keys()))
        else:
            ins = Instance(chars=tokenized_char, raw_chars=tokenized_char)
        self._dataset.append(ins)

    @property
    def dataset(self):
        # if input as dict format:
        # data = DataSet({"raw_chars":[msg], "words":[[x for x in msg]], "seq_len":[len(word_list)]})
        # 从该dataset中的chars列建立词表
        self._vocab4word.from_dataset(self._dataset,
                                      field_name=self._CONST_CHAR)
        # 使用vocabulary将chars列转换为index
        self._vocab4word.index_dataset(self._dataset,
                                       field_name=self._CONST_CHAR,
                                       new_field_name=self._CONST_WORDS)
        if self._addTarget2Vocab:
            self._vocab4target.from_dataset(self._dataset,
                                            field_name=self._CONST_TARGET)
            self._vocab4target.index_dataset(self._dataset,
                                             field_name=self._CONST_TARGET)
        self._dataset.add_seq_len(self._CONST_CHAR)
        self._dataset.set_input(*self._input_fields)
        return self._dataset

    def _content(self):
        for line in self._dataset["raw_chars"].content:
            yield "".join(line)

    def result(self, dataset):
        # 打印数据集中的预测结果
        ret = self.model.predict(dataset)["pred"]
        for line, cont in zip(ret, self._content()):
            yield self._get_ner(line[0].tolist(), cont)
Пример #13
0
    def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first',
                 include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = True, min_freq=1,
                 only_use_pretrain_bpe=False, truncate_embed=True):
        super().__init__()

        self.tokenizer = CamembertTokenizer.from_pretrained(model_dir_or_name)
        self.encoder = CamembertModel.from_pretrained(model_dir_or_name)

        self.encoder.resize_token_embeddings(len(self.tokenizer))
        self._max_position_embeddings = self.encoder.config.max_position_embeddings - 2
        encoder_layer_number = len(self.encoder.encoder.layer)
        if isinstance(layers, list):
            self.layers = [int(l) for l in layers]
        elif isinstance(layers, str):
            self.layers = list(map(int, layers.split(',')))
        else:
            raise TypeError("`layers` only supports str or list[int]")
        for layer in self.layers:
            if layer < 0:
                assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                                                       f"a bert model with {encoder_layer_number} layers."
            else:
                assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a bert model with {encoder_layer_number} layers."

        assert pool_method in ('avg', 'max', 'first', 'last')
        self.pool_method = pool_method
        self.include_cls_sep = include_cls_sep
        self.pooled_cls = pooled_cls
        self.auto_truncate = auto_truncate

        logger.info("Start to generate word pieces for word.")
        word_piece_dict = {'<s>': 1, '</s>': 1}
        found_count = 0
        new_add_to_bpe_vocab = 0
        unsegment_count = 0
        if "<s>" in vocab:
            warnings.warn("<s> detected in your vocabulary. RobertaEmbedding will add <s> and </s> to the begin "
                          "and end of the input automatically, make sure you don't add <s> and </s> at the begin"
                          " and end.")

        unique = []
        for word, index in vocab:

            word_pieces = []
            word_pieces.extend(self.tokenizer.tokenize(
                word))  # , add_prefix_space=True))

            word_token_ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
            if 3 in word_token_ids:
                if word_pieces[word_token_ids.index(3)] not in unique:
                    unique.append(word_pieces[word_token_ids.index(3)])
                    unsegment_count += 1
            if not vocab._is_word_no_create_entry(word):
                if index != vocab.unknown_idx and word_pieces[0] == '<unk>':
                    if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry(
                            word) and not only_use_pretrain_bpe:
                        word_piece_dict[word] = 1
                        new_add_to_bpe_vocab += 1
                    unsegment_count += 1
                    continue
            found_count += 1
            for word_piece in word_pieces:
                word_piece_dict[word_piece] = 1

        if unsegment_count > 0:
            logger.info(f"{unsegment_count} words are unsegmented.")

        word_to_wordpieces = []
        word_pieces_lengths = []
        for word, index in vocab:
            if index == vocab.padding_idx:
                word = '<pad>'
            elif index == vocab.unknown_idx:
                word = '<unk>'
            word_pieces = self.tokenizer.tokenize(word)
            word_pieces = self.tokenizer.convert_tokens_to_ids(word_pieces)
            word_to_wordpieces.append(word_pieces)
            word_pieces_lengths.append(len(word_pieces))
        self._cls_index = self.tokenizer.convert_tokens_to_ids('<s>')
        self._sep_index = self.tokenizer.convert_tokens_to_ids('</s>')
        self._word_pad_index = vocab.padding_idx
        self._wordpiece_pad_index = self.tokenizer.convert_tokens_to_ids(
            '<pad>')
        self.word_to_wordpieces = np.array(word_to_wordpieces)
        self.register_buffer('word_pieces_lengths',
                             torch.LongTensor(word_pieces_lengths))
        self.encoder.resize_token_embeddings(len(self.tokenizer))
        logger.debug("Successfully generate word pieces.")