示例#1
0
from fastNLP.io.loader import CSVLoader
bundle = CSVLoader(headers=['raw_words1', 'raw_words2', 'target'],
                   sep='\t').load(sys.argv[1])

#####test
import jieba
from fastNLP.core import Vocabulary
bundle.apply(lambda line: jieba.lcut(line['raw_words1']) + ['[SEP]'] + jieba.
             lcut(line['raw_words2']),
             new_field_name='words')
bundle.apply(lambda line: len(line['words']), new_field_name='seq_len')
bundle.apply(lambda line: 1, new_field_name='target')
vocab = Vocabulary()
vocab.from_dataset(bundle.get_dataset("train"),
                   field_name='words',
                   no_create_entry_dataset=[
                       bundle.get_dataset("test"),
                       bundle.get_dataset("dev")
                   ])
vocab.index_dataset(bundle.get_dataset("train"), field_name='words')
vocab.index_dataset(bundle.get_dataset("test"), field_name='words')
vocab.index_dataset(bundle.get_dataset("dev"), field_name='words')

# establish the model
from fastNLP import Const
import torch
from fastNLP.models import BertForSentenceMatching
from fastNLP.embeddings.bert_embedding import BertEmbedding
embed = BertEmbedding(vocab, model_dir_or_name='cn-base', requires_grad=False)

#pad the input array
bundle.set_pad_val("words", 0)
示例#2
0
def equip_chinese_ner_with_lexicon(
        datasets,
        vocabs,
        embeddings,
        w_list,
        word_embedding_path=None,
        only_lexicon_in_train=False,
        word_char_mix_embedding_path=None,  # 字和词的embedding信息
        number_normalized=False,
        lattice_min_freq=1,
        only_train_min_freq=0):
    from fastNLP.core import Vocabulary

    def normalize_char(inp):
        result = []
        for c in inp:
            if c.isdigit():
                result.append('0')
            else:
                result.append(c)

        return result

    def normalize_bigram(inp):
        result = []
        for bi in inp:
            tmp = bi
            if tmp[0].isdigit():
                tmp = '0' + tmp[:1]
            if tmp[1].isdigit():
                tmp = tmp[0] + '0'

            result.append(tmp)
        return result

    if number_normalized == 3:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if only_lexicon_in_train:
        print('已支持只加载在trian中出现过的词汇')

    def get_skip_path(chars, w_trie):
        sentence = ''.join(chars)
        result = w_trie.get_lexicon(sentence)
        # print(result)

        return result

    from V0.utils_ import Trie
    from functools import partial
    from fastNLP.core import Vocabulary
    from fastNLP_module import StaticEmbedding
    from fastNLP import DataSet
    a = DataSet()
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    if only_lexicon_in_train:
        lexicon_in_train = set()
        for s in datasets['train']['chars']:
            lexicon_in_s = w_trie.get_lexicon(s)
            for s, e, lexicon in lexicon_in_s:
                lexicon_in_train.add(''.join(lexicon))

        print('lexicon in train:{}'.format(len(lexicon_in_train)))
        print('i.e.: {}'.format(list(lexicon_in_train)[:10]))
        w_trie = Trie()
        for w in lexicon_in_train:
            w_trie.insert(w)

    import copy
    for k, v in datasets.items():
        v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars',
                      'lexicons')
        v.apply_field(copy.copy, 'chars', 'raw_chars')
        v.add_seq_len('lexicons', 'lex_num')
        v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons',
                      'lex_s')
        v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons',
                      'lex_e')

    if number_normalized == 1:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if number_normalized == 2:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def concat(ins):
        chars = ins['chars']
        lexicons = ins['lexicons']
        result = chars + list(map(lambda x: x[2], lexicons))
        # print('lexicons:{}'.format(lexicons))
        # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons))))
        # print('result:{}'.format(result))
        return result

    def get_pos_s(ins):
        lex_s = ins['lex_s']
        seq_len = ins['seq_len']
        pos_s = list(range(seq_len)) + lex_s

        return pos_s

    def get_pos_e(ins):
        lex_e = ins['lex_e']
        seq_len = ins['seq_len']
        pos_e = list(range(seq_len)) + lex_e

        return pos_e

    for k, v in datasets.items():
        v.apply(concat, new_field_name='lattice')
        v.set_input('lattice')
        v.apply(get_pos_s, new_field_name='pos_s')
        v.apply(get_pos_e, new_field_name='pos_e')
        v.set_input('pos_s', 'pos_e')

    word_vocab = Vocabulary()
    word_vocab.add_word_lst(w_list)
    vocabs['word'] = word_vocab

    lattice_vocab = Vocabulary()
    lattice_vocab.from_dataset(datasets['train'],
                               field_name='lattice',
                               no_create_entry_dataset=[
                                   v for k, v in datasets.items()
                                   if k != 'train'
                               ])
    vocabs['lattice'] = lattice_vocab
    """
    1.word_embedding_path 这个参数到底是用做什么的?
    我将其设置成了 None。但是如果为None,那么embedding['word']没有了还可以吗?
    
    2.StaticEmbedding:
    给定预训练embedding的名称或路径,根据vocab从embedding中抽取相应的数据(只会将出现在vocab中的词抽取出来, 如果没有找到,则会随机初始化一个值(但如果该word是被标记为no_create_entry的话,则不会单独创建一个值,而是会被指向unk的index))
    """
    if word_embedding_path is not None:
        word_embedding = StaticEmbedding(word_vocab,
                                         word_embedding_path,
                                         word_dropout=0)
        embeddings['word'] = word_embedding

    if word_char_mix_embedding_path is not None:
        lattice_embedding = StaticEmbedding(
            lattice_vocab,
            word_char_mix_embedding_path,
            word_dropout=0.01,
            min_freq=lattice_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['lattice'] = lattice_embedding

    vocabs['char'].index_dataset(*(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
    vocabs['bigram'].index_dataset(*(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
    vocabs['label'].index_dataset(*(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')
    vocabs['lattice'].index_dataset(*(datasets.values()),
                                    field_name='lattice',
                                    new_field_name='lattice')

    return datasets, vocabs, embeddings
def equip_chinese_ner_with_lexicon(datasets,
                                   vocabs,
                                   embeddings,
                                   w_list,
                                   word_embedding_path=None,
                                   only_lexicon_in_train=False,
                                   word_char_mix_embedding_path=None,
                                   number_normalized=False,
                                   lattice_min_freq=1,
                                   only_train_min_freq=0):
    from fastNLP.core import Vocabulary

    def normalize_char(inp):
        result = []
        for c in inp:
            if c.isdigit():
                result.append('0')
            else:
                result.append(c)

        return result

    def normalize_bigram(inp):
        result = []
        for bi in inp:
            tmp = bi
            if tmp[0].isdigit():
                tmp = '0' + tmp[:1]
            if tmp[1].isdigit():
                tmp = tmp[0] + '0'

            result.append(tmp)
        return result

    if number_normalized == 3:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if only_lexicon_in_train:
        print('已支持只加载在trian中出现过的词汇')

    def get_skip_path(chars, w_trie):
        sentence = ''.join(chars)
        result = w_trie.get_lexicon(sentence)
        # print(result)

        return result

    from V0.utils_ import Trie
    from functools import partial
    from fastNLP.core import Vocabulary
    # from fastNLP.embeddings import StaticEmbedding
    from fastNLP_module import StaticEmbedding
    from fastNLP import DataSet
    a = DataSet()
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    if only_lexicon_in_train:
        lexicon_in_train = set()
        for s in datasets['train']['chars']:
            lexicon_in_s = w_trie.get_lexicon(s)
            for s, e, lexicon in lexicon_in_s:
                lexicon_in_train.add(''.join(lexicon))

        print('lexicon in train:{}'.format(len(lexicon_in_train)))
        print('i.e.: {}'.format(list(lexicon_in_train)[:10]))
        w_trie = Trie()
        for w in lexicon_in_train:
            w_trie.insert(w)

    import copy
    for k, v in datasets.items():
        v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars',
                      'lexicons')
        v.apply_field(copy.copy, 'chars', 'raw_chars')
        v.add_seq_len('lexicons', 'lex_num')
        v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons',
                      'lex_s')
        v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons',
                      'lex_e')

    if number_normalized == 1:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if number_normalized == 2:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def concat(ins):
        chars = ins['chars']
        lexicons = ins['lexicons']
        result = chars + list(map(lambda x: x[2], lexicons))
        # print('lexicons:{}'.format(lexicons))
        # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons))))
        # print('result:{}'.format(result))
        return result

    def get_pos_s(ins):
        lex_s = ins['lex_s']
        seq_len = ins['seq_len']
        pos_s = list(range(seq_len)) + lex_s

        return pos_s

    def get_pos_e(ins):
        lex_e = ins['lex_e']
        seq_len = ins['seq_len']
        pos_e = list(range(seq_len)) + lex_e

        return pos_e

    for k, v in datasets.items():
        v.apply(concat, new_field_name='lattice')
        v.set_input('lattice')
        v.apply(get_pos_s, new_field_name='pos_s')
        v.apply(get_pos_e, new_field_name='pos_e')
        v.set_input('pos_s', 'pos_e')

    # print(list(datasets['train'][:10]['lexicons']))
    # print(list(datasets['train'][:10]['lattice']))
    # print(list(datasets['train'][:10]['lex_s']))
    # print(list(datasets['train'][:10]['lex_e']))
    # print(list(datasets['train'][:10]['pos_s']))
    # print(list(datasets['train'][:10]['pos_e']))
    # exit(1208)

    word_vocab = Vocabulary()
    word_vocab.add_word_lst(w_list)
    vocabs['word'] = word_vocab

    lattice_vocab = Vocabulary()
    lattice_vocab.from_dataset(datasets['train'],
                               field_name='lattice',
                               no_create_entry_dataset=[
                                   v for k, v in datasets.items()
                                   if k != 'train'
                               ])
    vocabs['lattice'] = lattice_vocab
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word')
    #
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word')

    # for k,v in datasets.items():
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count')
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_l2r_word',new_field_name='skips_l2r_word')
    #
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back')
    #
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_r2l_word',new_field_name='skips_r2l_word')

    if word_embedding_path is not None:
        word_embedding = StaticEmbedding(word_vocab,
                                         word_embedding_path,
                                         word_dropout=0)
        embeddings['word'] = word_embedding

    if word_char_mix_embedding_path is not None:
        lattice_embedding = StaticEmbedding(
            lattice_vocab,
            word_char_mix_embedding_path,
            word_dropout=0.01,
            min_freq=lattice_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['lattice'] = lattice_embedding

    vocabs['char'].index_dataset(*(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
    vocabs['bigram'].index_dataset(*(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
    vocabs['label'].index_dataset(*(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')
    vocabs['lattice'].index_dataset(*(datasets.values()),
                                    field_name='lattice',
                                    new_field_name='lattice')
    vocabs['span_label'].index_dataset(*(datasets.values()),
                                       field_name='span_label',
                                       new_field_name='span_label')
    vocabs['attr_label'].index_dataset(*(datasets.values()),
                                       field_name='attr_start_label',
                                       new_field_name='attr_start_label')
    vocabs['attr_label'].index_dataset(*(datasets.values()),
                                       field_name='attr_end_label',
                                       new_field_name='attr_end_label')

    return datasets, vocabs, embeddings
class CustomizedNER(object):
    def __init__(self, modelFile, vocabFile, addTarget2Vocab=False):
        # CHAR_INPUT="chars", 并且会转化为word_index
        self._vocabFile = vocabFile
        self._addTarget2Vocab = addTarget2Vocab
        self._CONST_CHAR = Const.CHAR_INPUT
        self._CONST_WORDS = Const.INPUT
        self._CONST_TARGET = Const.TARGET
        self._input_fields = [self._CONST_WORDS, Const.INPUT_LEN]
        self._word_counter, self._word_vocab, self._target_counter, \
        self._target_vocab, self._target = self._get_vocabs()
        self._vocab4word = Vocabulary()
        self._update_word()
        if self._addTarget2Vocab:
            self._vocab4target = Vocabulary(unknown=None, padding=None)
            self._input_fields.append(self._CONST_TARGET)
            self._update_target()
        self._model = Predictor(ModelLoader().load_pytorch_model(modelFile))

    def _target_token(self, word_token, cont, number="", word=""):
        ret = dict()
        sign = True
        lastIdx = len(word_token) - 1
        for num, token in zip(enumerate(word_token), cont):
            if num[1] in self._target:
                if sign:
                    number += str(num[1])
                    word += token
                    if num[0] < lastIdx and not word_token[num[0] + 1]:
                        sign = False
                else:
                    ret.setdefault(number, set())
                    ret[number].add(word)
                    number = ""
                    word = token
                    sign = True
        if number:
            ret.setdefault(number, set())
            ret[number].add(word)
        return ret

    def _extract_ner(self, tokenNum, token, weighted=False):
        if not weighted:
            cls = self._target.get(int(max(tokenNum, key=tokenNum.count)), "")
            if cls.endswith("LOC"):
                return {"LOC": [x for x in token]}
            elif cls.endswith("PER"):
                return {"PER": [x for x in token]}
            elif cls.endswith("ORG"):
                return {"ORG": [x for x in token]}

    def _get_ner(self, tokenNumber, tokenWord):
        nerDict = self._target_token(tokenNumber, tokenWord)
        ret = dict()
        for num, token in nerDict.items():
            if len(num) == 1:
                continue
            for k, v in self._extract_ner(num, token).items():
                ret.setdefault(k, list())
                ret[k].extend(v)
        return ret

    def _read_vocab(self):
        with open(self._vocabFile, "r", encoding="utf-8") as vocabIn:
            return eval(vocabIn.read())

    def _reverse_dict(self, dic):
        ret = dict()
        for key, value in dic.items():
            ret.setdefault(value, key)
        return ret

    def _tartget_label(self, dic):
        ret = self._reverse_dict(dic)
        del ret[0]
        return ret

    def _get_vocabs(self):
        vocabs = self._read_vocab()
        word_count = vocabs.get("wordsWc", dict())
        wordsVocab = vocabs.get("wordsVocab", dict())
        target_count = vocabs.get("targetWc", dict())
        targetVocab = vocabs.get("targetVocab", dict())
        reverseTargetVocab = self._tartget_label(targetVocab)
        return Counter(word_count), wordsVocab, Counter(
            target_count), targetVocab, reverseTargetVocab

    def _update_word(self):
        self._vocab4word.update(self._word_vocab)
        self._vocab4word.word_count = self._word_counter

    def _update_target(self):
        self._vocab4target.update(self._target_vocab)
        self._vocab4target.word_count = self._target_counter

    @property
    def model(self):
        if not self._model:
            raise
        return self._model

    def formatRowString(self, msg):
        msg = msg.strip()
        tokenized_char = [x for x in msg]
        self._dataset = DataSet()
        if self._addTarget2Vocab:
            ins = Instance(chars=tokenized_char,
                           raw_chars=tokenized_char,
                           target=list(dict(self._target_vocab).keys()))
        else:
            ins = Instance(chars=tokenized_char, raw_chars=tokenized_char)
        self._dataset.append(ins)

    @property
    def dataset(self):
        # if input as dict format:
        # data = DataSet({"raw_chars":[msg], "words":[[x for x in msg]], "seq_len":[len(word_list)]})
        # 从该dataset中的chars列建立词表
        self._vocab4word.from_dataset(self._dataset,
                                      field_name=self._CONST_CHAR)
        # 使用vocabulary将chars列转换为index
        self._vocab4word.index_dataset(self._dataset,
                                       field_name=self._CONST_CHAR,
                                       new_field_name=self._CONST_WORDS)
        if self._addTarget2Vocab:
            self._vocab4target.from_dataset(self._dataset,
                                            field_name=self._CONST_TARGET)
            self._vocab4target.index_dataset(self._dataset,
                                             field_name=self._CONST_TARGET)
        self._dataset.add_seq_len(self._CONST_CHAR)
        self._dataset.set_input(*self._input_fields)
        return self._dataset

    def _content(self):
        for line in self._dataset["raw_chars"].content:
            yield "".join(line)

    def result(self, dataset):
        # 打印数据集中的预测结果
        ret = self.model.predict(dataset)["pred"]
        for line, cont in zip(ret, self._content()):
            yield self._get_ner(line[0].tolist(), cont)