from fastNLP.io.loader import CSVLoader bundle = CSVLoader(headers=['raw_words1', 'raw_words2', 'target'], sep='\t').load(sys.argv[1]) #####test import jieba from fastNLP.core import Vocabulary bundle.apply(lambda line: jieba.lcut(line['raw_words1']) + ['[SEP]'] + jieba. lcut(line['raw_words2']), new_field_name='words') bundle.apply(lambda line: len(line['words']), new_field_name='seq_len') bundle.apply(lambda line: 1, new_field_name='target') vocab = Vocabulary() vocab.from_dataset(bundle.get_dataset("train"), field_name='words', no_create_entry_dataset=[ bundle.get_dataset("test"), bundle.get_dataset("dev") ]) vocab.index_dataset(bundle.get_dataset("train"), field_name='words') vocab.index_dataset(bundle.get_dataset("test"), field_name='words') vocab.index_dataset(bundle.get_dataset("dev"), field_name='words') # establish the model from fastNLP import Const import torch from fastNLP.models import BertForSentenceMatching from fastNLP.embeddings.bert_embedding import BertEmbedding embed = BertEmbedding(vocab, model_dir_or_name='cn-base', requires_grad=False) #pad the input array bundle.set_pad_val("words", 0)
def equip_chinese_ner_with_lexicon( datasets, vocabs, embeddings, w_list, word_embedding_path=None, only_lexicon_in_train=False, word_char_mix_embedding_path=None, # 字和词的embedding信息 number_normalized=False, lattice_min_freq=1, only_train_min_freq=0): from fastNLP.core import Vocabulary def normalize_char(inp): result = [] for c in inp: if c.isdigit(): result.append('0') else: result.append(c) return result def normalize_bigram(inp): result = [] for bi in inp: tmp = bi if tmp[0].isdigit(): tmp = '0' + tmp[:1] if tmp[1].isdigit(): tmp = tmp[0] + '0' result.append(tmp) return result if number_normalized == 3: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if only_lexicon_in_train: print('已支持只加载在trian中出现过的词汇') def get_skip_path(chars, w_trie): sentence = ''.join(chars) result = w_trie.get_lexicon(sentence) # print(result) return result from V0.utils_ import Trie from functools import partial from fastNLP.core import Vocabulary from fastNLP_module import StaticEmbedding from fastNLP import DataSet a = DataSet() w_trie = Trie() for w in w_list: w_trie.insert(w) if only_lexicon_in_train: lexicon_in_train = set() for s in datasets['train']['chars']: lexicon_in_s = w_trie.get_lexicon(s) for s, e, lexicon in lexicon_in_s: lexicon_in_train.add(''.join(lexicon)) print('lexicon in train:{}'.format(len(lexicon_in_train))) print('i.e.: {}'.format(list(lexicon_in_train)[:10])) w_trie = Trie() for w in lexicon_in_train: w_trie.insert(w) import copy for k, v in datasets.items(): v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars', 'lexicons') v.apply_field(copy.copy, 'chars', 'raw_chars') v.add_seq_len('lexicons', 'lex_num') v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons', 'lex_s') v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons', 'lex_e') if number_normalized == 1: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if number_normalized == 2: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) def concat(ins): chars = ins['chars'] lexicons = ins['lexicons'] result = chars + list(map(lambda x: x[2], lexicons)) # print('lexicons:{}'.format(lexicons)) # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons)))) # print('result:{}'.format(result)) return result def get_pos_s(ins): lex_s = ins['lex_s'] seq_len = ins['seq_len'] pos_s = list(range(seq_len)) + lex_s return pos_s def get_pos_e(ins): lex_e = ins['lex_e'] seq_len = ins['seq_len'] pos_e = list(range(seq_len)) + lex_e return pos_e for k, v in datasets.items(): v.apply(concat, new_field_name='lattice') v.set_input('lattice') v.apply(get_pos_s, new_field_name='pos_s') v.apply(get_pos_e, new_field_name='pos_e') v.set_input('pos_s', 'pos_e') word_vocab = Vocabulary() word_vocab.add_word_lst(w_list) vocabs['word'] = word_vocab lattice_vocab = Vocabulary() lattice_vocab.from_dataset(datasets['train'], field_name='lattice', no_create_entry_dataset=[ v for k, v in datasets.items() if k != 'train' ]) vocabs['lattice'] = lattice_vocab """ 1.word_embedding_path 这个参数到底是用做什么的? 我将其设置成了 None。但是如果为None,那么embedding['word']没有了还可以吗? 2.StaticEmbedding: 给定预训练embedding的名称或路径,根据vocab从embedding中抽取相应的数据(只会将出现在vocab中的词抽取出来, 如果没有找到,则会随机初始化一个值(但如果该word是被标记为no_create_entry的话,则不会单独创建一个值,而是会被指向unk的index)) """ if word_embedding_path is not None: word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0) embeddings['word'] = word_embedding if word_char_mix_embedding_path is not None: lattice_embedding = StaticEmbedding( lattice_vocab, word_char_mix_embedding_path, word_dropout=0.01, min_freq=lattice_min_freq, only_train_min_freq=only_train_min_freq) embeddings['lattice'] = lattice_embedding vocabs['char'].index_dataset(*(datasets.values()), field_name='chars', new_field_name='chars') vocabs['bigram'].index_dataset(*(datasets.values()), field_name='bigrams', new_field_name='bigrams') vocabs['label'].index_dataset(*(datasets.values()), field_name='target', new_field_name='target') vocabs['lattice'].index_dataset(*(datasets.values()), field_name='lattice', new_field_name='lattice') return datasets, vocabs, embeddings
def equip_chinese_ner_with_lexicon(datasets, vocabs, embeddings, w_list, word_embedding_path=None, only_lexicon_in_train=False, word_char_mix_embedding_path=None, number_normalized=False, lattice_min_freq=1, only_train_min_freq=0): from fastNLP.core import Vocabulary def normalize_char(inp): result = [] for c in inp: if c.isdigit(): result.append('0') else: result.append(c) return result def normalize_bigram(inp): result = [] for bi in inp: tmp = bi if tmp[0].isdigit(): tmp = '0' + tmp[:1] if tmp[1].isdigit(): tmp = tmp[0] + '0' result.append(tmp) return result if number_normalized == 3: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if only_lexicon_in_train: print('已支持只加载在trian中出现过的词汇') def get_skip_path(chars, w_trie): sentence = ''.join(chars) result = w_trie.get_lexicon(sentence) # print(result) return result from V0.utils_ import Trie from functools import partial from fastNLP.core import Vocabulary # from fastNLP.embeddings import StaticEmbedding from fastNLP_module import StaticEmbedding from fastNLP import DataSet a = DataSet() w_trie = Trie() for w in w_list: w_trie.insert(w) if only_lexicon_in_train: lexicon_in_train = set() for s in datasets['train']['chars']: lexicon_in_s = w_trie.get_lexicon(s) for s, e, lexicon in lexicon_in_s: lexicon_in_train.add(''.join(lexicon)) print('lexicon in train:{}'.format(len(lexicon_in_train))) print('i.e.: {}'.format(list(lexicon_in_train)[:10])) w_trie = Trie() for w in lexicon_in_train: w_trie.insert(w) import copy for k, v in datasets.items(): v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars', 'lexicons') v.apply_field(copy.copy, 'chars', 'raw_chars') v.add_seq_len('lexicons', 'lex_num') v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons', 'lex_s') v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons', 'lex_e') if number_normalized == 1: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if number_normalized == 2: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) def concat(ins): chars = ins['chars'] lexicons = ins['lexicons'] result = chars + list(map(lambda x: x[2], lexicons)) # print('lexicons:{}'.format(lexicons)) # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons)))) # print('result:{}'.format(result)) return result def get_pos_s(ins): lex_s = ins['lex_s'] seq_len = ins['seq_len'] pos_s = list(range(seq_len)) + lex_s return pos_s def get_pos_e(ins): lex_e = ins['lex_e'] seq_len = ins['seq_len'] pos_e = list(range(seq_len)) + lex_e return pos_e for k, v in datasets.items(): v.apply(concat, new_field_name='lattice') v.set_input('lattice') v.apply(get_pos_s, new_field_name='pos_s') v.apply(get_pos_e, new_field_name='pos_e') v.set_input('pos_s', 'pos_e') # print(list(datasets['train'][:10]['lexicons'])) # print(list(datasets['train'][:10]['lattice'])) # print(list(datasets['train'][:10]['lex_s'])) # print(list(datasets['train'][:10]['lex_e'])) # print(list(datasets['train'][:10]['pos_s'])) # print(list(datasets['train'][:10]['pos_e'])) # exit(1208) word_vocab = Vocabulary() word_vocab.add_word_lst(w_list) vocabs['word'] = word_vocab lattice_vocab = Vocabulary() lattice_vocab.from_dataset(datasets['train'], field_name='lattice', no_create_entry_dataset=[ v for k, v in datasets.items() if k != 'train' ]) vocabs['lattice'] = lattice_vocab # for k,v in datasets.items(): # v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source') # v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word') # # for k,v in datasets.items(): # v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source') # v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word') # for k,v in datasets.items(): # v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count') # v.apply_field(lambda x: # list(map(lambda y: # list(map(lambda z:word_vocab.to_index(z),y)),x)), # 'skips_l2r_word',new_field_name='skips_l2r_word') # # v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back') # # v.apply_field(lambda x: # list(map(lambda y: # list(map(lambda z:word_vocab.to_index(z),y)),x)), # 'skips_r2l_word',new_field_name='skips_r2l_word') if word_embedding_path is not None: word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0) embeddings['word'] = word_embedding if word_char_mix_embedding_path is not None: lattice_embedding = StaticEmbedding( lattice_vocab, word_char_mix_embedding_path, word_dropout=0.01, min_freq=lattice_min_freq, only_train_min_freq=only_train_min_freq) embeddings['lattice'] = lattice_embedding vocabs['char'].index_dataset(*(datasets.values()), field_name='chars', new_field_name='chars') vocabs['bigram'].index_dataset(*(datasets.values()), field_name='bigrams', new_field_name='bigrams') vocabs['label'].index_dataset(*(datasets.values()), field_name='target', new_field_name='target') vocabs['lattice'].index_dataset(*(datasets.values()), field_name='lattice', new_field_name='lattice') vocabs['span_label'].index_dataset(*(datasets.values()), field_name='span_label', new_field_name='span_label') vocabs['attr_label'].index_dataset(*(datasets.values()), field_name='attr_start_label', new_field_name='attr_start_label') vocabs['attr_label'].index_dataset(*(datasets.values()), field_name='attr_end_label', new_field_name='attr_end_label') return datasets, vocabs, embeddings
class CustomizedNER(object): def __init__(self, modelFile, vocabFile, addTarget2Vocab=False): # CHAR_INPUT="chars", 并且会转化为word_index self._vocabFile = vocabFile self._addTarget2Vocab = addTarget2Vocab self._CONST_CHAR = Const.CHAR_INPUT self._CONST_WORDS = Const.INPUT self._CONST_TARGET = Const.TARGET self._input_fields = [self._CONST_WORDS, Const.INPUT_LEN] self._word_counter, self._word_vocab, self._target_counter, \ self._target_vocab, self._target = self._get_vocabs() self._vocab4word = Vocabulary() self._update_word() if self._addTarget2Vocab: self._vocab4target = Vocabulary(unknown=None, padding=None) self._input_fields.append(self._CONST_TARGET) self._update_target() self._model = Predictor(ModelLoader().load_pytorch_model(modelFile)) def _target_token(self, word_token, cont, number="", word=""): ret = dict() sign = True lastIdx = len(word_token) - 1 for num, token in zip(enumerate(word_token), cont): if num[1] in self._target: if sign: number += str(num[1]) word += token if num[0] < lastIdx and not word_token[num[0] + 1]: sign = False else: ret.setdefault(number, set()) ret[number].add(word) number = "" word = token sign = True if number: ret.setdefault(number, set()) ret[number].add(word) return ret def _extract_ner(self, tokenNum, token, weighted=False): if not weighted: cls = self._target.get(int(max(tokenNum, key=tokenNum.count)), "") if cls.endswith("LOC"): return {"LOC": [x for x in token]} elif cls.endswith("PER"): return {"PER": [x for x in token]} elif cls.endswith("ORG"): return {"ORG": [x for x in token]} def _get_ner(self, tokenNumber, tokenWord): nerDict = self._target_token(tokenNumber, tokenWord) ret = dict() for num, token in nerDict.items(): if len(num) == 1: continue for k, v in self._extract_ner(num, token).items(): ret.setdefault(k, list()) ret[k].extend(v) return ret def _read_vocab(self): with open(self._vocabFile, "r", encoding="utf-8") as vocabIn: return eval(vocabIn.read()) def _reverse_dict(self, dic): ret = dict() for key, value in dic.items(): ret.setdefault(value, key) return ret def _tartget_label(self, dic): ret = self._reverse_dict(dic) del ret[0] return ret def _get_vocabs(self): vocabs = self._read_vocab() word_count = vocabs.get("wordsWc", dict()) wordsVocab = vocabs.get("wordsVocab", dict()) target_count = vocabs.get("targetWc", dict()) targetVocab = vocabs.get("targetVocab", dict()) reverseTargetVocab = self._tartget_label(targetVocab) return Counter(word_count), wordsVocab, Counter( target_count), targetVocab, reverseTargetVocab def _update_word(self): self._vocab4word.update(self._word_vocab) self._vocab4word.word_count = self._word_counter def _update_target(self): self._vocab4target.update(self._target_vocab) self._vocab4target.word_count = self._target_counter @property def model(self): if not self._model: raise return self._model def formatRowString(self, msg): msg = msg.strip() tokenized_char = [x for x in msg] self._dataset = DataSet() if self._addTarget2Vocab: ins = Instance(chars=tokenized_char, raw_chars=tokenized_char, target=list(dict(self._target_vocab).keys())) else: ins = Instance(chars=tokenized_char, raw_chars=tokenized_char) self._dataset.append(ins) @property def dataset(self): # if input as dict format: # data = DataSet({"raw_chars":[msg], "words":[[x for x in msg]], "seq_len":[len(word_list)]}) # 从该dataset中的chars列建立词表 self._vocab4word.from_dataset(self._dataset, field_name=self._CONST_CHAR) # 使用vocabulary将chars列转换为index self._vocab4word.index_dataset(self._dataset, field_name=self._CONST_CHAR, new_field_name=self._CONST_WORDS) if self._addTarget2Vocab: self._vocab4target.from_dataset(self._dataset, field_name=self._CONST_TARGET) self._vocab4target.index_dataset(self._dataset, field_name=self._CONST_TARGET) self._dataset.add_seq_len(self._CONST_CHAR) self._dataset.set_input(*self._input_fields) return self._dataset def _content(self): for line in self._dataset["raw_chars"].content: yield "".join(line) def result(self, dataset): # 打印数据集中的预测结果 ret = self.model.predict(dataset)["pred"] for line, cont in zip(ret, self._content()): yield self._get_ner(line[0].tolist(), cont)