def load_seqtag(path, files, indexs): word_h, tag_h = 'words', 'tags' loader = ConllLoader(headers=[word_h, tag_h], indexes=indexs) ds_list = [] for fn in files: ds_list.append(loader.load(os.path.join(path, fn))) word_v = Vocabulary(min_freq=2) tag_v = Vocabulary(unknown=None) update_v(word_v, ds_list[0], word_h) update_v(tag_v, ds_list[0], tag_h) def process_data(ds): to_index(word_v, ds, word_h, C.INPUT) to_index(tag_v, ds, tag_h, C.TARGET) ds.apply(lambda x: x[C.INPUT][:MAX_LEN], new_field_name=C.INPUT) ds.apply(lambda x: x[C.TARGET][:MAX_LEN], new_field_name=C.TARGET) ds.apply(lambda x: len(x[word_h]), new_field_name=C.INPUT_LEN) ds.set_input(C.INPUT, C.INPUT_LEN) ds.set_target(C.TARGET, C.INPUT_LEN) for i in range(len(ds_list)): process_data(ds_list[i]) return ds_list, word_v, tag_v
def load_conllized_ontonote_POS(path,embedding_path=None): from fastNLP.io.data_loader import ConllLoader header2index = {'words':3,'POS':4,'NER':10} headers = ['words','POS'] if 'NER' in headers: print('警告!通过 load_conllized_ontonote 函数读出来的NER标签不是BIOS,是纯粹的conll格式,是错误的!') indexes = list(map(lambda x:header2index[x],headers)) loader = ConllLoader(headers,indexes) bundle = loader.load(path) # print(bundle.datasets) train_set = bundle.datasets['train'] dev_set = bundle.datasets['dev'] test_set = bundle.datasets['test'] # train_set = loader.load(os.path.join(path,'train.txt')) # dev_set = loader.load(os.path.join(path, 'dev.txt')) # test_set = loader.load(os.path.join(path, 'test.txt')) # print(len(train_set)) train_set.add_seq_len('words','seq_len') dev_set.add_seq_len('words','seq_len') test_set.add_seq_len('words','seq_len') # print(dataset['POS']) vocab = Vocabulary(min_freq=1) vocab.from_dataset(train_set,field_name='words') vocab.from_dataset(dev_set, field_name='words') vocab.from_dataset(test_set, field_name='words') vocab.index_dataset(train_set,field_name='words') vocab.index_dataset(dev_set, field_name='words') vocab.index_dataset(test_set, field_name='words') label_vocab_dict = {} for i,h in enumerate(headers): if h == 'words': continue label_vocab_dict[h] = Vocabulary(min_freq=1,padding=None,unknown=None) label_vocab_dict[h].from_dataset(train_set,field_name=h) label_vocab_dict[h].index_dataset(train_set,field_name=h) label_vocab_dict[h].index_dataset(dev_set,field_name=h) label_vocab_dict[h].index_dataset(test_set,field_name=h) train_set.set_input(Const.INPUT, Const.INPUT_LEN) train_set.set_target(headers[1]) dev_set.set_input(Const.INPUT, Const.INPUT_LEN) dev_set.set_target(headers[1]) test_set.set_input(Const.INPUT, Const.INPUT_LEN) test_set.set_target(headers[1]) if len(headers) > 2: print('警告:由于任务数量大于1,所以需要每次手动设置target!') print('train:',len(train_set),'dev:',len(dev_set),'test:',len(test_set)) if embedding_path is not None: pretrained_embedding = load_word_emb(embedding_path, 300, vocab) return (train_set,dev_set,test_set),(vocab,label_vocab_dict),pretrained_embedding else: return (train_set, dev_set, test_set), (vocab, label_vocab_dict)
def load_resume_ner(path,char_embedding_path=None,bigram_embedding_path=None,index_token=True, normalize={'char':True,'bigram':True,'word':False}): from fastNLP.io.data_loader import ConllLoader from utils import get_bigrams train_path = os.path.join(path,'train.char.bmes') dev_path = os.path.join(path,'dev.char.bmes') test_path = os.path.join(path,'test.char.bmes') loader = ConllLoader(['chars','target']) train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams,field_name='chars',new_field_name='bigrams') datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['dev'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None,unknown=None) print(datasets.keys()) print(len(datasets['dev'])) print(len(datasets['test'])) print(len(datasets['train'])) char_vocab.from_dataset(datasets['train'],field_name='chars', no_create_entry_dataset=[datasets['dev'],datasets['test']] ) bigram_vocab.from_dataset(datasets['train'],field_name='bigrams', no_create_entry_dataset=[datasets['dev'],datasets['test']]) label_vocab.from_dataset(datasets['train'],field_name='target') if index_token: char_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'], field_name='chars',new_field_name='chars') bigram_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'], field_name='bigrams',new_field_name='bigrams') label_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'], field_name='target',new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding(char_vocab,char_embedding_path,word_dropout=0.01,normalize=normalize['char']) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab,bigram_embedding_path,word_dropout=0.01,normalize=normalize['bigram']) embeddings['bigram'] = bigram_embedding return datasets,vocabs,embeddings
def load_weibo_ner(path,unigram_embedding_path=None,bigram_embedding_path=None,index_token=True, normlize={'char':True,'bigram':True,'word':False}): from fastNLP.io.data_loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars','target']) bundle = loader.load(path) datasets = bundle.datasets for k,v in datasets.items(): print('{}:{}'.format(k,len(v))) # print(*list(datasets.keys())) vocabs = {} word_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None,unknown=None) for k,v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x],'chars','chars') v.apply_field(get_bigrams,'chars','bigrams') word_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev'],datasets['test']]) label_vocab.from_dataset(datasets['train'],field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab),label_vocab.idx2word)) for k,v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars',new_field_name='seq_len') vocabs['char'] = word_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev'],datasets['test']]) if index_token: word_vocab.index_dataset(*list(datasets.values()), field_name='raw_words', new_field_name='words') bigram_vocab.index_dataset(*list(datasets.values()),field_name='raw_bigrams',new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='raw_target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding(word_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=0.01,normalize=normlize['char']) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01,normalize=normlize['bigram']) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
class CTBxJointLoader(DataSetLoader): """ 文件夹下应该具有以下的文件结构 -train.conllx -dev.conllx -test.conllx 每个文件中的内容如下(空格隔开不同的句子, 共有) 1 费孝通 _ NR NR _ 3 nsubjpass _ _ 2 被 _ SB SB _ 3 pass _ _ 3 授予 _ VV VV _ 0 root _ _ 4 麦格赛赛 _ NR NR _ 5 nn _ _ 5 奖 _ NN NN _ 3 dobj _ _ 1 新华社 _ NR NR _ 7 dep _ _ 2 马尼拉 _ NR NR _ 7 dep _ _ 3 8月 _ NT NT _ 7 dep _ _ 4 31日 _ NT NT _ 7 dep _ _ ... """ def __init__(self): self._loader = ConllLoader(headers=['words', 'pos_tags', 'heads', 'labels'], indexes=[1, 3, 6, 7]) def load(self, path:str): """ 给定一个文件路径,将数据读取为DataSet格式。DataSet中包含以下的内容 words: list[str] pos_tags: list[str] heads: list[int] labels: list[str] :param path: :return: """ dataset = self._loader.load(path) dataset.heads.int() return dataset def process(self, paths): """ :param paths: :return: Dataset包含以下的field chars: bigrams: trigrams: pre_chars: pre_bigrams: pre_trigrams: seg_targets: seg_masks: seq_lens: char_labels: char_heads: gold_word_pairs: seg_targets: seg_masks: char_labels: char_heads: pun_masks: gold_label_word_pairs: """ paths = check_dataloader_paths(paths) data = DataBundle() for name, path in paths.items(): dataset = self.load(path) data.datasets[name] = dataset char_labels_vocab = Vocabulary(padding=None, unknown=None) def process(dataset, char_label_vocab): dataset.apply(add_word_lst, new_field_name='word_lst') dataset.apply(lambda x: list(chain(*x['word_lst'])), new_field_name='chars') dataset.apply(add_bigram, field_name='chars', new_field_name='bigrams') dataset.apply(add_trigram, field_name='chars', new_field_name='trigrams') dataset.apply(add_char_heads, new_field_name='char_heads') dataset.apply(add_char_labels, new_field_name='char_labels') dataset.apply(add_segs, new_field_name='seg_targets') dataset.apply(add_mask, new_field_name='seg_masks') dataset.add_seq_len('chars', new_field_name='seq_lens') dataset.apply(add_pun_masks, new_field_name='pun_masks') if len(char_label_vocab.word_count)==0: char_label_vocab.from_dataset(dataset, field_name='char_labels') char_label_vocab.index_dataset(dataset, field_name='char_labels') new_dataset = add_root(dataset) new_dataset.apply(add_word_pairs, new_field_name='gold_word_pairs', ignore_type=True) global add_label_word_pairs add_label_word_pairs = partial(add_label_word_pairs, label_vocab=char_label_vocab) new_dataset.apply(add_label_word_pairs, new_field_name='gold_label_word_pairs', ignore_type=True) new_dataset.set_pad_val('char_labels', -1) new_dataset.set_pad_val('char_heads', -1) return new_dataset for name in list(paths.keys()): dataset = data.datasets[name] dataset = process(dataset, char_labels_vocab) data.datasets[name] = dataset data.vocabs['char_labels'] = char_labels_vocab char_vocab = Vocabulary(min_freq=2).from_dataset(data.datasets['train'], field_name='chars') bigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='bigrams') trigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='trigrams') for name in ['chars', 'bigrams', 'trigrams']: vocab = Vocabulary().from_dataset(field_name=name, no_create_entry_dataset=list(data.datasets.values())) vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name='pre_' + name) data.vocabs['pre_{}'.format(name)] = vocab for name, vocab in zip(['chars', 'bigrams', 'trigrams'], [char_vocab, bigram_vocab, trigram_vocab]): vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name=name) data.vocabs[name] = vocab for name, dataset in data.datasets.items(): dataset.set_input('chars', 'bigrams', 'trigrams', 'seq_lens', 'char_labels', 'char_heads', 'pre_chars', 'pre_bigrams', 'pre_trigrams') dataset.set_target('gold_word_pairs', 'seq_lens', 'seg_targets', 'seg_masks', 'char_labels', 'char_heads', 'pun_masks', 'gold_label_word_pairs') return data
def load_weibo_ner_old(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, normlize={ 'char': True, 'bigram': True, 'word': False }): from fastNLP.io.data_loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) # from fastNLP.io.file_reader import _read_conll # from fastNLP.core import Instance,DataSet # def _load(path): # ds = DataSet() # for idx, data in _read_conll(path, indexes=loader.indexes, dropna=loader.dropna, # encoding='ISO-8859-1'): # ins = {h: data[i] for i, h in enumerate(loader.headers)} # ds.append(Instance(**ins)) # return ds # from fastNLP.io.utils import check_loader_paths # paths = check_loader_paths(path) # datasets = {name: _load(path) for name, path in paths.items()} datasets = {} train_path = os.path.join(path, 'train.all.bmes') dev_path = os.path.join(path, 'dev.all.bmes') test_path = os.path.join(path, 'test.all.bmes') datasets['train'] = loader.load(train_path).datasets['train'] datasets['dev'] = loader.load(dev_path).datasets['train'] datasets['test'] = loader.load(test_path).datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) vocabs = {} word_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') word_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = word_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: word_vocab.index_dataset(*list(datasets.values()), field_name='raw_words', new_field_name='words') bigram_vocab.index_dataset(*list(datasets.values()), field_name='raw_bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='raw_target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( word_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=0.01, normalize=normlize['char']) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, normalize=normlize['bigram']) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings