def process(self, data_bundle: DataBundle) -> DataBundle: for name, dataset in data_bundle.datasets.items(): dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) _add_words_field(data_bundle, lower=self.lower) if self.word_shape: data_bundle.apply_field(word_shape, field_name='raw_words', new_field_name='word_shapes') data_bundle.set_input('word_shapes') data_bundle.apply_field(lambda chars: [ ''.join(['0' if c.isdigit() else c for c in char]) for char in chars ], field_name=Const.INPUT, new_field_name=Const.INPUT) _indexize(data_bundle, target_field_names=['target'], vocabulary=self.vocabulary) input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def load(self, folder): fns ={ 'dev':'{}_dev.csv'.format(self.lg1_lg2), 'test':'{}_test500.csv'.format(self.lg1_lg2), 'train': '{}_train500_10.csv'.format(self.lg1_lg2) } target_lg = self.lg1_lg2.split('_')[0] data_bundle = DataBundle() for name, fn in fns.items(): path = os.path.join(folder, fn) ds = DataSet() with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: parts = line.split('\t') if self.lower: ins = Instance(word=parts[1].lower(), definition=parts[-1].lower()) else: ins = Instance(word=parts[1], definition=parts[-1]) ds.append(ins) data_bundle.set_dataset(ds, name=name) target_words = {} with open(os.path.join(folder, '{}.txt'.format(target_lg)), encoding='utf-8') as f: for line in f: line = line.strip() if line: if self.lower: line = line.lower() target_words[line] = 1 target_words = list(target_words.keys()) setattr(data_bundle, 'target_words', target_words) return data_bundle
def process(self, data_bundle: DataBundle): _add_chars_field(data_bundle, lower=False) data_bundle.apply_field(self.encoding_func, field_name=Const.TARGET, new_field_name=Const.TARGET) # 将所有digit转为0 data_bundle.apply_field(lambda chars:[''.join(['0' if c.isdigit() else c for c in char]) for char in chars], field_name=Const.CHAR_INPUT, new_field_name=Const.CHAR_INPUT) # input_field_names = [Const.CHAR_INPUT] if self.bigrams: data_bundle.apply_field(lambda chars:[c1+c2 for c1,c2 in zip(chars, chars[1:]+['<eos>'])], field_name=Const.CHAR_INPUT, new_field_name='bigrams') input_field_names.append('bigrams') # index _indexize(data_bundle, input_field_names=input_field_names, target_field_names=Const.TARGET) input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.CHAR_INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def process(self, data_bundle: DataBundle) -> DataBundle: ''' :param data_bundle: databundler里的dataset列为 raw_words|index|target|comment ->用‘raw_words’增加 words_bert_ids|e1e|e1e|e2b|e2e ,还需设置is_input,is_target列属性 :return: 对databundle中的dataset进行扩展 ''' for name,dataset in data_bundle.datasets.items(): dataset.apply_field_more(func=self.raw_words2words_func,field_name='raw_words',modify_fields=True) data_bundle.set_input('words_bert_ids','e1b','e1e','e2b','e2e') data_bundle.set_target('target') return data_bundle
def load(self, folder): data_bundle = DataBundle() fns = { 'dev': '{}_dev.csv', # 'test':'{}_test500.csv'.format(self.lg1_lg2), 'train': '{}_train500_10.csv' } data_bundle = DataBundle() words = {} for lg in ['en', 'es', 'fr']: for name, fn in fns.items(): path = os.path.join(folder, fn.format(lg)) ds = read_dataset(path, self.lower, 0) data_bundle.set_dataset(ds, name=f'{lg}_{name}') target_words = {} with open(os.path.join(folder, '{}.txt'.format(lg)), encoding='utf-8') as f: for line in f: line = line.strip() if line: if self.lower: line = line.lower() target_words[line] = 1 target_words = list(target_words.keys()) words[lg] = target_words setattr(data_bundle, 'target_words_dict', words) for bi in ['en_fr', 'fr_en', 'en_es', 'es_en']: path = os.path.join(folder, '{}_test500.csv'.format(bi)) ds = read_dataset(path, self.lower, 1) data_bundle.set_dataset(ds, '{}_test'.format(bi)) return data_bundle
def process(self, data_bundle: DataBundle) -> DataBundle: """ 支持的DataSet的field为 .. csv-table:: :header: "raw_words", "target" "[Nadim, Ladki]", "[B-PER, I-PER]" "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" "[...]", "[...]" :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]在传入DataBundle基础上原位修改。 :return DataBundle: """ # 转换tag for name, dataset in data_bundle.datasets.items(): dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) _add_words_field(data_bundle, lower=self.lower) if self.word_shape: data_bundle.apply_field(word_shape, field_name='raw_words', new_field_name='word_shapes') data_bundle.set_input('word_shapes') # 将所有digit转为0 data_bundle.apply_field(lambda chars: [ ''.join(['0' if c.isdigit() else c for c in char]) for char in chars ], field_name=Const.INPUT, new_field_name=Const.INPUT) # index _indexize(data_bundle) input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def load(self, paths, is_lower): if paths is None: raise NotImplementedError(f"I am not ready for downloading!") paths = check_loader_paths(paths) datasets = { name: self._load(path, is_lower) for name, path in paths.items() } data_bundle = DataBundle(datasets=datasets) return data_bundle
def load(self, paths): """ 输出的DataSet包含以下的field tokens pos dep aspects ["The", "bread", ...] ["DET", "NOUN",...] [["dep", 2, 1], ["nsubj", 4, 2], ...] [{"term": ["bread"], "polarity": "positive", "from": 1, "to": 2}] 其中dep中["dep", 2, 1]指当前这个word的head是2(0是root,这里2就是bread),"dep"是依赖关系为dep :param paths: :return: """ data_bundle = DataBundle() folder_name = os.path.basename(paths) fns = [ f'{folder_name}_Test_biaffine_depparsed.json', f'{folder_name}_Train_biaffine_depparsed.json' ] if not os.path.exists(os.path.join(paths, fns[0])): fns = [ f'Test_biaffine_depparsed.json', f'Train_biaffine_depparsed.json' ] for split, name in zip(['test', 'train'], fns): fp = os.path.join(paths, name) with open(fp, 'r', encoding='utf-8') as f: data = json.load(f) ds = DataSet() for ins in data: tokens = ins['token'] pos = ins['pos'] dep = ins['dependencies'] aspects = ins['aspects'] ins = Instance(tokens=tokens, pos=pos, dep=dep, aspects=aspects) ds.append(ins) data_bundle.set_dataset(ds, name=split) return data_bundle # c = ConllUDataset('./data/EWT/en_ewt-ud-test.conllu') # print('done')
def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: if paths is None: paths = self.download() paths = check_loader_paths(paths) datasets = {} for name, path in paths.items(): datasets[name] = self._load(path) data_bundle = DataBundle(datasets=datasets) return data_bundle
def load(self, folder): data_bundle = DataBundle() for name in ['desc.json', 'dev.json', 'seen.json', 'train.json', 'unseen.json']: path = os.path.join(folder, name) dataset = DataSet() with open(path, 'r', encoding='utf-8') as f: data = json.load(f) for d in data: word = d['word'].lower() definition = d['definitions'].lower() ins = Instance(word=word, definition=definition) dataset.append(ins) data_bundle.set_dataset(dataset, name=name.split('.')[0]) words = [] with open(os.path.join(folder, 'target_words.txt'), 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: words.append(line) setattr(data_bundle, 'target_words', words) return data_bundle
def load(self, paths: Union[str, Dict[str, str]] = None,ratio_tr_d_te:tuple =ratio_tr_d_te) -> DataBundle: ''' :param paths: 为str时,读入训练集合的所有训练样本,并在训练集的基础上按比例8:1:1划分 为Dict[str,str]时,通过test,val,train这种键值来pick训练、测试、验证集,(train必须要有) 如果没有val、test,从train中划分一定比例充当验证/测试集 :return: ''' paths =self.check_loader_paths(paths) #此时的paths是字典{'train':XXX,..} datasets ={_:self._load(path=path) for _,path in paths.items()} # 对所有数据做shuffle 处理 for name,ds in datasets.items(): shuffled_ds =DataSet() indices =[_ for _ in range(len(ds))] random.shuffle(indices) for _ in indices: shuffled_ds.append(ds[_]) datasets[name] =shuffled_ds # shuffle 处理结束 if len(datasets) ==1: print('检测到只load train中的dataset,默认8:1:1拆分为train/test/val 三份集合') ds =datasets['train'] train_count =int(len(ds)*(ratio_tr_d_te[0]/sum(ratio_tr_d_te))) test_count = int(len(ds)*(ratio_tr_d_te[2]/sum(ratio_tr_d_te))) return DataBundle(datasets={'train':ds[:train_count],'val':ds[train_count:-test_count],'test':ds[-test_count:]}) elif len(datasets) ==3: print('检测到train,test,val,不需要从train划分') return DataBundle(datasets=datasets) elif 'val' not in datasets: print('检测到train,test,从train划分出val') ds = datasets['train'] val_count = int(len(ds) * (ratio_tr_d_te[1] / sum(ratio_tr_d_te))) return DataBundle(datasets= {'train': ds[:-val_count], 'val': ds[-val_count:], 'test': datasets['test']}) elif 'test' not in datasets: print('检测到train,val,从train划分出test') ds = datasets['train'] test_count = int(len(ds) * (ratio_tr_d_te[2] / sum(ratio_tr_d_te))) return DataBundle(datasets={'train': ds[:-test_count], 'val': ds[-test_count:], 'test': datasets['test']})
def load(self, paths): """ 输出的DataSet包含以下的field tokens pos dep aspects ["The", "bread", ...] ["DET", "NOUN",...] [["dep", 2, 1], ["nsubj", 4, 2], ...] [{"term": ["bread"], "polarity": "positive", "from": 1, "to": 2}] 其中dep中["dep", 2, 1]指当前这个word的head是2(0是root,这里2就是bread),"dep"是依赖关系为dep :param paths: :return: """ data_bundle = DataBundle() folder_name = os.path.basename(paths) fns = [ f"{folder_name}_Test.json", f"{folder_name}_Train.json", ] if not os.path.exists(os.path.join(paths, fns[0])): fns = [f"Test.json", f"Train.json"] for split, name in zip(["test", "train"], fns): fp = os.path.join(paths, name) with open(fp, "r", encoding="utf-8") as f: data = json.load(f) ds = DataSet() for ins in data: tokens = ins["token"] pos = ins["pos"] dep = ins["dependencies"] aspects = ins["aspects"] ins = Instance(tokens=tokens, pos=pos, dep=dep, aspects=aspects) ds.append(ins) data_bundle.set_dataset(ds, name=split) return data_bundle
def test_demo(self): # related to issue https://github.com/fastnlp/fastNLP/issues/324#issue-705081091 from fastNLP import DataSet, Instance from fastNLP.io import DataBundle data_bundle = DataBundle() ds = DataSet() ds.append(Instance(raw_words="截流 进入 最后 冲刺 ( 附 图片 1 张 )")) data_bundle.set_dataset(ds, name='train') data_bundle = CWSPipe().process(data_bundle) self.assertFalse('<' in data_bundle.get_vocab('chars'))
def process(self, data_bundle: DataBundle) -> DataBundle: data_bundle.copy_field(field_name=C.RAW_WORD, new_field_name=C.INPUT, ignore_miss_dataset=True) for name, dataset in data_bundle.datasets.items(): dataset.apply_field(self.copy_func, field_name=C.RAW_WORD, new_field_name=C.INPUT) dataset.add_seq_len(C.INPUT) # 这里没有用Const.INPUT=words而是 raw_words data_bundle.set_input(C.INPUT, C.INPUT_LEN) data_bundle.set_target(C.TARGET) # Const.TARGET ,'target' return data_bundle
def load( self, paths: Union[str, Dict[str, str]] = None, ratio_train_dev_test: tuple = (8, 1, 1) ) -> tuple: ''' 调用_load函数,对其return value做数据集划分的处理,(train,val),test :param paths: :return: DataBundle ''' datasets, id2country_dict = self._load(paths) train_data = DataSet() dev_data = DataSet() test_data = DataSet() indices = [_ for _ in range(len(datasets))] random.shuffle(indices) train_count = int( len(datasets) * (ratio_train_dev_test[0] / sum(ratio_train_dev_test))) dev_count = int( len(datasets) * (ratio_train_dev_test[1] / sum(ratio_train_dev_test))) test_count = int( len(datasets) * (ratio_train_dev_test[2] / sum(ratio_train_dev_test))) train_indices = indices[:train_count] dev_indices = indices[train_count:train_count + dev_count] test_indices = indices[train_count + dev_count:] for idx in train_indices: train_data.append(datasets[idx]) for idx in dev_indices: dev_data.append(datasets[idx]) for idx in test_indices: test_data.append(datasets[idx]) warnings.warn('分割train/dev/test集合,count:{}/{}/{}'.format( len(train_data), len(dev_data), len(test_data))) data_set = {'train': train_data, 'dev': dev_data, 'test': test_data} return DataBundle(datasets=data_set), id2country_dict
def load(self, folder): # 首先读取两个单语文件 lg1, lg2 = self.lg1_lg2.split('_') fns = { 'dev': '{}_dev.csv', # 'test':'{}_test500.csv'.format(self.lg1_lg2), 'train': '{}_train500_10.csv' } data_bundle = DataBundle() words = {} for lg in [lg1, lg2]: for name, fn in fns.items(): path = os.path.join(folder, fn.format(lg)) ds = read_dataset(path, self.lower, 0) data_bundle.set_dataset(ds, name=f'{lg}_{name}') target_words = {} with open(os.path.join(folder, '{}.txt'.format(lg)), encoding='utf-8') as f: for line in f: line = line.strip() if line: if self.lower: line = line.lower() target_words[line] = 1 target_words = list(target_words.keys()) words[lg] = target_words setattr(data_bundle, 'target_words_dict', words) # 读取bi的测试数据 bi1 = f'{lg1}_{lg2}' bi2 = f'{lg2}_{lg1}' for bi in [bi1, bi2]: path = os.path.join(folder, '{}_test500.csv'.format(bi)) ds = read_dataset(path, self.lower, 1) # ds = DataSet() # with open(path, 'r', encoding='utf-8') as f: # for line in f: # line = line.strip() # if line: # parts = line.split('\t') # ins = Instance(word=parts[1].lower(), definition=parts[-1]) # ds.append(ins) data_bundle.set_dataset(ds, '{}_test'.format(bi)) return data_bundle
def process(self, data_bundle: DataBundle) -> DataBundle: new_bundle = DataBundle() aspect_dict = {} mask_id = self.tokenizer.convert_tokens_to_ids([self.mask])[0] if isinstance(self.tokenizer, BertTokenizer): cls = "[CLS]" sep = "[SEP]" else: cls = self.tokenizer.cls_token sep = self.tokenizer.sep_token for name, ds in data_bundle.iter_datasets(): new_ds = DataSet() for ins in ds: tokens = ins["tokens"] if not isinstance(self.tokenizer, XLNetTokenizer): tokens.insert(0, cls) tokens.append(sep) shift = 1 else: tokens.append(sep) tokens.append(cls) shift = 0 starts = [] ends = [] for aspect in ins["aspects"]: starts.append(aspect["from"] + shift) ends.append(aspect["to"] + shift) for aspect in ins["aspects"]: target = aspect["polarity"] start = aspect["from"] + shift end = aspect["to"] + shift aspect_mask = [0] * len(tokens) for i in range(start, end): aspect_mask[i] = 1 pieces = [] piece_masks = [] raw_words = tokens[shift:-1] raw_words.insert(start - 1, "[[") raw_words.insert(end, "]]") for mask, token in zip(aspect_mask, tokens): bpes = self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(token)) pieces.extend(bpes) piece_masks.extend([mask] * (len(bpes))) new_ins = Instance( tokens=pieces, target=target, aspect_mask=piece_masks, raw_words=" ".join(raw_words), ) new_ds.append(new_ins) new_bundle.set_dataset(new_ds, name) target_vocab = Vocabulary(padding=None, unknown=None) target_vocab.add_word_lst( ["neutral", "positive", "negative", "smooth"]) target_vocab.index_dataset(*new_bundle.datasets.values(), field_name="target") new_bundle.set_target("target") new_bundle.set_input("tokens", "aspect_mask", "raw_words") new_bundle.apply_field(lambda x: len(x), field_name="tokens", new_field_name="seq_len") # new_bundle.set_vocab(vocab, 'tokens') if hasattr(self.tokenizer, "pad_token_id"): new_bundle.set_pad_val("tokens", self.tokenizer.pad_token_id) else: new_bundle.set_pad_val("tokens", self.tokenizer.pad_index) new_bundle.set_vocab(target_vocab, "target") return new_bundle
def process(self, paths): """ :param paths: :return: Dataset包含以下的field chars: bigrams: trigrams: pre_chars: pre_bigrams: pre_trigrams: seg_targets: seg_masks: seq_lens: char_labels: char_heads: gold_word_pairs: seg_targets: seg_masks: char_labels: char_heads: pun_masks: gold_label_word_pairs: """ paths = check_dataloader_paths(paths) data = DataBundle() for name, path in paths.items(): print(name,path) dataset = self.load(path) data.datasets[name] = dataset char_labels_vocab = Vocabulary(padding=None, unknown=None) def process(dataset, char_label_vocab): dataset.apply(add_word_lst, new_field_name='word_lst') dataset.apply(lambda x: list(chain(*x['word_lst'])), new_field_name='chars') dataset.apply(add_bigram, field_name='chars', new_field_name='bigrams') dataset.apply(add_trigram, field_name='chars', new_field_name='trigrams') dataset.apply(add_char_heads, new_field_name='char_heads') dataset.apply(add_char_labels, new_field_name='char_labels') dataset.apply(add_segs, new_field_name='seg_targets') dataset.apply(add_mask, new_field_name='seg_masks') dataset.add_seq_len('chars', new_field_name='seq_lens') dataset.apply(add_pun_masks, new_field_name='pun_masks') if len(char_label_vocab.word_count)==0: char_label_vocab.from_dataset(dataset, field_name='char_labels') char_label_vocab.index_dataset(dataset, field_name='char_labels') new_dataset = add_root(dataset) new_dataset.apply(add_word_pairs, new_field_name='gold_word_pairs', ignore_type=True) global add_label_word_pairs add_label_word_pairs = partial(add_label_word_pairs, label_vocab=char_label_vocab) new_dataset.apply(add_label_word_pairs, new_field_name='gold_label_word_pairs', ignore_type=True) new_dataset.set_pad_val('char_labels', -1) new_dataset.set_pad_val('char_heads', -1) return new_dataset for name in list(paths.keys()): dataset = data.datasets[name] dataset = process(dataset, char_labels_vocab) data.datasets[name] = dataset data.vocabs['char_labels'] = char_labels_vocab char_vocab = Vocabulary(min_freq=2).from_dataset(data.datasets['train'], field_name='chars') bigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='bigrams') trigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='trigrams') for name in ['chars', 'bigrams', 'trigrams']: vocab = Vocabulary().from_dataset(field_name=name, no_create_entry_dataset=list(data.datasets.values())) vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name='pre_' + name) data.vocabs['pre_{}'.format(name)] = vocab for name, vocab in zip(['chars', 'bigrams', 'trigrams'], [char_vocab, bigram_vocab, trigram_vocab]): vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name=name) data.vocabs[name] = vocab for name, dataset in data.datasets.items(): dataset.set_input('chars', 'bigrams', 'trigrams', 'seq_lens', 'char_labels', 'char_heads', 'pre_chars', 'pre_bigrams', 'pre_trigrams') dataset.set_target('gold_word_pairs', 'seq_lens', 'seg_targets', 'seg_masks', 'char_labels', 'char_heads', 'pun_masks', 'gold_label_word_pairs') return data
train_dataset.set_target('target') test_dataset.set_target('target') '''build vocabulary''' vocab = Vocabulary() vocab.from_dataset(train_dataset, field_name='words', no_create_entry_dataset=[test_dataset]) vocab.index_dataset(train_dataset, test_dataset, field_name='words') target_vocab = Vocabulary(padding=None, unknown=None) target_vocab.from_dataset(train_dataset, field_name='target', no_create_entry_dataset=[test_dataset]) target_vocab.index_dataset(train_dataset, test_dataset, field_name='target') '''build bundle''' data_dict = {"train":train_dataset, "test":test_dataset} vocab_dict = {"words":vocab, "target":target_vocab} data_bundle = DataBundle(vocab_dict, data_dict) print(data_bundle) '''build model''' embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-uncased', include_cls_sep=True) model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target'))) # model = BertForSequenceClassification(embed, 2) device = 0 if torch.cuda.is_available() else 'cpu' trainer = Trainer(data_bundle.get_dataset('train'), model, optimizer=Adam(model_params=model.parameters(), lr=2e-5), loss=CrossEntropyLoss(), device=device, batch_size=8, dev_data=data_bundle.get_dataset('train'), metrics=AccuracyMetric(), n_epochs=10, print_every=1) trainer.train()
def process(self, data_bundle: DataBundle) -> DataBundle: """ 可以处理的DataSet需要包含raw_words列 .. csv-table:: :header: "raw_words" "上海 浦东 开发 与 法制 建设 同步" "新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )" "..." :param data_bundle: :return: """ data_bundle.copy_field(Const.RAW_WORD, Const.CHAR_INPUT) if self.replace_num_alpha: data_bundle.apply_field(_find_and_replace_alpha_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) data_bundle.apply_field(_find_and_replace_digit_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) self._tokenize(data_bundle) input_field_names = [Const.CHAR_INPUT] target_field_names = [] for name, dataset in data_bundle.datasets.items(): dataset.apply_field( lambda chars: _word_lens_to_relay(map(len, chars)), field_name=Const.CHAR_INPUT, new_field_name=Const.TARGET) dataset.apply_field( lambda chars: _word_lens_to_start_seg_mask(map(len, chars)), field_name=Const.CHAR_INPUT, new_field_name='start_seg_mask') dataset.apply_field( lambda chars: _word_lens_to_end_seg_mask(map(len, chars)), field_name=Const.CHAR_INPUT, new_field_name='end_seg_mask') dataset.apply_field(lambda chars: list(chain(*chars)), field_name=Const.CHAR_INPUT, new_field_name=Const.CHAR_INPUT) target_field_names.append('start_seg_mask') input_field_names.append('end_seg_mask') if self.bigrams: for name, dataset in data_bundle.datasets.items(): dataset.apply_field( lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])], field_name=Const.CHAR_INPUT, new_field_name='bigrams') input_field_names.append('bigrams') _indexize(data_bundle, ['chars', 'bigrams'], []) func = partial(_clip_target, L=self.L) for name, dataset in data_bundle.datasets.items(): res = dataset.apply_field(func, field_name='target') relay_target = [res_i[0] for res_i in res] relay_mask = [res_i[1] for res_i in res] dataset.add_field('relay_target', relay_target, is_input=True, is_target=False, ignore_type=False) dataset.add_field('relay_mask', relay_mask, is_input=True, is_target=False, ignore_type=False) input_field_names.append('relay_target') input_field_names.append('relay_mask') input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names target_fields = [Const.TARGET, Const.INPUT_LEN] + target_field_names for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.CHAR_INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
dev_dataset, test_dataset, field_name='words') target_vocab = Vocabulary(padding=None, unknown=None) target_vocab.from_dataset(train_dataset, field_name='target', no_create_entry_dataset=[dev_dataset, test_dataset]) target_vocab.index_dataset(train_dataset, dev_dataset, test_dataset, field_name='target') '''build bundle''' data_dict = {"train": train_dataset, "dev": dev_dataset, "test": test_dataset} vocab_dict = {"words": vocab, "target": target_vocab} data_bundle = DataBundle(vocab_dict, data_dict) print(data_bundle) '''build model''' embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-uncased', include_cls_sep=True) model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target'))) # model = BertForSequenceClassification(embed, 2) device = 0 if torch.cuda.is_available() else 'cpu' trainer = Trainer(data_bundle.get_dataset('train'), model, optimizer=Adam(model_params=model.parameters(), lr=2e-5), loss=CrossEntropyLoss(target='target'), device=device,