def test_bert_embedding_1(self): vocab = Vocabulary().add_word_lst( "this is a test . [SEP] NotInBERT".split()) embed = BertEmbedding( vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1) requires_grad = embed.requires_grad embed.requires_grad = not requires_grad embed.train() words = torch.LongTensor([[2, 3, 4, 0]]) result = embed(words) self.assertEqual(result.size(), (1, 4, 16)) embed = BertEmbedding( vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1) embed.eval() words = torch.LongTensor([[2, 3, 4, 0]]) result = embed(words) self.assertEqual(result.size(), (1, 4, 16)) # 自动截断而不报错 embed = BertEmbedding( vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1, auto_truncate=True) words = torch.LongTensor([[2, 3, 4, 1] * 10, [2, 3] + [0] * 38]) result = embed(words) self.assertEqual(result.size(), (2, 40, 16))
def test_download(self): # import os vocab = Vocabulary().add_word_lst("This is a test .".split()) embed = BertEmbedding(vocab, model_dir_or_name='en') words = torch.LongTensor([[2, 3, 4, 0]]) print(embed(words).size()) for pool_method in ['first', 'last', 'max', 'avg']: for include_cls_sep in [True, False]: embed = BertEmbedding(vocab, model_dir_or_name='en', pool_method=pool_method, include_cls_sep=include_cls_sep) print(embed(words).size())
def test_bert_embed_eq_bert_piece_encoder(self): ds = DataSet({ 'words': ["this is a texta model vocab".split(), 'this is'.split()] }) encoder = BertWordPieceEncoder( model_dir_or_name='test/data_for_tests/embedding/small_bert') encoder.eval() encoder.index_datasets(ds, field_name='words') word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1])) word_pieces_res = encoder(word_pieces) vocab = Vocabulary() vocab.from_dataset(ds, field_name='words') vocab.index_dataset(ds, field_name='words', new_field_name='words') ds.set_input('words') words = torch.LongTensor(ds['words'].get([0, 1])) embed = BertEmbedding( vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', pool_method='first', include_cls_sep=True, pooled_cls=False, min_freq=1) embed.eval() words_res = embed(words) # 检查word piece什么的是正常work的 self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0) self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0) self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
def load_data(): paths = { 'train': 'data/{}/train.txt'.format(dataset), 'dev': 'data/{}/dev.txt'.format(dataset), 'test': 'data/{}/test.txt'.format(dataset) } min_freq = 2 data_bundle = CNNERPipe( bigrams=True, encoding_type=encoding_type).process_from_file(paths) dict_save_path = os.path.join("data/{}/data.pth".format(dataset)) context_dict, context_word2id, context_id2word = get_neighbor_for_vocab( data_bundle.get_vocab('chars').word2idx, glove_path, dict_save_path) train_feature_data, dev_feature_data, test_feature_data = build_instances( "data/{}".format(dataset), context_num, context_dict) embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bi_embed = StaticEmbedding( data_bundle.get_vocab('bigrams'), model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec', word_dropout=0.02, dropout=0.3, min_freq=min_freq, only_norm_found_vector=normalize_embed, only_train_min_freq=True) tencent_embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/tencent_unigram.txt', min_freq=min_freq, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method=args.pool_method, word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=False) embed = StackEmbedding([embed, tencent_embed, bert_embed], dropout=0, word_dropout=0.02) return data_bundle, embed, bi_embed, train_feature_data, dev_feature_data, test_feature_data, context_word2id, context_id2word
def test_bert_embedding_1(self): vocab = Vocabulary().add_word_lst("this is a test . [SEP]".split()) embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1) requires_grad = embed.requires_grad embed.requires_grad = not requires_grad embed.train() words = torch.LongTensor([[2, 3, 4, 0]]) result = embed(words) self.assertEqual(result.size(), (1, 4, 16))
def test_word_drop(self): vocab = Vocabulary().add_word_lst("This is a test .".split()) embed = BertEmbedding(vocab, model_dir_or_name='en', dropout=0.1, word_dropout=0.2) for i in range(10): words = torch.LongTensor([[2, 3, 4, 0]]) print(embed(words).size())
def get_data(): data = CTBxJointPipe().process_from_file(data_folder) data.delete_field('bigrams') data.delete_field('trigrams') data.delete_field('chars') data.rename_field('pre_chars', 'chars') data.delete_field('pre_bigrams') data.delete_field('pre_trigrams') bert_embed = BertEmbedding(data.get_vocab('chars'), model_dir_or_name='cn', requires_grad=True) return data, bert_embed
def load_data(): paths = { 'train': 'data/{}/train.txt'.format(dataset), "dev": 'data/{}/dev.txt'.format(dataset), "test": 'data/{}/test.txt'.format(dataset) } min_freq = 1 data_bundle = CNNERPipe( bigrams=True, encoding_type=encoding_type).process_from_file(paths) train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api( os.path.join("data", dataset), "all", args.feature_level) embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) tencent_embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/tencent_unigram.txt', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bi_embed = StaticEmbedding( data_bundle.get_vocab('bigrams'), model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec', word_dropout=0.02, dropout=0.3, min_freq=min_freq, only_norm_found_vector=normalize_embed, only_train_min_freq=True) bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method=args.pool_method, word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=False) embed = StackEmbedding([embed, tencent_embed, bert_embed], dropout=0, word_dropout=0.02) return data_bundle, embed, bi_embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature
def __init__(self, vocab, num_class, bert_model_dir_or_name, fine_tune=False): super(BertTC, self).__init__() self.embed = BertEmbedding(vocab, requires_grad=fine_tune, model_dir_or_name=bert_model_dir_or_name, include_cls_sep=True) self.classifier = nn.Linear(self.embed.embedding_dim, num_class)
def load_ner_data(): paths = { 'train': 'data/{}/train.txt'.format(dataset), 'dev': 'data/{}/dev.txt'.format(dataset), 'test': 'data/{}/test.txt'.format(dataset) } min_freq = 2 data_bundle = CNNERPipe( bigrams=True, encoding_type=encoding_type).process_from_file(paths) # train_list = data_bundle.get_dataset('train')['raw_chars'] embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bi_embed = StaticEmbedding( data_bundle.get_vocab('bigrams'), model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec', word_dropout=0.02, dropout=0.3, min_freq=2, only_norm_found_vector=normalize_embed, only_train_min_freq=True) tencent_embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/tencent_unigram.txt', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method=args.pool_method, word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=True) # embed = StackEmbedding([tencent_embed, bert_embed], dropout=0, word_dropout=0.02) embed = StackEmbedding([embed, tencent_embed, bert_embed], dropout=0, word_dropout=0.02) return data_bundle, embed, bi_embed
def trainer(data_folder, write2model, write2vocab): data_bundle = PeopleDailyNERLoader().load( data_folder) # 这一行代码将从{data_dir}处读取数据至DataBundle data_bundle = PeopleDailyPipe().process(data_bundle) data_bundle.rename_field('chars', 'words') # 存储vocab targetVocab = dict(data_bundle.vocabs["target"]) wordsVocab = dict(data_bundle.vocabs["words"]) targetWc = dict(data_bundle.vocabs['target'].word_count) wordsWc = dict(data_bundle.vocabs['words'].word_count) with open(write2vocab, "w", encoding="utf-8") as VocabOut: VocabOut.write( json.dumps( { "targetVocab": targetVocab, "wordsVocab": wordsVocab, "targetWc": targetWc, "wordsWc": wordsWc }, ensure_ascii=False)) embed = BertEmbedding(vocab=data_bundle.get_vocab('words'), model_dir_or_name='cn', requires_grad=False, auto_truncate=True) model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=100, dropout=0.5, target_vocab=data_bundle.get_vocab('target')) metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target')) optimizer = Adam(model.parameters(), lr=2e-5) loss = LossInForward() device = 0 if torch.cuda.is_available() else 'cpu' # device = "cpu" trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer, batch_size=8, dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device, n_epochs=1) trainer.train() tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric) tester.test() saver = ModelSaver(write2model) saver.save_pytorch(model, param_only=False)
def test_save_load(self): bert_save_test = 'bert_save_test' try: os.makedirs(bert_save_test, exist_ok=True) vocab = Vocabulary().add_word_lst( "this is a test . [SEP] NotInBERT".split()) embed = BertEmbedding( vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1, auto_truncate=True) embed.save(bert_save_test) load_embed = BertEmbedding.load(bert_save_test) words = torch.randint(len(vocab), size=(2, 20)) embed.eval(), load_embed.eval() self.assertEqual((embed(words) - load_embed(words)).sum(), 0) finally: import shutil shutil.rmtree(bert_save_test)
def load_data(): paths = { "train": "../data/{}/train.txt".format(dataset), "test": "../data/{}/test.txt".format(dataset), "dev": "../data/{}/dev.txt".format(dataset) } data = WNUT_17NERPipe(encoding_type=encoding_type).process_from_file(paths) dict_save_path = os.path.join("../data/{}/data.pth".format(dataset)) context_dict, context_word2id, context_id2word = get_neighbor_for_vocab( data.get_vocab('words').word2idx, glove_path, dict_save_path) train_feature_data, dev_feature_data, test_feature_data = build_instances( "../data/{}".format(dataset), context_num, context_dict) data.rename_field('words', 'chars') embed = ElmoEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=elmo_model, layers='mix', requires_grad=False, word_dropout=0.0, dropout=0.5, cache_word_reprs=False) embed.set_mix_weights_requires_grad() bert_embed = BertEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method=args.pool_method, word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=False) embed = StackEmbedding([embed, bert_embed], dropout=0, word_dropout=0.02) return data, embed, train_feature_data, dev_feature_data, test_feature_data, context_word2id, context_id2word
@cache_results('caches/conll2003.pkl', _refresh=False) def load_data(): # 替换路径 paths = 'data/conll2003' data = Conll2003NERPipe( encoding_type=encoding_type).process_from_file(paths) return data data = load_data() print(data) embed = BertEmbedding(data.get_vocab(Const.INPUT), model_dir_or_name='en-base-cased', pool_method='max', requires_grad=True, layers='11', include_cls_sep=False, dropout=0.5, word_dropout=0.01) callbacks = [ GradientClipCallback(clip_type='norm', clip_value=1), WarmupCallback(warmup=0.1, schedule='linear'), EvaluateCallback(data.get_dataset('test')) ] model = BertCRF(embed, tag_vocab=data.get_vocab('target'), encoding_type=encoding_type) optimizer = AdamW(model.parameters(), lr=2e-5)
from fastNLP.io import WeiboSenti100kPipe from fastNLP.embeddings import BertEmbedding from fastNLP.io.pipe.qa import CMRC2018Loader from fastNLP.io import CNXNLILoader from fastNLP.io import WeiboNERLoader from fastNLP.embeddings import StaticEmbedding from fastNLP import Vocabulary if __name__ == "__main__": # 下载情感分析-分类数据 data_bundle = WeiboSenti100kPipe().process_from_file() data_bundle.rename_field('chars', 'words') # 下载bert embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True) # 问答数据 data_bundle = CMRC2018Loader().load() # 文本匹配 data_bundle = CNXNLILoader().load() # NER data_bundle = WeiboNERLoader().load() # embedding vocab = Vocabulary() vocab.add_word_lst("你 好 .".split()) embed = StaticEmbedding(vocab, model_dir_or_name='cn-sgns-literature-word')
train_data, dev_data = data_set.split(0.015) # training device = 0 if torch.cuda.is_available() else 'cpu' ''' EMBED_DIM = 100 model = CNNText((len(vocab),EMBED_DIM), num_classes=len(vocab_target), dropout=0.1) metrics=AccuracyMetric() loss = CrossEntropyLoss() optimizer=optim.RMSprop(model.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False) N_EPOCHS = 10 BATCH_SIZE = 16 trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics,optimizer=optimizer,n_epochs=N_EPOCHS, batch_size=BATCH_SIZE, device=device) trainer.train() ''' embed = BertEmbedding(vocab, model_dir_or_name='en', include_cls_sep=True) model = BertForSequenceClassification(embed, len(vocab_target)) trainer = Trainer(train_data, model, optimizer=Adam(model_params=model.parameters(), lr=2e-5), loss=CrossEntropyLoss(), device=device, batch_size=8, dev_data=dev_data, metrics=AccuracyMetric(), n_epochs=2, print_every=1) trainer.train() saver = ModelSaver("save_model/bert2021.1.19.pkl") saver.save_pytorch(model)
vocab = Vocabulary() vocab.from_dataset(train_dataset, field_name='words', no_create_entry_dataset=[test_dataset]) vocab.index_dataset(train_dataset, test_dataset, field_name='words') target_vocab = Vocabulary(padding=None, unknown=None) target_vocab.from_dataset(train_dataset, field_name='target', no_create_entry_dataset=[test_dataset]) target_vocab.index_dataset(train_dataset, test_dataset, field_name='target') '''build bundle''' data_dict = {"train":train_dataset, "test":test_dataset} vocab_dict = {"words":vocab, "target":target_vocab} data_bundle = DataBundle(vocab_dict, data_dict) print(data_bundle) '''build model''' embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-uncased', include_cls_sep=True) model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target'))) # model = BertForSequenceClassification(embed, 2) device = 0 if torch.cuda.is_available() else 'cpu' trainer = Trainer(data_bundle.get_dataset('train'), model, optimizer=Adam(model_params=model.parameters(), lr=2e-5), loss=CrossEntropyLoss(), device=device, batch_size=8, dev_data=data_bundle.get_dataset('train'), metrics=AccuracyMetric(), n_epochs=10, print_every=1) trainer.train() tester = Tester(data_bundle.get_dataset('test'), model, batch_size=128, metrics=AccuracyMetric()) tester.test()
char_vocab_pkl_file = os.path.join(model_path, 'vocab_char.pkl') target_vocab_pkl_file = os.path.join(model_path, 'target_char.pkl') logger.warn('加载数据集') data_bundle = load_serialize_obj(train_data_bundle_pkl_file) logger.warn('获取词典') char_vocab = data_bundle.get_vocab('words') logger.info('char_vocab:{}'.format(char_vocab)) target_vocab = data_bundle.get_vocab('target') logger.info('target_vocab:{}'.format(target_vocab)) save_serialize_obj(char_vocab, char_vocab_pkl_file) save_serialize_obj(target_vocab, target_vocab_pkl_file) logger.info('词典序列化:{}'.format(char_vocab_pkl_file)) logger.warn('选择预训练词向量') # model_dir_or_name = 'cn-wwm' model_dir_or_name = './data/embed/ERNIE_1.0_max-len-512-pytorch' bert_embed = BertEmbedding(vocab=char_vocab, model_dir_or_name=model_dir_or_name, requires_grad=False) logger.warn('神经网络模型') model = BiLSTMCRF(embed=bert_embed, num_classes=len(target_vocab), num_layers=1, hidden_size=200, dropout=0.5, target_vocab=target_vocab) logger.info(model) logger.warn('训练超参数设定') loss = LossInForward() optimizer = Adam([param for param in model.parameters() if param.requires_grad]) # metric = AccuracyMetric() metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab(Const.TARGET), only_gross=False) # 若only_gross=False, 即还会返回各个label的metric统计值 device = 'cuda' if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快 logger.info('device:{}'.format(device)) batch_size = 32 n_epochs = 10 early_stopping = 10 trainer = Trainer(
def load_data(): if dataset == 'ON5e': paths = 'data/ON5e/english' data = OntoNotesNERPipe( encoding_type=encoding_type).process_from_file(paths) else: paths = { "train": "data/{}/train.txt".format(dataset), "dev": "data/{}/dev.txt".format(dataset), "test": "data/{}/test.txt".format(dataset) } data = ENNERPipe(encoding_type=encoding_type).process_from_file(paths) if knowledge: train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api( os.path.join("data", dataset), "all", feature_level) else: train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = None, None, None, None, None, None char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'), embed_size=embed_size, char_emb_size=embed_size, word_dropout=0, dropout=0.3, pool_method='max', activation='relu', min_char_freq=2, requires_grad=True, include_word_start_end=False, char_attn_type=char_type, char_n_head=3, char_dim_ffn=60, char_scale=char_type == 'naive', char_dropout=0.15, char_after_norm=True) word_embed = StaticEmbedding(vocab=data.get_vocab('words'), model_dir_or_name='en-glove-6b-100d', requires_grad=True, lower=True, word_dropout=0, dropout=0.5, only_norm_found_vector=normalize_embed) data.rename_field('words', 'chars') embed = ElmoEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=elmo_model, layers='mix', requires_grad=False, word_dropout=0.0, dropout=0.5, cache_word_reprs=False) embed.set_mix_weights_requires_grad() bert_embed = BertEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method="first", word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=False) embed = StackEmbedding([embed, bert_embed, word_embed, char_embed], dropout=0, word_dropout=0.02) return data, embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature
elif arg.task == 'qnli': data_bundle = QNLIBertPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'mnli': data_bundle = MNLIBertPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'quora': data_bundle = QuoraBertPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() else: raise RuntimeError(f'NOT support {arg.task} task yet!') print(data_bundle) # print details in data_bundle # load embedding embed = BertEmbedding(data_bundle.vocabs[Const.INPUT], model_dir_or_name=arg.bert_model_dir_or_name) # define model model = BertForSentenceMatching(embed, num_labels=len( data_bundle.vocabs[Const.TARGET])) # define optimizer and callback optimizer = AdamW(lr=arg.lr, params=model.parameters()) callbacks = [ WarmupCallback(warmup=arg.warm_up_rate, schedule='linear'), ] if arg.task in ['snli']: callbacks.append( EvaluateCallback(data=data_bundle.datasets[arg.test_dataset_name]))
logger.warn('加载数据集') data_bundle = load_serialize_obj(train_data_bundle_pkl_file) logger.warn('获取词典') char_vocab = data_bundle.get_vocab('words') logger.info('char_vocab:{}'.format(char_vocab)) target_vocab = data_bundle.get_vocab('target') logger.info('target_vocab:{}'.format(target_vocab)) save_serialize_obj(char_vocab, char_vocab_pkl_file) save_serialize_obj(target_vocab, target_vocab_pkl_file) logger.info('词典序列化:{}'.format(char_vocab_pkl_file)) logger.warn('选择预训练词向量') bert_embed = BertEmbedding(vocab=char_vocab, model_dir_or_name='cn-wwm', pool_method='max', requires_grad=True, layers='11', include_cls_sep=False, dropout=0.5, word_dropout=0.01, auto_truncate=True) logger.warn('神经网络模型') model = BertCRF(bert_embed, tag_vocab=target_vocab, encoding_type='bio') logger.info(model) logger.warn('训练超参数设定') loss = LossInForward() optimizer = AdamW( [param for param in model.parameters() if param.requires_grad], lr=2e-5) # metric = AccuracyMetric() metric = SpanFPreRecMetric( tag_vocab=data_bundle.get_vocab(Const.TARGET),
from fastNLP.embeddings import BertEmbedding from fastNLP.models import BertForQuestionAnswering from fastNLP.core.losses import CMRC2018Loss from fastNLP.core.metrics import CMRC2018Metric from fastNLP.io.pipe.qa import CMRC2018BertPipe from fastNLP import Trainer, BucketSampler from fastNLP import WarmupCallback, GradientClipCallback from fastNLP.core.optimizer import AdamW data_bundle = CMRC2018BertPipe().process_from_file() data_bundle.rename_field('chars', 'words') print(data_bundle) embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn', requires_grad=True, include_cls_sep=False, auto_truncate=True, dropout=0.5, word_dropout=0.01) model = BertForQuestionAnswering(embed) loss = CMRC2018Loss() metric = CMRC2018Metric() wm_callback = WarmupCallback(schedule='linear') gc_callback = GradientClipCallback(clip_value=1, clip_type='norm') callbacks = [wm_callback, gc_callback] optimizer = AdamW(model.parameters(), lr=5e-5) trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer, sampler=BucketSampler(seq_len_field_name='context_len'), dev_data=data_bundle.get_dataset('dev'), metrics=metric, callbacks=callbacks, device=0, batch_size=6, num_workers=2, n_epochs=2, print_every=1, test_use_tqdm=False, update_every=10)
@cache_results('imdb.pkl') def get_data(): data_bundle = IMDBLoader().process('imdb/') return data_bundle data_bundle = get_data() print(data_bundle) # 删除超过512, 但由于英语中会把word进行word piece处理,所以截取的时候做一点的裕量 data_bundle.datasets['train'].drop(lambda x: len(x['words']) > 400) data_bundle.datasets['dev'].drop(lambda x: len(x['words']) > 400) data_bundle.datasets['test'].drop(lambda x: len(x['words']) > 400) bert_embed = BertEmbedding(data_bundle.vocabs['words'], requires_grad=False, model_dir_or_name="en-base-uncased") model = BiLSTMSentiment(bert_embed, len(data_bundle.vocabs['target'])) Trainer(data_bundle.datasets['train'], model, optimizer=None, loss=CrossEntropyLoss(), device=0, batch_size=10, dev_data=data_bundle.datasets['dev'], metrics=AccuracyMetric()).train() # 在测试集上测试一下效果 Tester(data_bundle.datasets['test'], model,
def forward(self, chars): # batch_size, max_len = words.size() chars = self.embedding(chars) outputs = self.mlp(chars) return {Const.OUTPUT: outputs} def predict(self, chars): # batch_size, max_len = words.size() chars = self.embedding(chars) outputs = self.mlp(chars) return {Const.OUTPUT: outputs} embed = BertEmbedding(data.get_vocab(Const.CHAR_INPUT), model_dir_or_name='cn-wwm-ext', pool_method='first', requires_grad=True, layers='11', include_cls_sep=False, dropout=0.5) callbacks = [ GradientClipCallback(clip_type='norm', clip_value=1), WarmupCallback(warmup=0.1, schedule='linear') ] model = BertCNNER(embed, len(data.vocabs[Const.TARGET])) optimizer = AdamW(model.parameters(), lr=3e-5) for name, dataset in data.datasets.items(): original_len = len(dataset) dataset.drop(lambda x:x['seq_len']>256, inplace=True) clipped_len = len(dataset) print("Delete {} instances in {}.".format(original_len-clipped_len, name))
bi_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), model_dir_or_name='cpt/gigaword/bi.ite50.vec', word_dropout=0.02, dropout=0.3, min_freq=min_freq, only_norm_found_vector=normalize_embed, only_train_min_freq=True) return data_bundle, embed, bi_embed data_bundle, embed, bi_embed = load_data() bert_embed = BertEmbedding(data_bundle.get_vocab('chars'), model_dir_or_name='transformer_cpt/bert', requires_grad=False) print(data_bundle) model = TENER(tag_vocab=data_bundle.get_vocab('target'), embed=embed, num_layers=num_layers, d_model=d_model, n_head=n_heads, feedforward_dim=dim_feedforward, dropout=dropout, after_norm=after_norm, attn_type=attn_type, bi_embed=bi_embed, bert_embed=bert_embed, fc_dropout=fc_dropout,
self.embedding = Embedding(embed, dropout=0.1) self.tag_size = tag_size self.mlp = MLP(size_layer=[self.embedding.embedding_dim, tag_size]) def forward(self, chars): # batch_size, max_len = words.size() chars = self.embedding(chars) outputs = self.mlp(chars) return {Const.OUTPUT: outputs} embed = BertEmbedding(data.vocabs[Const.CHAR_INPUT], model_dir_or_name='en-base', pool_method='max', requires_grad=True, layers='11') for name, dataset in data.datasets.items(): dataset.set_pad_val(Const.TARGET, -100) callbacks = [ GradientClipCallback(clip_type='norm', clip_value=1), WarmupCallback(warmup=0.1, schedule='linear') ] model = BertCNNER(embed, len(data.vocabs[Const.TARGET])) optimizer = AdamW(model.parameters(), lr=1e-4) for name, dataset in data.datasets.items():