def train_and_save_model(data_train, data_test, vocab, max_sentence_length, save_dir): # 确认torch版本是否能与fastnlp兼容 print(torch.__version__) # 读取神经网络 model = DPCNN(max_features=len(vocab), word_embedding_dimension=word_embedding_dimension, max_sentence_length=max_sentence_length, num_classes=num_classes) # 定义 loss 和 metric loss = CrossEntropyLoss(pred="output", target="label_seq") metric = AccuracyMetric(pred="predict", target="label_seq") # train model with train_data,and val model witst_data # embedding=300 gaussian init,weight_decay=0.0001, lr=0.001,epoch=5 trainer = Trainer(model=model, train_data=data_train, dev_data=data_test, loss=loss, metrics=metric, save_path='CD', batch_size=64, n_epochs=5, optimizer=Adam(lr=0.001, weight_decay=0.0001)) trainer.train() # 存储模型 _save_model(model, model_name='new_model.pkl', save_dir=save_dir)
def test_trainer_suggestion6(self): # 检查报错提示能否正确提醒用户 # 这里传入多余参数,让其duplicate dataset = prepare_fake_dataset2('x1', 'x_unused') dataset.rename_field('x_unused', 'x2') dataset.set_input('x1', 'x2') dataset.set_target('y', 'x1') class Model(nn.Module): def __init__(self): super().__init__() self.fc = nn.Linear(5, 4) def forward(self, x1, x2): x1 = self.fc(x1) x2 = self.fc(x2) x = x1 + x2 time.sleep(0.1) # loss = F.cross_entropy(x, y) return {'preds': x} model = Model() with self.assertRaises(NameError): trainer = Trainer(train_data=dataset, model=model, dev_data=dataset, loss=CrossEntropyLoss(), metrics=AccuracyMetric(), use_tqdm=False, print_every=2)
def test_fastnlp_1min_tutorial(self): # tutorials/fastnlp_1min_tutorial.ipynb data_path = "test/data_for_tests/tutorial_sample_dataset.csv" ds = DataSet.read_csv(data_path, headers=('raw_sentence', 'label'), sep='\t') print(ds[1]) # 将所有数字转为小写 ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int ds.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) def split_sent(ins): return ins['raw_sentence'].split() ds.apply(split_sent, new_field_name='words', is_input=True) # 分割训练集/验证集 train_data, dev_data = ds.split(0.3) print("Train size: ", len(train_data)) print("Test size: ", len(dev_data)) from fastNLP import Vocabulary vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) # index句子, Vocabulary.to_index(word) train_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words', is_input=True) dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words', is_input=True) from fastNLP.models import CNNText model = CNNText((len(vocab), 50), num_classes=5, padding=2, dropout=0.1) from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), optimizer=Adam(), metrics=AccuracyMetric(target='target')) trainer.train() print('Train finished!')
def train(epochs=10, lr=0.001): global model for i in range(epochs): print('----------------- ', str(i + 1), ' ------------------') trainer = Trainer(model=model, train_data=train_set, dev_data=test_set, loss=CrossEntropyLoss(pred='output', target='target'), metrics=AccuracyMetric(pred='pred', target='target'), optimizer=Adam(lr=lr), save_path=None, batch_size=1, n_epochs=1) trainer.train() model.load_state_dict(copy.deepcopy(trainer.model.state_dict())) # save('../model/cnn-' + str(kernel_sizes) + '-' + str(keep_proba) + '-' + str(i+1)) save('../model/lstm-' + str(input_dim) + '-' + str(hidden_dim) + '-' + str(i + 1))
def test_fastnlp_10min_tutorial(self): # 从csv读取数据到DataSet sample_path = "test/data_for_tests/tutorial_sample_dataset.csv" dataset = CSVLoader(headers=['raw_sentence', 'label'], sep=' ')._load(sample_path) print(len(dataset)) print(dataset[0]) print(dataset[-3]) dataset.append(Instance(raw_sentence='fake data', label='0')) # 将所有数字转为小写 dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int dataset.apply(lambda x: int(x['label']), new_field_name='label') # 使用空格分割句子 def split_sent(ins): return ins['raw_sentence'].split() dataset.apply(split_sent, new_field_name='words') # 增加长度信息 dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') print(len(dataset)) print(dataset[0]) # DataSet.drop(func)筛除数据 dataset.drop(lambda x: x['seq_len'] <= 3, inplace=True) print(len(dataset)) # 设置DataSet中,哪些field要转为tensor # set target,loss或evaluate中的golden,计算loss,模型评估时使用 dataset.set_target("label") # set input,模型forward时使用 dataset.set_input("words", "seq_len") # 分出测试集、训练集 test_data, train_data = dataset.split(0.5) print(len(test_data)) print(len(train_data)) # 构建词表, Vocabulary.add(word) vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() # index句子, Vocabulary.to_index(word) train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') print(test_data[0]) # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具 from fastNLP.core.batch import DataSetIter from fastNLP.core.sampler import RandomSampler batch_iterator = DataSetIter(dataset=train_data, batch_size=2, sampler=RandomSampler()) for batch_x, batch_y in batch_iterator: print("batch_x has: ", batch_x) print("batch_y has: ", batch_y) break from fastNLP.models import CNNText model = CNNText((len(vocab), 50), num_classes=5, dropout=0.1) from fastNLP import Trainer from copy import deepcopy # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 train_data.rename_field('label', 'label_seq') test_data.rename_field('label', 'label_seq') loss = CrossEntropyLoss(target="label_seq") metric = AccuracyMetric(target="label_seq") # 实例化Trainer,传入模型和数据,进行训练 # 先在test_data拟合(确保模型的实现是正确的) copy_model = deepcopy(model) overfit_trainer = Trainer(train_data=test_data, model=copy_model, loss=loss, batch_size=32, n_epochs=5, dev_data=test_data, metrics=metric, save_path=None) overfit_trainer.train() # 用train_data训练,在test_data验证 trainer = Trainer(model=model, train_data=train_data, dev_data=test_data, loss=CrossEntropyLoss(target="label_seq"), metrics=AccuracyMetric(target="label_seq"), save_path=None, batch_size=32, n_epochs=5) trainer.train() print('Train finished!') # 调用Tester在test_data上评价效果 from fastNLP import Tester tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(target="label_seq"), batch_size=4) acc = tester.test() print(acc)
from copy import deepcopy from fastNLP.core.losses import CrossEntropyLoss from fastNLP.core.metrics import AccuracyMetric from fastNLP.core.optimizer import Adam # from fastNLP.core.utils import save_pickle from fastNLP.io.model_io import ModelSaver # load model model = DPCNN(max_features=len(vocab) + 1, word_embedding_dimension=word_embedding_dimension, max_sentence_length=max_sentence_length, num_classes=num_classes, weight=weight) # define loss and metric loss = CrossEntropyLoss(pred="output", target="label_seq") metric = AccuracyMetric(pred="predict", target="label_seq") # train model with train_data,and val model with test_data # embedding=300 gaussian init,weight_decay=0.0001, lr=0.001,epoch=5 trainer = Trainer(model=model, train_data=dataset_train, dev_data=dataset_test, loss=loss, metrics=metric, save_path='new_model.pkl', batch_size=64, n_epochs=5, optimizer=Adam(lr=0.001, weight_decay=0.0001)) trainer.train()
def testENAS(self): # 从csv读取数据到DataSet sample_path = "tutorials/sample_data/tutorial_sample_dataset.csv" dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'), sep='\t') print(len(dataset)) print(dataset[0]) print(dataset[-3]) dataset.append(Instance(raw_sentence='fake data', label='0')) # 将所有数字转为小写 dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int dataset.apply(lambda x: int(x['label']), new_field_name='label') # 使用空格分割句子 def split_sent(ins): return ins['raw_sentence'].split() dataset.apply(split_sent, new_field_name='words') # 增加长度信息 dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') print(len(dataset)) print(dataset[0]) # DataSet.drop(func)筛除数据 dataset.drop(lambda x: x['seq_len'] <= 3) print(len(dataset)) # 设置DataSet中,哪些field要转为tensor # set target,loss或evaluate中的golden,计算loss,模型评估时使用 dataset.set_target("label") # set input,模型forward时使用 dataset.set_input("words", "seq_len") # 分出测试集、训练集 test_data, train_data = dataset.split(0.5) print(len(test_data)) print(len(train_data)) # 构建词表, Vocabulary.add(word) vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() # index句子, Vocabulary.to_index(word) train_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') test_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') print(test_data[0]) # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具 from fastNLP.core.batch import Batch from fastNLP.core.sampler import RandomSampler batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler()) for batch_x, batch_y in batch_iterator: print("batch_x has: ", batch_x) print("batch_y has: ", batch_y) break from fastNLP.models.enas_model import ENASModel from fastNLP.models.enas_controller import Controller model = ENASModel(embed_num=len(vocab), num_classes=5) controller = Controller() from fastNLP.models.enas_trainer import ENASTrainer from copy import deepcopy # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致 train_data.rename_field('label', 'label_seq') test_data.rename_field('words', 'word_seq') test_data.rename_field('label', 'label_seq') loss = CrossEntropyLoss(pred="output", target="label_seq") metric = AccuracyMetric(pred="predict", target="label_seq") trainer = ENASTrainer(model=model, controller=controller, train_data=train_data, dev_data=test_data, loss=CrossEntropyLoss(pred="output", target="label_seq"), metrics=AccuracyMetric(pred="predict", target="label_seq"), check_code_level=-1, save_path=None, batch_size=32, print_every=1, n_epochs=3, final_epochs=1) trainer.train() print('Train finished!') # 调用Tester在test_data上评价效果 from fastNLP import Tester tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"), batch_size=4) acc = tester.test() print(acc)
def test_tutorial(self): # 从csv读取数据到DataSet sample_path = "./data_for_tests/tutorial_sample_dataset.csv" dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'), sep='\t') print(len(dataset)) print(dataset[0]) dataset.append(Instance(raw_sentence='fake data', label='0')) dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int dataset.apply(lambda x: int(x['label']), new_field_name='label') # 使用空格分割句子 def split_sent(ins): return ins['raw_sentence'].split() dataset.apply(split_sent, new_field_name='words') # 增加长度信息 dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') # print(len(dataset)) # print(dataset[0]) # DataSet.drop(func)筛除数据 dataset.drop(lambda x: x['seq_len'] <= 3) print(len(dataset)) # 设置DataSet中,哪些field要转为tensor # set target,loss或evaluate中的golden,计算loss,模型评估时使用 dataset.set_target("label") # set input,模型forward时使用 dataset.set_input("words") # 分出测试集、训练集 test_data, train_data = dataset.split(0.5) # print(len(test_data)) # print(len(train_data)) # 构建词表, Vocabulary.add(word) vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() # index句子, Vocabulary.to_index(word) train_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') test_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') print(test_data[0]) model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1) from fastNLP import Trainer from copy import deepcopy # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致 train_data.rename_field('label', 'label_seq') test_data.rename_field('words', 'word_seq') test_data.rename_field('label', 'label_seq') # 实例化Trainer,传入模型和数据,进行训练 copy_model = deepcopy(model) overfit_trainer = Trainer(train_data=test_data, model=copy_model, loss=CrossEntropyLoss(pred="output", target="label_seq"), metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4, dev_data=test_data, save_path="./save") overfit_trainer.train() trainer = Trainer(train_data=train_data, model=model, loss=CrossEntropyLoss(pred="output", target="label_seq"), metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4, dev_data=test_data, save_path="./save") trainer.train() print('Train finished!') # 使用fastNLP的Tester测试脚本 tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"), batch_size=4) acc = tester.test() print(acc)
callbacks = [ GradientClipCallback( clip_value=10), # 等价于torch.nn.utils.clip_grad_norm_(10) LRScheduler(scheduler), ] if arg.task in ['snli']: callbacks.append( EvaluateCallback(data=data_bundle.datasets[arg.test_dataset_name])) # evaluate test set in every epoch if task is snli. # define trainer trainer = Trainer(train_data=data_bundle.datasets[arg.train_dataset_name], model=model, optimizer=optimizer, loss=CrossEntropyLoss(), batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, n_epochs=arg.n_epochs, print_every=-1, dev_data=data_bundle.datasets[arg.dev_dataset_name], metrics=AccuracyMetric(), metric_key='acc', device=[i for i in range(torch.cuda.device_count())], check_code_level=-1, save_path=arg.save_path, callbacks=callbacks) # train model trainer.train(load_best_model=True)
from fastNLP import Const from fastNLP import AccuracyMetric from fastNLP import CrossEntropyLoss from fastNLP import BucketSampler from fastNLP import Batch import torch import time import fitlog from fastNLP.core.callback import FitlogCallback from fastNLP import Tester from fastNLP import Callback fitlog.commit('__file__') # auto commit your codes fitlog.add_hyper_in_file ('__file__') # record your hyperparameters loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET) metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET) target_len = 20 def readdata(): global target_len min_count = 10 #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ] dataset_train = fetch_20newsgroups(subset='train', data_home='../../..') dataset_test = fetch_20newsgroups(subset='test', data_home='../../..') data = dataset_train.data target = dataset_train.target target_len = len(dataset_train.target_names) train_data = DataSet() padding = 0 for i in range(len(data)):
def train(args): text_data = TextData() with open(os.path.join(args.vocab_dir, args.vocab_data), 'rb') as fin: text_data = pickle.load(fin) vocab_size = text_data.vocab_size class_num = text_data.class_num # class_num = 1 seq_len = text_data.max_seq_len print("(vocab_size,class_num,seq_len):({0},{1},{2})".format( vocab_size, class_num, seq_len)) train_data = text_data.train_set val_data = text_data.val_set test_data = text_data.test_set train_data.set_input('words', 'seq_len') train_data.set_target('target') val_data.set_input('words', 'seq_len') val_data.set_target('target') test_data.set_input('words', 'seq_len') test_data.set_target('target') init_embeds = None if args.pretrain_model == "None": print("No pretrained model with be used.") print("vocabsize:{0}".format(vocab_size)) init_embeds = (vocab_size, args.embed_size) elif args.pretrain_model == "word2vec": embeds_path = os.path.join(args.prepare_dir, 'w2v_embeds.pkl') print("Loading Word2Vec pretrained embedding from {0}.".format( embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) elif args.pretrain_model == 'glove': embeds_path = os.path.join(args.prepare_dir, 'glove_embeds.pkl') print( "Loading Glove pretrained embedding from {0}.".format(embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) elif args.pretrain_model == 'glove2wv': embeds_path = os.path.join(args.prepare_dir, 'glove2wv_embeds.pkl') print( "Loading Glove pretrained embedding from {0}.".format(embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) else: init_embeds = (vocab_size, args.embed_size) if args.model == "CNNText": print("Using CNN Model.") model = CNNText(init_embeds, num_classes=class_num, padding=2, dropout=args.dropout) elif args.model == "StarTransformer": print("Using StarTransformer Model.") model = STSeqCls(init_embeds, num_cls=class_num, hidden_size=args.hidden_size) elif args.model == "MyCNNText": model = MyCNNText(init_embeds=init_embeds, num_classes=class_num, padding=2, dropout=args.dropout) print("Using user defined CNNText") elif args.model == "LSTMText": print("Using LSTM Model.") model = LSTMText(init_embeds=init_embeds, output_dim=class_num, hidden_dim=args.hidden_size, num_layers=args.num_layers, dropout=args.dropout) elif args.model == "Bert": print("Using Bert Model.") else: print("Using default model: CNNText.") model = CNNText((vocab_size, args.embed_size), num_classes=class_num, padding=2, dropout=0.1) print(model) if args.cuda: device = torch.device('cuda') else: device = None print("train_size:{0} ; val_size:{1} ; test_size:{2}".format( train_data.get_length(), val_data.get_length(), test_data.get_length())) if args.optim == "Adam": print("Using Adam as optimizer.") optimizer = fastnlp_optim.Adam(lr=0.001, weight_decay=args.weight_decay) if (args.model_suffix == "default"): args.model_suffix == args.optim else: print("No Optimizer will be used.") optimizer = None criterion = CrossEntropyLoss() metric = AccuracyMetric() model_save_path = os.path.join(args.model_dir, args.model, args.model_suffix) earlystop = EarlyStopCallback(args.patience) fitlog_back = FitlogCallback({"val": val_data, "train": train_data}) trainer = Trainer(train_data=train_data, model=model, save_path=model_save_path, device=device, n_epochs=args.epochs, optimizer=optimizer, dev_data=val_data, loss=criterion, batch_size=args.batch_size, metrics=metric, callbacks=[fitlog_back, earlystop]) trainer.train() print("Train Done.") tester = Tester(data=val_data, model=model, metrics=metric, batch_size=args.batch_size, device=device) tester.test() print("Test Done.") print("Predict the answer with best model...") acc = 0.0 output = [] data_iterator = Batch(test_data, batch_size=args.batch_size) for data_x, batch_y in data_iterator: i_data = Variable(data_x['words']).cuda() pred = model(i_data)[C.OUTPUT] pred = pred.sigmoid() # print(pred.shape) output.append(pred.cpu().data) output = torch.cat(output, 0).numpy() print(output.shape) print("Predict Done. {} records".format(len(output))) result_save_path = os.path.join(args.result_dir, args.model + "_" + args.model_suffix) with open(result_save_path + ".pkl", 'wb') as f: pickle.dump(output, f) output = output.squeeze()[:, 1].tolist() projectid = text_data.test_projectid.values answers = [] count = 0 for i in range(len(output)): if output[i] > 0.5: count += 1 print("true sample count:{}".format(count)) add_count = 0 for i in range(len(projectid) - len(output)): output.append([0.13]) add_count += 1 print("Add {} default result in predict.".format(add_count)) df = pd.DataFrame() df['projectid'] = projectid df['y'] = output df.to_csv(result_save_path + ".csv", index=False) print("Predict Done, results saved to {}".format(result_save_path)) fitlog.finish()
def test_fastnlp_advanced_tutorial(self): import os os.chdir("tutorials/fastnlp_advanced_tutorial") from fastNLP import DataSet from fastNLP import Instance from fastNLP import Vocabulary from fastNLP import Trainer from fastNLP import Tester # ### Instance # Instance表示一个样本,由一个或者多个field(域、属性、特征)组成,每个field具有自己的名字以及值 # 在初始化Instance的时候可以定义它包含的field,使用"field_name=field_value"的写法 # In[2]: # 组织一个Instance,这个Instance由premise、hypothesis、label三个field组成 instance = Instance(premise='an premise example .', hypothesis='an hypothesis example.', label=1) instance # In[3]: data_set = DataSet([instance] * 5) data_set.append(instance) data_set[-2:] # In[4]: # 如果某一个field的类型与dataset对应的field类型不一样仍可被加入dataset中 instance2 = Instance(premise='the second premise example .', hypothesis='the second hypothesis example.', label='1') try: data_set.append(instance2) except: pass data_set[-2:] # In[5]: # 如果某一个field的名字不对,则该instance不能被append到dataset中 instance3 = Instance(premises='the third premise example .', hypothesis='the third hypothesis example.', label=1) try: data_set.append(instance3) except: print('cannot append instance') pass data_set[-2:] # In[6]: # 除了文本以外,还可以将tensor作为其中一个field的value import torch tensor_ins = Instance(image=torch.randn(5, 5), label=0) ds = DataSet() ds.append(tensor_ins) ds from fastNLP import DataSet from fastNLP import Instance # 从csv读取数据到DataSet # 类csv文件,即每一行为一个example的文件,都可以使用这种方法进行数据读取 dataset = DataSet.read_csv('tutorial_sample_dataset.csv', headers=('raw_sentence', 'label'), sep='\t') # 查看DataSet的大小 len(dataset) # In[8]: # 使用数字索引[k],获取第k个样本 dataset[0] # In[9]: # 获取的样本是一个Instance type(dataset[0]) # In[10]: # 使用数字索引[a: b],获取第a到第b个样本 dataset[0:3] # In[11]: # 索引也可以是负数 dataset[-1] data_path = ['premise', 'hypothesis', 'label'] # 读入文件 with open(data_path[0]) as f: premise = f.readlines() with open(data_path[1]) as f: hypothesis = f.readlines() with open(data_path[2]) as f: label = f.readlines() assert len(premise) == len(hypothesis) and len(hypothesis) == len( label) # 组织DataSet data_set = DataSet() for p, h, l in zip(premise, hypothesis, label): p = p.strip() # 将行末空格去除 h = h.strip() # 将行末空格去除 data_set.append(Instance(premise=p, hypothesis=h, truth=l)) data_set[0] # ### DataSet的其他操作 # 在构建完毕DataSet后,仍然可以对DataSet的内容进行操作,函数接口为DataSet.apply() # In[13]: # 将premise域的所有文本转成小写 data_set.apply(lambda x: x['premise'].lower(), new_field_name='premise') data_set[-2:] # In[14]: # label转int data_set.apply(lambda x: int(x['truth']), new_field_name='truth') data_set[-2:] # In[15]: # 使用空格分割句子 def split_sent(ins): return ins['premise'].split() data_set.apply(split_sent, new_field_name='premise') data_set.apply(lambda x: x['hypothesis'].split(), new_field_name='hypothesis') data_set[-2:] # In[16]: # 筛选数据 origin_data_set_len = len(data_set) data_set.drop(lambda x: len(x['premise']) <= 6) origin_data_set_len, len(data_set) # In[17]: # 增加长度信息 data_set.apply(lambda x: [1] * len(x['premise']), new_field_name='premise_len') data_set.apply(lambda x: [1] * len(x['hypothesis']), new_field_name='hypothesis_len') data_set[-1] # In[18]: # 设定特征域、标签域 data_set.set_input("premise", "premise_len", "hypothesis", "hypothesis_len") data_set.set_target("truth") # In[19]: # 重命名field data_set.rename_field('truth', 'label') data_set[-1] # In[20]: # 切分训练、验证集、测试集 train_data, vad_data = data_set.split(0.5) dev_data, test_data = vad_data.split(0.4) len(train_data), len(dev_data), len(test_data) # In[21]: # 深拷贝一个数据集 import copy train_data_2, dev_data_2 = copy.deepcopy(train_data), copy.deepcopy( dev_data) del copy # 初始化词表,该词表最大的vocab_size为10000,词表中每个词出现的最低频率为2,'<unk>'表示未知词语,'<pad>'表示padding词语 # Vocabulary默认初始化参数为max_size=None, min_freq=None, unknown='<unk>', padding='<pad>' vocab = Vocabulary(max_size=10000, min_freq=2, unknown='<unk>', padding='<pad>') # 构建词表 train_data.apply(lambda x: [vocab.add(word) for word in x['premise']]) train_data.apply( lambda x: [vocab.add(word) for word in x['hypothesis']]) vocab.build_vocab() # In[23]: # 根据词表index句子 train_data.apply( lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise') train_data.apply( lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') dev_data.apply( lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise') dev_data.apply( lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') test_data.apply( lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise') test_data.apply( lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') train_data[-1], dev_data[-1], test_data[-1] # 读入vocab文件 with open('vocab.txt') as f: lines = f.readlines() vocabs = [] for line in lines: vocabs.append(line.strip()) # 实例化Vocabulary vocab_bert = Vocabulary(unknown=None, padding=None) # 将vocabs列表加入Vocabulary vocab_bert.add_word_lst(vocabs) # 构建词表 vocab_bert.build_vocab() # 更新unknown与padding的token文本 vocab_bert.unknown = '[UNK]' vocab_bert.padding = '[PAD]' # In[25]: # 根据词表index句子 train_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise') train_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') dev_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise') dev_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') train_data_2[-1], dev_data_2[-1] # step 1:加载模型参数(非必选) from fastNLP.io.config_io import ConfigSection, ConfigLoader args = ConfigSection() ConfigLoader().load_config("./data/config", {"esim_model": args}) args["vocab_size"] = len(vocab) args.data # In[27]: # step 2:加载ESIM模型 from fastNLP.models import ESIM model = ESIM(**args.data) model # In[28]: # 另一个例子:加载CNN文本分类模型 from fastNLP.models import CNNText cnn_text_model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1) cnn_text_model from fastNLP import CrossEntropyLoss from fastNLP import Adam from fastNLP import AccuracyMetric trainer = Trainer( train_data=train_data, model=model, loss=CrossEntropyLoss(pred='pred', target='label'), metrics=AccuracyMetric(), n_epochs=3, batch_size=16, print_every=-1, validate_every=-1, dev_data=dev_data, use_cuda=False, optimizer=Adam(lr=1e-3, weight_decay=0), check_code_level=-1, metric_key='acc', use_tqdm=False, ) trainer.train() tester = Tester( data=test_data, model=model, metrics=AccuracyMetric(), batch_size=args["batch_size"], ) tester.test() os.chdir("../..")
def train(args): text_data = TextData() with open(os.path.join(args.vocab_dir, args.vocab_data), 'rb') as fin: text_data = pickle.load(fin) vocab_size = text_data.vocab_size class_num = text_data.class_num seq_len = text_data.max_seq_len print("(vocab_size,class_num,seq_len):({0},{1},{2})".format( vocab_size, class_num, seq_len)) train_data = text_data.train_set test_dev_data = text_data.test_set train_data.set_input('words', 'seq_len') train_data.set_target('target') test_dev_data.set_input('words', 'seq_len') test_dev_data.set_target('target') test_data, dev_data = test_dev_data.split(0.2) test_data = test_dev_data init_embeds = None if args.pretrain_model == "None": print("No pretrained model with be used.") print("vocabsize:{0}".format(vocab_size)) init_embeds = (vocab_size, args.embed_size) elif args.pretrain_model == "word2vec": embeds_path = os.path.join(args.prepare_dir, 'w2v_embeds.pkl') print("Loading Word2Vec pretrained embedding from {0}.".format( embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) elif args.pretrain_model == 'glove': embeds_path = os.path.join(args.prepare_dir, 'glove_embeds.pkl') print( "Loading Glove pretrained embedding from {0}.".format(embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) elif args.pretrain_model == 'glove2wv': embeds_path = os.path.join(args.prepare_dir, 'glove2wv_embeds.pkl') print( "Loading Glove pretrained embedding from {0}.".format(embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) else: init_embeds = (vocab_size, args.embed_size) if args.model == "CNNText": print("Using CNN Model.") model = CNNText(init_embeds, num_classes=class_num, padding=2, dropout=args.dropout) elif args.model == "StarTransformer": print("Using StarTransformer Model.") model = STSeqCls(init_embeds, num_cls=class_num, hidden_size=args.hidden_size) elif args.model == "MyCNNText": model = MyCNNText(init_embeds=init_embeds, num_classes=class_num, padding=2, dropout=args.dropout) print("Using user defined CNNText") elif args.model == "LSTMText": print("Using LSTM Model.") model = LSTMText(init_embeds=init_embeds, output_dim=class_num, hidden_dim=args.hidden_size, num_layers=args.num_layers, dropout=args.dropout) elif args.model == "Bert": print("Using Bert Model.") else: print("Using default model: CNNText.") model = CNNText((vocab_size, args.embed_size), num_classes=class_num, padding=2, dropout=0.1) print(model) if args.cuda: device = torch.device('cuda') else: device = None print("train_size:{0} ; dev_size:{1} ; test_size:{2}".format( train_data.get_length(), dev_data.get_length(), test_data.get_length())) if args.optim == "Adam": print("Using Adam as optimizer.") optimizer = fastnlp_optim.Adam(lr=0.001, weight_decay=args.weight_decay) if (args.model_suffix == "default"): args.model_suffix == args.optim else: print("No Optimizer will be used.") optimizer = None criterion = CrossEntropyLoss() metric = AccuracyMetric() model_save_path = os.path.join(args.model_dir, args.model, args.model_suffix) earlystop = EarlyStopCallback(args.patience) trainer = Trainer(train_data=train_data, model=model, save_path=model_save_path, device=device, n_epochs=args.epochs, optimizer=optimizer, dev_data=test_data, loss=criterion, batch_size=args.batch_size, metrics=metric, callbacks=[FitlogCallback(test_data), earlystop]) trainer.train() print("Train Done.") tester = Tester(data=test_data, model=model, metrics=metric, batch_size=args.batch_size, device=device) tester.test() print("Test Done.") fitlog.finish()