def predict(): test_contents, test_labels = load_corpus('./dataset/test.txt', word2id, max_sen_len=50) # 加载测试集 test_dataset = TensorDataset( torch.from_numpy(test_contents).type(torch.float), torch.from_numpy(test_labels).type(torch.long)) test_dataloader = DataLoader(dataset=test_dataset, batch_size=config.batch_size, shuffle=False, num_workers=2) # 读取模型 model = TextCNN(config) model.load_state_dict(torch.load(config.model_path)) model.eval() model.to(device) # 测试过程 count, correct = 0, 0 for _, (batch_x, batch_y) in enumerate(test_dataloader): batch_x, batch_y = batch_x.to(device), batch_y.to(device) output = model(batch_x) # correct += (output.argmax(1) == batch_y).float().sum().item() correct += (output.argmax(1) == batch_y).sum().item() count += len(batch_x) # 打印准确率 print('test accuracy is {:.2f}%.'.format(100 * correct / count))
def test(): # 配置文件 cf = Config('./config.yaml') # 有GPU用GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 测试数据 test_data = NewsDataset("./data/cnews_final_test.txt",cf.max_seq_len) test_dataloader = DataLoader(test_data,batch_size=cf.batch_size,shuffle=True) # 预训练词向量矩阵 embedding_matrix = get_pre_embedding_matrix("./data/final_vectors") # 模型 model = TextCNN(cf,torch.tensor(embedding_matrix)) # model.load_state_dict(torch.load("./output/model.bin",map_location='cpu')) model.load_state_dict(torch.load("./output/model.bin")) # 把模型放到指定设备 model.to(device) # 让模型并行化运算 if torch.cuda.device_count()>1: model = torch.nn.DataParallel(model) # 训练 start_time = time.time() data_len = len(test_dataloader) model.eval() y_pred = np.array([]) y_test = np.array([]) for step,batch in enumerate(tqdm(test_dataloader,"batch",total=len(test_dataloader))): label_id = batch['label_id'].squeeze(1).to(device) segment_ids = batch['segment_ids'].to(device) with torch.no_grad(): pred = model.get_labels(segment_ids) y_pred = np.hstack((y_pred,pred)) y_test = np.hstack((y_test,label_id.to("cpu").numpy())) # 评估 print("Precision, Recall and F1-Score...") print(metrics.classification_report(y_test, y_pred, target_names=get_labels('./data/label'))) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test, y_pred) print(cm)
def evaluate(): # test model = TextCNN(config) model.cuda() saved_model = torch.load(config.save_model) model.load_state_dict(saved_model["state_dict"]) print( "epoch:%s steps:%s best_valid_acc:%s" % (saved_model["epoch"], saved_model["steps"], saved_model["valid_acc"])) test_loss, test_acc, cm = test(config.test) print( f"\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)") print_confusion_matrix(cm, list(id2label.values()))
def train(): train_contents, train_labels = load_corpus('./dataset/train.txt', word2id, max_sen_len=50) val_contents, val_labels = load_corpus('./dataset/validation.txt', word2id, max_sen_len=50) # 混合训练集和验证集 contents = np.vstack([train_contents, val_contents]) labels = np.concatenate([train_labels, val_labels]) # 加载训练用的数据 train_dataset = TensorDataset( torch.from_numpy(contents).type(torch.float), torch.from_numpy(labels).type(torch.long)) train_dataloader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=2) model = TextCNN(config) if config.model_path: model.load_state_dict(torch.load(config.model_path)) model.to(device) # 设置优化器 optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) # 设置损失函数 criterion = nn.CrossEntropyLoss() # 定义训练过程 for epoch in range(config.epochs): for batch_idx, (batch_x, batch_y) in enumerate(train_dataloader): batch_x, batch_y = batch_x.to(device), batch_y.to(device) output = model(batch_x) loss = criterion(output, batch_y) if batch_idx % 200 == 0 & config.verbose: print("Train Epoch:{}[{}/{} ({:.0f}%)]\tLoss:{:.6f}".format( epoch + 1, batch_idx * len(batch_x), len(train_dataloader.dataset), 100. * batch_idx / len(train_dataloader), loss.item())) optimizer.zero_grad() loss.backward() optimizer.step() # 保存模型 torch.save(model.state_dict(), './models/model.pth')
net = TextCNN(args) output = None dl_output = None ml_output = None FEATURE_LABEL = [ "PROJECT_NAME", "BUSINESS_UNIT", "REGION_ID", "REP_OFFICE_ID", "CUSTOMER_ID", "PROJECT_LEVEL_NAME", "BUSINESS_GROUP_NAME", "DELIVERY_TYPE", "PROJECT_LABEL" ] # Deep Learning if args.snapshot is not None: net.load_state_dict(torch.load(args.snapshot)) net.eval() feature = [] for label in FEATURE_LABEL: text = getattr(args, label) text = text_fields.preprocess(text) text = [[text_fields.vocab.stoi[x] for x in text]] x = text_fields.tensor_type(text) x = autograd.Variable(x, volatile=True) feature.append(x) dl_output = net(feature).int().squeeze(0).tolist() # Machine Learning if args.machine_learning_model is not None:
from text_processor import TextProcessor from torch_config import EMBEDDINGS_DIR app = Sanic('PyTorch API') embeddings = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl') model = TextCNN( embeddings=embeddings, n_filters=64, filter_sizes=[2, 3], dropout=0.0, ) device = torch.device('cpu') model.load_state_dict(torch.load('model.pth', map_location=device)) model.eval() text_processing = TextProcessor( wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')), tokenizer=get_tokenizer('basic_english'), standardize=True, min_len=3, ) @app.post('/game') async def game(request: Request): q = request.form.get('q', None) if q is None:
i += 1 print('epoch: %d, [iter: %d / all %d], class_loss: %f, domain_s_loss: %f, domain_t_loss: %f' \ % (epoch, i, len_dataloader, class_loss.cpu().data.numpy(), domain_s_loss.cpu().data.numpy(), domain_t_loss.cpu().data.numpy())) logging.info('epoch: %d, [iter: %d / all %d], class_loss: %f, domain_s_loss: %f, domain_t_loss: %f' \ % (epoch, i, len_dataloader, class_loss.cpu().data.numpy(), domain_s_loss.cpu().data.numpy(), domain_t_loss.cpu().data.numpy())) dir = 'checkpoint/WithoutImage_' + str(epoch + 1) + '.pkl' torch.save(model.state_dict(), dir) # test model = TextCNN(args, W) model.load_state_dict(torch.load(dir)) if torch.cuda.is_available(): model.cuda() model.eval() test_sub = np.zeros((len(label_df['id']), 3), dtype=np.float) batch = len(label_df['id']) // args.batch_size for i, (test_data, event_labels) in enumerate(test_loader): test_text, test_mask = to_var(test_data[0]), to_var(test_data[1]) test_text = test_text.long() test_mask = test_mask.float() test_outputs, domain_outputs = model(test_text, test_mask) if i != batch: test_sub[i * args.batch_size:(i + 1) * args.batch_size, :] = to_np(test_outputs)
label = torch.autograd.Variable(label).squeeze() out = model(data) l2_loss = config.l2 * torch.sum( torch.pow(list(model.parameters())[1], 2)) loss = criterion(out, autograd.Variable(label.long())) + l2_loss loss_sum += loss.data.item() count += 1 if count % 100 == 0: print("epoch", epoch, end=' ') print("The loss is: %.5f" % (loss_sum / 100)) loss_sum = 0 count = 0 optimizer.zero_grad() loss.backward() optimizer.step() # save the model in every epoch # 一轮训练结束,在验证集测试 valid_loss, valid_acc = get_test_result(valid_iter, valid_set) early_stopping(valid_loss, model) print("The valid acc is: %.5f" % valid_acc) if early_stopping.early_stop: print("Early stopping") break # 1 fold训练结果 model.load_state_dict(torch.load('./checkpoints/checkpoint%d.pt' % i)) test_loss, test_acc = get_test_result(test_iter, test_set) print("The test acc is: %.5f" % test_acc) acc += test_acc / 10 # 输出10-fold的平均acc print("The test acc is: %.5f" % acc)
def train(config): try: split = config["split"] data_path = config["data_path"] pretrained_model_dir = config["pretrained_model_dir"] pretrained_model_file = config["pretrained_model_file"] last_model_path = config["last_model_path"] save_to = config["save_to"] min_freq = config["min_freq"] batch_size = config["batch_size"] max_sent_length = config["max_sent_length"] embed_dim = config["embed_dim"] filter_num = config["filter_num"] filter_widths = config["filter_widths"] learning_rate = config["learning_rate"] patience = config["patience"] lr_decay = config["lr_decay"] max_num_trial = config["max_num_trial"] max_epoch = config["max_epoch"] save_every = config["save_every"] cuda = config["cuda"] debug = config["debug"] except KeyError: print("Input Parameter Error") exit(1) if not Path(save_to).exists(): Path(save_to).mkdir() device = torch.device("cuda:0" if ( torch.cuda.is_available() and cuda) else "cpu") # build torchtext field TEXT = torchtext.data.Field(tokenize='spacy', lower=True) LABEL = torchtext.data.Field(dtype=torch.long) train_data, test_data = IMDB.splits(TEXT, LABEL, root=data_path) if debug: train_data, val_data = train_data.split(split_ratio=0.1) train_data, val_data = train_data.split(split_ratio=0.7) train_iter, val_iter = torchtext.data.Iterator.splits( (train_data, val_data), batch_size=batch_size, device=device) if (pretrained_model_file is not None) and (pretrained_model_dir is not None): pretrained_vector = Vectors(name=pretrained_model_file, cache=pretrained_model_dir) TEXT.build_vocab(train_data, min_freq=min_freq, vectors=pretrained_vector) LABEL.build_vocab(train_data) logging.info("saving TEXT/LABEL vocabulary...") with open(f"{save_to}/TEXT_vocab.bin", "wb") as f: dill.dump(TEXT, f) with open(f"{save_to}/LABEL_vocab.bin", "wb") as f: dill.dump(LABEL, f) assert embed_dim == TEXT.vocab.vectors.shape[ -1], "incompatiable embeddings" embed_num, class_num = len(TEXT.vocab), len(LABEL.vocab) model = TextCNN(embed_num, embed_dim, class_num, filter_num, filter_widths, from_pretrained=TEXT.vocab.vectors).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) cross_entropy = nn.CrossEntropyLoss(weight=torch.tensor( [0, 0, 1.0, 1.0], device=device)) # class [<unk>,<pad>,'pos','neg'] if last_model_path is not None: # load model logging.info(f'load model from {last_model_path}') params = torch.load(last_model_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) logging.info('restore parameters of the optimizers') optimizer.load_state_dict(torch.load(last_model_path + '.optim')) model.train() epoch = 0 cur_trial = 0 hist_valid_scores = [] train_time = begin_time = time.time() logging.info("begin training!") while True: epoch += 1 train_loss = 0 cum_cnt = 0 step = 0 for batch in iter(train_iter): feature, target = batch.text.T, batch.label.squeeze(0) step += 1 optimizer.zero_grad() res = model(feature) loss = cross_entropy(res, target) train_loss += loss loss.backward() optimizer.step() train_loss = train_loss / step val_loss, accuracy = evaluate(model, val_iter, cross_entropy) logging.info( f'epoch {epoch}\t train_loss: {train_loss}\t val_loss:{val_loss}\t val_accuracy:{accuracy} speed:{time.time()-train_time:.2f}s/epoch\t time elapsed {time.time()-begin_time:.2f}s' ) train_time = time.time() is_better = len( hist_valid_scores) == 0 or val_loss < min(hist_valid_scores) hist_valid_scores.append(val_loss) if epoch % save_every == 0: model.save(f"{save_to}/model_step_{epoch}") torch.save(optimizer.state_dict(), f"{save_to}/model_step_{epoch}.optim") if is_better: cur_patience = 0 model_save_path = f"{save_to}/model_best" print(f'save currently the best model to [{model_save_path}]') model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif cur_patience < patience: cur_patience += 1 print('hit patience %d' % cur_patience) if cur_patience == patience: cur_trial += 1 print(f'hit #{cur_trial} trial') if cur_trial == max_num_trial: print('early stop!') exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * lr_decay logging.info( f'load previously best model and decay learning rate to {lr}' ) # load model params = torch.load(model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) logging.info('restore parameters of the optimizers') optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience cur_patience = 0 if epoch == max_epoch: print('reached maximum number of epochs!') exit(0)
if __name__ == '__main__': # load data and model with open('/Users/pengyiliu/Desktop/UoS/Dissertation_Project/Implementation/train_data.pkl', 'rb') as fp: train_data = pickle.load(fp) with open('/Users/pengyiliu/Desktop/UoS/Dissertation_Project/Implementation/test_data.pkl', 'rb') as fp: test_data = pickle.load(fp) with open('/Users/pengyiliu/Desktop/UoS/Dissertation_Project/Implementation/word2index.pkl', 'rb') as fp: word2index = pickle.load(fp) with open('/Users/pengyiliu/Desktop/UoS/Dissertation_Project/Implementation/tag2index.pkl', 'rb') as fp: tag2index = pickle.load(fp) # build model config = Config() model = TextCNN(len(word2index), config.word_embedding_dimension, len(tag2index)) model.load_state_dict(torch.load('save_model.pth')) model.eval() # test print('start testing……') result = [] # for test in train_data: for i, batch in enumerate(getBatch(config.batch_size, test_data)): data, label = pad_to_batch(batch, word2index, tag2index) with torch.no_grad(): score = model(data) pred = torch.topk(score, 3, dim=1)[1].data.tolist() # print('pred:', pred) target = torch.topk(label, 3, dim=1)
if args.gpu: inputs = inputs.cuda() labels = labels.cuda() outputs = model(inputs) loss = loss_fn(outputs, labels).item() cum_loss += loss * labels.size(0) cum_cnt += labels.size(0) model.train() return cum_loss / cum_cnt while True: valid_loss = validate(net) if args.verbose: print('validation loss: %.5f' % (valid_loss)) if ep == 0 or valid_loss < best_loss: best_loss = valid_loss best_model.load_state_dict(net.state_dict()) no_improve_cnt = 0 else: no_improve_cnt += 1 if no_improve_cnt > 5 or ep > 1000: if args.verbose: print('final validation: %.5f' % (validate(best_model))) print('best validation: %.5f' % (best_loss)) break # Train for it, data in enumerate(dataLoader, start=0): inputs, labels = data if args.gpu: inputs = inputs.cuda() labels = labels.cuda() optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate)
if valid_acc > best_valid_acc: save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'valid_acc': valid_acc }, True) secs = int(time.time() - start_time) mins = secs / 60 secs = secs % 60 writer.add_scalars("Loss", { 'train': train_loss, 'valid': valid_loss }, epoch) writer.add_scalars("Acc", { 'train': train_acc, 'valid': valid_acc }, epoch) print("Epoch: %d" % (epoch + 1), " | time in %d minutes, %d seconds" % (mins, secs)) print(f"\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)") print(f"\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)") # test saved_params = torch.load("%s/%s" % (args.save_model, model_name)) print("epoch:%s best_valid_acc:%s" % (saved_params['epoch'], saved_params['valid_acc'])) model.load_state_dict(saved_params['state_dict']) loss, acc = test(args.test) print("test set loss: %s" % loss) print("test set acc: %s" % acc)