def predict(): test_contents, test_labels = load_corpus('./dataset/test.txt', word2id, max_sen_len=50) # 加载测试集 test_dataset = TensorDataset( torch.from_numpy(test_contents).type(torch.float), torch.from_numpy(test_labels).type(torch.long)) test_dataloader = DataLoader(dataset=test_dataset, batch_size=config.batch_size, shuffle=False, num_workers=2) # 读取模型 model = TextCNN(config) model.load_state_dict(torch.load(config.model_path)) model.eval() model.to(device) # 测试过程 count, correct = 0, 0 for _, (batch_x, batch_y) in enumerate(test_dataloader): batch_x, batch_y = batch_x.to(device), batch_y.to(device) output = model(batch_x) # correct += (output.argmax(1) == batch_y).float().sum().item() correct += (output.argmax(1) == batch_y).sum().item() count += len(batch_x) # 打印准确率 print('test accuracy is {:.2f}%.'.format(100 * correct / count))
def train(**kwargs): opt.parse(kwargs) device = torch.device( "cuda:{}".format(opt.gpu_id) if torch.cuda.is_available() else "cpu") opt.device = device x_text, y = load_data_and_labels("./data/rt-polarity.pos", "./data/rt-polarity.neg") x_train, x_test, y_train, y_test = train_test_split( x_text, y, test_size=opt.test_size) train_data = Data(x_train, y_train) test_data = Data(x_test, y_test) train_loader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, collate_fn=collate_fn) test_loader = DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, collate_fn=collate_fn) print("{} train data: {}, test data: {}".format(now(), len(train_data), len(test_data))) model = TextCNN(opt) print("{} init model finished".format(now())) if opt.use_gpu: model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) for epoch in range(opt.epochs): total_loss = 0.0 model.train() for step, batch_data in enumerate(train_loader): x, labels = batch_data labels = torch.LongTensor(labels) if opt.use_gpu: labels = labels.to(device) optimizer.zero_grad() output = model(x) loss = criterion(output, labels) loss.backward() optimizer.step() total_loss += loss.item() acc = test(model, test_loader) print("{} {} epoch: loss: {}, acc: {}".format(now(), epoch, total_loss, acc))
def test(): # 配置文件 cf = Config('./config.yaml') # 有GPU用GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 测试数据 test_data = NewsDataset("./data/cnews_final_test.txt",cf.max_seq_len) test_dataloader = DataLoader(test_data,batch_size=cf.batch_size,shuffle=True) # 预训练词向量矩阵 embedding_matrix = get_pre_embedding_matrix("./data/final_vectors") # 模型 model = TextCNN(cf,torch.tensor(embedding_matrix)) # model.load_state_dict(torch.load("./output/model.bin",map_location='cpu')) model.load_state_dict(torch.load("./output/model.bin")) # 把模型放到指定设备 model.to(device) # 让模型并行化运算 if torch.cuda.device_count()>1: model = torch.nn.DataParallel(model) # 训练 start_time = time.time() data_len = len(test_dataloader) model.eval() y_pred = np.array([]) y_test = np.array([]) for step,batch in enumerate(tqdm(test_dataloader,"batch",total=len(test_dataloader))): label_id = batch['label_id'].squeeze(1).to(device) segment_ids = batch['segment_ids'].to(device) with torch.no_grad(): pred = model.get_labels(segment_ids) y_pred = np.hstack((y_pred,pred)) y_test = np.hstack((y_test,label_id.to("cpu").numpy())) # 评估 print("Precision, Recall and F1-Score...") print(metrics.classification_report(y_test, y_pred, target_names=get_labels('./data/label'))) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test, y_pred) print(cm)
def objective(trial): model = TextCNN(trial, len(id2vocab), CLS) model.to(device) optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"]) lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True) optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr) criterion = nn.NLLLoss() for epoch in range(EPOCHS): model.train() epoch_loss= [] for batch in train_iter: text_idx_batch, label_idx_batch = batch.text.t_().to(device), batch.label.to(device) model.zero_grad() out = model(text_idx_batch) loss = criterion(out, label_idx_batch) loss.backward() epoch_loss.append(loss.item()) optimizer.step() #print(f'Epoch[{epoch}] - Loss:{sum(epoch_loss)/len(epoch_loss)}') model.eval() predict_all = np.array([], dtype=int) labels_all = np.array([], dtype=int) with torch.no_grad(): for batch in val_iter: text_idx_batch, label_idx_batch = batch.text.t_().to(device), batch.label pred = model(text_idx_batch) pred = torch.max(pred.data, 1)[1].cpu().numpy() predict_all = np.append(predict_all, pred) truth = label_idx_batch.cpu().numpy() labels_all = np.append(labels_all, truth) acc = metrics.accuracy_score(labels_all, predict_all) trial.report(acc, epoch) if trial.should_prune(): raise optuna.exceptions.TrialPruned() return acc
def train(): train_contents, train_labels = load_corpus('./dataset/train.txt', word2id, max_sen_len=50) val_contents, val_labels = load_corpus('./dataset/validation.txt', word2id, max_sen_len=50) # 混合训练集和验证集 contents = np.vstack([train_contents, val_contents]) labels = np.concatenate([train_labels, val_labels]) # 加载训练用的数据 train_dataset = TensorDataset( torch.from_numpy(contents).type(torch.float), torch.from_numpy(labels).type(torch.long)) train_dataloader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=2) model = TextCNN(config) if config.model_path: model.load_state_dict(torch.load(config.model_path)) model.to(device) # 设置优化器 optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) # 设置损失函数 criterion = nn.CrossEntropyLoss() # 定义训练过程 for epoch in range(config.epochs): for batch_idx, (batch_x, batch_y) in enumerate(train_dataloader): batch_x, batch_y = batch_x.to(device), batch_y.to(device) output = model(batch_x) loss = criterion(output, batch_y) if batch_idx % 200 == 0 & config.verbose: print("Train Epoch:{}[{}/{} ({:.0f}%)]\tLoss:{:.6f}".format( epoch + 1, batch_idx * len(batch_x), len(train_dataloader.dataset), 100. * batch_idx / len(train_dataloader), loss.item())) optimizer.zero_grad() loss.backward() optimizer.step() # 保存模型 torch.save(model.state_dict(), './models/model.pth')
PAD = 0 model_name = 'GoogleNews-vectors-negative300.bin' word2vec = gensim.models.KeyedVectors.load_word2vec_format(model_name, binary=True) vocab_file = make_vocab(data_file, vocab_output_file) vocab2idx = convert_vocab_to_idx(vocab_output_file) word_embedding = load_word_embedding(vocab2idx, word2vec) #训练集划分 X, Y = load_data(data_file, vocab2idx) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=1) train_dataset = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train)) test_dataset = TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test)) train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=64) #模型准备 model = TextCNN(word_embedding) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) loss_func = nn.CrossEntropyLoss() #模型训练 train(model, device, optimizer, loss_func)
def train(): # 配置文件 cf = Config('./config.yaml') # 有GPU用GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 训练数据 train_data = NewsDataset("./data/cnews_final_train.txt",cf.max_seq_len) train_dataloader = DataLoader(train_data,batch_size=cf.batch_size,shuffle=True) # 测试数据 test_data = NewsDataset("./data/cnews_final_test.txt",cf.max_seq_len) test_dataloader = DataLoader(test_data,batch_size=cf.batch_size,shuffle=True) # 预训练词向量矩阵 embedding_matrix = get_pre_embedding_matrix("./data/final_vectors") # 模型 model = TextCNN(cf,torch.tensor(embedding_matrix)) # 优化器用adam optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters())) # 把模型放到指定设备 model.to(device) # 让模型并行化运算 if torch.cuda.device_count()>1: model = torch.nn.DataParallel(model) # 训练 start_time = time.time() total_batch = 0 # 总批次 best_acc_val = 0.0 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练 flag = False model.train() for epoch_id in trange(cf.epoch,desc="Epoch"): for step,batch in enumerate(tqdm(train_dataloader,"batch",total=len(train_dataloader))): label_id = batch['label_id'].squeeze(1).to(device) segment_ids = batch['segment_ids'].to(device) loss = model(segment_ids,label_id) loss.backward() optimizer.step() optimizer.zero_grad() total_batch += 1 if total_batch % cf.print_per_batch == 0: model.eval() with torch.no_grad(): loss_train,acc_train = model.get_loss_acc(segment_ids,label_id) loss_val,acc_val = evaluate(model,test_dataloader,device) if acc_val > best_acc_val: # 保存最好结果 best_acc_val = acc_val last_improved = total_batch torch.save(model.state_dict(),"./output/model.bin") improved_str = "*" else: improved_str = "" time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) model.train() if total_batch - last_improved > require_improvement: print("长时间未优化") flag = True break if flag: break
def train(config): try: split = config["split"] data_path = config["data_path"] pretrained_model_dir = config["pretrained_model_dir"] pretrained_model_file = config["pretrained_model_file"] last_model_path = config["last_model_path"] save_to = config["save_to"] min_freq = config["min_freq"] batch_size = config["batch_size"] max_sent_length = config["max_sent_length"] embed_dim = config["embed_dim"] filter_num = config["filter_num"] filter_widths = config["filter_widths"] learning_rate = config["learning_rate"] patience = config["patience"] lr_decay = config["lr_decay"] max_num_trial = config["max_num_trial"] max_epoch = config["max_epoch"] save_every = config["save_every"] cuda = config["cuda"] debug = config["debug"] except KeyError: print("Input Parameter Error") exit(1) if not Path(save_to).exists(): Path(save_to).mkdir() device = torch.device("cuda:0" if ( torch.cuda.is_available() and cuda) else "cpu") # build torchtext field TEXT = torchtext.data.Field(tokenize='spacy', lower=True) LABEL = torchtext.data.Field(dtype=torch.long) train_data, test_data = IMDB.splits(TEXT, LABEL, root=data_path) if debug: train_data, val_data = train_data.split(split_ratio=0.1) train_data, val_data = train_data.split(split_ratio=0.7) train_iter, val_iter = torchtext.data.Iterator.splits( (train_data, val_data), batch_size=batch_size, device=device) if (pretrained_model_file is not None) and (pretrained_model_dir is not None): pretrained_vector = Vectors(name=pretrained_model_file, cache=pretrained_model_dir) TEXT.build_vocab(train_data, min_freq=min_freq, vectors=pretrained_vector) LABEL.build_vocab(train_data) logging.info("saving TEXT/LABEL vocabulary...") with open(f"{save_to}/TEXT_vocab.bin", "wb") as f: dill.dump(TEXT, f) with open(f"{save_to}/LABEL_vocab.bin", "wb") as f: dill.dump(LABEL, f) assert embed_dim == TEXT.vocab.vectors.shape[ -1], "incompatiable embeddings" embed_num, class_num = len(TEXT.vocab), len(LABEL.vocab) model = TextCNN(embed_num, embed_dim, class_num, filter_num, filter_widths, from_pretrained=TEXT.vocab.vectors).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) cross_entropy = nn.CrossEntropyLoss(weight=torch.tensor( [0, 0, 1.0, 1.0], device=device)) # class [<unk>,<pad>,'pos','neg'] if last_model_path is not None: # load model logging.info(f'load model from {last_model_path}') params = torch.load(last_model_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) logging.info('restore parameters of the optimizers') optimizer.load_state_dict(torch.load(last_model_path + '.optim')) model.train() epoch = 0 cur_trial = 0 hist_valid_scores = [] train_time = begin_time = time.time() logging.info("begin training!") while True: epoch += 1 train_loss = 0 cum_cnt = 0 step = 0 for batch in iter(train_iter): feature, target = batch.text.T, batch.label.squeeze(0) step += 1 optimizer.zero_grad() res = model(feature) loss = cross_entropy(res, target) train_loss += loss loss.backward() optimizer.step() train_loss = train_loss / step val_loss, accuracy = evaluate(model, val_iter, cross_entropy) logging.info( f'epoch {epoch}\t train_loss: {train_loss}\t val_loss:{val_loss}\t val_accuracy:{accuracy} speed:{time.time()-train_time:.2f}s/epoch\t time elapsed {time.time()-begin_time:.2f}s' ) train_time = time.time() is_better = len( hist_valid_scores) == 0 or val_loss < min(hist_valid_scores) hist_valid_scores.append(val_loss) if epoch % save_every == 0: model.save(f"{save_to}/model_step_{epoch}") torch.save(optimizer.state_dict(), f"{save_to}/model_step_{epoch}.optim") if is_better: cur_patience = 0 model_save_path = f"{save_to}/model_best" print(f'save currently the best model to [{model_save_path}]') model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif cur_patience < patience: cur_patience += 1 print('hit patience %d' % cur_patience) if cur_patience == patience: cur_trial += 1 print(f'hit #{cur_trial} trial') if cur_trial == max_num_trial: print('early stop!') exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * lr_decay logging.info( f'load previously best model and decay learning rate to {lr}' ) # load model params = torch.load(model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) logging.info('restore parameters of the optimizers') optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience cur_patience = 0 if epoch == max_epoch: print('reached maximum number of epochs!') exit(0)