def predict(): test_contents, test_labels = load_corpus('./dataset/test.txt', word2id, max_sen_len=50) # 加载测试集 test_dataset = TensorDataset( torch.from_numpy(test_contents).type(torch.float), torch.from_numpy(test_labels).type(torch.long)) test_dataloader = DataLoader(dataset=test_dataset, batch_size=config.batch_size, shuffle=False, num_workers=2) # 读取模型 model = TextCNN(config) model.load_state_dict(torch.load(config.model_path)) model.eval() model.to(device) # 测试过程 count, correct = 0, 0 for _, (batch_x, batch_y) in enumerate(test_dataloader): batch_x, batch_y = batch_x.to(device), batch_y.to(device) output = model(batch_x) # correct += (output.argmax(1) == batch_y).float().sum().item() correct += (output.argmax(1) == batch_y).sum().item() count += len(batch_x) # 打印准确率 print('test accuracy is {:.2f}%.'.format(100 * correct / count))
def build_textcnn_model(vocab, config, train=True): model = TextCNN(vocab.vocab_size, config) if train: model.train() else: model.eval() if torch.cuda.is_available(): model.cuda() else: model.cpu() return model
def build_textcnn_model(vocab, config, train=True): model = TextCNN(vocab.vocab_size, config) if train: model.train() #在训练模型时会在前面加上train(); else: model.eval() #在测试模型时在前面使用eval(),会将BN和DropOut固定住,不会取平均,而是用训练好的值 if torch.cuda.is_available(): model.cuda() else: model.cpu() return model
def test(): # 配置文件 cf = Config('./config.yaml') # 有GPU用GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 测试数据 test_data = NewsDataset("./data/cnews_final_test.txt",cf.max_seq_len) test_dataloader = DataLoader(test_data,batch_size=cf.batch_size,shuffle=True) # 预训练词向量矩阵 embedding_matrix = get_pre_embedding_matrix("./data/final_vectors") # 模型 model = TextCNN(cf,torch.tensor(embedding_matrix)) # model.load_state_dict(torch.load("./output/model.bin",map_location='cpu')) model.load_state_dict(torch.load("./output/model.bin")) # 把模型放到指定设备 model.to(device) # 让模型并行化运算 if torch.cuda.device_count()>1: model = torch.nn.DataParallel(model) # 训练 start_time = time.time() data_len = len(test_dataloader) model.eval() y_pred = np.array([]) y_test = np.array([]) for step,batch in enumerate(tqdm(test_dataloader,"batch",total=len(test_dataloader))): label_id = batch['label_id'].squeeze(1).to(device) segment_ids = batch['segment_ids'].to(device) with torch.no_grad(): pred = model.get_labels(segment_ids) y_pred = np.hstack((y_pred,pred)) y_test = np.hstack((y_test,label_id.to("cpu").numpy())) # 评估 print("Precision, Recall and F1-Score...") print(metrics.classification_report(y_test, y_pred, target_names=get_labels('./data/label'))) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test, y_pred) print(cm)
def build_textcnn_model(vocab, config, train=True): model = TextCNN(vocab.vocab_size, config) if train: model.train() #在训练模型时会在前面加上train(); else: model.eval() #在测试模型时在前面使用eval(),会将BN和DropOut固定住,不会取平均,而是用训练好的值 #train()与eval()两个方法是针对网络train和eval时采用不同方式的情况 #比如Batch Normalization和Dropout #BN的作用主要是对网络中间的每层进行归一化处理,并且使用变换重构保证所提取的特征分布不会被破坏; #由于训练完毕后参数都是固定的,所有BN的训练和测试时的操作不同 #Dropopt能够克服过拟合,在每个训练batch中,通过忽略一般的特征检测器,可以明显地减少过拟合现象。 if torch.cuda.is_available(): model.cuda() else: model.cpu() return model
def objective(trial): model = TextCNN(trial, len(id2vocab), CLS) model.to(device) optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"]) lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True) optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr) criterion = nn.NLLLoss() for epoch in range(EPOCHS): model.train() epoch_loss= [] for batch in train_iter: text_idx_batch, label_idx_batch = batch.text.t_().to(device), batch.label.to(device) model.zero_grad() out = model(text_idx_batch) loss = criterion(out, label_idx_batch) loss.backward() epoch_loss.append(loss.item()) optimizer.step() #print(f'Epoch[{epoch}] - Loss:{sum(epoch_loss)/len(epoch_loss)}') model.eval() predict_all = np.array([], dtype=int) labels_all = np.array([], dtype=int) with torch.no_grad(): for batch in val_iter: text_idx_batch, label_idx_batch = batch.text.t_().to(device), batch.label pred = model(text_idx_batch) pred = torch.max(pred.data, 1)[1].cpu().numpy() predict_all = np.append(predict_all, pred) truth = label_idx_batch.cpu().numpy() labels_all = np.append(labels_all, truth) acc = metrics.accuracy_score(labels_all, predict_all) trial.report(acc, epoch) if trial.should_prune(): raise optuna.exceptions.TrialPruned() return acc
output = None dl_output = None ml_output = None FEATURE_LABEL = [ "PROJECT_NAME", "BUSINESS_UNIT", "REGION_ID", "REP_OFFICE_ID", "CUSTOMER_ID", "PROJECT_LEVEL_NAME", "BUSINESS_GROUP_NAME", "DELIVERY_TYPE", "PROJECT_LABEL" ] # Deep Learning if args.snapshot is not None: net.load_state_dict(torch.load(args.snapshot)) net.eval() feature = [] for label in FEATURE_LABEL: text = getattr(args, label) text = text_fields.preprocess(text) text = [[text_fields.vocab.stoi[x] for x in text]] x = text_fields.tensor_type(text) x = autograd.Variable(x, volatile=True) feature.append(x) dl_output = net(feature).int().squeeze(0).tolist() # Machine Learning if args.machine_learning_model is not None: classifiers = np.load(args.machine_learning_model)
loss = criterion(target, label) loss.backward() optimizer.step() total_loss += loss.data[0] current_count += sample_batched['data'].size()[0] sys.stdout.write('epoch {0} / {1}: {2} / {3}\r'.format( epoch, nb_epoch, current_count, len(dataset_train))) sys.stdout.write('epoch {0} / {1}: {2} / {3}\n'.format( epoch, nb_epoch, current_count, len(dataset_train))) # 计算开发集loss text_cnn.eval() for i_batch, sample_batched in enumerate(data_loader_dev): data = Variable(sample_batched['data']) label = Variable(sample_batched['label']) if use_cuda: data = data.cuda() label = label.cuda() pred = text_cnn(data) loss = criterion(pred, label) dev_loss += loss.data[0] total_loss /= float(len(data_loader_train)) dev_loss /= float(len(data_loader_dev)) print('\ttrain loss: {:.4f}, dev loss: {:.4f}'.format( total_loss, dev_loss))
from torch_config import EMBEDDINGS_DIR app = Sanic('PyTorch API') embeddings = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl') model = TextCNN( embeddings=embeddings, n_filters=64, filter_sizes=[2, 3], dropout=0.0, ) device = torch.device('cpu') model.load_state_dict(torch.load('model.pth', map_location=device)) model.eval() text_processing = TextProcessor( wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')), tokenizer=get_tokenizer('basic_english'), standardize=True, min_len=3, ) @app.post('/game') async def game(request: Request): q = request.form.get('q', None) if q is None: return HTTPResponse(status=400)
def main(): device = torch.device('cuda') embedding_vectors = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl') text_processor = TextProcessor( wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')), tokenizer=get_tokenizer('basic_english'), standardize=True, min_len=3, ) dataset = TextDataset(CORPUS_DIR, text_processor) # split into training and test set # TODO: fix this splitting sometimes failing when corpus size changes train_set, test_set = torch.utils.data.random_split( dataset, [ int(len(dataset) * DATA_SPLIT), int(len(dataset) * (1.0 - DATA_SPLIT)) ]) # count number of samples in each class class_count = [0, 0] for data, label in dataset: class_count[int(label.item())] += 1 # get relative weights for classes _sum = sum(class_count) class_count[0] /= _sum class_count[1] /= _sum # reverse the weights since we're getting the inverse for the sampler class_count = list(reversed(class_count)) # set weight for every sample weights = [class_count[int(x[1].item())] for x in train_set] # weighted sampler sampler = torch.utils.data.WeightedRandomSampler( weights=weights, num_samples=len(train_set), replacement=True) train_loader = DataLoader(dataset=train_set, batch_size=32, collate_fn=Sequencer(SEQUENCE_LEN), sampler=sampler) test_loader = DataLoader(dataset=test_set, batch_size=32, collate_fn=Sequencer(SEQUENCE_LEN)) # number of filters in each convolutional filter N_FILTERS = 64 # sizes and number of convolutional layers FILTER_SIZES = [2, 3] # dropout for between conv and dense layers DROPOUT = 0.5 model = TextCNN( embeddings=embedding_vectors, n_filters=N_FILTERS, filter_sizes=FILTER_SIZES, dropout=DROPOUT, ).to(device) print(model) print('Trainable params:', sum(p.numel() for p in model.parameters() if p.requires_grad)) criterion = nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) EPOCHS = 12 best_acc = 0.0 # training loop for epoch in range(EPOCHS): print('Epoch', epoch + 1) for i, data in tqdm(enumerate(train_loader), total=len(train_loader)): # get word indices vector and corresponding labels x, labels = data # send to device x = x.to(device) labels = labels.to(device) # make predictions predictions = model(x).squeeze() # calculate loss loss = criterion(predictions, labels) # learning stuff... optimizer.zero_grad() loss.backward() optimizer.step() # evaluate with torch.no_grad(): model.eval() correct = 0 wrong = 0 m = [[0, 0], [0, 0]] for data in test_loader: x, label = data x = x.to(device) predictions = model(x).squeeze() for truth, prediction in zip(label, predictions): y = int(truth.item()) y_pred = 1 if prediction.item() > 0.5 else 0 m[y][y_pred] += 1 if y == y_pred: correct += 1 else: wrong += 1 model.train() acc = correct / (correct + wrong) if acc > best_acc: best_acc = acc for file in glob.glob('models/model_*.pth'): os.remove(file) torch.save(model.state_dict(), f'models/state_{epoch}.pth') print() print('Correct:', f'{correct}/{correct + wrong}', 'Accuracy:', acc) print('[[TN, FP], [FN, TP]]') print(m) print() # put into evaluation mode model.eval() text_processor.do_standardize = True with torch.no_grad(): while True: text = input('Prompt: ') x = text_processor.process(text) x = torch.tensor(x).unsqueeze(dim=0) print(model(x.to(device)).squeeze())
print('epoch: %d, [iter: %d / all %d], class_loss: %f, domain_s_loss: %f, domain_t_loss: %f' \ % (epoch, i, len_dataloader, class_loss.cpu().data.numpy(), domain_s_loss.cpu().data.numpy(), domain_t_loss.cpu().data.numpy())) logging.info('epoch: %d, [iter: %d / all %d], class_loss: %f, domain_s_loss: %f, domain_t_loss: %f' \ % (epoch, i, len_dataloader, class_loss.cpu().data.numpy(), domain_s_loss.cpu().data.numpy(), domain_t_loss.cpu().data.numpy())) dir = 'checkpoint/WithoutImage_' + str(epoch + 1) + '.pkl' torch.save(model.state_dict(), dir) # test model = TextCNN(args, W) model.load_state_dict(torch.load(dir)) if torch.cuda.is_available(): model.cuda() model.eval() test_sub = np.zeros((len(label_df['id']), 3), dtype=np.float) batch = len(label_df['id']) // args.batch_size for i, (test_data, event_labels) in enumerate(test_loader): test_text, test_mask = to_var(test_data[0]), to_var(test_data[1]) test_text = test_text.long() test_mask = test_mask.float() test_outputs, domain_outputs = model(test_text, test_mask) if i != batch: test_sub[i * args.batch_size:(i + 1) * args.batch_size, :] = to_np(test_outputs) else: test_sub[i * args.batch_size:len(test_df['id']), :] = to_np(test_outputs)
def train(): # 配置文件 cf = Config('./config.yaml') # 有GPU用GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 训练数据 train_data = NewsDataset("./data/cnews_final_train.txt",cf.max_seq_len) train_dataloader = DataLoader(train_data,batch_size=cf.batch_size,shuffle=True) # 测试数据 test_data = NewsDataset("./data/cnews_final_test.txt",cf.max_seq_len) test_dataloader = DataLoader(test_data,batch_size=cf.batch_size,shuffle=True) # 预训练词向量矩阵 embedding_matrix = get_pre_embedding_matrix("./data/final_vectors") # 模型 model = TextCNN(cf,torch.tensor(embedding_matrix)) # 优化器用adam optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters())) # 把模型放到指定设备 model.to(device) # 让模型并行化运算 if torch.cuda.device_count()>1: model = torch.nn.DataParallel(model) # 训练 start_time = time.time() total_batch = 0 # 总批次 best_acc_val = 0.0 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练 flag = False model.train() for epoch_id in trange(cf.epoch,desc="Epoch"): for step,batch in enumerate(tqdm(train_dataloader,"batch",total=len(train_dataloader))): label_id = batch['label_id'].squeeze(1).to(device) segment_ids = batch['segment_ids'].to(device) loss = model(segment_ids,label_id) loss.backward() optimizer.step() optimizer.zero_grad() total_batch += 1 if total_batch % cf.print_per_batch == 0: model.eval() with torch.no_grad(): loss_train,acc_train = model.get_loss_acc(segment_ids,label_id) loss_val,acc_val = evaluate(model,test_dataloader,device) if acc_val > best_acc_val: # 保存最好结果 best_acc_val = acc_val last_improved = total_batch torch.save(model.state_dict(),"./output/model.bin") improved_str = "*" else: improved_str = "" time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) model.train() if total_batch - last_improved > require_improvement: print("长时间未优化") flag = True break if flag: break
label_df.comment_all.apply(lambda x: ' '.join(jieba.cut(cut_sub(x).lower()))) text_train = pd.DataFrame(train_df) text_test = pd.DataFrame(test_df) train_data, test_data, label_data, W = load_weight(args, text_train, text_test, label_df) # 图片->训练数据 test_data = DatasetWithoutImg(label_data, mode='test_images') test_loader = DataLoader(dataset=test_data, batch_size=args.batch_size, shuffle=False) # test TextCNN.eval() test_sub = np.zeros((len(label_df['id']), 3), dtype=np.float) batch = len(label_df['id']) // args.batch_size for i, (test_data, event_labels) in enumerate(test_loader): test_text, test_mask = to_var(test_data[0]), to_var(test_data[1]) test_text = test_text.long() test_mask = test_mask.float() test_outputs, domain_outputs = TextCNN(test_text, test_mask) if i != batch: test_sub[i * args.batch_size:(i + 1) * args.batch_size, :] = to_np(test_outputs) else: test_sub[i * args.batch_size:len(test_df['id']), :] = to_np(test_outputs)
class Trainer: def __init__(self, config): self.config = config self.train_data_loader = None self.eval_data_loader = None # 加载数据集 self.load_data() self.train_inputs, self.train_labels, label_to_idx = self.train_data_loader.gen_data( ) self.vocab_size = self.train_data_loader.vocab_size self.word_vectors = self.train_data_loader.word_vectors print(f"train data size: {len(self.train_labels)}") print(f"vocab size: {self.vocab_size}") self.label_list = [value for key, value in label_to_idx.items()] self.eval_inputs, self.eval_labels = self.eval_data_loader.gen_data() # 初始化模型 self.model = TextCNN(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) def load_data(self): """加载数据集""" self.train_data_loader = TrainData(self.config) self.config.test_data = self.config.eval_data # 使用验证集,进行训练过程中的测试 self.eval_data_loader = TestData(self.config) def train(self): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9, allow_growth=True) sess_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options) with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) # 初始化变量 current_step = 0 # 创建Train/Eval的summar路径和写入对象 train_summary_path = os.path.join( self.config.BASE_DIR, self.config.summary_path + "/train") if not os.path.exists(train_summary_path): os.makedirs(train_summary_path) train_summary_writer = tf.summary.FileWriter( train_summary_path, sess.graph) eval_summary_path = os.path.join( self.config.BASE_DIR, self.config.summary_path + "/eval") if not os.path.exists(eval_summary_path): os.makedirs(eval_summary_path) eval_summary_writer = tf.summary.FileWriter( eval_summary_path, sess.graph) # Train & Eval Process for epoch in range(self.config.epochs): print(f"----- Epoch {epoch + 1}/{self.config.epochs} -----") for batch in self.train_data_loader.next_batch( self.train_inputs, self.train_labels, self.config.batch_size): summary, loss, predictions = self.model.train( sess, batch, self.config.keep_prob) train_summary_writer.add_summary(summary) if self.config.num_classes == 1: acc = get_binary_metrics(pred_y=predictions.tolist(), true_y=batch['y']) print("Train step: {}, acc: {:.3f}".format( current_step, acc)) elif self.config.num_classes > 1: acc = get_multi_metrics(pred_y=predictions.tolist(), true_y=batch['y']) print("Train step: {}, acc: {:.3f}".format( current_step, acc)) current_step += 1 if self.eval_data_loader and current_step % self.config.ckeckpoint_every == 0: eval_losses = [] eval_accs = [] for eval_batch in self.eval_data_loader.next_batch( self.eval_inputs, self.eval_labels, self.config.batch_size): eval_summary, eval_loss, eval_predictions = self.model.eval( sess, eval_batch) eval_summary_writer.add_summary(eval_summary) eval_losses.append(eval_loss) if self.config.num_classes == 1: acc = get_binary_metrics( pred_y=eval_predictions.tolist(), true_y=batch['y']) eval_accs.append(acc) elif self.config.num_classes > 1: acc = get_multi_metrics( pred_y=eval_predictions.tolist(), true_y=batch['y']) eval_accs.append(acc) print( f"Eval \tloss: {list_mean(eval_losses)}, acc: {list_mean(eval_accs)}" ) if self.config.ckpt_model_path: save_path = os.path.join( self.config.BASE_DIR, self.config.ckpt_model_path) if not os.path.exists(save_path): os.makedirs(save_path) model_save_path = os.path.join( save_path, self.config.model_name) self.model.saver.save(sess, model_save_path, global_step=current_step)