class RnnModel: def __init__(self): self.categories, self.cat_to_id = read_category() self.words, self.word_to_id = read_vocab(vocab_file) self.model = TextRNN() self.model.load_state_dict(torch.load('model_params.pkl')) def predict(self, message): content = message data = [self.word_to_id[x] for x in content if x in self.word_to_id] data = kr.preprocessing.sequence.pad_sequences([data], 600) data = torch.LongTensor(data) y_pred_cls = self.model(data) class_index = torch.argmax(y_pred_cls[0]).item() return self.categories[class_index]
def train(args): train_iter, dev_iter = data_processor.load_data(args) # 将数据分为训练集和验证集 print('加载数据完成') model = TextRNN(args) if args.cuda: model.cuda() """ Q5: Please give optimizer here """ optimizer = torch.optim.Adam(model.parameters()) steps = 0 best_acc = 0 last_step = 0 model.train() for epoch in range(1, args.epoch + 1): for batch in train_iter: feature, target = batch.text, batch.label # t_()函数表示将(max_len, batch_size)转置为(batch_size, max_len) with torch.no_grad(): #feature.t_() target.sub_(1) # target减去1 #print(feature.shape) if args.cuda: feature, target = feature.cuda(), target.cuda() optimizer.zero_grad() logits = model(feature) #print(logits.shape) loss = F.cross_entropy(logits, target) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() steps += 1 if steps % args.log_interval == 0: # torch.max(logits, 1)函数:返回每一行中最大值的那个元素,且返回其索引(返回最大元素在这一行的列索引) corrects = (torch.max(logits, 1)[1] == target).sum() train_acc = 100.0 * corrects / batch.batch_size sys.stdout.write( '\rBatch[{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format( steps, loss.item(), train_acc, corrects, batch.batch_size)) if steps % args.test_interval == 0: dev_acc = eval(dev_iter, model, args) if dev_acc > best_acc: best_acc = dev_acc last_step = steps if args.save_best: print('Saving best model, acc: {:.4f}%\n'.format( best_acc)) save(model, args.save_dir, 'best', steps) else: if steps - last_step >= args.early_stopping: print('\nearly stop by {} steps, acc: {:.4f}%'.format( args.early_stopping, best_acc)) raise KeyboardInterrupt
def train(): model = TextRNN().to(device) #定义损失函数 Loss = nn.MultiLabelSoftMarginLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) #保存最好模型,先给一个定义为0 best_val_acc = 0 for epoch in range(10): # print('epoch=',epoch) #分批训练 accuracy_array0 = np.array([]) for step, (x_batch, y_batch) in enumerate(train_loader): x = x_batch.to(device) y = y_batch.to(device) out = model(x) loss = Loss(out, y) #print(out) #print('loss=',loss) #反向传播 optimizer.zero_grad() loss.backward() optimizer.step() accuracy0 = np.mean( (torch.argmax(out, 1) == torch.argmax(y, 1)).cpu().numpy()) accuracy_array0 = np.append(accuracy_array0, accuracy0) accuracy_train = np.mean(accuracy_array0) print('accuracy_train:', accuracy_train) #对模型进行验证 if (epoch + 1) % 5 == 0: for step, (x_batch, y_batch) in enumerate(val_loader): x = x_batch.to(device) y = y_batch.to(device) out = model(x) #计算准确率 accuracy1 = np.mean( (torch.argmax(out, 1) == torch.argmax(y, 1)).cpu().numpy()) accuracy_array1 = np.array([]) if accuracy1 > best_val_acc: torch.save(model, 'model.pkl') best_val_acc = accuracy1 print('model.pkl saved') accuracy_array1 = np.append(accuracy_array1, best_val_acc) accuracy_test = np.mean(accuracy_array1) print('accuracy_test:', accuracy_test)
def train(args): train_iter, dev_iter = data_processor.load_data(args) # 将数据分为训练集和验证集 print('加载数据完成') model = TextRNN(args) Cuda = torch.cuda.is_available() if Cuda and args.cuda: model.cuda() """ Q5: Please give optimizer here Add lr_scheduler to adjust learning rate. """ optimizer = torch.optim.Adam(model.parameters(), lr = args.lr) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.8) steps = 0 best_acc = 0 last_step = 0 model.train() for epoch in range(1, args.epoch + 1): for batch in train_iter: feature, target = batch.text, batch.label # t_()函数表示将(max_len, batch_size)转置为(batch_size, max_len) with torch.no_grad(): feature.t_(), target.sub_(1) # target减去1 if args.cuda and Cuda: feature, target = feature.cuda(), target.cuda() optimizer.zero_grad() logits = model(feature) loss = F.cross_entropy(logits, target) loss.backward() optimizer.step() steps += 1 if steps % args.log_interval == 0: # torch.max(logits, 1)函数:返回每一行中最大值的那个元素,且返回其索引(返回最大元素在这一行的列索引) corrects = (torch.max(logits, 1)[1] == target).sum() train_acc = 100.0 * corrects / batch.batch_size sys.stdout.write( '\rBatch[{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format(steps, loss.item(), train_acc, corrects, batch.batch_size)) if steps % args.test_interval == 0: dev_acc = eval(dev_iter, model, args) if dev_acc > best_acc: best_acc = dev_acc last_step = steps if args.save_best: print('Saving best model, acc: {:.4f}%\n'.format(best_acc)) save(model, args.save_dir, 'best', steps) else: scheduler.step() print('lr decayed to {}'.format(optimizer.state_dict()['param_groups'][0]['lr'])) if steps - last_step >= args.early_stopping: print('\nearly stop by {} steps, acc: {:.4f}%'.format(args.early_stopping, best_acc)) raise KeyboardInterrupt
def main(): reviews_ints, labels, features, word_int_dict = data_processing(300) train_data, test_data, train_label, test_label = split_train_test(features, labels, 0.1) textrnn = TextRNN(300 * len(train_data), embed_size, hidden_size, 1) criterion = nn.CrossEntropyLoss() optimizer = t.optim.Adam(textrnn.parameters(), lr=0.01) process_bar = len(train_data) // batch_size + 1 # print('process_bar:', process_bar) for epoch in range(num_epochs): # h0 = [num_layers(1) * num_directions(1), batch_size, hidden_size] # h0 = h0.to(device)# 1*200*256 # print(type(h0)) for i in range(process_bar): x = train_data[batch_size * i:batch_size * (i + 1)] y = train_label[batch_size * i:batch_size * (i + 1)] # x = [batch_size * seq_length] x = t.LongTensor(x) y = t.LongTensor(y) # 下面一步中的输入x=[batch_size, seq_length, embed_size], # h0 = [batch_size, num_layers(1) * num_directions(1), hidden_size] # 输出output= [batch_size, seq_length, output_dim(num_directions * hidden_size)], # ht = [batch_size, num_layers * num_directions, hidden_size] output = textrnn(x) # print(output.size()) # print(y.size()) loss = criterion(output, y) optimizer.zero_grad() loss.backward() optimizer.step() print(str(datetime.datetime.now()) + '||epoch ' + str(epoch + 1) + '||step ' + str( i + 1) + ' | loss is: ' + str(loss.item())) if i % 5 == 0: # h0 = t.zeros(num_layers, len(test_data), hidden_size) test = t.LongTensor(test_data) # test = test.transpose(0, 1) # test_label = t.LongTensor(test_label) output = textrnn(test) pre_y = t.max(output,dim=1)[1].data.numpy().squeeze() print(len(pre_y)) acc = sum(pre_y == test_label) / len(test_label) print('acc:', acc)
def __init__(self): self.config = TRNNConfig() self.categories, self.cat_to_id = read_category() self.words, self.word_to_id = read_vocab(vocab_dir) self.config.vocab_size = len(self.words) self.model = TextRNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def train_TextRNN(): model = TextRNN(TextRNNConfig) loss = CrossEntropyLoss(pred="pred", target="target") metrics = AccuracyMetric(pred="pred", target="target") trainer = Trainer(model=model, train_data=dataset_train, dev_data=dataset_dev, loss=loss, metrics=metrics, batch_size=16, n_epochs=20) trainer.train() tester = Tester(dataset_test, model, metrics) tester.test()
def train(lr, train_loader, test_dataset): model = TextRNN().cuda() loss_fn = nn.MultiLabelSoftMarginLoss() optimizer = optim.Adam(model.parameters(), lr=lr) best_acc = 0 for epoch in range(train_epochs): for step, (x_batch, y_batch) in enumerate(train_loader): x, y = x_batch.cuda(), y_batch.cuda() # FF y_pred = model(x) loss = loss_fn(y_pred, y) # BF optimizer.zero_grad() loss.backward() optimizer.step() acc = np.mean( (torch.argmax(y_pred, 1) == torch.argmax(y, 1)).cpu().numpy()) print('Training epoch {:}, loss = {:}, acc = {:}'.format( epoch + 1, loss.item(), acc)) if (epoch + 1) % 5 == 0: for step, (x_batch, y_batch) in enumerate(test_loader): x, y = x_batch.cuda(), y_batch.cuda() # FF y_pred = model(x) acc = np.mean( (torch.argmax(y_pred, 1) == torch.argmax(y, 1)).cpu().numpy()) # print('Test acc = {:}'.format(acc)) if acc > best_acc: best_acc = acc torch.save(model.state_dict(), 'model_params.pkl')
def train(x_train, y_train, vocab_processor, x_dev, y_dev): # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): if FLAGS.model == "cnn": print("Begin to train model with cnn") nn = TextCNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) else: print("Begin to train model with rnn") nn = TextRNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), lstm_size=FLAGS.lstm_size, embedding_size=FLAGS.embedding_dim, num_layers=FLAGS.num_layers, l2_reg_lambda=FLAGS.l2_reg_lambda, attn_size=FLAGS.attn_size) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(nn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", nn.loss) acc_summary = tf.summary.scalar("accuracy", nn.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { nn.input_x: x_batch, nn.input_y: y_batch, nn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, nn.loss, nn.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) def dev_step(x_batch, y_batch, writer=None): """ Evaluates model on a dev set """ feed_dict = { nn.input_x: x_batch, nn.input_y: y_batch, nn.dropout_keep_prob: 1.0 } step, summaries, loss, accuracy = sess.run( [global_step, dev_summary_op, nn.loss, nn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) if writer: writer.add_summary(summaries, step) # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), FLAGS.train_batch_size, FLAGS.num_epochs) # Training loop. For each batch... for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation:") dev_step(x_dev, y_dev, writer=dev_summary_writer) print("") if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path))
def train(): word_dict = load_vocab(FLAGS.vocab_data) glove = load_glove("../glove.6B.{}d.txt".format(FLAGS.embedding_size), FLAGS.embedding_size, word_dict) train = Dataset(filepath=FLAGS.train_data, num_class=FLAGS.num_class, sequence_length=FLAGS.sequence_length) valid = Dataset(filepath=FLAGS.valid_data, num_class=FLAGS.num_class, sequence_length=FLAGS.sequence_length) with tf.Graph().as_default(): session_conf = tf.compat.v1.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.compat.v1.Session(config=session_conf) with sess.as_default(): rnn = TextRNN(vocab_size=len(word_dict), embedding_size=FLAGS.embedding_size, sequence_length=FLAGS.sequence_length, num_class=FLAGS.num_class, cell_type=FLAGS.cell_type, hidden_size=FLAGS.hidden_size, pretrained_embeddings=glove, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define training procedure global_step = tf.compat.v1.Variable(0, name="global_step", trainable=False) train_op = tf.compat.v1.train.AdamOptimizer( FLAGS.learning_rate).minimize(rnn.loss, global_step=global_step) acc, acc_op = tf.compat.v1.metrics.accuracy( labels=rnn.labels, predictions=rnn.predictions, name="metrics/acc") metrics_vars = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_init_op = tf.compat.v1.variables_initializer( var_list=metrics_vars) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.compat.v1.summary.scalar("loss", rnn.loss) acc_summary = tf.compat.v1.summary.scalar("accuracy", rnn.accuracy) # Train summaries train_summary_op = tf.compat.v1.summary.merge( [loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.compat.v1.summary.FileWriter( train_summary_dir, sess.graph) # Valid summaries valid_step = 0 valid_summary_op = tf.compat.v1.summary.merge( [loss_summary, acc_summary]) valid_summary_dir = os.path.join(out_dir, "summaries", "valid") valid_summary_writer = tf.compat.v1.summary.FileWriter( valid_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables(), max_to_keep=FLAGS.num_checkpoints) # initialize all variables best_valid_acc = 0.0 sess.run(tf.compat.v1.global_variables_initializer()) sess.run(tf.compat.v1.local_variables_initializer()) # training and validating loop for epoch in range(FLAGS.num_epoch): print('-' * 100) print('\n{}> epoch: {}\n'.format( datetime.datetime.now().isoformat(), epoch)) sess.run(metrics_init_op) # Training process for batch in train.bacth_iter(FLAGS.batch_size, desc="Training", shuffle=True): labels, docs = zip(*batch) padded_docs, _, masks = vectorize(docs, FLAGS.sequence_length) feed_dict = { rnn.inputs: padded_docs, rnn.labels: labels, rnn.masks: masks, rnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy, _ = sess.run([ train_op, global_step, train_summary_op, rnn.loss, rnn.accuracy, acc_op ], feed_dict) train_summary_writer.add_summary(summaries, step) print("\ntraining accuracy = {:.2f}\n".format( sess.run(acc) * 100)) sess.run(metrics_init_op) # Validating process for batch in valid.bacth_iter(FLAGS.batch_size, desc="Validating", shuffle=False): valid_step += 1 labels, docs = zip(*batch) padded_docs, _, masks = vectorize(docs, FLAGS.sequence_length) feed_dict = { rnn.inputs: padded_docs, rnn.labels: labels, rnn.masks: masks, rnn.dropout_keep_prob: 1.0 } summaries, loss, accuracy, _ = sess.run( [valid_summary_op, rnn.loss, rnn.accuracy, acc_op], feed_dict) valid_summary_writer.add_summary(summaries, global_step=valid_step) valid_acc = sess.run(acc) * 100 print("\nvalidating accuracy = {:.2f}\n".format(valid_acc)) # model checkpoint if valid_acc > best_valid_acc: best_valid_acc = valid_acc print("current best validating accuracy = {:.2f}\n".format( best_valid_acc)) path = saver.save(sess, checkpoint_prefix) print("saved model checkpoint to {}\n".format(path)) print("{} optimization finished!\n".format( datetime.datetime.now())) print("best validating accuracy = {:.2f}\n".format(best_valid_acc))
vocab_dir = os.path.join(base_dir, 'vocab.txt') save_dir = os.path.join(base_dir, train_ratio + '/checkpoints/textrnn') save_path = os.path.join(save_dir, 'best_validation') # 最佳验证结果保存路径 window_size = int(window_size) train_ratio = float(train_ratio) print('Configuring RNN model...') print('Building vocab if not exists.') start_time_vocab = time.time() config = TRNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_data_dir, vocab_dir) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) model = TextRNN(config) time_dif_vocab = get_time_dif(start_time_vocab) print("Time usage:", time_dif_vocab) #读取原始数据并转换成三个集合 print("Processing and loading training and validation data...") start_time = time.time() x_train, x_val, x_test, y_train, y_val, y_test = process_all_file( train_data_dir, eval_data_dir, train_ratio, word_to_id, cat_to_id, config.seq_length, window_size) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) print('==========Training==========') start_time_train = time.time() train()
def train(): model = TextRNN().to(device) #定义损失函数 Loss = nn.MultiLabelSoftMarginLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) #保存最好模型,先给一个定义为0 best_val_acc = 0 costs = [] early_stop = 0 min_loss = float('inf') for epoch in range(5): # print('epoch=',epoch) #分批训练 losses = [] accuracy_array0 = np.array([]) for step, (x_batch, y_batch) in enumerate(train_loader): x = x_batch.to(device) y = y_batch.to(device) out = model(x) loss = Loss(out, y) losses.append(loss.item()) #print(out) #print('loss=',loss) #反向传播 optimizer.zero_grad() loss.backward() optimizer.step() accuracy0 = np.mean( (torch.argmax(out, 1) == torch.argmax(y, 1)).cpu().numpy()) accuracy_array0 = np.append(accuracy_array0, accuracy0) meanloss = np.mean(losses) costs.append(meanloss) #对模型进行验证 if (epoch + 1) % 5 == 0: accuracy_train = np.mean(accuracy_array0) print('accuracy_train:', accuracy_train) for step, (x_batch, y_batch) in enumerate(val_loader): x = x_batch.to(device) y = y_batch.to(device) out = model(x) #计算准确率 accuracy1 = np.mean( (torch.argmax(out, 1) == torch.argmax(y, 1)).cpu().numpy()) accuracy_array1 = np.array([]) accuracy_test = np.mean(accuracy_array1) print('accuracy_test:', accuracy_test) if accuracy1 > best_val_acc: torch.save(model, 'model.pkl') best_val_acc = accuracy1 print('model.pkl saved') #accuracy_array1 = np.append(accuracy_array1, best_val_acc # 早停法 if meanloss < min_loss: min_loss = meanloss early_stop = 0 else: early_stop += 1 if early_stop > 5: print(f"loss连续{epoch}个epoch未降低, 停止循环") break
confusion = metrics.confusion_matrix(labels_all, predict_all) return acc, loss_total / len(data_loader.dataset), confusion return acc, loss_total / len(data_loader.dataset) EPOCH = 30 batch_size = 32 best_epoch, best_acc = 0, 0 #保存训练模型 file_name = 'cnews_best.pt' train_data = textData(train=True) val_data = textData(val=True) test_data = textData() train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False) model = TextRNN() # 损失函数:这里用交叉熵 criterion = nn.CrossEntropyLoss() # 优化器 这里用SGD optimizer = optim.Adam(model.parameters(), lr=0.001) # device : GPU or CPU device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) # 训练 for epoch in range(EPOCH): start_time = time.time() for i, data in enumerate(train_loader): model.train()
def main(args): print "loadding reviews and labels from dataset" data = pd.read_csv('data/labeledTrainData.tsv.zip', compression='zip', delimiter='\t', header=0, quoting=3) reviews = data["review"] labels = list(data['sentiment']) sentences = [] for review in reviews: if len(review) > 0: sentences.append( utils.review_to_wordlist(review.decode('utf8').strip(), remove_stopwords=True)) print "loaded %d reviews from dataset" % len(sentences) word_dict = utils.build_vocab(sentences, max_words=10000) vec_reviews = utils.vectorize(sentences, word_dict, verbose=True) train_x = vec_reviews[0:20000] train_y = labels[0:20000] train_y = utils.one_hot(train_y, args.nb_classes) test_x = vec_reviews[20000:] test_y = labels[20000:] test_y = utils.one_hot(test_y, args.nb_classes) save_dir = args.save_dir log_dir = args.log_dir if not os.path.exists(save_dir): os.makedirs(save_dir) if not os.path.exists(log_dir): os.makedirs(log_dir) with tf.Graph().as_default(): config_proto = utils.get_config_proto() sess = tf.Session(config=config_proto) if args.model_type == "cnn": model = TextCNN(args, "TextCNN") test_batch = utils.get_batches(test_x, test_y, args.max_size) elif args.model_type in ["rnn", "bi_rnn"]: model = TextRNN(args, "TextRNN") test_batch = utils.get_batches(test_x, test_y, args.max_size, type="rnn") sess.run(tf.global_variables_initializer()) summary_writer = tf.summary.FileWriter(log_dir, sess.graph) for epoch in range(1, args.nb_epochs + 1): print "epoch %d start" % epoch print "- " * 50 loss = 0. total_reviews = 0 accuracy = 0. if args.model_type == "cnn": train_batch = utils.get_batches(train_x, train_y, args.batch_size) elif args.model_type in ["rnn", "bi_rnn"]: train_batch = utils.get_batches(train_x, train_y, args.batch_size, type="rnn") epoch_start_time = time.time() step_start_time = epoch_start_time for idx, batch in enumerate(train_batch): reviews, reviews_length, labels = batch _, loss_t, accuracy_t, global_step, batch_size, summaries = model.train( sess, reviews, reviews_length, labels, args.keep_prob) loss += loss_t * batch_size total_reviews += batch_size accuracy += accuracy_t * batch_size summary_writer.add_summary(summaries, global_step) if global_step % 50 == 0: print "epoch %d, step %d, loss %f, accuracy %.4f, time %.2fs" % \ (epoch, global_step, loss_t, accuracy_t, time.time() - step_start_time) step_start_time = time.time() epoch_time = time.time() - epoch_start_time print "%.2f seconds in this epoch" % (epoch_time) print "train loss %f, train accuracy %.4f" % ( loss / total_reviews, accuracy / total_reviews) total_reviews = 0 accuracy = 0. for batch in test_batch: reviews, reviews_length, labels = batch accuracy_t, batch_size = model.test(sess, reviews, reviews_length, labels, 1.0) total_reviews += batch_size accuracy += accuracy_t * batch_size print "accuracy %.4f in %d test reviews" % ( accuracy / total_reviews, total_reviews)
def test(): # 配置文件 cf = Config('./config.yaml') # 有GPU用GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 测试数据 test_data = NewsDataset("./data/cnews_final_test.txt", cf.max_seq_len) test_dataloader = DataLoader(test_data, batch_size=cf.batch_size, shuffle=True) # 预训练词向量矩阵 embedding_matrix = get_pre_embedding_matrix("./data/final_vectors") # 模型 model = TextRNN(cf, torch.tensor(embedding_matrix)) # model.load_state_dict(torch.load("./output/model.bin",map_location='cpu')) model.load_state_dict(torch.load("./output/model.bin")) # 把模型放到指定设备 model.to(device) # 让模型并行化运算 if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # 训练 start_time = time.time() data_len = len(test_dataloader) model.eval() y_pred = np.array([]) y_test = np.array([]) # for step,batch in enumerate(tqdm(test_dataloader,"batch",total=len(test_dataloader))): for step, batch in enumerate(test_dataloader): label_id = batch['label_id'].squeeze(1).to(device) seq_len = batch["seq_len"].to(device) segment_ids = batch['segment_ids'].to(device) # 将序列按长度降序排列 seq_len, perm_idx = seq_len.sort(0, descending=True) label_id = label_id[perm_idx] segment_ids = segment_ids[perm_idx].transpose(0, 1) with torch.no_grad(): pred = model.get_labels(segment_ids, seq_len) y_pred = np.hstack((y_pred, pred)) y_test = np.hstack((y_test, label_id.to("cpu").numpy())) # 评估 print("Precision, Recall and F1-Score...") print( metrics.classification_report(y_test, y_pred, target_names=get_labels('./data/label'))) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test, y_pred) print(cm)
def train(): # 配置文件 cf = Config('./config.yaml') # 有GPU用GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 训练数据 train_data = NewsDataset("./data/cnews_final_train.txt", cf.max_seq_len) train_dataloader = DataLoader(train_data, batch_size=cf.batch_size, shuffle=True) # 测试数据 test_data = NewsDataset("./data/cnews_final_test.txt", cf.max_seq_len) test_dataloader = DataLoader(test_data, batch_size=cf.batch_size, shuffle=True) # 预训练词向量矩阵 embedding_matrix = get_pre_embedding_matrix("./data/final_vectors") # 模型 model = TextRNN(cf, torch.tensor(embedding_matrix)) # 优化器用adam optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters())) # 把模型放到指定设备 model.to(device) # 让模型并行化运算 if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # 训练 start_time = time.time() total_batch = 0 # 总批次 best_acc_val = 0.0 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练 flag = False model.train() for epoch_id in trange(cf.epoch, desc="Epoch"): # for step,batch in enumerate(tqdm(train_dataloader,"batch",total=len(train_dataloader))): for step, batch in enumerate(train_dataloader): label_id = batch['label_id'].squeeze(1).to(device) seq_len = batch["seq_len"].to(device) segment_ids = batch['segment_ids'].to(device) # 将序列按长度降序排列 seq_len, perm_idx = seq_len.sort(0, descending=True) label_id = label_id[perm_idx] segment_ids = segment_ids[perm_idx].transpose(0, 1) loss = model(segment_ids, seq_len, label_id) loss.backward() optimizer.step() optimizer.zero_grad() total_batch += 1 if total_batch % cf.print_per_batch == 0: model.eval() with torch.no_grad(): loss_train, acc_train = model.get_loss_acc( segment_ids, seq_len, label_id) loss_val, acc_val = evaluate(model, test_dataloader, device) if acc_val > best_acc_val: # 保存最好结果 best_acc_val = acc_val last_improved = total_batch torch.save(model.state_dict(), "./output/model.bin") improved_str = "*" else: improved_str = "" time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print( msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) model.train() if total_batch - last_improved > require_improvement: print("长时间未优化") flag = True break if flag: break
# 评估 print("Precision, Recall and F1-Score...") print( metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': print('Configuring RNN model...') if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, args.VOCAB_SIZE) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) args.VOCAB_SIZE = len(words) model = TextRNN(args) if args.DO_TRAIN: train() if args.DO_TEST: test()
def main(args): print "loadding data and labels from dataset" train = pd.read_csv(args.train_dir) ch_train = pd.read_csv(args.chtrain_dir) x_train = train["comment_text"] x_chtrain = ch_train["comment_text"] target_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] x = [] x_ch = [] for line in x_train: if len(line) > 0: x.append(utils.review_to_wordlist(line.strip())) print "loaded %d comments from dataset" % len(x) for line in x_chtrain: if len(line) > 0: x_ch.append(utils.review_to_wordlist_char(line.strip())) print "loaded %d comments from dataset" % len(x) y = train[target_cols].values index2word, word2index = utils.load_vocab(args.vocab_dir) index2char, char2index = utils.load_char(args.char_dir) x_vector = utils.vectorize(x, word2index, verbose=False) x_vector = np.array(x_vector) char_vector = utils.vectorize_char(x_ch, char2index, verbose=False) char_vector = np.array(char_vector) print char_vector[0] save_dir = os.path.join(args.save_dir, args.model_type) if not os.path.exists(save_dir): os.makedirs(save_dir) if args.model_type in ["cnn", "cnnfe", "chcnn", "chcnn2"]: max_step = args.max_step_cnn max_size = args.max_size_cnn nb_epochs = args.nb_epochs_cnn elif args.model_type in [ "rnn", "rnnfe", "rnnfe2", "chrnn", "chrnnfe", "rcnn" ]: max_step = args.max_step_rnn max_size = args.max_size_rnn nb_epochs = args.nb_epochs_rnn ex_features = add_features("../data/train.csv") nfolds = args.nfolds skf = KFold(n_splits=nfolds, shuffle=True, random_state=2018) test_prob = [] stack_logits = np.zeros((len(x_vector), len(target_cols))) for (f, (train_index, test_index)) in enumerate(skf.split(x_vector)): x_train, x_eval = x_vector[train_index], x_vector[test_index] char_train, char_eval = char_vector[train_index], char_vector[ test_index] y_train, y_eval = y[train_index], y[test_index] with tf.Graph().as_default(): config_proto = utils.get_config_proto() sess = tf.Session(config=config_proto) if args.model_type == "cnn": model = TextCNN(args, "TextCNN") elif args.model_type == "cnnfe": model = TextCNNFE(args, "TextCNNFE") elif args.model_type == "rnn": model = TextRNN(args, "TextRNN") elif args.model_type == "rnnfe": model = TextRNNFE(args, "TextRNNFE") elif args.model_type == "rcnn": model = TextRCNN(args, "TextRCNN") elif args.model_type == "attention": model = RNNWithAttention(args, "Attention") elif args.model_type == "chrnn": model = TextRNNChar(args, "TextRNNChar") elif args.model_type == "chcnn": model = TextCNNChar(args, "TextCNNChar") elif args.model_type == "chcnn2": model = TextCNNChar(args, "TextCNNChar2") elif args.model_type == "rnnfe2": model = TextRNNFE2(args, "TextCNNCharFE2") elif args.model_type == "chrnnfe": model = TextRNNCharFE(args, "TextCNNCharFE") else: raise ValueError("Unknown model_type %s" % args.model_type) sess.run(tf.global_variables_initializer()) if args.use_ft: pretrain_dir = args.ft_dir print "use FastText word vector" embedding = utils.load_fasttext(pretrain_dir, index2word) if not args.use_ft: pretrain_dir = args.glove_dir print "use Glove word vector" embedding = utils.load_glove(pretrain_dir, index2word) sess.run(model.embedding_init, {model.embedding_placeholder: embedding}) for line in model.tvars: print line print "training %s model for toxic comments classification" % ( args.model_type) print "%d fold start training" % f for epoch in range(1, nb_epochs + 1): print "epoch %d start with lr %f" % ( epoch, model.learning_rate.eval(session=sess)), "\n", "- " * 50 loss, total_comments = 0.0, 0 if args.model_type in ["cnn", "rnn", "rcnn"]: train_batch = utils.get_batches(x_train, y_train, args.batch_size, args.max_len) valid_batch = utils.get_batches(x_eval, y_eval, max_size, args.max_len, False) elif args.model_type in ["chrnn", "chcnn", "chcnn2"]: train_batch = utils.get_batches_with_char( x_train, char_train, y_train, args.batch_size, args.max_len) valid_batch = utils.get_batches_with_char( x_eval, char_eval, y_eval, max_size, args.max_len, False) elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]: train_batch = utils.get_batches_with_fe( x_train, y_train, ex_features, args.batch_size, args.max_len) valid_batch = utils.get_batches_with_fe( x_eval, y_eval, ex_features, max_size, args.max_len, False) elif args.model_type in ["chrnnfe"]: train_batch = utils.get_batches_with_charfe( x_train, char_train, y_train, ex_features, args.batch_size, args.max_len) valid_batch = utils.get_batches_with_charfe( x_eval, char_eval, y_eval, ex_features, max_size, args.max_len, False) epoch_start_time = time.time() step_start_time = epoch_start_time for idx, batch in enumerate(train_batch): if args.model_type in ["cnn", "rnn", "rcnn"]: comments, comments_length, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, labels) elif args.model_type in ["chrnn", "chcnn", "chcnn2"]: comments, comments_length, chs, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, chs, labels) elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]: comments, comments_length, exs, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, labels, exs) elif args.model_type in ["chrnnfe"]: comments, comments_length, chs, exs, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, chs, labels, exs) loss += loss_t * batch_size total_comments += batch_size if global_step % 200 == 0: print "epoch %d step %d loss %f time %.2fs" % ( epoch, global_step, loss_t, time.time() - step_start_time) if global_step % 200 == 0: _ = run_valid(valid_batch, model, sess, args.model_type) # model.saver.save(sess, os.path.join(save_dir, "model.ckpt"), global_step=global_step) step_start_time = time.time() epoch_time = time.time() - epoch_start_time sess.run(model.learning_rate_decay_op) print "%.2f seconds in this epoch with train loss %f" % ( epoch_time, loss / total_comments) test_prob.append(run_test(args, model, sess)) stack_logits[test_index] = run_valid(valid_batch, model, sess, args.model_type) preds = np.zeros((test_prob[0].shape[0], len(target_cols))) for prob in test_prob: preds += prob print prob[0] preds /= len(test_prob) print len(test_prob) write_predict(stack_logits, args.model_type) write_results(preds, args.model_type)
def __init__(self): self.categories, self.cat_to_id = read_category() self.words, self.word_to_id = read_vocab(vocab_file) self.model = TextRNN() self.model.load_state_dict(torch.load('model_params.pkl'))
# 交叉验证 f = StratifiedKFold(n_splits=n_splits, random_state=seed) for i, (tr, va) in enumerate(f.split(x_pad, y)): x_train_age = x_pad[tr] x_va_age = x_pad[va] y_train_age = y[tr] y_va_age = y[va] # 将整型标签转为onehot y_train_age = to_categorical(y_train_age) y_va_age = to_categorical(y_va_age) print('开始LSTM建模......') max_features = len(word2index) + 1 # 词表的大小 model = TextRNN(maxlen, max_features, embedding_dims, 7, 'softmax').get_model() # 指定optimizer、loss、评估标准 model.compile('adam', 'categorical_crossentropy', metrics=['accuracy']) print('训练...') my_callbacks = [ ModelCheckpoint(model_path + 'lstm_model_age.h5', verbose=1), EarlyStopping(monitor='val_accuracy', patience=2, mode='max') ] # fit拟合数据 history = model.fit(x_train_age, y_train_age, batch_size=batch_size, epochs=epochs, callbacks=my_callbacks, validation_data=(x_va_age, y_va_age))