def train(config): set_manual_seed(10) """ 1: 文本清洗和分词,构建词表 """ print("Preparing the batch data ... \n") corpus_x, corpus_y, vocab = build_dataset(config) """ 2:计算类别权重,缓解类别不平衡问题 """ class_weights = calcu_class_weights(corpus_y, config) config.class_weights = class_weights """ 3:加载预训练的词向量 """ embed_matrix = load_embed_matrix(vocab, config) config.embed_matrix = embed_matrix """ 4: 划分数据集和生成batch迭代器 """ train_iter, valid_iter, test_iter = batch_generator( corpus_x, corpus_y, 0.15, config) """ 5:模型初始化 """ print("Building the textcnn model ... \n") model = TextCNN(config) print(f'The model has {count_params(model):,} trainable parameters\n') model.to(config.device) """ 6:开始训练模型 """ print("Start the training ... \n") init_network(model) train_model(config, model, train_iter, valid_iter, test_iter)
class Textcnn_pred(object): def __init__(self, vocab_dir): self.input_x = tf.placeholder(tf.int32, [None, config.seq_length], name='input_x') self.words = tools.read_file(vocab_dir) self.vocab_size = len(self.words) self.textcnn = TextCNN(config, self.vocab_size, keep_prob=1.0) self.logits = self.textcnn.cnn(self.input_x) self.textcnn_pred = tf.argmax(tf.nn.softmax(self.logits), 1) saver = tf.train.Saver() sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = 0.8 sess_config.gpu_options.allow_growth = True self.sess = tf.Session(config=sess_config) model_path = 'checkpoints/model/TextCNNnet_2019-10-17-14-35-50.ckpt-9000' saver.restore(sess=self.sess, save_path=model_path) print( "################ load TextCNN model down! ##########################" ) def _close(self): self.sess.close() def text(self, input): logit, pred = self.sess.run([self.logits, self.textcnn_pred], feed_dict={self.input_x: input}) return pred
def Textcnn_test(): if FLAGS.vocab_dir is None: words = tools.build_vocab(train_data=FLAGS.train_data, vocab_dir=FLAGS.vocab_dir) ### 制作词汇表 else: words = tools.read_file(FLAGS.vocab_dir) vocab_size = len(words) print("Test words : ", vocab_size) test_X, test_Y = tools.create_voabulary(train_data=FLAGS.test_data, vocab_data=FLAGS.vocab_dir, max_length=config.seq_length) input_x = tf.placeholder(tf.int32, [None, config.seq_length], name='input_x') input_y = tf.placeholder(tf.float32, [None, config.num_classes], name='input_y') model_path = 'checkpoints/TextCNNnet_2019-11-01-15-31-50.ckpt-4000' save_path = model_path sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True sess = tf.Session(config=sess_config) textcnn = TextCNN(config, vocab_size, keep_prob=1.0) logits = textcnn.cnn(input_x) ### (?,10) loss = textcnn_loss(logits=logits, label=input_y) acc = textcnn_acc(logits=logits, labels=input_y) saver = tf.train.Saver() saver.restore(sess=sess, save_path=save_path) batch_test = tools.batch_iter(test_X, test_Y, config.batch_size) ### 生成批次数据 i = 0 all_acc = 0 for x_batch, y_batch in batch_test: test_loss, test_acc = sess.run([loss, acc], feed_dict={ input_x: x_batch, input_y: y_batch }) all_acc = all_acc + test_acc i += 1 print("Average acc : ", (all_acc / i))
def train(config): set_manual_seed(10) """ 1: 划分数据集并保存 """ print("Preparing the batch data ... \n") build_dataset(config) """ 2:计算类别权重,缓解类别不平衡问题 """ class_weights = calcu_class_weights(config) config.class_weights = class_weights """ 3: 划分数据集和生成batch迭代器 """ train_iter, valid_iter, test_iter = batch_generator(config) """ 5:模型初始化 """ print("Building the textcnn model ... \n") model = TextCNN(config) print(f'The model has {count_params(model):,} trainable parameters\n') model.to(config.device) """ 6:开始训练模型 """ print("Start the training ... \n") init_network(model) train_model(config, model, train_iter, valid_iter, test_iter)
def __init__(self, pretrained_model_path, vocabulary_size, filter_sizes, filter_num, data=None, train=False, cuda=1): super(OLTR_For_Textcnn, self).__init__() self.device = torch.device( 'cuda:%d' % cuda if torch.cuda.is_available() else 'cpu') self.textcnn = TextCNN(vocabulary_size=vocabulary_size, class_num=183, filter_num=filter_num, filter_sizes=filter_sizes, embedding_dim=128) checkpoint = torch.load(pretrained_model_path, map_location=self.device) self.textcnn.load_state_dict(checkpoint) self.textcnn = self.textcnn.to(self.device) # fix all param in textcnn when training OLTR for param_name, param in self.textcnn.named_parameters(): param.requires_grad = False self.textcnn.eval() self.classes_num = 183 self.feature_dim = len(filter_sizes.split(",")) * filter_num self.classifier = OLTR_classifier(self.feature_dim, self.classes_num) self.centroids = nn.Parameter( torch.randn(self.classes_num, self.feature_dim)) if train and data is not None: print("update centroid with data") self.centroids.data = self.centroids_cal(data) elif train and data is None: raise ValueError("Train mode should update centroid with data") else: print("Test mode should load pretrained centroid")
def predict(): word2index, label2index, trainX, trainY, vaildX, validY, testX, testY = load_data( FLAGS.all_data_h5py, FLAGS.id_index_pkl) vocab_size = len(word2index) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: text_cnn = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.label_size, FLAGS.learning_rate, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size) saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Model Failed') return print("test_X.shape:", testX.shape) print("test_Y.shape:", testY.shape) raw_labels = [] predicted_labels = [] number_examples = len(testX) batch_size = FLAGS.batch_size for start, end in zip(range(0, number_examples, batch_size), range(batch_size, number_examples, batch_size)): predictions = sess.run(text_cnn.predictions, feed_dict={ text_cnn.input_x: testX[start:end], text_cnn.input_y: testY[start:end], text_cnn.dropout_keep_prob: 1.0, text_cnn.is_training_flag: False }) if len(predictions) == len(testY[start:end]): raw_labels.extend(testY[start:end]) predicted_labels.extend(predictions) print classification_report(raw_labels, predicted_labels)
def main(_): time_start = time.time() def predict_label(logits, tmp_gate=0.01): #根据logit,预测label,预测为大于设定的阈值得标签 probs = softmax(logits) labels = [] for prob in probs: con = np.greater_equal(prob, [tmp_gate] * 16) #判断是否大于阈值,此处为0.01 tmp = list(np.argwhere(con == True)) label = [x[0] for x in tmp] if sum(label) < 1: #如果没有大于阈值的label,就选最大的label label = np.argmax(prob) labels.append(label) #in_prob = prob[index] return labels def predict_label_top_k(sess, eval_return): '''旧方案预测top k标签,k可以固定下来或者根据修改model.py,动态确定每条评论的k''' y_predict = [] top_number = eval_return[0] # 每条评论的标签数 probs_squeezed = eval_return[1] # logits for i, curr_prob in enumerate(probs_squeezed): index = tf.nn.top_k(curr_prob, top_number[i]) #从logits中选取对应前k个 index = set(index.indices.eval()) y_predict.append( tf.constant([ 1 if i in index else 0 for i in range(FLAGS.num_classes) ])) y_predict = tf.stack(y_predict) return y_predict.eval() def pad_y(Y, label_num=16): #根据标签在标签集的整数索引,转化为multi-hot向量 Y2 = np.zeros((len(Y), label_num)) for i, y in enumerate(Y): tmp_y = [0] * label_num for it_y in y: tmp_y[it_y] = 1 Y2[i, :] = tmp_y return Y2 vocabulary_word2index, vocabulary_index2word = create_voabulary( file=FLAGS.word2vec_model_path, cache_path="./cache_pickle/ft_%s_voabulary.pickle" % _LANG, from_word2vec=1) # 导入词汇表 vocab_size = len(vocabulary_word2index) #词汇量 trainX, trainY, testX, testY = None, None, None, None cache_train_data_path = "./cache_pickle/train_data_%s.pickle" % _LANG #导入由tc_utils.py模块准备好的训练集、验证集 train_data = os.path.exists(cache_train_data_path) cache_eval_data_path = "./cache_pickle/eval_data_%s.pickle" % _LANG eval_data = os.path.exists(cache_eval_data_path) if (train_data and eval_data): with open(cache_train_data_path, 'rb') as f: trainX, trainY = pickle.load(f) with open(cache_eval_data_path, 'rb') as f: testX, testY = pickle.load(f) else: return "data NOT found!" trainY = pad_y(trainY) testY = pad_y(testY) trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.) #对评论进行截断或者补全到固定长度 testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) config = tf.ConfigProto() best_f1 = 0.0 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 500 # 如果超过500轮未提升,提前结束训练 print_per_batch = 10 # 每多少轮次输出在训练集和验证集上的性能 save_per_batch = 50 # 每多少轮次将训练结果写入tensorboard scalar save_path = FLAGS.ckpt_dir + "model.ckpt" total_batch = 1 flag = False with tf.Session(config=config) as sess: print("initialize model") textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training, multi_label_flag=FLAGS.multi_label_flag ) #初始化模型,可替换为textrcnn、textrnn tf.summary.scalar("loss", textCNN.loss_val) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter("./tf_board/") saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): #可导入以前训练好的模型,继续进行训练 print("Restore Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Initialize variable") sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #使用训练好的词向量 assign_pretrained_word_embedding( sess, vocabulary_index2word, vocab_size, textCNN, cache_path="./cache_pickle/embedding_%s.pickle" % _LANG, word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch = sess.run(textCNN.epoch_step) writer.add_graph(sess.graph) number_of_training_data = len(trainX) batch_size = FLAGS.batch_size print("Start epoch") #开始训练模型 for epoch in range(curr_epoch, FLAGS.num_epochs): batch_train = batch_iter(trainX, trainY, batch_size) for curr_trainX, curr_trainY in batch_train: feed_dict = { textCNN.input_x: curr_trainX, textCNN.dropout_keep_prob: 0.5 } feed_dict[textCNN.input_y] = curr_trainY if total_batch == 1: print('testX\n', testX) print('testY\n', testY) if total_batch % save_per_batch == 0: s = sess.run(merged_summary, feed_dict=feed_dict) writer.add_summary(s, total_batch) if total_batch % print_per_batch == 0: feed_dict[textCNN.dropout_keep_prob] = 1.0 #train_loss,train_logits = sess.run([textCNN.loss_val,textCNN.logits],feed_dict=feed_dict) feed_dict1 = { textCNN.input_x: testX, textCNN.input_y: testY, textCNN.dropout_keep_prob: 1.0 } test_loss, logits = sess.run( [textCNN.loss_val, textCNN.logits], feed_dict=feed_dict1) predict_y = predict_label(logits) # predict_y = predict_label(test_logits) test_acc, precision, recall, f1 = evaluate( predict_y, testY) # waitting if test_acc > best_f1: best_f1 = test_acc last_improved = total_batch saver.save(sess, save_path, global_step=total_batch) improved_str = '*' else: improved_str = '' print( "epoch:%d total_batch:%d test_loss:%f test_acc:%f precision:%f recall:%f f1:%f %s" % (epoch, total_batch, test_loss, test_acc, precision, recall, f1, improved_str)) #waitting sess.run([textCNN.train_op], feed_dict) total_batch += 1 if total_batch - last_improved > require_improvement: print("auto stopping") flag = True break if flag: break time_end = time.time() print('using time:', time_end - time_start)
def main(_): time_start = time.time() def save_predict(predict_y, voc, file): ''' 保存模型预测的label为固定格式''' print(predict_y) with open(file, 'w') as f: for i in range(len(predict_y)): if isinstance(predict_y[i], list): labels = [] for j in predict_y[i]: label = voc[j] labels.append(label) line = ["__label__" + x for x in labels] if len(predict_y[i]) > 1 and "__label__NULL" in line: line.remove("__label__NULL") line = " ".join(line) else: label = voc[predict_y[i]] line = "__label__" + label f.write(line + "\n") def predict_label(logits, tmp_gate=0.01): '''根据logits预测label,选择可不同阈值''' probs = softmax(logits) labels = [] for prob in probs: con = np.greater_equal(prob, [tmp_gate] * 16) tmp = list(np.argwhere(con == True)) label = [x[0] for x in tmp] if sum(label) < 1: label = np.argmax(prob) labels.append(label) return labels def predict_label_top_k(sess, eval_return, batch_size=1): '''旧方案,预测topk标签,k可固定,若动态确定,需要修改model模块''' top_number = eval_return[0] #k probs = eval_return[1] # probs,已经由logits转为prob ones = tf.ones(shape=top_number.shape, dtype=tf.float32) top_number = tf.cast(tf.where(tf.greater(top_number, ones), top_number, ones), dtype=tf.int32) probs_split = tf.split(probs, batch_size) probs_squeezed = [tf.squeeze(x) for x in probs_split] y_predict = [] for i, curr_prob in enumerate(probs_squeezed): index = tf.nn.top_k(curr_prob, top_number[i]) index = set(index.indices.eval()) y_predict.append( tf.constant([ 1 if i in index else 0 for i in range(FLAGS.num_classes) ])) y_predict = tf.stack(y_predict) return y_predict.eval() def pad_y(Y): Y2 = np.zeros((len(Y), 16)) for i, y in enumerate(Y): tmp_y = [0] * 16 for it_y in y: tmp_y[it_y] = 1 Y2[i, :] = tmp_y return Y2 testX, testY = None, None cache_test_data_path = "./cache_pickle/eval_data_%s.pickle" % _LANG test_data = os.path.exists(cache_test_data_path) vocabulary_word2index_label, vocabulary_index2word_label = create_voab_label( ) vocabulary_word2index, vocabulary_index2word = create_voabulary( file=FLAGS.word2vec_model_path, cache_path="./cache_pickle/ft_%s_voabulary.pickle" % _LANG, from_word2vec=1) if not test_data: print("test data NOT exist") vocab_size = len(vocabulary_word2index) print("cnn_model_vocab_size:", vocab_size) testX, testY = load_data(vocabulary_word2index, vocabulary_word2index_label, training_data_path=FLAGS.traning_data_path, cache_path='') #导入测试集 print("testX:", len(testX), "testY:", len(testY)) testY = pad_y(testY) testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) with open(cache_test_data_path, 'ab') as data_f: #缓存,方便多次测试 pickle.dump((np.array(testX), np.array(testY)), data_f) print("dump data end!") else: vocab_size = len(vocabulary_word2index) print("cnn_model_vocab_size:", vocab_size) with open(cache_test_data_path, 'rb') as data_f: testX, testY = pickle.load(data_f) testY = pad_y(testY) testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) config = tf.ConfigProto() with tf.Session(config=config) as sess: print("initialize model") textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training, multi_label_flag=FLAGS.multi_label_flag) saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): #载入训练好的模型 print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint. going to stop") return feed_dict = { textCNN.input_x: testX, textCNN.input_y: testY, textCNN.dropout_keep_prob: 1.0 } test_loss, logits = sess.run([textCNN.loss_val, textCNN.logits], feed_dict=feed_dict) # 这里是验证,所以有loss,如果只预测,就不用loss,feed_dict也不用textCNN.input_y # feed_dict = {textCNN.input_x:testX,textCNN.dropout_keep_prob:1.0} # logits = sess.run([textCNN.logits],feed_dict=feed_dict) predict_y = predict_label(logits) save_predict(predict_y, file='./result_%s.txt' % _LANG, voc=vocabulary_index2word_label) test_acc, precision, recall, f1 = evaluate(predict_y, testY) print("test_loss:%f test_acc:%f precision:%f recall:%f f1:%f" % (test_loss, test_acc, precision, recall, f1))
class OLTR_For_Textcnn(nn.Module): def __init__(self, pretrained_model_path, vocabulary_size, filter_sizes, filter_num, data=None, train=False, cuda=1): super(OLTR_For_Textcnn, self).__init__() self.device = torch.device( 'cuda:%d' % cuda if torch.cuda.is_available() else 'cpu') self.textcnn = TextCNN(vocabulary_size=vocabulary_size, class_num=183, filter_num=filter_num, filter_sizes=filter_sizes, embedding_dim=128) checkpoint = torch.load(pretrained_model_path, map_location=self.device) self.textcnn.load_state_dict(checkpoint) self.textcnn = self.textcnn.to(self.device) # fix all param in textcnn when training OLTR for param_name, param in self.textcnn.named_parameters(): param.requires_grad = False self.textcnn.eval() self.classes_num = 183 self.feature_dim = len(filter_sizes.split(",")) * filter_num self.classifier = OLTR_classifier(self.feature_dim, self.classes_num) self.centroids = nn.Parameter( torch.randn(self.classes_num, self.feature_dim)) if train and data is not None: print("update centroid with data") self.centroids.data = self.centroids_cal(data) elif train and data is None: raise ValueError("Train mode should update centroid with data") else: print("Test mode should load pretrained centroid") def forward(self, x, *args): feature = self.textcnn.extract_feature(x) logits, _ = self.classifier(feature, self.centroids) return logits, feature def class_count(self, data): labels = np.array([int(ex.label) for ex in data.dataset]) class_data_num = [] for l in range(self.classes_num): class_data_num.append(len(labels[labels == l])) if class_data_num[-1] == 0: class_data_num[-1] = 1 return class_data_num def centroids_cal(self, data): centroids = torch.zeros(self.classes_num, self.feature_dim).to(self.device) print('Calculating centroids.') # for model in self.networks.values(): # model.eval() self.textcnn.eval() # Calculate initial centroids only on training data. with torch.set_grad_enabled(False): for batch in data: inputs, labels = batch.text, batch.label inputs, labels = inputs.to(self.device), labels.to(self.device) # Calculate Features of each training data features = self.textcnn.extract_feature(inputs) # Add all calculated features to center tensor for i in range(len(labels)): label = labels[i] centroids[label] += features[i] # Average summed features with class count centroids /= torch.Tensor( self.class_count(data)).float().unsqueeze(1).to(self.device) return centroids
if not os.path.isdir(output_dir): os.makedirs(output_dir) class_num = 183 feature_dim = 300 bs = 96 text_field = data.Field(lower=True, batch_first=True) label_field = data.Field(sequential=False, use_vocab=False, batch_first=True) train_iter, dev_iter, test_iter = process_data(text_field=text_field, label_field=label_field, data_dir=input_dir, batch_size=bs) vocabulary_size = len(text_field.vocab) textcnn = TextCNN(vocabulary_size=vocabulary_size, class_num=183, filter_num=filter_num, filter_sizes=filter_size, embedding_dim=128) checkpoint = torch.load(load_model_path, map_location=device) textcnn.load_state_dict(checkpoint) textcnn = textcnn.to(device) textcnn.eval() def centroids_cal(data): centroids = torch.zeros(class_num, feature_dim).to(device) print('Calculating centroids.') # for model in self.networks.values(): # model.eval()
def main(_): #1.load data word2index, label2index, train_x, train_y, valid_x, valid_y, test_x, test_y =\ load_data(FLAGS.cache_file_h5py, FLAGS.cache_file_pickle) vocab_size = len(word2index) num_classes = len(label2index) print(train_y[0:3]) num_examples, FLAGS.sentence_len = train_x.shape #2 create session config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: textCNN = TextCNN(filter_sizes, FLAGS.num_filters, num_classes, FLAGS.learning_rate, \ FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.sentence_len,\ vocab_size, FLAGS.embed_size, multi_label_flag = FLAGS.multi_label_flag) saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: index2word = {v: k for k, v in word2index.items()} #assign_pretrained_word_embedding(sess, index2word, vocab_size, textCNN, FLAG.word2vec_model_path) curr_epoch = sess.run(textCNN.epoch_step) #3 feed data and training number_of_training_data = len(train_x) batch_size = FLAGS.batch_size iteration = 0 for epoch in range(curr_epoch, FLAGS.num_epochs): loss, counter = 0.0, 0 for start, end in zip(range(0, number_of_training_data, batch_size), \ range(batch_size, number_of_training_data, batch_size)): iteration = iteration + 1 feed_dict = { textCNN.input_x: train_x[start: end], textCNN.dropout_keep_prob: 0.8, textCNN.is_training_flag: FLAGS.is_training_flag } if not FLAGS.multi_label_flag: feed_dict[textCNN.input_y] = train_y[start: end] else: feed_dict[textCNN.input_y_multilabel] = train_y[start: end] curr_loss, lr, _ = sess.run([textCNN.loss_val, textCNN.learning_rate, textCNN.train_op], feed_dict) loss, counter = loss + curr_loss, counter + 1 # 每50步打印损失 if counter % 50 == 0: #do_eval(sess, textCNN, test_x, test_y, num_classes) print("Epoch %d\tBatch %d\tTrain loss:%.3f\tLearning rate:%.5f" % \ (epoch, counter, loss/float(counter), lr)) #每一轮进行验证 print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every==0: #eval_loss, f1_score, f1_micro, f1_macro = do_eval(sess, textCNN, text_x, text_y, num_classes) # do_eval(sess, textCNN, text_x, text_y, num_classes) #save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch)
def Textcnn_train(): ########### load data ################### if not os.path.exists(FLAGS.vocab_dir): words = tools.build_vocab(train_data=FLAGS.train_data, vocab_dir=FLAGS.vocab_dir) ### 制作词汇表 else: words = tools.read_file(FLAGS.vocab_dir) vocab_size = len(words) train_X, train_Y = tools.create_voabulary(train_data=FLAGS.train_data, vocab_data=FLAGS.vocab_dir, max_length=config.seq_length) val_X, val_Y = tools.create_voabulary(train_data=FLAGS.val_data, vocab_data=FLAGS.vocab_dir, max_length=config.seq_length) #trainX = pad_sequences(train_X, maxlen=200, value=0.) # padding to max length #test_X = pad_sequences(test_X, maxlen=200, value=0.) # padding to max length print("Data deal down!") ############################################################################### input_x = tf.placeholder(tf.int32, [None, config.seq_length], name='input_x') input_y = tf.placeholder(tf.float32, [None, config.num_classes], name='input_y') textcnn = TextCNN(config, vocab_size, keep_prob=config.dropout_keep_prob) logits = textcnn.cnn(input_x) ### (?,10) loss = textcnn_loss(logits=logits, label=input_y) ############# 计算 acc ###################################### acc = textcnn_acc(logits=logits, labels=input_y) ###################################################### global_step = tf.Variable(0, name='global_step', trainable=False) learning_rate = tf.train.exponential_decay( learning_rate=FLAGS.learning_rate, global_step=global_step, decay_steps=2000, decay_rate=0.1, staircase=True) optim = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize( loss=loss, global_step=global_step) tensorboard_dir = 'tensorboard/textcnn' tf.summary.scalar("loss", loss) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir) saver = tf.train.Saver(max_to_keep=3) ### 保存模型 model_save_dir = 'checkpoints/' train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) model_name = 'TextCNNnet_{:s}.ckpt'.format(str(train_start_time)) model_save_path = os.path.join(model_save_dir, model_name) model_restore_path = './checkpoints/TextCNNnet_2019-11-01-15-31-50.ckpt-4000' ##### 创建日志 logging.basicConfig( filename='./checkpoints/' + model_name + '.log', format='%(asctime)s - %(pathname)s - %(levelname)s: %(message)s', level=logging.DEBUG, filemode='a', datefmt='%Y-%m-%d%I:%M:%S %p') logging.info('###### Next Is Training Infomation ###################') sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True sess = tf.Session(config=sess_config) with sess.as_default(): if not FLAGS.model_store: step = 0 init = tf.global_variables_initializer() sess.run(init) writer.add_graph(sess.graph) else: saver.restore(sess=sess, save_path=model_restore_path) step = sess.run(tf.train.get_global_step()) writer.add_graph(sess.graph) print('First step is:', step) num_batch = int( (len(train_X) - 1) / config.batch_size) + 1 ### 总batch数 acc_begain = 0 for epoch in range(config.epochs): batch_train = tools.batch_iter(train_X, train_Y, config.batch_size) ### 生成批次数据 Begain_learn_rate = FLAGS.learning_rate for x_batch, y_batch in batch_train: step += 1 _, learn_rate, train_loss_value, train_pred, train_acc, merge_summary_value = sess.run( [optim, learning_rate, loss, logits, acc, merged_summary], feed_dict={ input_x: x_batch, input_y: y_batch }) if Begain_learn_rate != learn_rate: information = '############ New Learning_Rate {:6f} in step {:d} ###########'.format( learn_rate, step) logging.info(information) print(information) Begain_learn_rate = learn_rate if step % 10 == 0: information = '## Epoch {:d} Step_Train / Total_Batch: {:d} / {:d} train_loss= {:5f} train_acc={:5f}'.\ format(int(step/num_batch),step, num_batch, train_loss_value, train_acc) logging.info(information) print(information) if step % 500 == 0: ### 每 500 步进行一次验证,并保存最优模型 val_acc_all = 0 val_loss_all = 0 val_step = 0 batch_val = tools.batch_iter(val_X, val_Y, config.batch_size) ### 生成批次数据 for x_val, y_val in batch_val: if x_val.shape[0] < config.batch_size: pass else: _, val_loss_value, val_pred, val_acc, merge_summary_value = sess.run( [optim, loss, logits, acc, merged_summary], feed_dict={ input_x: x_val, input_y: y_val }) writer.add_summary(merge_summary_value, step) val_acc_all = val_acc_all + val_acc val_loss_all = val_loss_all + val_loss_value val_step += 1 ave_acc = val_acc_all / val_step ave_loss = val_loss_all / val_step if (ave_acc - acc_begain) > 0.001: acc_begain = ave_acc saver.save(sess, model_save_path, global_step=step) tf.train.write_graph(sess.graph_def, '', './checkpoints/textcnn_graph.pb') information = '############ Val_loss = {:5f} Val_acc = {:5f} ##################'.format( ave_loss, ave_acc) logging.info(information) print(information)
pic_dir = "./t-SNE/%s" % load_model_name if not os.path.isdir(pic_dir): os.makedirs(pic_dir) def y_tokenize(y): return int(y) text_field = data.Field(lower=True,batch_first=True) label_field = data.Field(sequential=False, tokenize=y_tokenize, use_vocab = False, batch_first=True) train_iter, dev_iter, test_iter = process_data(text_field=text_field, label_field=label_field, data_dir=input_dir,batch_size=bs, mode=mode) vocabulary_size = len(text_field.vocab) # class_num = len(label_field.vocab) class_num = 183 textcnn = TextCNN(vocabulary_size=vocabulary_size, class_num=class_num, filter_num=filter_num, filter_sizes=filter_size, embedding_dim=embedding_dim, dropout=dropout) checkpoint = torch.load(load_model_path, map_location=device) textcnn.load_state_dict(checkpoint) textcnn = textcnn.to(device) textcnn.eval() samples=[] for i in range(class_num): samples.append([]) for batch in train_iter: for i,label in enumerate(batch.label.numpy().tolist()): samples[label].append(batch.text) max_points=100
batch_size = 32 embedding_dims = 100 epochs = 10 print('Loading data...') (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)...') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) print('Build model...') model = TextCNN(maxlen, vocab_size, embedding_dims).get_model() model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) print('Train...') #这里可以自己实现自己需要的回调函数,做比赛时基本都是自定义回调函数 early_stopping = EarlyStopping(monitor='val_acc', patience=2, mode='max') model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=[early_stopping], validation_data=(x_test, y_test)) print('Test...') result = model.predict(x_test)
def main(_): word2index, label2index, trainX, trainY, vaildX, validY, testX, testY = load_data( FLAGS.all_data_h5py, FLAGS.id_index_pkl) vocab_size = len(word2index) text_cnn = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.label_size, FLAGS.learning_rate, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(FLAGS.summary_dir, tf.get_default_graph()) number_of_training_data = len(trainX) batch_size = FLAGS.batch_size for epoch in range(0, FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end]) print("trainY[start:end]:", trainY[start:end]) curr_loss, curr_acc, _ = sess.run( [text_cnn.loss_val, text_cnn.accuracy, text_cnn.train_op], feed_dict={ text_cnn.input_x: trainX[start:end], text_cnn.input_y: trainY[start:end], text_cnn.dropout_keep_prob: 0.8, text_cnn.is_training_flag: True }) loss, acc, counter = loss + curr_loss, acc + curr_acc, counter + 1 if counter % 500 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter))) print("going to increment epoch counter....") sess.run(text_cnn.epoch_increment) print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval(sess, text_cnn, vaildX, validY, batch_size) print( "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=text_cnn.epoch_step) test_loss, test_acc = do_eval(sess, text_cnn, testX, testY, batch_size) print("Test Loss:%.3f\tTest Accuracy:%.3f" % (test_loss, test_acc)) writer.close()
if __name__ == '__main__': text_field = data.Field(lower=True, batch_first=True) label_field = data.Field(sequential=False, use_vocab=False, batch_first=True) train_iter, dev_iter, test_iter = process_data(text_field=text_field, label_field=label_field, data_dir=input_dir, batch_size=bs) vocabulary_size = len(text_field.vocab) class_num = 183 textcnn = TextCNN(vocabulary_size=vocabulary_size, class_num=class_num, filter_num=filter_num, filter_sizes=filter_size, embedding_dim=embedding_dim, dropout=dropout) textcnn = textcnn.to(device) optimizer = torch.optim.Adam(textcnn.parameters(), lr=lr) textcnn.train() steps = 0 best_acc = min_acc for epoch in range(1, epoch_num + 1): for batch in train_iter: textcnn.train() feature, target = batch.text, batch.label feature, target = feature.to(device), target.to(device) optimizer.zero_grad() logits = textcnn(feature)
def class_count(df): df_label = np.argmax(df, 1) class_freqs = 1 / np.bincount(df_label) class_freqs = class_freqs / (max(class_freqs) - min(class_freqs)) return tf.convert_to_tensor(class_freqs, dtype=tf.float32) F1_score = 0 for tag in tag_columns: train_Y = train_tags[tag] valid_Y = valid_tags[tag] weights = class_count(train_Y) print("Building the model for catetory: {}".format(tag)) model = TextCNN(params, embed_matrix) print("Starting the training model for catetory: {}".format(tag)) save_path = os.path.join(textcnn_dir, date, mode, tag) if os.path.exists(save_path): print('dir exists') else: print('dir not exists, create dir.') os.makedirs(save_path) train_data, train_steps = batch_generator(train_X, train_Y, params["batch_size"]) valid_data, valid_steps = batch_generator(valid_X, valid_Y, params["batch_size"]) train_model(model, train_data, valid_data, train_steps, valid_steps,