def backward_propagation(): word_vecs, sen_index, psegs_vec, sen_pseg_index, one_hot_label=tcnn_data_helper.process_file(FLAGS.file_location,FLAGS.w2v_model_location,FLAGS.words_location,FLAGS.psegs_location, False,FLAGS.sentence_length,FLAGS.vector_size,FLAGS.psegs_size) #划分训练集和测试集 print('begin 获得训练集测试集') X_train = sen_index[:-30] X_test = sen_index[-30:] y_train = one_hot_label[:-30] y_test = one_hot_label[-30:] #首先是embedding层获得词向量数据 with tf.name_scope("embedding"): input_x = tf.nn.embedding_lookup(word_vecs,x) # 注意此处只能用tf.expand_dims()不能用np.expand_dims(),因为此处还没feed进去值 # input_x = tf.expand_dims(input_x,-1) #初始化模型 res = transformer_encoder.Encoder().build(input_x) with tf.Session() as sess: tf.global_variables_initializer().run() #批量获得数据 for epoch in range(FLAGS.num_epochs): batch_train = tcnn_data_helper.batch_iter2(X_train, y_train, FLAGS.batch_size) total_batch = 0 #一个批次大小的数据无法参与训练 for x_batch,y_batch in batch_train: total_batch+=1 feed_dict={x:x_batch,y:y_batch,keep_prob:FLAGS.keep_prob} aa=sess.run(res,feed_dict=feed_dict) print(aa.shape)
def train_tes_model(model, transformer_config): # 配置Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖 tensorboard_dir = 'tensorboard/transformer' if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) #删除原来已存在的tensorboard文件 else: file_list = os.listdir(tensorboard_dir) if len(file_list) > 0: for file in file_list: os.remove(os.path.join(tensorboard_dir, file)) tf.summary.scalar("loss", model.losses) tf.summary.scalar("accuracy", model.accuracy) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir) # 配置 Saver,用以保存模型 saver = tf.train.Saver() if not os.path.exists(model_save_location): os.makedirs(model_save_location) #获得训练数据和测试数据 start_time = time.time() _, sen_index, _, sen_pseg_index, one_hot_label = tcnn_data_helper.process_file( transformer_config.file_location, transformer_config.w2v_model_location, transformer_config.words_location, transformer_config.psegs_location, False, transformer_config.sentence_length, transformer_config.vector_size, transformer_config.pseg_size) X_train, X_test, X_pseg_train, X_pseg_test, y_train, y_test = train_test_split( sen_index, sen_pseg_index, one_hot_label, test_size=0.1) time_dif = get_time_dif(start_time) print("load data usage:", time_dif) print('Training and Testing...') start_time = time.time() with tf.Session() as sess: tf.global_variables_initializer().run() writer.add_graph(sess.graph) #批量获得数据 for epoch in range(transformer_config.num_epochs): batch_train = tcnn_data_helper.batch_iter( X_train, X_pseg_train, y_train, transformer_config.batch_size) total_batch = 0 for x_batch, x_pseg_batch, y_batch in batch_train: total_batch += 1 feed_dict = { model.input_x: x_batch, model.input_x_pseg: x_pseg_batch, model.input_y: y_batch, model.keep_prob: transformer_config.keep_prob } if total_batch % transformer_config.save_per_batch == 0: summary_str = sess.run(merged_summary, feed_dict=feed_dict) writer.add_summary(summary_str, total_batch) # 将summary 写入文件 if total_batch % transformer_config.print_per_batch == 0: train_accuracy = model.accuracy.eval(feed_dict=feed_dict) print("Epoch %d:Step %d accuracy is %f" % (epoch + 1, total_batch, train_accuracy)) sess.run(model.optim, feed_dict=feed_dict) saver.save(sess, save_path) #训练完之后通过测试集测试模型 batch_train = tcnn_data_helper.batch_iter( X_test, X_pseg_test, y_test, transformer_config.batch_size) all_test_pred = [] for x_batch, x_pseg_batch, y_batch in batch_train: test_pred = model.pred_label.eval( feed_dict={ model.input_x: x_batch, model.input_x_pseg: x_pseg_batch, model.input_y: y_batch, model.keep_prob: 1.0 }) all_test_pred.extend(test_pred) test_label = np.argmax(y_test, 1) #要和id所代表的类别标签顺序相同 categories = [ '教育', '时尚', '家居', '娱乐', '财经', '体育', '房产', '游戏', '时政', '科技' ] # 评估 print("Precision, Recall and F1-Score...") print( classification_report(test_label, all_test_pred, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = confusion_matrix(test_label, all_test_pred) print(cm) time_dif = get_time_dif(start_time) print("train_and_test usage:", time_dif)
categories = ['教育', '时尚', '家居', '娱乐', '财经', '体育', '房产', '游戏', '时政', '科技'] # 评估 print("Precision, Recall and F1-Score...") print( classification_report(test_label, all_test_pred, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = confusion_matrix(test_label, all_test_pred) print(cm) if __name__ == '__main__': tcnn_config = Config() model = TcnnModel(tcnn_config) # 获得训练数据和测试数据 start_time = time.time() _, sen_index, _, sen_pseg_index, one_hot_label = tcnn_data_helper.process_file( tcnn_config.file_location, tcnn_config.w2v_model_location, tcnn_config.words_location, tcnn_config.psegs_location, False, tcnn_config.sentence_length, tcnn_config.vector_size, tcnn_config.pseg_size) X_train, X_test, X_pseg_train, X_pseg_test, y_train, y_test = train_test_split( sen_index, sen_pseg_index, one_hot_label, test_size=0.1) time_dif = get_time_dif(start_time) print("load data usage:", time_dif) train_val_test_model((X_train, X_pseg_train, y_train), (X_test, X_pseg_test, y_test), (X_test, X_pseg_test, y_test), model, tcnn_config)
def rcnn(self): # 词向量映射 word_vecs, sen_index, psegs_vec, sen_pseg_index, one_hot_label = tcnn_data_helper.process_file( self.config.file_location, self.config.w2v_model_location, self.config.words_location, self.config.psegs_location, False, self.config.sentence_length, self.config.vector_size, self.config.pseg_size) with tf.name_scope("embedding_lookup"): input_x_content = tf.nn.embedding_lookup(word_vecs, self.input_x) input_x_pseg = tf.nn.embedding_lookup(psegs_vec, self.input_x_pseg) _input_x = tf.concat((input_x_content, input_x_pseg), axis=-1) print('begin rnn') with tf.name_scope("rnn"): lstm_fw_cell = tf.contrib.rnn.BasicLSTMCell( self.config.n_hidden, forget_bias=1.0) # 创建正向的cell lstm_bw_cell = tf.contrib.rnn.BasicLSTMCell( self.config.n_hidden, forget_bias=1.0) # 创建反向的cell outputs, outputs_states = tf.nn.bidirectional_dynamic_rnn( lstm_fw_cell, lstm_bw_cell, _input_x, dtype=tf.float32) # 把单词的上下文表示与其进行左右连接作为卷积神经网络的输入 outputs = tf.concat([outputs[0], _input_x, outputs[1]], 2) x = tf.expand_dims(outputs, -1) #随后进行卷积池化操作 print('begin conve_pool') pool_outputs = [] for filter_h in self.config.filter_hs: with tf.variable_scope('conv_pool_{}'.format(filter_h)): conv = tf.layers.conv2d( x, filters=self.config.num_filters, kernel_size=(filter_h, self.config.vector_size + self.config.pseg_size + self.config.n_hidden * 2), activation=tf.nn.relu, use_bias=False, kernel_initializer=tf.contrib.layers.xavier_initializer(), name='conve') pooled = tf.nn.max_pool(conv, ksize=[ 1, self.config.sentence_length - filter_h + 1, 1, 1 ], strides=[1, 1, 1, 1], padding='VALID', name='pool') pool_outputs.append(pooled) #全连接层操作 print('begin full_connection') with tf.name_scope("full_connection"): h_pool = tf.concat(pool_outputs, 3) # 把3种大小卷积核卷积池化之后的值进行连接 num_filters_total = self.config.num_filters * len( self.config.filter_hs) # 因为随后要经过一个全连接层得到与类别种类相同的输出,而全连接接收的参数是二维的,所以进行维度转换 h_pool_flaten = tf.reshape(h_pool, [-1, num_filters_total]) h_drop = tf.nn.dropout(h_pool_flaten, self.keep_prob) #分类器 W = tf.Variable( tf.truncated_normal( [num_filters_total, self.config.num_classes])) self.l2_loss = tf.nn.l2_loss(W) b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]), name="b") self.y_pred = tf.nn.xw_plus_b(h_drop, W, b, name="scores") # wx+b #预测类别 self.pred_label = tf.argmax(tf.nn.softmax(self.y_pred), 1) with tf.name_scope('optimize'): # 计算损失值,会自动计算softmax故必须传入没计算之前的,否则相当于计算了两次 cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.input_y, logits=self.y_pred) #self.losses = tf.reduce_mean(cross_entropy) # 加L2正则化 self.losses = tf.reduce_mean( cross_entropy) + self.config.l2_reg_lambda * self.l2_loss #优化器 self.optim = tf.train.AdamOptimizer( self.config.init_learning_rate).minimize(self.losses) with tf.name_scope('accuracy'): correct_prediction = tf.equal(self.pred_label, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean( tf.cast(correct_prediction, tf.float32))
def transformer(self): # 向量映射 word_vecs, sen_index, psegs_vec, sen_pseg_index, one_hot_label = tcnn_data_helper.process_file( self.config.file_location, self.config.w2v_model_location, self.config.words_location, self.config.psegs_location, False, self.config.sentence_length, self.config.vector_size, self.config.pseg_size) with tf.name_scope('embedding'): input_x_content = tf.nn.embedding_lookup(word_vecs, self.input_x) # print(tf.shape(self.input_x)[0]) #获得位置编码的部分 with tf.variable_scope("positional_encoding"): #返回的是维度为[self.config.sentence_length,self.config.vector_size]的tensor positional_embedding = positional_encoding( self.config.vector_size, self.config.sentence_length) #注意此处的维度不能固定死为self.config.batch_size,因为最后一批次的训练数据大小往往不为batch_size除非是训练数据的大小正好能被batch_size整除 #应该保持和填充的数据self.input_x的0维相同(即为填充数据其批次的大小) #注意应该用tf.shape(self.input_x)[0]获得而不可用self.input_x.shape[0]获得 positional_inputs = tf.tile( tf.range(0, self.config.sentence_length), [tf.shape(self.input_x)[0]]) #返回batch_size个x,每个x没[0,1,2....self.config.sentence_length] positional_inputs = tf.reshape( positional_inputs, [tf.shape(self.input_x)[0], self.config.sentence_length]) #融入位置信息 input_x_content_add_positional = tf.add( input_x_content, tf.nn.embedding_lookup(positional_embedding, positional_inputs)) # print("shape test:",input_x_content.shape) input_x_pseg = tf.nn.embedding_lookup(psegs_vec, self.input_x_pseg) _input_x = tf.concat( (input_x_content_add_positional, input_x_pseg), axis=-1) #随后通过transformer提取特征 outputs = transformer_encoder.Encoder().build(_input_x) #全连接层操作 print('begin full_connection') with tf.name_scope("full_connection"): #维度转换之后取最后一时刻的输出 outputs = tf.transpose(outputs, [1, 0, 2]) f_input = outputs[-1] #分类器 W = tf.Variable( tf.truncated_normal([ self.config.vector_size + self.config.pseg_size, self.config.num_classes ])) self.l2_loss = tf.nn.l2_loss(W) b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]), name="b") self.y_pred = tf.nn.xw_plus_b(f_input, W, b, name="scores") # wx+b #预测类别 self.pred_label = tf.argmax(tf.nn.softmax(self.y_pred), 1) with tf.name_scope('optimize'): # 计算损失值,会自动计算softmax故必须传入没计算之前的,否则相当于计算了两次 cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.input_y, logits=self.y_pred) self.losses = tf.reduce_mean(cross_entropy) # 加L2正则化 # self.losses = tf.reduce_mean(cross_entropy) + self.config.l2_reg_lambda *self.l2_loss #优化器 self.optim = tf.train.AdamOptimizer( self.config.init_learning_rate).minimize(self.losses) with tf.name_scope('accuracy'): correct_prediction = tf.equal(self.pred_label, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean( tf.cast(correct_prediction, tf.float32))
def tcnn(self): # 词向量映射 word_vecs, sen_index, psegs_vec, sen_pseg_index, one_hot_label = tcnn_data_helper.process_file( self.config.file_location, self.config.w2v_model_location, self.config.words_location, self.config.psegs_location, False, self.config.sentence_length, self.config.vector_size, self.config.pseg_size) with tf.device('/cpu:0'): input_x_content = tf.nn.embedding_lookup(word_vecs, self.input_x) input_x_pseg = tf.nn.embedding_lookup(psegs_vec, self.input_x_pseg) _input_x = tf.concat((input_x_content, input_x_pseg), axis=-1) x = tf.expand_dims(_input_x, -1) #随后进行卷积操作 print('begin conve') with tf.name_scope("conve"): W_conv = [] b_conv = [] for filter_h in self.config.filter_hs: W_conv1 = W_generate([ filter_h, self.config.vector_size + self.config.pseg_size, 1, self.config.num_filters ]) b_conv1 = bias_generate(self.config.num_filters) W_conv.append(W_conv1) b_conv.append(b_conv1) con_outputs = [] # print(np.shape(input_x)) for W, b in zip(W_conv, b_conv): con_output = tf.nn.relu( tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='VALID')) #每个con_output数据的维度为(批次的大小,最大句子的长度减卷积核的高度,1,卷积核的个数) # print('con_output',con_output.shape) con_outputs.append(con_output) #再进行池化操作 print('begin pool') with tf.name_scope("pool"): pool_outputs = [] for con, filter_h in zip(con_outputs, self.config.filter_hs): pool_output = tf.nn.max_pool( con, ksize=[ 1, self.config.sentence_length - filter_h + 1, 1, 1 ], strides=[1, 1, 1, 1], padding='VALID') # 每个pool_output数据的维度为(批次的大小,1,1,卷积核的个数) # 进行的池化操作相当于关键词的提取 # print('pool_output',pool_output.shape) pool_outputs.append(pool_output) #全连接层操作 print('begin full_connection') with tf.name_scope("full_connection"): h_pool = tf.concat(pool_outputs, 3) # 把3种大小卷积核卷积池化之后的值进行连接 num_filters_total = self.config.num_filters * len( self.config.filter_hs) # 因为随后要经过一个全连接层得到与类别种类相同的输出,而全连接接收的参数是二维的,所以进行维度转换 h_pool_flaten = tf.reshape(h_pool, [-1, num_filters_total]) h_drop = tf.nn.dropout(h_pool_flaten, self.keep_prob) #分类器 W = tf.Variable( tf.truncated_normal( [num_filters_total, self.config.num_classes])) self.l2_loss = tf.nn.l2_loss(W) b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]), name="b") self.y_pred = tf.nn.xw_plus_b(h_drop, W, b, name="scores") # wx+b #便于获得概率输出时使用 self.softmax_y = tf.nn.softmax(self.y_pred, name="softmaxy") # wx+b #预测类别 self.pred_label = tf.argmax(tf.nn.softmax(self.y_pred), 1) with tf.name_scope('optimize'): # 计算损失值,会自动计算softmax故必须传入没计算之前的,否则相当于计算了两次 cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.input_y, logits=self.y_pred) #self.losses = tf.reduce_mean(cross_entropy) # 加L2正则化 self.losses = tf.reduce_mean( cross_entropy) + self.config.l2_reg_lambda * self.l2_loss #优化器 self.optim = tf.train.AdamOptimizer( self.config.init_learning_rate).minimize(self.losses) with tf.name_scope('accuracy'): correct_prediction = tf.equal(self.pred_label, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean( tf.cast(correct_prediction, tf.float32))