Exemplos de process_file em Python, exemplos de tcnn_data_helper.process_file em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: transformer_demo.py Projeto: dylgithub/norm_classifier

def backward_propagation():
    word_vecs, sen_index, psegs_vec, sen_pseg_index, one_hot_label=tcnn_data_helper.process_file(FLAGS.file_location,FLAGS.w2v_model_location,FLAGS.words_location,FLAGS.psegs_location,
                                                                        False,FLAGS.sentence_length,FLAGS.vector_size,FLAGS.psegs_size)
    #划分训练集和测试集
    print('begin 获得训练集测试集')
    X_train = sen_index[:-30]
    X_test = sen_index[-30:]
    y_train = one_hot_label[:-30]
    y_test = one_hot_label[-30:]
    #首先是embedding层获得词向量数据
    with tf.name_scope("embedding"):
        input_x = tf.nn.embedding_lookup(word_vecs,x)
        # 注意此处只能用tf.expand_dims()不能用np.expand_dims()，因为此处还没feed进去值
        # input_x = tf.expand_dims(input_x,-1)
    #初始化模型
    res = transformer_encoder.Encoder().build(input_x)
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        #批量获得数据
        for epoch in range(FLAGS.num_epochs):
            batch_train = tcnn_data_helper.batch_iter2(X_train, y_train, FLAGS.batch_size)
            total_batch = 0
            #一个批次大小的数据无法参与训练
            for x_batch,y_batch in batch_train:
                total_batch+=1
                feed_dict={x:x_batch,y:y_batch,keep_prob:FLAGS.keep_prob}
                aa=sess.run(res,feed_dict=feed_dict)
                print(aa.shape)

Exemplo n.º 2

0

Exibir arquivo

def train_tes_model(model, transformer_config):
    # 配置Tensorboard，重新训练时，请将tensorboard文件夹删除，不然图会覆盖
    tensorboard_dir = 'tensorboard/transformer'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)
    #删除原来已存在的tensorboard文件
    else:
        file_list = os.listdir(tensorboard_dir)
        if len(file_list) > 0:
            for file in file_list:
                os.remove(os.path.join(tensorboard_dir, file))
    tf.summary.scalar("loss", model.losses)
    tf.summary.scalar("accuracy", model.accuracy)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)
    # 配置 Saver，用以保存模型
    saver = tf.train.Saver()
    if not os.path.exists(model_save_location):
        os.makedirs(model_save_location)
    #获得训练数据和测试数据
    start_time = time.time()
    _, sen_index, _, sen_pseg_index, one_hot_label = tcnn_data_helper.process_file(
        transformer_config.file_location,
        transformer_config.w2v_model_location,
        transformer_config.words_location, transformer_config.psegs_location,
        False, transformer_config.sentence_length,
        transformer_config.vector_size, transformer_config.pseg_size)
    X_train, X_test, X_pseg_train, X_pseg_test, y_train, y_test = train_test_split(
        sen_index, sen_pseg_index, one_hot_label, test_size=0.1)
    time_dif = get_time_dif(start_time)
    print("load data usage:", time_dif)
    print('Training and Testing...')
    start_time = time.time()
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        writer.add_graph(sess.graph)
        #批量获得数据
        for epoch in range(transformer_config.num_epochs):
            batch_train = tcnn_data_helper.batch_iter(
                X_train, X_pseg_train, y_train, transformer_config.batch_size)
            total_batch = 0
            for x_batch, x_pseg_batch, y_batch in batch_train:
                total_batch += 1
                feed_dict = {
                    model.input_x: x_batch,
                    model.input_x_pseg: x_pseg_batch,
                    model.input_y: y_batch,
                    model.keep_prob: transformer_config.keep_prob
                }
                if total_batch % transformer_config.save_per_batch == 0:
                    summary_str = sess.run(merged_summary, feed_dict=feed_dict)
                    writer.add_summary(summary_str,
                                       total_batch)  # 将summary 写入文件
                if total_batch % transformer_config.print_per_batch == 0:
                    train_accuracy = model.accuracy.eval(feed_dict=feed_dict)
                    print("Epoch %d:Step %d accuracy is %f" %
                          (epoch + 1, total_batch, train_accuracy))
                sess.run(model.optim, feed_dict=feed_dict)
        saver.save(sess, save_path)
        #训练完之后通过测试集测试模型
        batch_train = tcnn_data_helper.batch_iter(
            X_test, X_pseg_test, y_test, transformer_config.batch_size)
        all_test_pred = []
        for x_batch, x_pseg_batch, y_batch in batch_train:
            test_pred = model.pred_label.eval(
                feed_dict={
                    model.input_x: x_batch,
                    model.input_x_pseg: x_pseg_batch,
                    model.input_y: y_batch,
                    model.keep_prob: 1.0
                })
            all_test_pred.extend(test_pred)
        test_label = np.argmax(y_test, 1)
        #要和id所代表的类别标签顺序相同
        categories = [
            '教育', '时尚', '家居', '娱乐', '财经', '体育', '房产', '游戏', '时政', '科技'
        ]
        # 评估
        print("Precision, Recall and F1-Score...")
        print(
            classification_report(test_label,
                                  all_test_pred,
                                  target_names=categories))
        # 混淆矩阵
        print("Confusion Matrix...")
        cm = confusion_matrix(test_label, all_test_pred)
        print(cm)
        time_dif = get_time_dif(start_time)
        print("train_and_test usage:", time_dif)

Exemplo n.º 3

0

Exibir arquivo

    categories = ['教育', '时尚', '家居', '娱乐', '财经', '体育', '房产', '游戏', '时政', '科技']
    # 评估
    print("Precision, Recall and F1-Score...")
    print(
        classification_report(test_label,
                              all_test_pred,
                              target_names=categories))
    # 混淆矩阵
    print("Confusion Matrix...")
    cm = confusion_matrix(test_label, all_test_pred)
    print(cm)


if __name__ == '__main__':
    tcnn_config = Config()
    model = TcnnModel(tcnn_config)
    # 获得训练数据和测试数据
    start_time = time.time()
    _, sen_index, _, sen_pseg_index, one_hot_label = tcnn_data_helper.process_file(
        tcnn_config.file_location, tcnn_config.w2v_model_location,
        tcnn_config.words_location, tcnn_config.psegs_location, False,
        tcnn_config.sentence_length, tcnn_config.vector_size,
        tcnn_config.pseg_size)
    X_train, X_test, X_pseg_train, X_pseg_test, y_train, y_test = train_test_split(
        sen_index, sen_pseg_index, one_hot_label, test_size=0.1)
    time_dif = get_time_dif(start_time)
    print("load data usage:", time_dif)
    train_val_test_model((X_train, X_pseg_train, y_train),
                         (X_test, X_pseg_test, y_test),
                         (X_test, X_pseg_test, y_test), model, tcnn_config)

Exemplo n.º 4

0

Exibir arquivo

 def rcnn(self):
     # 词向量映射
     word_vecs, sen_index, psegs_vec, sen_pseg_index, one_hot_label = tcnn_data_helper.process_file(
         self.config.file_location, self.config.w2v_model_location,
         self.config.words_location, self.config.psegs_location, False,
         self.config.sentence_length, self.config.vector_size,
         self.config.pseg_size)
     with tf.name_scope("embedding_lookup"):
         input_x_content = tf.nn.embedding_lookup(word_vecs, self.input_x)
         input_x_pseg = tf.nn.embedding_lookup(psegs_vec, self.input_x_pseg)
         _input_x = tf.concat((input_x_content, input_x_pseg), axis=-1)
     print('begin rnn')
     with tf.name_scope("rnn"):
         lstm_fw_cell = tf.contrib.rnn.BasicLSTMCell(
             self.config.n_hidden, forget_bias=1.0)  # 创建正向的cell
         lstm_bw_cell = tf.contrib.rnn.BasicLSTMCell(
             self.config.n_hidden, forget_bias=1.0)  # 创建反向的cell
         outputs, outputs_states = tf.nn.bidirectional_dynamic_rnn(
             lstm_fw_cell, lstm_bw_cell, _input_x, dtype=tf.float32)
         # 把单词的上下文表示与其进行左右连接作为卷积神经网络的输入
         outputs = tf.concat([outputs[0], _input_x, outputs[1]], 2)
         x = tf.expand_dims(outputs, -1)
     #随后进行卷积池化操作
     print('begin conve_pool')
     pool_outputs = []
     for filter_h in self.config.filter_hs:
         with tf.variable_scope('conv_pool_{}'.format(filter_h)):
             conv = tf.layers.conv2d(
                 x,
                 filters=self.config.num_filters,
                 kernel_size=(filter_h, self.config.vector_size +
                              self.config.pseg_size +
                              self.config.n_hidden * 2),
                 activation=tf.nn.relu,
                 use_bias=False,
                 kernel_initializer=tf.contrib.layers.xavier_initializer(),
                 name='conve')
             pooled = tf.nn.max_pool(conv,
                                     ksize=[
                                         1, self.config.sentence_length -
                                         filter_h + 1, 1, 1
                                     ],
                                     strides=[1, 1, 1, 1],
                                     padding='VALID',
                                     name='pool')
             pool_outputs.append(pooled)
     #全连接层操作
     print('begin full_connection')
     with tf.name_scope("full_connection"):
         h_pool = tf.concat(pool_outputs, 3)  # 把3种大小卷积核卷积池化之后的值进行连接
         num_filters_total = self.config.num_filters * len(
             self.config.filter_hs)
         # 因为随后要经过一个全连接层得到与类别种类相同的输出，而全连接接收的参数是二维的，所以进行维度转换
         h_pool_flaten = tf.reshape(h_pool, [-1, num_filters_total])
         h_drop = tf.nn.dropout(h_pool_flaten, self.keep_prob)
         #分类器
         W = tf.Variable(
             tf.truncated_normal(
                 [num_filters_total, self.config.num_classes]))
         self.l2_loss = tf.nn.l2_loss(W)
         b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]),
                         name="b")
         self.y_pred = tf.nn.xw_plus_b(h_drop, W, b, name="scores")  # wx+b
         #预测类别
         self.pred_label = tf.argmax(tf.nn.softmax(self.y_pred), 1)
     with tf.name_scope('optimize'):
         # 计算损失值,会自动计算softmax故必须传入没计算之前的，否则相当于计算了两次
         cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
             labels=self.input_y, logits=self.y_pred)
         #self.losses = tf.reduce_mean(cross_entropy)
         # 加L2正则化
         self.losses = tf.reduce_mean(
             cross_entropy) + self.config.l2_reg_lambda * self.l2_loss
         #优化器
         self.optim = tf.train.AdamOptimizer(
             self.config.init_learning_rate).minimize(self.losses)
     with tf.name_scope('accuracy'):
         correct_prediction = tf.equal(self.pred_label,
                                       tf.argmax(self.input_y, 1))
         self.accuracy = tf.reduce_mean(
             tf.cast(correct_prediction, tf.float32))

Exemplo n.º 5

0

Exibir arquivo

Arquivo: transformer_model.py Projeto: dylgithub/norm_classifier

 def transformer(self):
     # 向量映射
     word_vecs, sen_index, psegs_vec, sen_pseg_index, one_hot_label = tcnn_data_helper.process_file(
         self.config.file_location, self.config.w2v_model_location,
         self.config.words_location, self.config.psegs_location, False,
         self.config.sentence_length, self.config.vector_size,
         self.config.pseg_size)
     with tf.name_scope('embedding'):
         input_x_content = tf.nn.embedding_lookup(word_vecs, self.input_x)
         # print(tf.shape(self.input_x)[0])
         #获得位置编码的部分
         with tf.variable_scope("positional_encoding"):
             #返回的是维度为[self.config.sentence_length,self.config.vector_size]的tensor
             positional_embedding = positional_encoding(
                 self.config.vector_size, self.config.sentence_length)
             #注意此处的维度不能固定死为self.config.batch_size，因为最后一批次的训练数据大小往往不为batch_size除非是训练数据的大小正好能被batch_size整除
             #应该保持和填充的数据self.input_x的0维相同（即为填充数据其批次的大小）
             #注意应该用tf.shape(self.input_x)[0]获得而不可用self.input_x.shape[0]获得
         positional_inputs = tf.tile(
             tf.range(0, self.config.sentence_length),
             [tf.shape(self.input_x)[0]])
         #返回batch_size个x，每个x没[0,1,2....self.config.sentence_length]
         positional_inputs = tf.reshape(
             positional_inputs,
             [tf.shape(self.input_x)[0], self.config.sentence_length])
         #融入位置信息
         input_x_content_add_positional = tf.add(
             input_x_content,
             tf.nn.embedding_lookup(positional_embedding,
                                    positional_inputs))
         # print("shape test:",input_x_content.shape)
         input_x_pseg = tf.nn.embedding_lookup(psegs_vec, self.input_x_pseg)
         _input_x = tf.concat(
             (input_x_content_add_positional, input_x_pseg), axis=-1)
     #随后通过transformer提取特征
     outputs = transformer_encoder.Encoder().build(_input_x)
     #全连接层操作
     print('begin full_connection')
     with tf.name_scope("full_connection"):
         #维度转换之后取最后一时刻的输出
         outputs = tf.transpose(outputs, [1, 0, 2])
         f_input = outputs[-1]
         #分类器
         W = tf.Variable(
             tf.truncated_normal([
                 self.config.vector_size + self.config.pseg_size,
                 self.config.num_classes
             ]))
         self.l2_loss = tf.nn.l2_loss(W)
         b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]),
                         name="b")
         self.y_pred = tf.nn.xw_plus_b(f_input, W, b, name="scores")  # wx+b
         #预测类别
         self.pred_label = tf.argmax(tf.nn.softmax(self.y_pred), 1)
     with tf.name_scope('optimize'):
         # 计算损失值,会自动计算softmax故必须传入没计算之前的，否则相当于计算了两次
         cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
             labels=self.input_y, logits=self.y_pred)
         self.losses = tf.reduce_mean(cross_entropy)
         # 加L2正则化
         # self.losses = tf.reduce_mean(cross_entropy) + self.config.l2_reg_lambda *self.l2_loss
         #优化器
         self.optim = tf.train.AdamOptimizer(
             self.config.init_learning_rate).minimize(self.losses)
     with tf.name_scope('accuracy'):
         correct_prediction = tf.equal(self.pred_label,
                                       tf.argmax(self.input_y, 1))
         self.accuracy = tf.reduce_mean(
             tf.cast(correct_prediction, tf.float32))

Exemplo n.º 6

0

Exibir arquivo

 def tcnn(self):
     # 词向量映射
     word_vecs, sen_index, psegs_vec, sen_pseg_index, one_hot_label = tcnn_data_helper.process_file(
         self.config.file_location, self.config.w2v_model_location,
         self.config.words_location, self.config.psegs_location, False,
         self.config.sentence_length, self.config.vector_size,
         self.config.pseg_size)
     with tf.device('/cpu:0'):
         input_x_content = tf.nn.embedding_lookup(word_vecs, self.input_x)
         input_x_pseg = tf.nn.embedding_lookup(psegs_vec, self.input_x_pseg)
         _input_x = tf.concat((input_x_content, input_x_pseg), axis=-1)
         x = tf.expand_dims(_input_x, -1)
     #随后进行卷积操作
     print('begin conve')
     with tf.name_scope("conve"):
         W_conv = []
         b_conv = []
         for filter_h in self.config.filter_hs:
             W_conv1 = W_generate([
                 filter_h, self.config.vector_size + self.config.pseg_size,
                 1, self.config.num_filters
             ])
             b_conv1 = bias_generate(self.config.num_filters)
             W_conv.append(W_conv1)
             b_conv.append(b_conv1)
         con_outputs = []
         # print(np.shape(input_x))
         for W, b in zip(W_conv, b_conv):
             con_output = tf.nn.relu(
                 tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='VALID'))
             #每个con_output数据的维度为（批次的大小，最大句子的长度减卷积核的高度，1，卷积核的个数）
             # print('con_output',con_output.shape)
             con_outputs.append(con_output)
     #再进行池化操作
     print('begin pool')
     with tf.name_scope("pool"):
         pool_outputs = []
         for con, filter_h in zip(con_outputs, self.config.filter_hs):
             pool_output = tf.nn.max_pool(
                 con,
                 ksize=[
                     1, self.config.sentence_length - filter_h + 1, 1, 1
                 ],
                 strides=[1, 1, 1, 1],
                 padding='VALID')
             # 每个pool_output数据的维度为（批次的大小，1，1，卷积核的个数）
             # 进行的池化操作相当于关键词的提取
             # print('pool_output',pool_output.shape)
             pool_outputs.append(pool_output)
     #全连接层操作
     print('begin full_connection')
     with tf.name_scope("full_connection"):
         h_pool = tf.concat(pool_outputs, 3)  # 把3种大小卷积核卷积池化之后的值进行连接
         num_filters_total = self.config.num_filters * len(
             self.config.filter_hs)
         # 因为随后要经过一个全连接层得到与类别种类相同的输出，而全连接接收的参数是二维的，所以进行维度转换
         h_pool_flaten = tf.reshape(h_pool, [-1, num_filters_total])
         h_drop = tf.nn.dropout(h_pool_flaten, self.keep_prob)
         #分类器
         W = tf.Variable(
             tf.truncated_normal(
                 [num_filters_total, self.config.num_classes]))
         self.l2_loss = tf.nn.l2_loss(W)
         b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]),
                         name="b")
         self.y_pred = tf.nn.xw_plus_b(h_drop, W, b, name="scores")  # wx+b
         #便于获得概率输出时使用
         self.softmax_y = tf.nn.softmax(self.y_pred,
                                        name="softmaxy")  # wx+b
         #预测类别
         self.pred_label = tf.argmax(tf.nn.softmax(self.y_pred), 1)
     with tf.name_scope('optimize'):
         # 计算损失值,会自动计算softmax故必须传入没计算之前的，否则相当于计算了两次
         cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
             labels=self.input_y, logits=self.y_pred)
         #self.losses = tf.reduce_mean(cross_entropy)
         # 加L2正则化
         self.losses = tf.reduce_mean(
             cross_entropy) + self.config.l2_reg_lambda * self.l2_loss
         #优化器
         self.optim = tf.train.AdamOptimizer(
             self.config.init_learning_rate).minimize(self.losses)
     with tf.name_scope('accuracy'):
         correct_prediction = tf.equal(self.pred_label,
                                       tf.argmax(self.input_y, 1))
         self.accuracy = tf.reduce_mean(
             tf.cast(correct_prediction, tf.float32))