예제 #1
0
파일: RTHN.py 프로젝트: Renren-yu/RTHN
def run():
    if FLAGS.log_file_name:
        sys.stdout = open(FLAGS.log_file_name, 'w')
    tf.reset_default_graph()
    localtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print("***********localtime: ", localtime)
    x_data, y_data, sen_len_data, doc_len_data, word_distance, word_embedding, pos_embedding = func.load_data(
    )

    word_embedding = tf.constant(word_embedding,
                                 dtype=tf.float32,
                                 name='word_embedding')
    pos_embedding = tf.constant(pos_embedding,
                                dtype=tf.float32,
                                name='pos_embedding')
    print('build model...')

    start_time = time.time()
    x = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len, FLAGS.max_sen_len])
    y = tf.placeholder(tf.float32, [None, FLAGS.max_doc_len, FLAGS.n_class])
    sen_len = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len])
    doc_len = tf.placeholder(tf.int32, [None])
    word_dis = tf.placeholder(tf.int32,
                              [None, FLAGS.max_doc_len, FLAGS.max_sen_len])
    keep_prob1 = tf.placeholder(tf.float32)
    keep_prob2 = tf.placeholder(tf.float32)
    placeholders = [x, y, sen_len, doc_len, word_dis, keep_prob1, keep_prob2]

    pred, reg, pred_assist_list, reg_assist_list = build_model(
        x, sen_len, doc_len, word_dis, word_embedding, pos_embedding,
        keep_prob1, keep_prob2)

    with tf.name_scope('loss'):
        valid_num = tf.cast(tf.reduce_sum(doc_len), dtype=tf.float32)
        loss_op = -tf.reduce_sum(
            y * tf.log(pred)) / valid_num + reg * FLAGS.l2_reg
        loss_assist_list = []
        for i in range(FLAGS.n_layers - 1):
            loss_assist = -tf.reduce_sum(y * tf.log(pred_assist_list[
                i])) / valid_num + reg_assist_list[i] * FLAGS.l2_reg
            loss_assist_list.append(loss_assist)

    with tf.name_scope('train'):
        optimizer = tf.train.AdamOptimizer(
            learning_rate=FLAGS.lr_main).minimize(loss_op)
        optimizer_assist_list = []
        for i in range(FLAGS.n_layers - 1):
            if i == 0:
                optimizer_assist = tf.train.AdamOptimizer(
                    learning_rate=FLAGS.lr_assist).minimize(
                        loss_assist_list[i])
            else:
                optimizer_assist = tf.train.AdamOptimizer(
                    learning_rate=FLAGS.lr_main).minimize(loss_assist_list[i])
            optimizer_assist_list.append(optimizer_assist)

    true_y_op = tf.argmax(y, 2)
    pred_y_op = tf.argmax(pred, 2)
    pred_y_assist_op_list = []
    for i in range(FLAGS.n_layers - 1):
        pred_y_assist_op = tf.argmax(pred_assist_list[i], 2)
        pred_y_assist_op_list.append(pred_y_assist_op)

    print('build model done!\n')
    prob_list_pr, y_label = [], []
    # Training Code Block
    print_training_info()
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        kf, fold, SID = KFold(n_splits=10), 1, 0
        Id = []
        p_list, r_list, f1_list = [], [], []
        for train, test in kf.split(x_data):
            tr_x, tr_y, tr_sen_len, tr_doc_len, tr_word_dis = map(
                lambda x: x[train],
                [x_data, y_data, sen_len_data, doc_len_data, word_distance])
            te_x, te_y, te_sen_len, te_doc_len, te_word_dis = map(
                lambda x: x[test],
                [x_data, y_data, sen_len_data, doc_len_data, word_distance])

            precision_list, recall_list, FF1_list = [], [], []
            pre_list, true_list, pre_list_prob = [], [], []

            sess.run(tf.global_variables_initializer())
            print('############# fold {} ###############'.format(fold))
            fold += 1
            max_f1 = 0.0
            print('train docs: {}    test docs: {}'.format(
                len(tr_y), len(te_y)))
            '''*********GP*********'''
            for layer in range(FLAGS.n_layers - 1):
                if layer == 0:
                    training_iter = FLAGS.training_iter
                else:
                    training_iter = FLAGS.training_iter - 5
                for i in range(training_iter):
                    step = 1
                    for train, _ in get_batch_data(tr_x, tr_y, tr_sen_len,
                                                   tr_doc_len, tr_word_dis,
                                                   FLAGS.keep_prob1,
                                                   FLAGS.keep_prob2,
                                                   FLAGS.batch_size):
                        _, loss, pred_y, true_y, pred_prob, doc_len_batch = sess.run(
                            [
                                optimizer_assist_list[layer],
                                loss_assist_list[layer],
                                pred_y_assist_op_list[layer], true_y_op,
                                pred_assist_list[layer], doc_len
                            ],
                            feed_dict=dict(zip(placeholders, train)))
                        acc_assist, p_assist, r_assist, f1_assist = func.acc_prf(
                            pred_y, true_y, doc_len_batch)
                        if step % 10 == 0:
                            print(
                                'GL{}: epoch {}: step {}: loss {:.4f} acc {:.4f}'
                                .format(layer + 1, i + 1, step, loss,
                                        acc_assist))
                        step = step + 1
            '''*********Train********'''
            for epoch in range(FLAGS.training_iter):
                step = 1
                for train, _ in get_batch_data(tr_x, tr_y, tr_sen_len,
                                               tr_doc_len, tr_word_dis,
                                               FLAGS.keep_prob1,
                                               FLAGS.keep_prob2,
                                               FLAGS.batch_size):
                    _, loss, pred_y, true_y, pred_prob, doc_len_batch = sess.run(
                        [
                            optimizer, loss_op, pred_y_op, true_y_op, pred,
                            doc_len
                        ],
                        feed_dict=dict(zip(placeholders, train)))
                    acc, p, r, f1 = func.acc_prf(pred_y, true_y, doc_len_batch)
                    if step % 5 == 0:
                        print(
                            'epoch {}: step {}: loss {:.4f} acc {:.4f}'.format(
                                epoch + 1, step, loss, acc))
                    step = step + 1
                '''*********Test********'''
                test = [
                    te_x, te_y, te_sen_len, te_doc_len, te_word_dis, 1., 1.
                ]
                loss, pred_y, true_y, pred_prob = sess.run(
                    [loss_op, pred_y_op, true_y_op, pred],
                    feed_dict=dict(zip(placeholders, test)))

                end_time = time.time()

                true_list.append(true_y)
                pre_list.append(pred_y)
                pre_list_prob.append(pred_prob)

                acc, p, r, f1 = func.acc_prf(pred_y, true_y, te_doc_len)
                precision_list.append(p)
                recall_list.append(r)
                FF1_list.append(f1)
                if f1 > max_f1:
                    max_acc, max_p, max_r, max_f1 = acc, p, r, f1
                print(
                    '\ntest: epoch {}: loss {:.4f} acc {:.4f}\np: {:.4f} r: {:.4f} f1: {:.4f} max_f1 {:.4f}\n'
                    .format(epoch + 1, loss, acc, p, r, f1, max_f1))

            Id.append(len(te_x))
            SID = np.sum(Id) - len(te_x)
            _, maxIndex = func.maxS(FF1_list)
            print("maxIndex:", maxIndex)
            print('Optimization Finished!\n')
            pred_prob = pre_list_prob[maxIndex]

            for i in range(pred_y.shape[0]):
                for j in range(te_doc_len[i]):
                    prob_list_pr.append(pred_prob[i][j][1])
                    y_label.append(true_y[i][j])

            p_list.append(max_p)
            r_list.append(max_r)
            f1_list.append(max_f1)
        print("running time: ", str((end_time - start_time) / 60.))
        print_training_info()
        p, r, f1 = map(lambda x: np.array(x).mean(), [p_list, r_list, f1_list])

        print("f1_score in 10 fold: {}\naverage : {} {} {}\n".format(
            np.array(f1_list).reshape(-1, 1), round(p, 4), round(r, 4),
            round(f1, 4)))
        return p, r, f1
예제 #2
0
def run():
    if FLAGS.log_file_name:
        sys.stdout = open(FLAGS.log_file_name, 'w')
    tf.reset_default_graph()
    localtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print("***********localtime: ", localtime)
    x_data, y_data, sen_len_data, doc_len_data, word_distance, word_embedding, pos_embedding = func.load_data(
    )

    word_embedding = tf.constant(word_embedding,
                                 dtype=tf.float32,
                                 name='word_embedding')
    pos_embedding = tf.constant(pos_embedding,
                                dtype=tf.float32,
                                name='pos_embedding')
    print('build model...')

    x = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len, FLAGS.max_sen_len])
    word_dis = tf.placeholder(tf.int32,
                              [None, FLAGS.max_doc_len, FLAGS.max_sen_len])
    sen_len = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len])
    doc_len = tf.placeholder(tf.int32, [None])
    keep_prob1 = tf.placeholder(tf.float32)
    keep_prob2 = tf.placeholder(tf.float32)
    y = tf.placeholder(tf.float32, [None, FLAGS.max_doc_len, FLAGS.n_class])
    placeholders = [x, word_dis, sen_len, doc_len, keep_prob1, keep_prob2, y]

    with tf.name_scope('loss'):
        pred, reg = build_model(word_embedding, pos_embedding, x, word_dis,
                                sen_len, doc_len, keep_prob1, keep_prob2)
        valid_num = tf.cast(tf.reduce_sum(doc_len), dtype=tf.float32)
        loss_op = -tf.reduce_sum(
            y * tf.log(pred)) / valid_num + reg * FLAGS.l2_reg

    with tf.name_scope('train'):
        optimizer = tf.train.AdamOptimizer(
            learning_rate=FLAGS.learning_rate).minimize(loss_op)

    true_y_op = tf.argmax(y, 2)
    pred_y_op = tf.argmax(pred, 2)
    print('build model done!\n')

    # Training Code Block
    print_training_info()

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        kf, fold, SID = KFold(n_splits=10), 1, 0
        Id = []
        p_list, r_list, f1_list = [], [], []
        true_result_all, pre_result_all = [], []
        start_time = time.time()
        all0_sum, multi1_sum, pred_multi_cause_sum = [], [], []
        for train, test in kf.split(x_data):
            tr_x, tr_y, tr_sen_len, tr_doc_len, tr_word_dis = map(
                lambda x: x[train],
                [x_data, y_data, sen_len_data, doc_len_data, word_distance])
            te_x, te_y, te_sen_len, te_doc_len, te_word_dis = map(
                lambda x: x[test],
                [x_data, y_data, sen_len_data, doc_len_data, word_distance])

            precision_list, recall_list, FF1_list = [], [], []
            pre_list, true_list, pre_list_prob = [], [], []
            sess.run(tf.global_variables_initializer())
            print('############# fold {} ###############'.format(fold))
            fold += 1
            max_f1, max_f1_rectify = 0.0, 0.0
            print('train docs: {}    test docs: {}'.format(
                len(tr_y), len(te_y)))
            for epoch in range(FLAGS.training_iter):
                step = 1
                # ************train************
                for train, _ in get_batch_data(tr_x, tr_word_dis, tr_sen_len,
                                               tr_doc_len, FLAGS.keep_prob1,
                                               FLAGS.keep_prob2, tr_y,
                                               FLAGS.batch_size):
                    _, loss, pred_y, true_y, pred_prob, doc_len_batch = sess.run(
                        [
                            optimizer, loss_op, pred_y_op, true_y_op, pred,
                            doc_len
                        ],
                        feed_dict=dict(zip(placeholders, train)))
                    acc, p, r, f1 = func.acc_prf(pred_y, true_y, doc_len_batch)
                    if step % 5 == 0:
                        print(
                            'epoch {}: step {}: loss {:.4f} acc {:.4f}'.format(
                                epoch + 1, step, loss, acc))
                    step = step + 1

                # ************test************
                test = [
                    te_x, te_word_dis, te_sen_len, te_doc_len, 1., 1., te_y
                ]
                loss, pred_y, true_y, pred_prob = sess.run(
                    [loss_op, pred_y_op, true_y_op, pred],
                    feed_dict=dict(zip(placeholders, test)))
                true_list.append(true_y)
                pre_list.append(pred_y)
                pre_list_prob.append(pred_prob)

                acc, p, r, f1 = func.acc_prf(pred_y, true_y, te_doc_len)
                precision_list.append(p)
                recall_list.append(r)
                FF1_list.append(f1)
                if f1 > max_f1:
                    max_acc, max_p, max_r, max_f1 = acc, p, r, f1
                print(
                    '\nepoch {}: loss {:.4f} acc {:.4f}\n\nnorectify: p {:.4f} r {:.4f} f1 {:.4f} max_f1 {:.4f}'
                    .format(epoch + 1, loss, acc, p, r, f1, max_f1))

            Id.append(len(te_x))
            SID = np.sum(Id) - len(te_x)
            _, maxIndex = func.maxS(FF1_list)
            print('Optimization Finished!\n')
            p_list.append(max_p)
            r_list.append(max_r)
            f1_list.append(max_f1)

        end_time = time.time()
        print("running time: ", str((end_time - start_time) / 60.))

        print_training_info()
        p, r, f1 = map(lambda x: np.array(x).mean(), [p_list, r_list, f1_list])
        print("f1_score in 10 fold: {}\naverage : {} {} {}\n".format(
            np.array(f1_list).reshape(-1, 1), round(p, 4), round(r, 4),
            round(f1, 4)))

        return p, r, f1
예제 #3
0
    def New(self):
        self.cause_edt.clear()
        senID = random.randint(1, 2105)
        the_line = linecache.getline('./data/datacsv_2105.csv', senID)
        the_line = the_line.strip().split(',')
        the_line = the_line[2]

        self.clause_edt.setPlainText(the_line)

        # 生成测试集
        #func.load_data():return x, y_position, y, sen_len, doc_len, relative_pos, relative_pos_a, embedding, embedding_pos
        x_data, y_position_data, y_data, sen_len_data, doc_len_data, word_distance, word_distance_a, word_distance_e, word_embedding, pos_embedding, pos_embedding_a, pos_embedding_e = func.load_data(
        )

        print("senID:{}\n".format(senID))

        te_x = x_data[senID]
        te_x = te_x[np.newaxis, :]
        te_sen_len = sen_len_data[senID]
        te_sen_len = te_sen_len[np.newaxis, :]
        te_doc_len = np.array([doc_len_data[senID]])
        te_word_dis = word_distance_e[senID]
        te_word_dis = te_word_dis[np.newaxis, :]

        print("te_x.shape:{}\n".format(te_x.shape))
        print("te_sen_len.shape:{}\n".format(te_sen_len.shape))
        print("te_doc_len:{}\n".format(te_doc_len))
        print("te_doc_len.shape:{}\n".format(te_doc_len.shape))
        print("te_word_dis.shape:{}\n".format(te_word_dis.shape))

        test = [
            te_x, te_sen_len, te_doc_len, te_word_dis, 1., 1., word_embedding
        ]
        tf.reset_default_graph()

        with tf.Session() as sess:
            # reload训练好的模型

            saver = tf.train.import_meta_graph(
                'run_final_ee/model.ckpt-13.meta')
            model_file = tf.train.latest_checkpoint('run_final_ee/')
            saver.restore(sess, model_file)

            tenboard_dir = './tensorboard/RTHN_EE'
            graph = tf.get_default_graph()
            writer = tf.summary.FileWriter(tenboard_dir, graph)
            writer.add_graph(sess.graph)

            x = graph.get_tensor_by_name("x:0")
            word_dis = graph.get_tensor_by_name("word_dis:0")
            sen_len = graph.get_tensor_by_name("sen_len:0")
            doc_len = graph.get_tensor_by_name("doc_len:0")
            keep_prob1 = graph.get_tensor_by_name("keep_prob1:0")
            keep_prob2 = graph.get_tensor_by_name("keep_prob2:0")
            pred_y_op = graph.get_tensor_by_name("pred_y_op:0")
            pred_pos_op = graph.get_tensor_by_name("pred_pos_op:0")
            word_embedding = graph.get_tensor_by_name("word_embedding:0")

            placeholders = [
                x, sen_len, doc_len, word_dis, keep_prob1, keep_prob2,
                word_embedding
            ]
            # placeholders = [x, y, sen_len, doc_len, word_dis, keep_prob1, keep_prob2]

            # 将测试集传入模型进行训练
            pred_pos, pred_y = sess.run([pred_pos_op, pred_y_op],
                                        feed_dict=dict(zip(placeholders,
                                                           test)))
            # print("pred_pos.shape:{}",pred_pos.shape)
            # print("pred_y.shape:{}",pred_y.shape)
            pred_pos = pred_pos.reshape(75, )
            pred_y = pred_y.reshape(75, )
            print("pred_pos:{}", pred_pos)
            print("pred_y:{}", pred_y)
            emo_ind = np.argwhere(pred_pos == 1)[:, 0]
            cla_ind = np.argwhere(pred_y == 1)[:, 0]
            # cla_ind = int(cla_ind)
            print("emo_ind:{}".format(emo_ind))
            print("cla_ind:{}".format(cla_ind))
            emo_num = len(emo_ind)
            cau_num = len(cla_ind)
            print("emo_num:{}".format(emo_num))
            print("cau_num:{}".format(cau_num))

            if cau_num == 0:
                self.cause_edt.setPlainText("该句子中不存在原因子句")
            else:
                # 通过训练结果找出原因子句
                self.cause_edt.clear()
                inputFile1 = codecs.open('./data/clause_keywords.csv', 'r',
                                         'utf-8')
                i = 0
                clause_all = []
                for line in inputFile1.readlines():
                    line = line.strip().split(',')
                    sen_id, cla_id, clause = int(line[0]), int(
                        line[1]), line[-1]
                    if sen_id == senID:
                        clause_all.append(clause.replace(' ', ''))
                clause_all = np.array(clause_all)
                for i in range(0, cau_num):
                    self.cause_edt.append(clause_all[cla_ind[i]])

    #                 self.cause_edt.setPlainText(clause.replace(' ', ''))
    # print(clause_all)
                inputFile1.close()

            if emo_num == 0:
                self.emotion_edt.setPlainText("该句子中不存在情感子句")
            else:
                # 通过训练结果找出情感子句
                self.emotion_edt.clear()
                inputFile2 = codecs.open('./data/clause_keywords.csv', 'r',
                                         'utf-8')
                i = 0
                clause_e_all = []
                for line in inputFile2.readlines():
                    line = line.strip().split(',')
                    sen_id, cla_id, clause = int(line[0]), int(
                        line[1]), line[-1]
                    if sen_id == senID:
                        for j in range(0, emo_num):
                            if cla_id == emo_ind[i] + 1:
                                self.emotion_edt.append(clause.replace(
                                    ' ', ''))
                #                 self.emotion_edt.setPlainText(clause.replace(' ', ''))
                # print(clause_e_all)
                inputFile2.close()
예제 #4
0
def run():
    if FLAGS.log_file_name:
        sys.stdout = open(FLAGS.log_file_name, 'w')
    tf.reset_default_graph()
    localtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print("***********localtime: ", localtime)
    #func.load_data():return x, y, sen_len, doc_len, relative_pos, embedding, embedding_pos
    #需要将word_distance改为自己计算的结果
    x_data, y_position_data, y_data, sen_len_data, doc_len_data, word_distance, word_distance_a, word_distance_e, word_embedding, pos_embedding, pos_embedding_a, pos_embedding_ap = func.load_data(
    )

    # print("x_data.shape:{}\n".format(x_data.shape))
    # print("y_data.shape:{}\n".format(y_data.shape))
    # print("sen_len_data.shape:{}\n".format(sen_len_data.shape))
    # print("doc_len_data.shape:{}\n".format(doc_len_data.shape))
    # print("word_distance.shape:{}\n".format(word_distance.shape))
    # print("word_embedding.shape:{}\n".format(word_embedding.shape))
    # print("pos_embedding.shape:{}\n".format(pos_embedding.shape))
    # print("pos_embedding:{}\n".format(pos_embedding[1]))

    word_embedding = tf.constant(word_embedding,
                                 dtype=tf.float32,
                                 name='word_embedding')
    pos_embedding = tf.constant(pos_embedding,
                                dtype=tf.float32,
                                name='pos_embedding')
    print('build model...')

    start_time = time.time()

    #定义placeholder
    x = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len, FLAGS.max_sen_len],
                       name="x")
    y = tf.placeholder(tf.float32, [None, FLAGS.max_doc_len, FLAGS.n_class],
                       name="y")
    sen_len = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len],
                             name="sen_len")
    doc_len = tf.placeholder(tf.int32, [None], name="doc_len")
    word_dis = tf.placeholder(tf.int32,
                              [None, FLAGS.max_doc_len, FLAGS.max_sen_len],
                              name="word_dis")
    keep_prob1 = tf.placeholder(tf.float32, name="keep_prob1")
    keep_prob2 = tf.placeholder(tf.float32, name="keep_prob2")
    placeholders = [x, y, sen_len, doc_len, word_dis, keep_prob1, keep_prob2]

    pred, reg = build_model(x, sen_len, doc_len, word_dis, word_embedding,
                            pos_embedding, keep_prob1, keep_prob2)
    # print(pred)

    with tf.name_scope('loss'):
        valid_num = tf.cast(tf.reduce_sum(doc_len), dtype=tf.float32)
        loss_op = -tf.reduce_sum(
            y * tf.log(pred)) / valid_num + reg * FLAGS.l2_reg

    with tf.name_scope('train'):
        optimizer = tf.train.AdamOptimizer(
            learning_rate=FLAGS.lr_main).minimize(loss_op)

    true_y_op = tf.argmax(y, 2, name="true_y_op")
    pred_y_op = tf.argmax(pred, 2, name="pred_y_op")

    print('build model done!\n')

    ########训练和验证过程#########
    prob_list_pr, y_label = [], []
    # Training Code Block
    print_training_info()
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True

    # saver = tf.train.Saver(max_to_keep=4)
    #
    # tenboard_dir = './tensorboard/RTHN'
    # graph = tf.get_default_graph()
    # writer = tf.summary.FileWriter(tenboard_dir, graph)

    with tf.Session(config=tf_config) as sess:
        # writer.add_graph(sess.graph)

        kf, fold, SID = KFold(n_splits=10), 1, 0  #十折交叉验证
        Id = []
        p_list, r_list, f1_list = [], [], []
        for train, test in kf.split(x_data):
            tr_x, tr_y, tr_sen_len, tr_doc_len, tr_word_dis = map(
                lambda x: x[train],
                [x_data, y_data, sen_len_data, doc_len_data, word_distance])
            te_x, te_y, te_sen_len, te_doc_len, te_word_dis = map(
                lambda x: x[test],
                [x_data, y_data, sen_len_data, doc_len_data, word_distance])
            precision_list, recall_list, FF1_list = [], [], []
            pre_list, true_list, pre_list_prob = [], [], []

            sess.run(tf.global_variables_initializer())
            print('############# fold {} ###############'.format(fold))
            fold += 1
            max_f1 = 0.0
            print('train docs: {}    test docs: {}'.format(
                len(tr_y), len(te_y)))
            '''*********Train********'''
            for epoch in range(FLAGS.training_iter):
                step = 1
                #train:feed_list = [x[index], y[index], sen_len[index], doc_len[index], word_dis[index], keep_prob1, keep_prob2]
                for train, _ in get_batch_data(tr_x, tr_y, tr_sen_len,
                                               tr_doc_len, tr_word_dis,
                                               FLAGS.keep_prob1,
                                               FLAGS.keep_prob2,
                                               FLAGS.batch_size):
                    _, loss, pred_y, true_y, pred_prob, doc_len_batch = sess.run(
                        [
                            optimizer, loss_op, pred_y_op, true_y_op, pred,
                            doc_len
                        ],
                        feed_dict=dict(zip(placeholders, train)))
                    acc, p, r, f1 = func.acc_prf(pred_y, true_y, doc_len_batch)
                    # if step % 10 == 0:
                    #     print('epoch {}: step {}: loss {:.4f} acc {:.4f}'.format(epoch + 1, step, loss, acc))
                    step = step + 1
                # print("begin save!")
                # saver.save(sess, "./run_final/model.ckpt", global_step=step)
                '''*********Test********'''
                test = [
                    te_x, te_y, te_sen_len, te_doc_len, te_word_dis, 1., 1.
                ]
                loss, pred_y, true_y, pred_prob = sess.run(
                    [loss_op, pred_y_op, true_y_op, pred],
                    feed_dict=dict(zip(placeholders, test)))

                end_time = time.time()

                true_list.append(true_y)
                pre_list.append(pred_y)
                pre_list_prob.append(pred_prob)

                #计算精确率准确率召回率和F值
                acc, p, r, f1 = func.acc_prf(pred_y, true_y, te_doc_len)
                precision_list.append(p)
                recall_list.append(r)
                FF1_list.append(f1)
                if f1 > max_f1:
                    max_acc, max_p, max_r, max_f1 = acc, p, r, f1
                # print('\ntest: epoch {}: loss {:.4f} acc {:.4f}\np: {:.4f} r: {:.4f} f1: {:.4f} max_f1 {:.4f}\n'.format(
                #     epoch + 1, loss, acc, p, r, f1, max_f1))

            Id.append(len(te_x))
            SID = np.sum(Id) - len(te_x)
            _, maxIndex = func.maxS(FF1_list)
            # print("maxIndex:", maxIndex)
            # print('Optimization Finished!\n')
            pred_prob = pre_list_prob[maxIndex]

            for i in range(pred_y.shape[0]):
                for j in range(te_doc_len[i]):
                    prob_list_pr.append(pred_prob[i][j][1])
                    y_label.append(true_y[i][j])

            p_list.append(max_p)
            r_list.append(max_r)
            f1_list.append(max_f1)
        print("running time: ", str((end_time - start_time) / 60.))
        print_training_info()
        p, r, f1 = map(lambda x: np.array(x).mean(), [p_list, r_list, f1_list])
        # print("f1_score in 10 fold: {}\naverage : {} {} {}\n".format(np.array(f1_list).reshape(-1, 1), round(p, 4), round(r, 4), round(f1, 4)))

        # writer.close()
        return p, r, f1
예제 #5
0
파일: ecjc.py 프로젝트: LeMei/ecjd
def run():

    save_dir = 'result_data_{}/'.format(FLAGS.save_dir)
    if FLAGS.log_file_name:
        sys.stdout = open(FLAGS.log_file_name, 'w')
    tf.reset_default_graph()
    localtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print("***********localtime: ", localtime)
    x_data, y_data, y_emotion_data, y_cause_data, sen_len_data, doc_len_data, word_embedding, doc_id_data, idx_word_dict, clause_position_data, embedding_pos, _ = func.load_data(
    )

    word_embedding = tf.constant(word_embedding,
                                 dtype=tf.float32,
                                 name='word_embedding')
    embedding_pos = tf.constant(embedding_pos,
                                dtype=tf.float32,
                                name='embedding_pos')

    print('build model...')

    start_time = time.time()
    x = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len, FLAGS.max_sen_len])
    y = tf.placeholder(tf.float32, [None, FLAGS.max_doc_len, FLAGS.n_class])
    y_emotion = tf.placeholder(
        tf.float32, [None, FLAGS.max_doc_len, FLAGS.emotion_n_class])
    y_cause = tf.placeholder(tf.float32,
                             [None, FLAGS.max_doc_len, FLAGS.cause_n_class])
    sen_len = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len])
    doc_len = tf.placeholder(tf.int32, [None])
    clause_position = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len])
    keep_prob1 = tf.placeholder(tf.float32)
    keep_prob2 = tf.placeholder(tf.float32)

    placeholders_emotion_assist = [
        x, y_emotion, sen_len, doc_len, clause_position, keep_prob1, keep_prob2
    ]
    placeholders_cause_assist = [
        x, y_cause, sen_len, doc_len, clause_position, keep_prob1, keep_prob2
    ]
    placeholders_main = [
        x, y, sen_len, doc_len, clause_position, keep_prob1, keep_prob2
    ]

    pred, reg, pred_emotion_assist_list, reg_emotion_assist_list, pred_cause_assist_list, reg_cause_assist_list =\
        build_model(x, sen_len, doc_len, word_embedding,  clause_position, embedding_pos, keep_prob1, keep_prob2)

    with tf.name_scope('loss'):
        valid_num = tf.cast(tf.reduce_sum(doc_len), dtype=tf.float32)
        loss_op = -tf.reduce_sum(
            y * tf.log(pred)) / valid_num + reg * FLAGS.l2_reg
        loss_emotion_assist_list, loss_cause_assist_list = [], []
        for i in range(FLAGS.assist_n_layers - 1):
            loss_emotion_assist = -tf.reduce_sum(
                y_emotion * tf.log(pred_emotion_assist_list[i])
            ) / valid_num + reg_emotion_assist_list[i] * FLAGS.l2_reg
            loss_emotion_assist_list.append(loss_emotion_assist)

            loss_cause_assist = -tf.reduce_sum(
                y_cause * tf.log(pred_cause_assist_list[i])
            ) / valid_num + reg_cause_assist_list[i] * FLAGS.l2_reg
            loss_cause_assist_list.append(loss_cause_assist)

    with tf.name_scope('train'):
        optimizer = tf.train.AdamOptimizer(
            learning_rate=FLAGS.lr_main).minimize(loss_op)
        optimizer_emotion_assist_list, optimizer_cause_assist_list = [], []
        for i in range(FLAGS.assist_n_layers - 1):
            if i == 0:
                optimizer_emotion_assist = tf.train.AdamOptimizer(
                    learning_rate=FLAGS.lr_assist).minimize(
                        loss_emotion_assist_list[i])
                optimizer_cause_assist = tf.train.AdamOptimizer(
                    learning_rate=FLAGS.lr_assist).minimize(
                        loss_cause_assist_list[i])

            else:
                optimizer_emotion_assist = tf.train.AdamOptimizer(
                    learning_rate=FLAGS.lr_main).minimize(
                        loss_emotion_assist_list[i])
                optimizer_cause_assist = tf.train.AdamOptimizer(
                    learning_rate=FLAGS.lr_assist).minimize(
                        loss_cause_assist_list[i])

            optimizer_emotion_assist_list.append(optimizer_emotion_assist)
            optimizer_cause_assist_list.append(optimizer_cause_assist)

    true_y_op = tf.argmax(y, 2)
    pred_y_op = tf.argmax(pred, 2)
    emotion_true_y_op = tf.argmax(y_emotion, 2)
    cause_true_y_op = tf.argmax(y_cause, 2)
    pred_y_emotion_assist_op_list, pred_y_cause_assist_op_list = [], []
    for i in range(FLAGS.assist_n_layers - 1):
        pred_y_emotion_assist_op = tf.argmax(pred_emotion_assist_list[i], 2)
        pred_y_emotion_assist_op_list.append(pred_y_emotion_assist_op)

        pred_y_cause_assist_op = tf.argmax(pred_cause_assist_list[i], 2)
        pred_y_cause_assist_op_list.append(pred_y_cause_assist_op)

    print('build model done!\n')

    prob_list_pr, y_label = [], []
    # Training Code Block
    print_training_info()
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        kf, fold, SID = KFold(n_splits=10), 1, 0
        Id = []
        p_list, r_list, f1_list = [], [], []
        for train, test in kf.split(x_data):
            tr_x, tr_y, tr_y_emotion, tr_y_cause, tr_sen_len, tr_doc_len, tr_doc_id, tr_clause_pos = map(
                lambda x: x[train], [
                    x_data, y_data, y_emotion_data, y_cause_data, sen_len_data,
                    doc_len_data, doc_id_data, clause_position_data
                ])
            te_x, te_y, te_y_emotion, te_y_cause, te_sen_len, te_doc_len, te_doc_id, te_clause_pos = map(
                lambda x: x[test], [
                    x_data, y_data, y_emotion_data, y_cause_data, sen_len_data,
                    doc_len_data, doc_id_data, clause_position_data
                ])

            precision_list, recall_list, FF1_list = [], [], []
            pre_list, true_list, pre_list_prob = [], [], []
            # file_dir = './ECPredictor_demo_assistlayer4_mainlayer1/fold{}/'.format(fold)
            # if not os.path.exists(file_dir):
            #     os.mkdir(file_dir)
            sess.run(tf.global_variables_initializer())
            print('############# fold {} ###############'.format(fold))
            fold += 1
            max_f1 = 0.0
            emo_max_f1 = 0.0
            cause_max_f1 = 0.0
            print('train docs: {}    test docs: {}'.format(
                len(tr_y), len(te_y)))

            for layer in range(FLAGS.assist_n_layers - 1):
                if layer == 0:
                    training_iter = FLAGS.training_iter
                else:
                    training_iter = FLAGS.training_iter - 5
                for i in range(training_iter):
                    '''预训练情感分类'''
                    emotion_step = 1
                    for train, _ in get_emotion_batch_data(
                            tr_x, tr_y_emotion, tr_sen_len, tr_doc_len,
                            tr_clause_pos, FLAGS.keep_prob1, FLAGS.keep_prob2,
                            FLAGS.batch_size):
                        _, loss, pred_y, true_y, doc_len_batch = sess.run(
                            [
                                optimizer_emotion_assist_list[layer],
                                loss_emotion_assist_list[layer],
                                pred_y_emotion_assist_op_list[layer],
                                emotion_true_y_op, doc_len
                            ],
                            feed_dict=dict(
                                zip(placeholders_emotion_assist, train)))
                        acc_assist, p_assist, r_assist, f1_assist = func.acc_prf_binary(
                            pred_y, true_y, doc_len_batch)
                        if emotion_step % 10 == 0:
                            print(
                                'Emotion {}: epoch {}: step {}: loss {:.4f} acc {:.4f} p {:.4f}'
                                .format(layer + 1, i + 1, emotion_step, loss,
                                        acc_assist, p_assist))
                        emotion_step = emotion_step + 1
                    """****test emotion extraction****"""
                    test = [
                        te_x, te_y_emotion, te_sen_len, te_doc_len,
                        te_clause_pos, 1., 1.
                    ]
                    loss, pred_y, true_y, doc_len_batch = sess.run(
                        [
                            loss_emotion_assist_list[layer],
                            pred_y_emotion_assist_op_list[layer],
                            emotion_true_y_op, doc_len
                        ],
                        feed_dict=dict(zip(placeholders_emotion_assist, test)))
                    acc_test, p_test, r_test, f1_test = func.acc_prf_binary(
                        pred_y, true_y, doc_len_batch)
                    if emo_max_f1 < f1_test:
                        emo_max_f1 = f1_test
                    print(
                        '\nemotion-test: epoch {}: loss {:.4f} acc {:.4f}\np: {:.4f} r: {:.4f} f1: {:.4f} max_f1 {:.4f}\n'
                        .format(i + 1, loss, acc_test, p_test, r_test, f1_test,
                                emo_max_f1))
                    # file_name = file_dir + 'emotion_test_layer{}_epoch{}.txt'.format(layer + 1, i+1)
                    # func.output_pred(file_name, te_doc_id, te_x, te_doc_len, te_sen_len, true_y, pred_y, idx_word_dict)
                    """预训练原因分类"""
                    cause_step = 1
                    for train, _ in get_cause_batch_data(
                            tr_x, tr_y_cause, tr_sen_len, tr_doc_len,
                            tr_clause_pos, FLAGS.keep_prob1, FLAGS.keep_prob2,
                            FLAGS.batch_size):
                        _, loss, pred_y, true_y, doc_len_batch = sess.run(
                            [
                                optimizer_cause_assist_list[layer],
                                loss_cause_assist_list[layer],
                                pred_y_cause_assist_op_list[layer],
                                cause_true_y_op, doc_len
                            ],
                            feed_dict=dict(
                                zip(placeholders_cause_assist, train)))
                        acc_assist, p_assist, r_assist, f1_assist = func.acc_prf_binary(
                            pred_y, true_y, doc_len_batch)
                        if cause_step % 10 == 0:
                            print(
                                'Cause {}: epoch {}: step {}: loss {:.4f} acc {:.4f} p {:.4f}'
                                .format(layer + 1, i + 1, cause_step, loss,
                                        acc_assist, p_assist))
                        cause_step = cause_step + 1
                    """****test cause extraction****"""
                    test = [
                        te_x, te_y_cause, te_sen_len, te_doc_len,
                        te_clause_pos, 1., 1.
                    ]
                    loss, pred_y, true_y, doc_len_batch = sess.run(
                        [
                            loss_cause_assist_list[layer],
                            pred_y_cause_assist_op_list[layer],
                            cause_true_y_op, doc_len
                        ],
                        feed_dict=dict(zip(placeholders_cause_assist, test)))
                    acc_test, p_test, r_test, f1_test = func.acc_prf_binary(
                        pred_y, true_y, doc_len_batch)
                    if cause_max_f1 < f1_test:
                        cause_max_f1 = f1_test
                    print(
                        '\ncause-test: epoch {}: loss {:.4f} acc {:.4f}\np: {:.4f} r: {:.4f} f1: {:.4f} max_f1 {:.4f}\n'
                        .format(i + 1, loss, acc_test, p_test, r_test, f1_test,
                                cause_max_f1))

                    # file_name = file_dir + 'cause_test_layer{}_epoch{}.txt'.format(layer+1, i+1)
                    # func.output_pred(file_name, te_doc_id, te_x, te_doc_len, te_sen_len, true_y, pred_y, idx_word_dict)
            '''*********Train********'''
            for epoch in range(FLAGS.training_iter):
                step = 1
                for train, _ in get_batch_data(tr_x, tr_y, tr_sen_len,
                                               tr_doc_len, tr_clause_pos,
                                               FLAGS.keep_prob1,
                                               FLAGS.keep_prob2,
                                               FLAGS.batch_size):
                    _, loss, pred_y, true_y, pred_prob, doc_len_batch = sess.run(
                        [
                            optimizer, loss_op, pred_y_op, true_y_op, pred,
                            doc_len
                        ],
                        feed_dict=dict(zip(placeholders_main, train)))
                    acc, p, r, f1 = func.acc_prf_multiclass(
                        pred_y, true_y, doc_len_batch)
                    if step % 5 == 0:
                        print(
                            'epoch {}: step {}: loss {:.4f} acc {:.4f} p {:.4}'
                            .format(epoch + 1, step, loss, acc, p))
                    step = step + 1
                '''*********Test********'''
                test = [
                    te_x, te_y, te_sen_len, te_doc_len, te_clause_pos, 1., 1.
                ]
                loss, pred_y, true_y, pred_prob = sess.run(
                    [loss_op, pred_y_op, true_y_op, pred],
                    feed_dict=dict(zip(placeholders_main, test)))

                end_time = time.time()

                true_list.append(true_y)
                pre_list.append(pred_y)
                pre_list_prob.append(pred_prob)

                acc, p, r, f1 = func.acc_prf_multiclass(
                    pred_y, true_y, te_doc_len)
                precision_list.append(p)
                recall_list.append(r)
                FF1_list.append(f1)
                if f1 > max_f1:
                    max_acc, max_p, max_r, max_f1 = acc, p, r, f1
                print(
                    '\ntest: epoch {}: loss {:.4f} acc {:.4f}\np: {:.4f} r: {:.4f} f1: {:.4f} max_f1 {:.4f}\n'
                    .format(epoch + 1, loss, acc, p, r, f1, max_f1))
                # file_name = file_dir + 'emotion_cause_test_{}.txt'.format(epoch+1)
                # func.output_pred(file_name, te_doc_id, te_x, te_doc_len, te_sen_len, true_y, pred_y, idx_word_dict)

            Id.append(len(te_x))
            SID = np.sum(Id) - len(te_x)
            _, maxIndex = func.maxS(FF1_list)
            print("maxIndex:", maxIndex)
            print('Optimization Finished!\n')
            pred_prob = pre_list_prob[maxIndex]

            for i in range(pred_y.shape[0]):
                for j in range(te_doc_len[i]):
                    prob_list_pr.append(pred_prob[i][j][1])
                    y_label.append(true_y[i][j])

            print("*********prob_list_pr", len(prob_list_pr))
            print("*********y_label", len(y_label))

            p_list.append(max_p)
            r_list.append(max_r)
            f1_list.append(max_f1)

        print("running time: ", str((end_time - start_time) / 60.))
        print_training_info()
        p, r, f1 = map(lambda x: np.array(x).mean(), [p_list, r_list, f1_list])

        print("f1_score in 10 fold: {}\naverage : {} {} {}\n".format(
            np.array(f1_list).reshape(-1, 1), round(p, 4), round(r, 4),
            round(f1, 4)))
        return p, r, f1