예제 #1
0
def predict():
    with open(map_path, "rb") as f:
        word_to_id, cat_to_id, seq_length, num_classes = pickle.load(f)
    id_to_cat = {v: k for k, v in cat_to_id.items()}
    config = TRNNConfig()
    config.num_classes = num_classes
    config.vocab_size = len(word_to_id)
    model = TextRNN(config)
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型
    while True:
        line = input("请输入测试句子:")
        data_id = [[
            word_to_id[x] for x in list(native_content(line))
            if x in word_to_id
        ]]
        x_pad = kr.preprocessing.sequence.pad_sequences(data_id, seq_length)
        y_pred_cls = session.run(model.y_pred_cls,
                                 feed_dict={
                                     model.input_x: x_pad,
                                     model.keep_prob: 1.0
                                 })
        print('sentence : {}, prdict intent : {}'.format(
            line, id_to_cat[y_pred_cls[0]]))
        a = 1
예제 #2
0
 def __init__(self):
     self.config = TRNNConfig()
     self.categories, self.cat_to_id = read_category()
     self.words, self.word_to_id = read_vocab(vocab_dir)
     self.config.vocab_size = len(self.words)
     self.model = TextRNN(self.config)
     self.session = tf.Session()
     self.session.run(tf.global_variables_initializer())
     saver = tf.train.Saver()
     saver.restore(sess=self.session, save_path=save_path)  # 读取保存的模型
예제 #3
0
    def __init__(self):
        self.config = TRNNConfig()
        self.categories, self.cat_to_id = read_category()
        self.words = np.load('./datas/dict_token.npy')
        self.word_to_id = np.load('./datas/token_to_id.npy').tolist()
        self.config.vocab_size = len(self.words)
        self.model = TextRNN(self.config)

        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess=self.session, save_path=save_path)  # 读取保存的模型
    def __init__(self):
        self.config = TRNNConfig()
        self.categories = categories
        self.cat_to_id = cat_to_id
        self.words = words
        self.word_to_id = word_to_id
        self.config.vocab_size = len(self.words)
        self.model = TextRNN(self.config)

        self.session = sess
        self.session.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess=self.session, save_path=rnn_save_path)  # 读取保存的模型
def init():
    """初始化模型"""
    rnn = TextRNN(embedding_dim=FLAGS.embedding_dim,
                  seq_length=FLAGS.seq_length,
                  num_classes=FLAGS.num_classes,
                  vocab_size=FLAGS.vocab_size,
                  num_layers=FLAGS.num_layers,
                  hidden_dim=FLAGS.hidden_dim,
                  rnn=FLAGS.rnn,
                  dropout_keep_prob=FLAGS.dropout_keep_prob,
                  learning_rate=FLAGS.learning_rate,
                  batch_size=FLAGS.batch_size,
                  num_epochs=FLAGS.num_epochs,
                  print_per_batch=FLAGS.print_per_batch,
                  save_per_batch=FLAGS.save_per_batch)
    return rnn
예제 #6
0
 def __init__(self):
     with open(map_path, "rb") as f:
         self.word_to_id, self.cat_to_id, self.seq_length, self.num_classes = pickle.load(
             f)
     self.id_to_cat = {v: k for k, v in self.cat_to_id.items()}
     if model_type == 'cnn':
         self.config = TCNNConfig()
         self.config.num_classes = self.num_classes
         self.config.vocab_size = len(self.word_to_id)
         self.model = TextCNN(self.config)
     else:
         self.config = TRNNConfig()
         self.config.num_classes = self.num_classes
         self.config.vocab_size = len(self.word_to_id)
         self.model = TextRNN(self.config)
     self.session = tf.Session()
     self.session.run(tf.global_variables_initializer())
     saver = tf.train.Saver()
     saver.restore(sess=self.session, save_path=save_path)  # 读取保存的模型
예제 #7
0
  def load_model(self):
    sess = tf.Session()
    print('Configuring CNN model...')
    config = TRNNConfig()
    cnn_model = TextRNN(config)

    saver = tf.train.Saver()
    params_file = tf.train.latest_checkpoint(self.model_dir)
    saver.restore(sess, params_file)

    categories, cat_to_id = read_category()
    vocab_dir = 'cnews/cnews.vocab.txt'
    words, word_to_id = read_vocab(vocab_dir)

    self.words = words
    self.word_to_id = word_to_id
    self.categories = categories
    self.cat_to_id = cat_to_id

    self.cnn_model = cnn_model
    self.sess = sess
    print(self.cnn_model)
    print(self.sess)
예제 #8
0
    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_rnn.py [train / test]""")

    print('Configuring RNN model...')
    config = TRNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextRNN(config)

    if sys.argv[1] == 'train':
        train()
    else:
        test()
예제 #9
0
    #                     default=path + '/valid2.csv')
    parser.add_argument('--test_data', type=str, default='data/test.csv')
    # parser.add_argument('--tensorboard_dir', type=str,
    #                     default=path + '/tensorboard')
    parser.add_argument('--save_path', type=str, default=path + '/model.ckpt')
    parser.add_argument('--word_file', type=str, default=path + '/words.csv')
    # parser.add_argument('--label_file', type=str,
    #                     default=path + '/labels.csv')
    parser.add_argument('--result', type=str, default=path + '/result.csv')
    FLAGS, unparser = parser.parse_known_args()

    # contents, labels, _ = read_data(FLAGS.train_data, sep=' ')
    # train_contents, valid_contents, train_labels, valid_labels = train_test_split(contents, labels, test_size=0.1,
    #                                                                               random_state=0)

    # valid_contents, valid_labels, _ = read_data(FLAGS.valid_data)

    # words, word2id, labels, label2id = word_to_id(train_contents, train_labels, FLAGS.vocab_size)
    # save_words(word2id, FLAGS.word_file)
    # save_labels(label2id, FLAGS.label_file)

    pred_contents, review_id, texts = read_data(FLAGS.test_data, sep=' ')
    model = TextRNN(FLAGS.embedding_size, FLAGS.hidden_layers,
                    FLAGS.hidden_units, FLAGS.number_classes,
                    FLAGS.learning_rate, FLAGS.sequence_length,
                    FLAGS.vocab_size)

    # train()
    # test()
    predict()
예제 #10
0
    print("Time usage:", time_dif)


if __name__ == '__main__':

    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_rnn.py [train / test]""")

    conn = pymysql.connect(host='localhost',
                           user='******',
                           passwd='sasa',
                           db='text_clf',
                           charset='utf8')
    cur = conn.cursor()

    print('Configuring RNN model...')
    config = TRNNConfig()
    #if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
    #   build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_vec = read_category()
    #words, word_to_id = read_vocab(vocab_dir)
    #config.vocab_size = len(words)
    model = TextRNN(config, get_embedding())

    print('training')

    if sys.argv[1] == 'train':
        train()
    else:
        test()
예제 #11
0
파일: run_rnn.py 프로젝트: DMStudent/Nlp
def train():
    print('Configuring RNN model...')
    config = TRNNConfig()
    config.dropout_keep_prob = 1.0
    start_time = time.time()
    # config.batch_size = 10
    total_batch = 0  # 总批次
    best_mse_val = 99999999  # 最佳验证集准确率
    best_loss_val = 99999999  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 5000  # 如果超过1000轮未提升,提前结束训练
    count = 0
    tensorboard_dir = config.tensorboard_dir

    # 配置GPU内存分配方式
    tfconfig = tf.ConfigProto(log_device_placement=True)
    tfconfig.gpu_options.allow_growth = True
    tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.6

    with tf.Graph().as_default(), tf.Session(config=tfconfig) as sess:
        train_dir_list = os.listdir(config.train_dir_tf)
        train_dir_list = [
            os.path.join(config.train_dir_tf, i) for i in train_dir_list
        ]
        queueTrain = tf.train.string_input_producer(
            train_dir_list, num_epochs=config.num_epochs)
        title_len, title, label, frame_weight = read_example(queueTrain)

        title_len_batch, title_batch, label_batch, frame_weight_batch = tf.train.batch(
            [title_len, title, label, frame_weight],
            batch_size=config.batch_size,
            capacity=100000,
            num_threads=1)

        with tf.variable_scope("model",
                               initializer=tf.random_uniform_initializer(
                                   -1 * 1, 1)):
            model = TextRNN(config=config,
                            input_x_len=title_len_batch,
                            input_x=title_batch,
                            input_y=label_batch,
                            frame_weight=frame_weight_batch)

        tf.summary.scalar("loss", model.loss)
        tf.summary.scalar("mse", model.mse)
        merged_summary = tf.summary.merge_all()
        writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)

        fetches = [model.loss, model.mse]
        feed_dict = {}
        # init
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess.run(init_op)
        # 配置 Saver
        saver = tf.train.Saver(write_version=saver_pb2.SaverDef.V1)
        if not config.retraining:
            saver.restore(sess=sess, save_path=config.modelPath)  # 读取保存的模型
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        if not os.path.exists(config.save_dir):
            os.makedirs(config.save_dir)

        try:
            while not coord.should_stop():
                # Run training steps or whatever
                # titles, labels = sess.run([title_batch, label_batch])
                if total_batch % config.save_per_batch == 0:
                    # 每多少轮次将训练结果写入tensorboard scalar
                    s = sess.run(merged_summary, feed_dict)
                    writer.add_summary(s, total_batch)

                if total_batch % config.print_per_batch == 0:
                    # 每多少轮次输出在训练集和验证集上的性能
                    loss_val, mse_val = sess.run(fetches, feed_dict)

                    if mse_val < best_mse_val or loss_val < best_loss_val:
                        # 保存最好结果
                        best_mse_val = mse_val
                        best_loss_val = loss_val
                        last_improved = total_batch
                        improved_str = '*'
                        # saver.save(sess=sess, save_path=config.save_path)
                        if total_batch % config.save_per_batch == 0:
                            saver.save(sess,
                                       config.save_path + '_%03d' %
                                       (total_batch / config.save_per_batch),
                                       write_meta_graph=False)
                    else:
                        improved_str = ''

                    time_dif = get_time_dif(start_time)
                    msg = 'Iter: {0:>6}, Val Loss: {1:>6.5}, Mse: {2:>6.5}, Time: {3} {4}'
                    print(
                        msg.format(total_batch, loss_val, mse_val, time_dif,
                                   improved_str))
                    # print(embedding_inputs)

                sess.run(model.optim, feed_dict)
                total_batch += 1

                if total_batch - last_improved > require_improvement:
                    # 验证集正确率长期不提升,提前结束训练
                    print("No optimization for a long time, auto-stopping...")
                    coord.should_stop()
                    break  # 跳出循环

        except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')

        coord.request_stop()
        coord.join(threads)
예제 #12
0
파일: run_rnn.py 프로젝트: DMStudent/Nlp
def test():
    print('Configuring RNN model...')
    config = TRNNConfig()
    config.dropout_keep_prob = 1.0
    config.num_epochs = 1
    start_time = time.time()
    config.batch_size = 10
    count = 0

    # 配置GPU内存分配方式
    tfconfig = tf.ConfigProto(log_device_placement=True)
    tfconfig.gpu_options.allow_growth = True
    tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.6

    fw = file(config.test_dir_output, "w")
    with tf.Graph().as_default(), tf.Session(config=tfconfig) as sess:
        test_dir_list = os.listdir(config.test_dir_tf)
        test_dir_list = [
            os.path.join(config.test_dir_tf, i) for i in test_dir_list
        ]
        queueTest = tf.train.string_input_producer(
            test_dir_list, num_epochs=config.num_epochs)
        text, title_len, title, label, frame_weight = read_example_test(
            queueTest)

        text_batch, title_len_batch, title_batch, label_batch, frame_weight_batch = tf.train.batch(
            [text, title_len, title, label, frame_weight],
            batch_size=config.batch_size,
            capacity=50000,
            num_threads=1)
        with tf.variable_scope("model",
                               initializer=tf.random_uniform_initializer(
                                   -1 * 1, 1)):
            model = TextRNN(config=config,
                            input_x_len=title_len_batch,
                            input_x=title_batch,
                            input_y=label_batch,
                            frame_weight=frame_weight_batch)

        fetches = [text_batch, model.input_x_len, model.y_pred, model.input_y]
        feed_dict = {}
        # init
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess.run(init_op)
        # 配置 Saver
        saver = tf.train.Saver()
        saver.restore(sess=sess, save_path=config.modelPath)  # 读取保存的模型
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        if not os.path.exists(config.save_dir):
            os.makedirs(config.save_dir)

        try:
            while not coord.should_stop():
                texts, x_len, y_pred, y_test = sess.run(fetches,
                                                        feed_dict=feed_dict)
                texts = "".join(texts.values).split("\n")
                for i in range(len(texts) - 1):
                    score = [str(int(j * 100))
                             for j in y_test[i]][:x_len[i][0]]
                    y_test_i = " ".join(score)
                    score = [str(int(j * 100))
                             for j in y_pred[i]][:x_len[i][0]]
                    y_pred_i = " ".join(score)
                    fw.write(texts[i] + "\ttarget:\t" + y_test_i +
                             "\tpredict\t" + y_pred_i + "\n")
                count = count + 1
                if count % 10000 == 0:
                    print(count)

        except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')

        coord.request_stop()
        coord.join(threads)
    fw.close()
예제 #13
0
            -0.0, 0.0, (rnn.config.vocab_size, embedding_dim))
        count = 0
        for i in range(0, rnn.config.vocab_size):
            if (rnn.words[i] in word_vector_map
                ):  #word_vector_map.has_key(cnn.words[i])
                count = count
                sub_embeddings[i] = word_vector_map.get(rnn.words[i])
            else:
                count = count + 1
                missing_words_file.write(rnn.words[i] + '\n')

        print('no embedding: ' + str(1.0 * count / len(rnn.words)))
        print(str(len(sub_embeddings)) + '\t' + str(len(sub_embeddings[0])))
        missing_words_file.close()

        rnn.model = TextRNN(rnn.config)

        rnn.train()
        predict_y = rnn.test()  #predicting results
        print(predict_y)
        print(len(predict_y))
        print(len(test_data_Y))

        tf.reset_default_graph()

        correct_count = 0
        for i in range(len(test_data_Y)):
            if rnn.id_to_cat[predict_y[i]] == test_data_Y[i]:
                correct_count += 1
            doc_node = doc.createElement("doc")
            doc_node.setAttribute("id", test_docs[i].split(',')[0])
예제 #14
0
    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_rnn.py [train / test]""")

    print('Configuring RNN model...')

    print('load data. . .')
    X = pickle.load(open(train_data, 'rb'))
    df, word_vecs, word_cab_num, sentence_max_len, class_num = X[0], X[1], X[2], X[3], X[4]

    config = TRNNConfig(sentence_max_len, class_num, word_cab_num)

    word_ids, W_list = process_data.getWordsVect(config, word_vecs)

    model = TextRNN(config, W_list, False) #默认不训练词向量


    if sys.argv[1] == 'train':
        train()
    else:
        test()
def train():
    # Training procedure
    # ======================================================
    # 设定最小显存使用量
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        config = RNNConfig()
        rnn = TextRNN(config)
        rnn.prepare_data()
        rnn.setRNN()

        print('Setting Tensorboard and Saver...')
        # 设置Saver和checkpoint来保存模型
        # ===================================================
        checkpoint_dir = os.path.join(os.path.abspath("checkpoints"), "textrnn")
        checkpoint_prefix = os.path.join(checkpoint_dir, rnn.train_mode)
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables())
        # =====================================================

        # 配置Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖
        # ====================================================================
        train_tensorboard_dir = 'tensorboard/textrnn/train/' + config.train_mode
        valid_tensorboard_dir = 'tensorboard/textrnn/valid/' + config.train_mode
        if not os.path.exists(train_tensorboard_dir):
            os.makedirs(train_tensorboard_dir)
        if not os.path.exists(valid_tensorboard_dir):
            os.makedirs(valid_tensorboard_dir)

        # 训练结果记录
        log_file = open(valid_tensorboard_dir+'/log.txt', mode='w')

        merged_summary = tf.summary.merge([tf.summary.scalar('loss', rnn.loss),
                                            tf.summary.scalar('accuracy', rnn.accuracy)])

        train_summary_writer = tf.summary.FileWriter(train_tensorboard_dir, sess.graph)
        # =========================================================================

        global_step = tf.Variable(0, trainable=False)

        # 保证Batch normalization的执行
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):  # 保证train_op在update_ops执行之后再执行。
            train_op = tf.train.AdamOptimizer(config.learning_rate).minimize(rnn.loss, global_step)

        # 训练步骤
        def train_step(batch_x, batch_y, keep_prob=config.dropout_keep_prob):
            feed_dict = {
                rnn.input_x: batch_x,
                rnn.labels: batch_y,
                rnn.dropout_keep_prob: keep_prob,
                rnn.training: True
            }
            sess.run(train_op, feed_dict=feed_dict)
            step, loss, accuracy, summery = sess.run(
                [global_step, rnn.loss, rnn.accuracy, merged_summary],
                feed_dict={rnn.input_x: batch_x,
                rnn.labels: batch_y,
                rnn.dropout_keep_prob: 1.0,
                rnn.training: False})
            t = datetime.datetime.now().strftime('%m-%d %H:%M')
            print('%s: epoch: %d, step: %d, loss: %f, accuracy: %f' % (t, epoch,step, loss, accuracy))
            # 把结果写入Tensorboard中
            train_summary_writer.add_summary(summery, step)

        # 验证步骤
        def valid_step(next_valid_element):
            # 把valid_loss和valid_accuracy归0
            valid_loss = 0.0
            valid_accuracy = 0.0
            valid_precision = 0.0
            valid_recall = 0.0
            valid_f1_score = 0.0
            i = 0
            while True:
                try:
                    lines = sess.run(next_valid_element)
                    batch_x, batch_y = rnn.convert_input(lines)
                    feed_dict = {
                        rnn.input_x: batch_x,
                        rnn.labels: batch_y,
                        rnn.dropout_keep_prob: 1.0,
                        rnn.training: False
                    }
                    loss, accuracy, prediction, y_true = sess.run(
                        [rnn.loss, rnn.accuracy, rnn.prediction, rnn.labels],
                        feed_dict)

                    precision = sk.metrics.precision_score(y_true=y_true, y_pred=prediction, average='weighted')
                    recall = sk.metrics.recall_score(y_true=y_true, y_pred=prediction, average='weighted')
                    f1_score = sk.metrics.f1_score(y_true=y_true, y_pred=prediction, average='weighted')

                    valid_loss += loss
                    valid_accuracy += accuracy
                    valid_precision += precision
                    valid_recall += recall
                    valid_f1_score += f1_score
                    i += 1

                except tf.errors.OutOfRangeError:
                    # 遍历完验证集,然后对loss和accuracy求平均值
                    valid_loss /= i
                    valid_accuracy /= i
                    valid_precision /= i
                    valid_recall /= i
                    valid_f1_score /= i

                    t = datetime.datetime.now().strftime('%m-%d %H:%M')
                    log = '%s: epoch %d, validation loss: %0.6f, accuracy: %0.6f' % (
                        t, epoch, valid_loss, valid_accuracy)
                    log = log + '\n' + ('precision: %0.6f, recall: %0.6f, f1_score: %0.6f' % (
                        valid_precision, valid_recall, valid_f1_score))
                    print(log)
                    log_file.write(log + '\n')
                    time.sleep(3)
                    # 把结果写入Tensorboard中
                    # valid_summary_writer.add_summary(valid_summary, step)
                    return

        print('Start training TextRNN, training mode='+rnn.train_mode)
        sess.run(tf.global_variables_initializer())

        # Training loop
        for epoch in range(config.epoch_num):
            train_init_op, valid_init_op, next_train_element, next_valid_element = rnn.shuffle_datset()
            sess.run(train_init_op)
            while True:
                try:
                    lines = sess.run(next_train_element)
                    batch_x, batch_y = rnn.convert_input(lines)
                    train_step(batch_x, batch_y, config.dropout_keep_prob)
                except tf.errors.OutOfRangeError:
                    # 初始化验证集迭代器
                    sess.run(valid_init_op)
                    valid_step(next_valid_element)
                    break

        train_summary_writer.close()
        log_file.close()
        # 训练完成后保存参数
        path = saver.save(sess, checkpoint_prefix, global_step=global_step)
        print("Saved model checkpoint to {}\n".format(path))