예제 #1
0
def predict():
    with open(map_path, "rb") as f:
        word_to_id, cat_to_id, seq_length, num_classes = pickle.load(f)
    id_to_cat = {v: k for k, v in cat_to_id.items()}
    config = TRNNConfig()
    config.num_classes = num_classes
    config.vocab_size = len(word_to_id)
    model = TextRNN(config)
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型
    while True:
        line = input("请输入测试句子:")
        data_id = [[
            word_to_id[x] for x in list(native_content(line))
            if x in word_to_id
        ]]
        x_pad = kr.preprocessing.sequence.pad_sequences(data_id, seq_length)
        y_pred_cls = session.run(model.y_pred_cls,
                                 feed_dict={
                                     model.input_x: x_pad,
                                     model.keep_prob: 1.0
                                 })
        print('sentence : {}, prdict intent : {}'.format(
            line, id_to_cat[y_pred_cls[0]]))
        a = 1
예제 #2
0
 def __init__(self):
     self.config = TRNNConfig()
     self.categories, self.cat_to_id = read_category()
     self.words, self.word_to_id = read_vocab(vocab_dir)
     self.config.vocab_size = len(self.words)
     self.model = TextRNN(self.config)
     self.session = tf.Session()
     self.session.run(tf.global_variables_initializer())
     saver = tf.train.Saver()
     saver.restore(sess=self.session, save_path=save_path)  # 读取保存的模型
예제 #3
0
    def __init__(self):
        self.config = TRNNConfig()
        self.categories, self.cat_to_id = read_category()
        self.words = np.load('./datas/dict_token.npy')
        self.word_to_id = np.load('./datas/token_to_id.npy').tolist()
        self.config.vocab_size = len(self.words)
        self.model = TextRNN(self.config)

        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess=self.session, save_path=save_path)  # 读取保存的模型
    def __init__(self):
        self.config = TRNNConfig()
        self.categories = categories
        self.cat_to_id = cat_to_id
        self.words = words
        self.word_to_id = word_to_id
        self.config.vocab_size = len(self.words)
        self.model = TextRNN(self.config)

        self.session = sess
        self.session.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess=self.session, save_path=rnn_save_path)  # 读取保存的模型
예제 #5
0
def read_example(filename_queue):
    """Read one example from filename_queue"""
    config = TRNNConfig()
    reader = tf.TFRecordReader()
    key, value = reader.read(filename_queue)
    features = tf.parse_single_example(value, features={"title_len": tf.FixedLenFeature([1], tf.int64),
                                                        "title": tf.FixedLenFeature([config.seq_length], tf.int64),
                                                        "label": tf.FixedLenFeature([config.seq_length], tf.float32),
                                                        "frame_weight": tf.FixedLenFeature([config.seq_length], tf.float32),
                                                        })
    title_len = tf.cast(features["title_len"], tf.int32)
    title = tf.cast(features["title"], tf.int32)
    label = tf.cast(features["label"], tf.float32)
    frame_weight = tf.cast(features["frame_weight"], tf.float32)
    return title_len, title, label, frame_weight
예제 #6
0
 def __init__(self):
     with open(map_path, "rb") as f:
         self.word_to_id, self.cat_to_id, self.seq_length, self.num_classes = pickle.load(
             f)
     self.id_to_cat = {v: k for k, v in self.cat_to_id.items()}
     if model_type == 'cnn':
         self.config = TCNNConfig()
         self.config.num_classes = self.num_classes
         self.config.vocab_size = len(self.word_to_id)
         self.model = TextCNN(self.config)
     else:
         self.config = TRNNConfig()
         self.config.num_classes = self.num_classes
         self.config.vocab_size = len(self.word_to_id)
         self.model = TextRNN(self.config)
     self.session = tf.Session()
     self.session.run(tf.global_variables_initializer())
     saver = tf.train.Saver()
     saver.restore(sess=self.session, save_path=save_path)  # 读取保存的模型
예제 #7
0
  def load_model(self):
    sess = tf.Session()
    print('Configuring CNN model...')
    config = TRNNConfig()
    cnn_model = TextRNN(config)

    saver = tf.train.Saver()
    params_file = tf.train.latest_checkpoint(self.model_dir)
    saver.restore(sess, params_file)

    categories, cat_to_id = read_category()
    vocab_dir = 'cnews/cnews.vocab.txt'
    words, word_to_id = read_vocab(vocab_dir)

    self.words = words
    self.word_to_id = word_to_id
    self.categories = categories
    self.cat_to_id = cat_to_id

    self.cnn_model = cnn_model
    self.sess = sess
    print(self.cnn_model)
    print(self.sess)
예제 #8
0
    print("Time usage:", time_dif)


if __name__ == '__main__':

    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_rnn.py [train / test]""")

    conn = pymysql.connect(host='localhost',
                           user='******',
                           passwd='sasa',
                           db='text_clf',
                           charset='utf8')
    cur = conn.cursor()

    print('Configuring RNN model...')
    config = TRNNConfig()
    #if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
    #   build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_vec = read_category()
    #words, word_to_id = read_vocab(vocab_dir)
    #config.vocab_size = len(words)
    model = TextRNN(config, get_embedding())

    print('training')

    if sys.argv[1] == 'train':
        train()
    else:
        test()
    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_rnn.py [train / test]""")

    print('Configuring RNN model...')
    config = TRNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextRNN(config)

    if sys.argv[1] == 'train':
        train()
    else:
        test()
예제 #10
0
파일: 0.py 프로젝트: jiudian123/first
test_dir = os.path.join(base_dir, 'cnewstest.txt')
val_dir = os.path.join(base_dir, 'cnewsval.txt')
vocab_dir = os.path.join(base_dir, 'cnewsvocab.txt')
vector_word_dir= os.path.join(base_dir, 'vector_word.txt')#vector_word trained by word2vec
vector_word_npz=os.path.join(base_dir, 'vector_word.npz')# save vector_word to numpy file
#最佳验证结果保存路径
save_dir = r'HOME\mydata\lstm\checkpoints'
save_path = os.path.join(save_dir, 'best_validation') 
#获取词典
'''build_vocab(train_dir,vocab_dir)
_,word_to_id=read_vocab(vocab_dir)
categories,cat_to_id=read_category()

config=TRNNConfig()
model=TextRNN(config)'''
config=TRNNConfig()
build_vocab(train_dir,vocab_dir)
words,word_to_id=read_vocab(vocab_dir)
categories,cat_to_id=read_category()
config.vocab_size = len(words)
if not os.path.exists(vector_word_npz):
   export_word2vec_vectors(word_to_id, vector_word_dir, vector_word_npz)
config.pre_trianing = get_training_word2vec_vectors(vector_word_npz)
model=TextRNN(config)
init=tf.global_variables_initializer()

def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))
예제 #11
0
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 3 or sys.argv[1] not in [
            'train', 'test'
    ] or sys.argv[2] not in ['char', 'word']:
        raise ValueError(
            """usage: python run_cnn.py [train / test] [char / word]""")

    print('Configuring RNN model...')
    if sys.argv[2] == 'char':
        config = TRNNConfig()
        if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
            build_vocab(train_dir, vocab_dir, config.vocab_size)
        categories, cat_to_id = read_category()
        words, word_to_id = read_vocab(vocab_dir)
        config.vocab_size = len(words)
        model = TextRNN(config)

        if sys.argv[1] == 'train':
            train()
        else:
            test()
    else:
        print("train on word embedding...")
        config = TRNNConfig()
        config.seq_length = 400
예제 #12
0
파일: run_rnn.py 프로젝트: DerekGrant/KDA
    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':

    print('Configuring RNN model...',sys.argv)

    if len(sys.argv) == 6 and sys.argv[1] in ['train', 'test']:
        config = TRNNConfig()
        t_name = sys.argv[3]
        t_th = sys.argv[2]
        data_dir = sys.argv[4]
        base_dir = 'data/' + data_dir + '/' + t_name
        classes = sys.argv[5].split('-')


        train_dir = os.path.join(base_dir, 'train.csv')
        test_dir = os.path.join(base_dir, 'test.csv')
        val_dir = os.path.join(base_dir, 'dev.csv')
        vocab_dir = os.path.join('data/data_orginal/'+t_name, 'vocab.csv')

        if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
            print(' vocab_dir not exists: ',vocab_dir)
            build_vocab('data/data_orginal/'+t_name+'/whole.csv', vocab_dir, config.vocab_size)
예제 #13
0
        print(len(test_docs))
        train_file.close()

        test_file = open(rnn.test_dir, 'w')
        for test_doc in test_docs:
            temp = test_doc.split(',')
            test_data_X.append(corpus[int(temp[0]) - 1])
            test_data_Y.append(temp[1])
            string = corpus[int(temp[0]) - 1].replace('\n',
                                                      '').replace('\t', '')

            test_file.write(temp[1] + '\t' + string + '\n')

        print('Configuring RNN model...')
        test_file.close()
        rnn.config = TRNNConfig()
        #if not os.path.exists(rnn.vocab_dir): #if no vocab, build it
        build_vocab_words(rnn.train_dir, rnn.vocab_dir, rnn.config.vocab_size)
        rnn.categories, rnn.cat_to_id, rnn.id_to_cat = read_category()
        rnn.words, rnn.word_to_id = read_vocab(rnn.vocab_dir)
        rnn.config.vocab_size = len(rnn.words)

        #select a subset of word vectors
        rnn.missing_dir = os.path.join(rnn.base_dir,
                                       key + '.' + sub_key + '.missing.txt')
        missing_words_file = open(rnn.missing_dir, 'w')
        sub_embeddings = np.random.uniform(
            -0.0, 0.0, (rnn.config.vocab_size, embedding_dim))
        count = 0
        for i in range(0, rnn.config.vocab_size):
            if (rnn.words[i] in word_vector_map
예제 #14
0
    for f in F1:
        print('\t'.join(['%0.1f'%f[0],str(f[2]),str(f[3]),str(f[4]),str(f[5])]))
    return auc,F1
if __name__ == '__main__':
    tf.reset_default_graph()
    base_dir = sys.argv[1]
    save_dir = sys.argv[2]
    ckpt_dir = sys.argv[3]
    train_dir = os.path.join(base_dir, 'train.txt')
    test_dir = os.path.join(base_dir, 'test.txt')
    val_dir = os.path.join(base_dir, 'val.txt')
    vocab_dir = os.path.join(base_dir, 'vocab.txt')
    predict_dir = os.path.join(base_dir, 'predict.txt')
    save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径
    if len(sys.argv)>4:
        option = sys.argv[4]
    else:
        option = 'train'
    print('Configuring RNN model...')
    config = TRNNConfig()
    tokenizer = Tokenizer(vocab_dir)
    config.vocab_size = len(tokenizer.vocab)
    model = TextRNN(config)
    print('参数总量:%d'%np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]))
    if option == 'train':
        iter = batch_iter(train_dir, tokenizer, epochs=config.num_epochs)
        iter_test = batch_iter_test(val_dir, tokenizer)
        train()
    else:
        test()
예제 #15
0
    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    # if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
    #     raise ValueError("""usage: python run_rnn.py [train / test]""")

    print('Configuring RNN model...')
    config = TRNNConfig()

    # 构建英文和中文的词表、字表
    if not os.path.exists(vocab_cn):  # 中文词汇表2370个,重建
        # 中文是分好词的,我需要把他处理成一个一个的字
        build_vocab(train_cnx, vocab_cn, config.vocab_size)
    if not os.path.exists(vocab_en):  # 英文词汇表5000个,重建
        build_vocab(train_eny, vocab_en, config.vocab_size)

    # 开始映射words to id
    words_cn, word_to_id_cn = read_vocab(vocab_cn)
    words_en, word_to_id_en = read_vocab(vocab_en)

    config.vocab_size_cn = len(words_cn)
    config.vocab_size_en = len(words_en)
예제 #16
0
파일: run_rnn.py 프로젝트: DMStudent/Nlp
def test():
    print('Configuring RNN model...')
    config = TRNNConfig()
    config.dropout_keep_prob = 1.0
    config.num_epochs = 1
    start_time = time.time()
    config.batch_size = 10
    count = 0

    # 配置GPU内存分配方式
    tfconfig = tf.ConfigProto(log_device_placement=True)
    tfconfig.gpu_options.allow_growth = True
    tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.6

    fw = file(config.test_dir_output, "w")
    with tf.Graph().as_default(), tf.Session(config=tfconfig) as sess:
        test_dir_list = os.listdir(config.test_dir_tf)
        test_dir_list = [
            os.path.join(config.test_dir_tf, i) for i in test_dir_list
        ]
        queueTest = tf.train.string_input_producer(
            test_dir_list, num_epochs=config.num_epochs)
        text, title_len, title, label, frame_weight = read_example_test(
            queueTest)

        text_batch, title_len_batch, title_batch, label_batch, frame_weight_batch = tf.train.batch(
            [text, title_len, title, label, frame_weight],
            batch_size=config.batch_size,
            capacity=50000,
            num_threads=1)
        with tf.variable_scope("model",
                               initializer=tf.random_uniform_initializer(
                                   -1 * 1, 1)):
            model = TextRNN(config=config,
                            input_x_len=title_len_batch,
                            input_x=title_batch,
                            input_y=label_batch,
                            frame_weight=frame_weight_batch)

        fetches = [text_batch, model.input_x_len, model.y_pred, model.input_y]
        feed_dict = {}
        # init
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess.run(init_op)
        # 配置 Saver
        saver = tf.train.Saver()
        saver.restore(sess=sess, save_path=config.modelPath)  # 读取保存的模型
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        if not os.path.exists(config.save_dir):
            os.makedirs(config.save_dir)

        try:
            while not coord.should_stop():
                texts, x_len, y_pred, y_test = sess.run(fetches,
                                                        feed_dict=feed_dict)
                texts = "".join(texts.values).split("\n")
                for i in range(len(texts) - 1):
                    score = [str(int(j * 100))
                             for j in y_test[i]][:x_len[i][0]]
                    y_test_i = " ".join(score)
                    score = [str(int(j * 100))
                             for j in y_pred[i]][:x_len[i][0]]
                    y_pred_i = " ".join(score)
                    fw.write(texts[i] + "\ttarget:\t" + y_test_i +
                             "\tpredict\t" + y_pred_i + "\n")
                count = count + 1
                if count % 10000 == 0:
                    print(count)

        except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')

        coord.request_stop()
        coord.join(threads)
    fw.close()
예제 #17
0
파일: run_rnn.py 프로젝트: DMStudent/Nlp
def train():
    print('Configuring RNN model...')
    config = TRNNConfig()
    config.dropout_keep_prob = 1.0
    start_time = time.time()
    # config.batch_size = 10
    total_batch = 0  # 总批次
    best_mse_val = 99999999  # 最佳验证集准确率
    best_loss_val = 99999999  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 5000  # 如果超过1000轮未提升,提前结束训练
    count = 0
    tensorboard_dir = config.tensorboard_dir

    # 配置GPU内存分配方式
    tfconfig = tf.ConfigProto(log_device_placement=True)
    tfconfig.gpu_options.allow_growth = True
    tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.6

    with tf.Graph().as_default(), tf.Session(config=tfconfig) as sess:
        train_dir_list = os.listdir(config.train_dir_tf)
        train_dir_list = [
            os.path.join(config.train_dir_tf, i) for i in train_dir_list
        ]
        queueTrain = tf.train.string_input_producer(
            train_dir_list, num_epochs=config.num_epochs)
        title_len, title, label, frame_weight = read_example(queueTrain)

        title_len_batch, title_batch, label_batch, frame_weight_batch = tf.train.batch(
            [title_len, title, label, frame_weight],
            batch_size=config.batch_size,
            capacity=100000,
            num_threads=1)

        with tf.variable_scope("model",
                               initializer=tf.random_uniform_initializer(
                                   -1 * 1, 1)):
            model = TextRNN(config=config,
                            input_x_len=title_len_batch,
                            input_x=title_batch,
                            input_y=label_batch,
                            frame_weight=frame_weight_batch)

        tf.summary.scalar("loss", model.loss)
        tf.summary.scalar("mse", model.mse)
        merged_summary = tf.summary.merge_all()
        writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)

        fetches = [model.loss, model.mse]
        feed_dict = {}
        # init
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess.run(init_op)
        # 配置 Saver
        saver = tf.train.Saver(write_version=saver_pb2.SaverDef.V1)
        if not config.retraining:
            saver.restore(sess=sess, save_path=config.modelPath)  # 读取保存的模型
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        if not os.path.exists(config.save_dir):
            os.makedirs(config.save_dir)

        try:
            while not coord.should_stop():
                # Run training steps or whatever
                # titles, labels = sess.run([title_batch, label_batch])
                if total_batch % config.save_per_batch == 0:
                    # 每多少轮次将训练结果写入tensorboard scalar
                    s = sess.run(merged_summary, feed_dict)
                    writer.add_summary(s, total_batch)

                if total_batch % config.print_per_batch == 0:
                    # 每多少轮次输出在训练集和验证集上的性能
                    loss_val, mse_val = sess.run(fetches, feed_dict)

                    if mse_val < best_mse_val or loss_val < best_loss_val:
                        # 保存最好结果
                        best_mse_val = mse_val
                        best_loss_val = loss_val
                        last_improved = total_batch
                        improved_str = '*'
                        # saver.save(sess=sess, save_path=config.save_path)
                        if total_batch % config.save_per_batch == 0:
                            saver.save(sess,
                                       config.save_path + '_%03d' %
                                       (total_batch / config.save_per_batch),
                                       write_meta_graph=False)
                    else:
                        improved_str = ''

                    time_dif = get_time_dif(start_time)
                    msg = 'Iter: {0:>6}, Val Loss: {1:>6.5}, Mse: {2:>6.5}, Time: {3} {4}'
                    print(
                        msg.format(total_batch, loss_val, mse_val, time_dif,
                                   improved_str))
                    # print(embedding_inputs)

                sess.run(model.optim, feed_dict)
                total_batch += 1

                if total_batch - last_improved > require_improvement:
                    # 验证集正确率长期不提升,提前结束训练
                    print("No optimization for a long time, auto-stopping...")
                    coord.should_stop()
                    break  # 跳出循环

        except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')

        coord.request_stop()
        coord.join(threads)
예제 #18
0
    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_rnn.py [train / test]""")

    print('Configuring RNN model...')
    config = TRNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextRNN(config)

    if sys.argv[1] == 'train':
        train()
    else:
        test()
예제 #19
0
    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_rnn.py [train / test]""")

    print('Configuring RNN model...')

    print('load data. . .')
    X = pickle.load(open(train_data, 'rb'))
    df, word_vecs, word_cab_num, sentence_max_len, class_num = X[0], X[1], X[2], X[3], X[4]

    config = TRNNConfig(sentence_max_len, class_num, word_cab_num)

    word_ids, W_list = process_data.getWordsVect(config, word_vecs)

    model = TextRNN(config, W_list, False) #默认不训练词向量


    if sys.argv[1] == 'train':
        train()
    else:
        test()