Exemplo n.º 1
0
    label3 = label3.reshape([5, 1])

    path3 = os.path.join(out_dir, 'data3.npy')
    indexMat3 = np.concatenate([label3, data3], axis=1)
    np.save(path3, indexMat3)

    print('indexMat1:\n{}'.format(indexMat1))
    print('indexMat2:\n{}'.format(indexMat2))
    print('indexMat3:\n{}'.format(indexMat3))


if __name__ == '__main__':
    word2vec_path = "../../word2vec/models/THUCNews_word2Vec/THUCNews_word2Vec_128.model"
    train_out_dir = '../data/train_data'
    labels_file = '../data/THUCNews_labels.txt'
    labels_set = files_processing.read_txt(labels_file)

    files_processing.info_labels_set(labels_set)

    # create_test_data(out_dir)
    file_list = get_file_list(file_dir=train_out_dir, postfix='*.npy')
    create_word2vec.info_npy(file_list)

    iter = 5  # 迭代3次,每次输出一个batch个
    labels_nums = len(labels_set)
    batch = get_data_batch(file_list,
                           labels_nums=14,
                           batch_size=6,
                           shuffle=False,
                           one_hot=False)
    for i in range(iter):
Exemplo n.º 2
0
def train(train_dir, val_dir, labels_file, word2vec_path, batch_size,
          max_steps, log_step, val_step, snapshot, out_dir):
    '''
    训练...
    :param train_dir: 训练数据目录
    :param val_dir:   val数据目录
    :param labels_file:  labels文件目录
    :param word2vec_path: 词向量模型文件
    :param batch_size: batch size
    :param max_steps:  最大迭代次数
    :param log_step:  log显示间隔
    :param val_step:  测试间隔
    :param snapshot:  保存模型间隔
    :param out_dir:   模型ckpt和summaries输出的目录
    :return:
    '''

    max_sentence_length = 300

    embedding_dim = 50

    filter_sizes = [3, 4, 5, 6]

    num_filters = 200  # Number of filters per filter size

    base_lr = 0.001  # 学习率

    dropout_keep_prob = 0.5

    l2_reg_lambda = 0.0  # "L2 regularization lambda (default: 0.0)

    allow_soft_placement = True  # 如果你指定的设备不存在,允许TF自动分配设备

    log_device_placement = False  # 是否打印设备分配日志

    print("Loading data...")

    w2vModel = create_word2vec.load_wordVectors(word2vec_path)

    labels_set = files_processing.read_txt(labels_file)

    labels_nums = len(labels_set)

    train_file_list = create_batch_data.get_file_list(file_dir=train_dir,
                                                      postfix='*.npy')

    train_batch = create_batch_data.get_data_batch(train_file_list,
                                                   labels_nums=labels_nums,
                                                   batch_size=batch_size,
                                                   shuffle=False,
                                                   one_hot=True)

    val_file_list = create_batch_data.get_file_list(file_dir=val_dir,
                                                    postfix='*.npy')

    val_batch = create_batch_data.get_data_batch(val_file_list,
                                                 labels_nums=labels_nums,
                                                 batch_size=batch_size,
                                                 shuffle=False,
                                                 one_hot=True)

    print("train data info *****************************")

    train_nums = create_word2vec.info_npy(train_file_list)

    print("val data   info *****************************")

    val_nums = create_word2vec.info_npy(val_file_list)

    print("labels_set info *****************************")

    files_processing.info_labels_set(labels_set)

    # Training

    with tf.Graph().as_default():

        session_conf = tf.ConfigProto(
            allow_soft_placement=allow_soft_placement,
            log_device_placement=log_device_placement)

        sess = tf.Session(config=session_conf)

        with sess.as_default():

            cnn = TextCNN(sequence_length=max_sentence_length,
                          num_classes=labels_nums,
                          embedding_size=embedding_dim,
                          filter_sizes=filter_sizes,
                          num_filters=num_filters,
                          l2_reg_lambda=l2_reg_lambda)

            # Define Training procedure

            global_step = tf.Variable(0, name="global_step", trainable=False)

            optimizer = tf.train.AdamOptimizer(learning_rate=base_lr)

            # optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9)

            grads_and_vars = optimizer.compute_gradients(cnn.loss)

            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)

            grad_summaries = []

            for g, v in grads_and_vars:

                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)

                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))

                    grad_summaries.append(grad_hist_summary)

                    grad_summaries.append(sparsity_summary)

            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries

            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy

            loss_summary = tf.summary.scalar("loss", cnn.loss)

            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries

            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])

            train_summary_dir = os.path.join(out_dir, "summaries", "train")

            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries

            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])

            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")

            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it

            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))

            checkpoint_prefix = os.path.join(checkpoint_dir, "model")

            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)

            # Initialize all variables

            sess.run(tf.global_variables_initializer())

            def train_step(x_batch, y_batch):
                """

                A single training step

                """

                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: dropout_keep_prob
                }

                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)

                if step % log_step == 0:
                    print("training: step {}, loss {:g}, acc {:g}".format(
                        step, loss, accuracy))

                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                """

                Evaluates model on a dev set

                """

                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }

                step, summaries, loss, accuracy = sess.run(
                    [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                    feed_dict)

                if writer:
                    writer.add_summary(summaries, step)

                return loss, accuracy

            for i in range(max_steps):

                train_batch_data, train_batch_label = create_batch_data.get_next_batch(
                    train_batch)

                train_batch_data = create_word2vec.indexMat2vector_lookup(
                    w2vModel, train_batch_data)

                train_step(train_batch_data, train_batch_label)

                current_step = tf.train.global_step(sess, global_step)

                if current_step % val_step == 0:

                    val_losses = []

                    val_accs = []

                    # for k in range(int(val_nums/batch_size)):

                    for k in range(100):
                        val_batch_data, val_batch_label = create_batch_data.get_next_batch(
                            val_batch)

                        val_batch_data = create_word2vec.indexMat2vector_lookup(
                            w2vModel, val_batch_data)

                        val_loss, val_acc = dev_step(val_batch_data,
                                                     val_batch_label,
                                                     writer=dev_summary_writer)

                        val_losses.append(val_loss)

                        val_accs.append(val_acc)

                    mean_loss = np.array(val_losses, dtype=np.float32).mean()

                    mean_acc = np.array(val_accs, dtype=np.float32).mean()

                    print("--------Evaluation:step {}, loss {:g}, acc {:g}".
                          format(current_step, mean_loss, mean_acc))

                if current_step % snapshot == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)

                    print("Saved model checkpoint to {}\n".format(path))
Exemplo n.º 3
0
def batch_predict(val_dir, labels_file, models_path, word2vec_path,
                  batch_size):
    '''
    预测...
    :param val_dir:   val数据目录
    :param labels_file:  labels文件目录
    :param models_path:  模型文件
    :param word2vec_path: 词向量模型文件
    :param batch_size: batch size
    :return:
    '''
    max_sentence_length = 300
    embedding_dim = 128
    filter_sizes = [3, 4, 5, 6]
    num_filters = 200  # Number of filters per filter size
    l2_reg_lambda = 0.0  # "L2 regularization lambda (default: 0.0)
    print("Loading data...")
    w2vModel = create_word2vec.load_wordVectors(word2vec_path)

    labels_set = files_processing.read_txt(labels_file)
    labels_nums = len(labels_set)

    val_file_list = create_batch_data.get_file_list(file_dir=val_dir,
                                                    postfix='*.npy')
    val_batch = create_batch_data.get_data_batch(val_file_list,
                                                 labels_nums=labels_nums,
                                                 batch_size=batch_size,
                                                 shuffle=False,
                                                 one_hot=True)

    print("val data   info *****************************")
    val_nums = create_word2vec.info_npy(val_file_list)
    print("labels_set info *****************************")
    files_processing.info_labels_set(labels_set)
    # Training
    with tf.Graph().as_default():
        sess = tf.Session()
        with sess.as_default():
            cnn = TextCNN(sequence_length=max_sentence_length,
                          num_classes=labels_nums,
                          embedding_size=embedding_dim,
                          filter_sizes=filter_sizes,
                          num_filters=num_filters,
                          l2_reg_lambda=l2_reg_lambda)

            # Initialize all variables
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()
            saver.restore(sess, models_path)

            def dev_step(x_batch, y_batch):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                loss, accuracy = sess.run([cnn.loss, cnn.accuracy], feed_dict)
                return loss, accuracy

            val_losses = []
            val_accs = []
            for k in range(int(val_nums / batch_size)):
                # for k in range(int(10)):
                val_batch_data, val_batch_label = create_batch_data.get_next_batch(
                    val_batch)
                val_batch_data = create_word2vec.indexMat2vector_lookup(
                    w2vModel, val_batch_data)
                val_loss, val_acc = dev_step(val_batch_data, val_batch_label)
                val_losses.append(val_loss)
                val_accs.append(val_acc)
                print("--------Evaluation:step {}, loss {:g}, acc {:g}".format(
                    k, val_loss, val_acc))

            mean_loss = np.array(val_losses, dtype=np.float32).mean()
            mean_acc = np.array(val_accs, dtype=np.float32).mean()
            print("--------Evaluation:step {}, mean loss {:g}, mean acc {:g}".
                  format(k, mean_loss, mean_acc))
Exemplo n.º 4
0
def main(dimen):
    # Data preprocess

    labels_file = '.\\data\\THUCNews_labels.txt'

    word2vec_path = "D:\\pypro\\bs\\model\\size50.model"

    max_steps = 10000  # 迭代次数

    batch_size = 128

    out_dir = ".\\models"  # 模型ckpt和summaries输出的目录

    train_dir = '.\\data\\train_data'

    val_dir = '.\\data\\val_data'
    '''
    train(train_dir=train_dir,

          val_dir=val_dir,

          labels_file=labels_file,

          word2vec_path=word2vec_path,

          batch_size=batch_size,

          max_steps=max_steps,

          log_step=50,

          val_step=500,

          snapshot=1000,

          out_dir=out_dir)
    '''
    model_dir = '.\\models\\checkpoints'
    saver = tf.train.import_meta_graph(model_dir+'\\model-40000.meta')
    graph = tf.get_default_graph()
    tensor_name_list = [tensor.name for tensor in graph.as_graph_def().node]
    x = graph.get_tensor_by_name('input_x:0')
    y = graph.get_tensor_by_name('input_y:0')
    keep_prob = graph.get_tensor_by_name('dropout_keep_prob:0')
    scores = graph.get_tensor_by_name('output/scores:0')
    accuracy = graph.get_tensor_by_name('accuracy/accuracy:0')
    with tf.Session() as sess:
        saver.restore(sess,tf.train.latest_checkpoint(model_dir))
        print('加载成功!')
        labels_set = files_processing.read_txt(labels_file)
        labels_nums = len(labels_set)
        val_file_list = create_batch_data.get_file_list(file_dir=val_dir, postfix='*.npy')
        val_batch = create_batch_data.get_data_batch(val_file_list, labels_nums=labels_nums, batch_size=batch_size,
                                                     shuffle=False, one_hot=True)
        w2vModel = create_word2vec.load_wordVectors(word2vec_path)
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
        val_losses = []
        val_accs = []
        for k in range(100):
            val_batch_data, val_batch_label = create_batch_data.get_next_batch(val_batch)

            val_batch_data = create_word2vec.indexMat2vector_lookup(w2vModel, val_batch_data)

            for i in val_batch_data:
                for j in i:
                    j[dimen]=0
                    '''
                    for d in range(dimen):
                        j[index[d]]=0
                    '''
            #print('val_batch_data是:', type(val_batch_data),val_batch_data[0].shape,  val_batch_data[0],len(val_batch_data[0]))
            feed_dict = {
                x: val_batch_data,
                y: val_batch_label,
                keep_prob: 1.0
            }
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=y)
            loss = tf.reduce_mean(losses)
            val_loss = sess.run(loss, feed_dict)
            val_acc = sess.run(accuracy,feed_dict)
            #print('val_loss:',val_loss,'val_acc:',val_acc)

            val_losses.append(val_loss)

            val_accs.append(val_acc)

        mean_loss = np.array(val_losses, dtype=np.float32).mean()

        mean_acc = np.array(val_accs, dtype=np.float32).mean()
        nowTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(dimen,"--------Evaluation: loss {:g}, acc {:g}".format( mean_loss, mean_acc),nowTime)
        return mean_acc, mean_loss
Exemplo n.º 5
0
def text_predict(files_list, labels_file, models_path, word2vec_path,
                 batch_size):
    '''
    预测...
    :param val_dir:   val数据目录
    :param labels_file:  labels文件目录
    :param models_path:  模型文件
    :param word2vec_path: 词向量模型文件
    :param batch_size: batch size
    :return:
    '''
    max_sentence_length = 300
    embedding_dim = 128
    filter_sizes = [3, 4, 5, 6]
    num_filters = 200  # Number of filters per filter size
    l2_reg_lambda = 0.0  # "L2 regularization lambda (default: 0.0)
    print("Loading data...")
    w2vModel = create_word2vec.load_wordVectors(word2vec_path)

    labels_set = files_processing.read_txt(labels_file)
    labels_nums = len(labels_set)
    sample_num = len(files_list)

    labels_list = [-1]
    labels_list = labels_list * sample_num

    with tf.Graph().as_default():
        sess = tf.Session()
        with sess.as_default():
            cnn = TextCNN(sequence_length=max_sentence_length,
                          num_classes=labels_nums,
                          embedding_size=embedding_dim,
                          filter_sizes=filter_sizes,
                          num_filters=num_filters,
                          l2_reg_lambda=l2_reg_lambda)

            # Initialize all variables
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()
            saver.restore(sess, models_path)

            def pred_step(x_batch):
                """
                predictions model on a dev set
                """
                feed_dict = {cnn.input_x: x_batch, cnn.dropout_keep_prob: 1.0}
                pred = sess.run([cnn.predictions], feed_dict)
                return pred

            batchNum = int(math.ceil(1.0 * sample_num / batch_size))
            for i in range(batchNum):
                start = i * batch_size
                end = min((i + 1) * batch_size, sample_num)
                batch_files = files_list[start:end]

                # 读取文件内容,字词分割
                batch_content = files_processing.read_files_list_to_segment(
                    batch_files, max_sentence_length, padding_token='<PAD>')
                # [1]将字词转为索引矩阵,再映射为词向量
                batch_indexMat = create_word2vec.word2indexMat(
                    w2vModel, batch_content, max_sentence_length)
                val_batch_data = create_word2vec.indexMat2vector_lookup(
                    w2vModel, batch_indexMat)

                # [2]直接将字词映射为词向量
                # val_batch_data = create_word2vec.word2vector_lookup(w2vModel,batch_content)

                pred = pred_step(val_batch_data)

                pred = pred[0].tolist()
                pred = files_processing.labels_decoding(pred, labels_set)
                for k, file in enumerate(batch_files):
                    print("{}, pred:{}".format(file, pred[k]))