Пример #1
0
def train(config):

    set_manual_seed(10)
    """ 1: 文本清洗和分词,构建词表 """
    print("Preparing the batch data ... \n")
    corpus_x, corpus_y, vocab = build_dataset(config)
    """ 2:计算类别权重,缓解类别不平衡问题 """
    class_weights = calcu_class_weights(corpus_y, config)
    config.class_weights = class_weights
    """ 3:加载预训练的词向量 """
    embed_matrix = load_embed_matrix(vocab, config)
    config.embed_matrix = embed_matrix
    """ 4: 划分数据集和生成batch迭代器 """
    train_iter, valid_iter, test_iter = batch_generator(
        corpus_x, corpus_y, 0.15, config)
    """ 5:模型初始化 """
    print("Building the textcnn model ... \n")
    model = TextCNN(config)
    print(f'The model has {count_params(model):,} trainable parameters\n')

    model.to(config.device)
    """ 6:开始训练模型 """
    print("Start the training ... \n")
    init_network(model)
    train_model(config, model, train_iter, valid_iter, test_iter)
Пример #2
0
class Textcnn_pred(object):
    def __init__(self, vocab_dir):
        self.input_x = tf.placeholder(tf.int32, [None, config.seq_length],
                                      name='input_x')
        self.words = tools.read_file(vocab_dir)
        self.vocab_size = len(self.words)

        self.textcnn = TextCNN(config, self.vocab_size, keep_prob=1.0)
        self.logits = self.textcnn.cnn(self.input_x)
        self.textcnn_pred = tf.argmax(tf.nn.softmax(self.logits), 1)

        saver = tf.train.Saver()
        sess_config = tf.ConfigProto(allow_soft_placement=True)

        sess_config.gpu_options.per_process_gpu_memory_fraction = 0.8
        sess_config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=sess_config)
        model_path = 'checkpoints/model/TextCNNnet_2019-10-17-14-35-50.ckpt-9000'
        saver.restore(sess=self.sess, save_path=model_path)
        print(
            "################ load TextCNN model down! ##########################"
        )

    def _close(self):
        self.sess.close()

    def text(self, input):
        logit, pred = self.sess.run([self.logits, self.textcnn_pred],
                                    feed_dict={self.input_x: input})

        return pred
Пример #3
0
def Textcnn_test():
    if FLAGS.vocab_dir is None:
        words = tools.build_vocab(train_data=FLAGS.train_data,
                                  vocab_dir=FLAGS.vocab_dir)  ### 制作词汇表
    else:
        words = tools.read_file(FLAGS.vocab_dir)
    vocab_size = len(words)
    print("Test words : ", vocab_size)
    test_X, test_Y = tools.create_voabulary(train_data=FLAGS.test_data,
                                            vocab_data=FLAGS.vocab_dir,
                                            max_length=config.seq_length)

    input_x = tf.placeholder(tf.int32, [None, config.seq_length],
                             name='input_x')
    input_y = tf.placeholder(tf.float32, [None, config.num_classes],
                             name='input_y')

    model_path = 'checkpoints/TextCNNnet_2019-11-01-15-31-50.ckpt-4000'

    save_path = model_path
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True
    sess = tf.Session(config=sess_config)

    textcnn = TextCNN(config, vocab_size, keep_prob=1.0)
    logits = textcnn.cnn(input_x)  ### (?,10)
    loss = textcnn_loss(logits=logits, label=input_y)
    acc = textcnn_acc(logits=logits, labels=input_y)

    saver = tf.train.Saver()
    saver.restore(sess=sess, save_path=save_path)

    batch_test = tools.batch_iter(test_X, test_Y,
                                  config.batch_size)  ### 生成批次数据
    i = 0
    all_acc = 0
    for x_batch, y_batch in batch_test:
        test_loss, test_acc = sess.run([loss, acc],
                                       feed_dict={
                                           input_x: x_batch,
                                           input_y: y_batch
                                       })
        all_acc = all_acc + test_acc
        i += 1

    print("Average acc : ", (all_acc / i))
Пример #4
0
def train(config):

    set_manual_seed(10)
    """ 1: 划分数据集并保存 """
    print("Preparing the batch data ... \n")
    build_dataset(config)
    """ 2:计算类别权重,缓解类别不平衡问题 """
    class_weights = calcu_class_weights(config)
    config.class_weights = class_weights
    """ 3: 划分数据集和生成batch迭代器 """
    train_iter, valid_iter, test_iter = batch_generator(config)
    """ 5:模型初始化 """
    print("Building the textcnn model ... \n")
    model = TextCNN(config)
    print(f'The model has {count_params(model):,} trainable parameters\n')

    model.to(config.device)
    """ 6:开始训练模型 """
    print("Start the training ... \n")
    init_network(model)
    train_model(config, model, train_iter, valid_iter, test_iter)
    def __init__(self,
                 pretrained_model_path,
                 vocabulary_size,
                 filter_sizes,
                 filter_num,
                 data=None,
                 train=False,
                 cuda=1):
        super(OLTR_For_Textcnn, self).__init__()
        self.device = torch.device(
            'cuda:%d' % cuda if torch.cuda.is_available() else 'cpu')
        self.textcnn = TextCNN(vocabulary_size=vocabulary_size,
                               class_num=183,
                               filter_num=filter_num,
                               filter_sizes=filter_sizes,
                               embedding_dim=128)
        checkpoint = torch.load(pretrained_model_path,
                                map_location=self.device)
        self.textcnn.load_state_dict(checkpoint)
        self.textcnn = self.textcnn.to(self.device)
        # fix all param in textcnn when training OLTR
        for param_name, param in self.textcnn.named_parameters():
            param.requires_grad = False
        self.textcnn.eval()
        self.classes_num = 183
        self.feature_dim = len(filter_sizes.split(",")) * filter_num

        self.classifier = OLTR_classifier(self.feature_dim, self.classes_num)

        self.centroids = nn.Parameter(
            torch.randn(self.classes_num, self.feature_dim))
        if train and data is not None:
            print("update centroid with data")
            self.centroids.data = self.centroids_cal(data)
        elif train and data is None:
            raise ValueError("Train mode should update centroid with data")
        else:
            print("Test mode should load pretrained centroid")
Пример #6
0
def predict():
    word2index, label2index, trainX, trainY, vaildX, validY, testX, testY = load_data(
        FLAGS.all_data_h5py, FLAGS.id_index_pkl)
    vocab_size = len(word2index)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        text_cnn = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.label_size,
                           FLAGS.learning_rate, FLAGS.decay_steps,
                           FLAGS.decay_rate, FLAGS.sentence_len, vocab_size,
                           FLAGS.embed_size)

        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Model Failed')
            return

        print("test_X.shape:", testX.shape)
        print("test_Y.shape:", testY.shape)

        raw_labels = []
        predicted_labels = []
        number_examples = len(testX)
        batch_size = FLAGS.batch_size
        for start, end in zip(range(0, number_examples, batch_size),
                              range(batch_size, number_examples, batch_size)):
            predictions = sess.run(text_cnn.predictions,
                                   feed_dict={
                                       text_cnn.input_x: testX[start:end],
                                       text_cnn.input_y: testY[start:end],
                                       text_cnn.dropout_keep_prob: 1.0,
                                       text_cnn.is_training_flag: False
                                   })
            if len(predictions) == len(testY[start:end]):
                raw_labels.extend(testY[start:end])
                predicted_labels.extend(predictions)

        print classification_report(raw_labels, predicted_labels)
Пример #7
0
def main(_):
    time_start = time.time()

    def predict_label(logits, tmp_gate=0.01):  #根据logit,预测label,预测为大于设定的阈值得标签
        probs = softmax(logits)

        labels = []
        for prob in probs:
            con = np.greater_equal(prob, [tmp_gate] * 16)  #判断是否大于阈值,此处为0.01
            tmp = list(np.argwhere(con == True))
            label = [x[0] for x in tmp]
            if sum(label) < 1:  #如果没有大于阈值的label,就选最大的label
                label = np.argmax(prob)
            labels.append(label)
            #in_prob = prob[index]
        return labels

    def predict_label_top_k(sess, eval_return):
        '''旧方案预测top k标签,k可以固定下来或者根据修改model.py,动态确定每条评论的k'''
        y_predict = []
        top_number = eval_return[0]  # 每条评论的标签数
        probs_squeezed = eval_return[1]  # logits
        for i, curr_prob in enumerate(probs_squeezed):
            index = tf.nn.top_k(curr_prob, top_number[i])  #从logits中选取对应前k个
            index = set(index.indices.eval())
            y_predict.append(
                tf.constant([
                    1 if i in index else 0 for i in range(FLAGS.num_classes)
                ]))
        y_predict = tf.stack(y_predict)
        return y_predict.eval()

    def pad_y(Y, label_num=16):  #根据标签在标签集的整数索引,转化为multi-hot向量
        Y2 = np.zeros((len(Y), label_num))
        for i, y in enumerate(Y):
            tmp_y = [0] * label_num
            for it_y in y:
                tmp_y[it_y] = 1
            Y2[i, :] = tmp_y
        return Y2

    vocabulary_word2index, vocabulary_index2word = create_voabulary(
        file=FLAGS.word2vec_model_path,
        cache_path="./cache_pickle/ft_%s_voabulary.pickle" % _LANG,
        from_word2vec=1)  # 导入词汇表
    vocab_size = len(vocabulary_word2index)  #词汇量

    trainX, trainY, testX, testY = None, None, None, None

    cache_train_data_path = "./cache_pickle/train_data_%s.pickle" % _LANG  #导入由tc_utils.py模块准备好的训练集、验证集
    train_data = os.path.exists(cache_train_data_path)
    cache_eval_data_path = "./cache_pickle/eval_data_%s.pickle" % _LANG
    eval_data = os.path.exists(cache_eval_data_path)
    if (train_data and eval_data):
        with open(cache_train_data_path, 'rb') as f:
            trainX, trainY = pickle.load(f)
        with open(cache_eval_data_path, 'rb') as f:
            testX, testY = pickle.load(f)
    else:
        return "data NOT found!"

    trainY = pad_y(trainY)
    testY = pad_y(testY)

    trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len,
                           value=0.)  #对评论进行截断或者补全到固定长度
    testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)

    config = tf.ConfigProto()
    best_f1 = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 500  # 如果超过500轮未提升,提前结束训练
    print_per_batch = 10  # 每多少轮次输出在训练集和验证集上的性能
    save_per_batch = 50  # 每多少轮次将训练结果写入tensorboard scalar
    save_path = FLAGS.ckpt_dir + "model.ckpt"
    total_batch = 1
    flag = False

    with tf.Session(config=config) as sess:

        print("initialize model")
        textCNN = TextCNN(filter_sizes,
                          FLAGS.num_filters,
                          FLAGS.num_classes,
                          FLAGS.learning_rate,
                          FLAGS.batch_size,
                          FLAGS.decay_steps,
                          FLAGS.decay_rate,
                          FLAGS.sentence_len,
                          vocab_size,
                          FLAGS.embed_size,
                          FLAGS.is_training,
                          multi_label_flag=FLAGS.multi_label_flag
                          )  #初始化模型,可替换为textrcnn、textrnn

        tf.summary.scalar("loss", textCNN.loss_val)
        merged_summary = tf.summary.merge_all()
        writer = tf.summary.FileWriter("./tf_board/")

        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):  #可导入以前训练好的模型,继续进行训练
            print("Restore Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Initialize variable")
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  #使用训练好的词向量
                assign_pretrained_word_embedding(
                    sess,
                    vocabulary_index2word,
                    vocab_size,
                    textCNN,
                    cache_path="./cache_pickle/embedding_%s.pickle" % _LANG,
                    word2vec_model_path=FLAGS.word2vec_model_path)
        curr_epoch = sess.run(textCNN.epoch_step)

        writer.add_graph(sess.graph)

        number_of_training_data = len(trainX)
        batch_size = FLAGS.batch_size
        print("Start epoch")  #开始训练模型
        for epoch in range(curr_epoch, FLAGS.num_epochs):

            batch_train = batch_iter(trainX, trainY, batch_size)
            for curr_trainX, curr_trainY in batch_train:

                feed_dict = {
                    textCNN.input_x: curr_trainX,
                    textCNN.dropout_keep_prob: 0.5
                }
                feed_dict[textCNN.input_y] = curr_trainY

                if total_batch == 1:
                    print('testX\n', testX)
                    print('testY\n', testY)

                if total_batch % save_per_batch == 0:
                    s = sess.run(merged_summary, feed_dict=feed_dict)
                    writer.add_summary(s, total_batch)

                if total_batch % print_per_batch == 0:
                    feed_dict[textCNN.dropout_keep_prob] = 1.0
                    #train_loss,train_logits = sess.run([textCNN.loss_val,textCNN.logits],feed_dict=feed_dict)
                    feed_dict1 = {
                        textCNN.input_x: testX,
                        textCNN.input_y: testY,
                        textCNN.dropout_keep_prob: 1.0
                    }
                    test_loss, logits = sess.run(
                        [textCNN.loss_val, textCNN.logits],
                        feed_dict=feed_dict1)
                    predict_y = predict_label(logits)
                    #                    predict_y = predict_label(test_logits)
                    test_acc, precision, recall, f1 = evaluate(
                        predict_y, testY)  # waitting

                    if test_acc > best_f1:
                        best_f1 = test_acc
                        last_improved = total_batch
                        saver.save(sess, save_path, global_step=total_batch)
                        improved_str = '*'
                    else:
                        improved_str = ''
                    print(
                        "epoch:%d total_batch:%d test_loss:%f test_acc:%f precision:%f recall:%f f1:%f %s"
                        % (epoch, total_batch, test_loss, test_acc, precision,
                           recall, f1, improved_str))  #waitting
                sess.run([textCNN.train_op], feed_dict)

                total_batch += 1

                if total_batch - last_improved > require_improvement:
                    print("auto stopping")
                    flag = True
                    break
            if flag:
                break

    time_end = time.time()
    print('using time:', time_end - time_start)
Пример #8
0
def main(_):

    time_start = time.time()

    def save_predict(predict_y, voc, file):
        ''' 保存模型预测的label为固定格式'''
        print(predict_y)
        with open(file, 'w') as f:
            for i in range(len(predict_y)):
                if isinstance(predict_y[i], list):
                    labels = []
                    for j in predict_y[i]:
                        label = voc[j]
                        labels.append(label)
                        line = ["__label__" + x for x in labels]
                        if len(predict_y[i]) > 1 and "__label__NULL" in line:
                            line.remove("__label__NULL")
                        line = " ".join(line)
                else:
                    label = voc[predict_y[i]]
                    line = "__label__" + label
                f.write(line + "\n")

    def predict_label(logits, tmp_gate=0.01):
        '''根据logits预测label,选择可不同阈值'''
        probs = softmax(logits)
        labels = []
        for prob in probs:
            con = np.greater_equal(prob, [tmp_gate] * 16)
            tmp = list(np.argwhere(con == True))
            label = [x[0] for x in tmp]
            if sum(label) < 1:
                label = np.argmax(prob)
            labels.append(label)
        return labels

    def predict_label_top_k(sess, eval_return, batch_size=1):
        '''旧方案,预测topk标签,k可固定,若动态确定,需要修改model模块'''
        top_number = eval_return[0]  #k
        probs = eval_return[1]  # probs,已经由logits转为prob
        ones = tf.ones(shape=top_number.shape, dtype=tf.float32)
        top_number = tf.cast(tf.where(tf.greater(top_number, ones), top_number,
                                      ones),
                             dtype=tf.int32)

        probs_split = tf.split(probs, batch_size)
        probs_squeezed = [tf.squeeze(x) for x in probs_split]

        y_predict = []
        for i, curr_prob in enumerate(probs_squeezed):
            index = tf.nn.top_k(curr_prob, top_number[i])
            index = set(index.indices.eval())
            y_predict.append(
                tf.constant([
                    1 if i in index else 0 for i in range(FLAGS.num_classes)
                ]))
        y_predict = tf.stack(y_predict)
        return y_predict.eval()

    def pad_y(Y):
        Y2 = np.zeros((len(Y), 16))
        for i, y in enumerate(Y):
            tmp_y = [0] * 16
            for it_y in y:
                tmp_y[it_y] = 1
            Y2[i, :] = tmp_y
        return Y2

    testX, testY = None, None

    cache_test_data_path = "./cache_pickle/eval_data_%s.pickle" % _LANG
    test_data = os.path.exists(cache_test_data_path)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voab_label(
    )
    vocabulary_word2index, vocabulary_index2word = create_voabulary(
        file=FLAGS.word2vec_model_path,
        cache_path="./cache_pickle/ft_%s_voabulary.pickle" % _LANG,
        from_word2vec=1)

    if not test_data:
        print("test data NOT exist")
        vocab_size = len(vocabulary_word2index)
        print("cnn_model_vocab_size:", vocab_size)

        testX, testY = load_data(vocabulary_word2index,
                                 vocabulary_word2index_label,
                                 training_data_path=FLAGS.traning_data_path,
                                 cache_path='')  #导入测试集
        print("testX:", len(testX), "testY:", len(testY))

        testY = pad_y(testY)
        testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)

        with open(cache_test_data_path, 'ab') as data_f:  #缓存,方便多次测试
            pickle.dump((np.array(testX), np.array(testY)), data_f)
        print("dump data end!")

    else:
        vocab_size = len(vocabulary_word2index)
        print("cnn_model_vocab_size:", vocab_size)
        with open(cache_test_data_path, 'rb') as data_f:
            testX, testY = pickle.load(data_f)
    testY = pad_y(testY)
    testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)

    config = tf.ConfigProto()
    with tf.Session(config=config) as sess:
        print("initialize model")
        textCNN = TextCNN(filter_sizes,
                          FLAGS.num_filters,
                          FLAGS.num_classes,
                          FLAGS.learning_rate,
                          FLAGS.batch_size,
                          FLAGS.decay_steps,
                          FLAGS.decay_rate,
                          FLAGS.sentence_len,
                          vocab_size,
                          FLAGS.embed_size,
                          FLAGS.is_training,
                          multi_label_flag=FLAGS.multi_label_flag)
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):  #载入训练好的模型
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint. going to stop")
            return
        feed_dict = {
            textCNN.input_x: testX,
            textCNN.input_y: testY,
            textCNN.dropout_keep_prob: 1.0
        }
        test_loss, logits = sess.run([textCNN.loss_val, textCNN.logits],
                                     feed_dict=feed_dict)
        # 这里是验证,所以有loss,如果只预测,就不用loss,feed_dict也不用textCNN.input_y
        # feed_dict = {textCNN.input_x:testX,textCNN.dropout_keep_prob:1.0}
        # logits = sess.run([textCNN.logits],feed_dict=feed_dict)
        predict_y = predict_label(logits)
        save_predict(predict_y,
                     file='./result_%s.txt' % _LANG,
                     voc=vocabulary_index2word_label)

        test_acc, precision, recall, f1 = evaluate(predict_y, testY)
        print("test_loss:%f test_acc:%f precision:%f recall:%f f1:%f" %
              (test_loss, test_acc, precision, recall, f1))
class OLTR_For_Textcnn(nn.Module):
    def __init__(self,
                 pretrained_model_path,
                 vocabulary_size,
                 filter_sizes,
                 filter_num,
                 data=None,
                 train=False,
                 cuda=1):
        super(OLTR_For_Textcnn, self).__init__()
        self.device = torch.device(
            'cuda:%d' % cuda if torch.cuda.is_available() else 'cpu')
        self.textcnn = TextCNN(vocabulary_size=vocabulary_size,
                               class_num=183,
                               filter_num=filter_num,
                               filter_sizes=filter_sizes,
                               embedding_dim=128)
        checkpoint = torch.load(pretrained_model_path,
                                map_location=self.device)
        self.textcnn.load_state_dict(checkpoint)
        self.textcnn = self.textcnn.to(self.device)
        # fix all param in textcnn when training OLTR
        for param_name, param in self.textcnn.named_parameters():
            param.requires_grad = False
        self.textcnn.eval()
        self.classes_num = 183
        self.feature_dim = len(filter_sizes.split(",")) * filter_num

        self.classifier = OLTR_classifier(self.feature_dim, self.classes_num)

        self.centroids = nn.Parameter(
            torch.randn(self.classes_num, self.feature_dim))
        if train and data is not None:
            print("update centroid with data")
            self.centroids.data = self.centroids_cal(data)
        elif train and data is None:
            raise ValueError("Train mode should update centroid with data")
        else:
            print("Test mode should load pretrained centroid")

    def forward(self, x, *args):
        feature = self.textcnn.extract_feature(x)
        logits, _ = self.classifier(feature, self.centroids)
        return logits, feature

    def class_count(self, data):
        labels = np.array([int(ex.label) for ex in data.dataset])
        class_data_num = []
        for l in range(self.classes_num):
            class_data_num.append(len(labels[labels == l]))
            if class_data_num[-1] == 0:
                class_data_num[-1] = 1
        return class_data_num

    def centroids_cal(self, data):

        centroids = torch.zeros(self.classes_num,
                                self.feature_dim).to(self.device)

        print('Calculating centroids.')

        # for model in self.networks.values():
        #     model.eval()
        self.textcnn.eval()

        # Calculate initial centroids only on training data.
        with torch.set_grad_enabled(False):

            for batch in data:
                inputs, labels = batch.text, batch.label
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                # Calculate Features of each training data
                features = self.textcnn.extract_feature(inputs)
                # Add all calculated features to center tensor
                for i in range(len(labels)):
                    label = labels[i]
                    centroids[label] += features[i]

        # Average summed features with class count
        centroids /= torch.Tensor(
            self.class_count(data)).float().unsqueeze(1).to(self.device)

        return centroids
Пример #10
0
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)
class_num = 183
feature_dim = 300
bs = 96

text_field = data.Field(lower=True, batch_first=True)
label_field = data.Field(sequential=False, use_vocab=False, batch_first=True)
train_iter, dev_iter, test_iter = process_data(text_field=text_field,
                                               label_field=label_field,
                                               data_dir=input_dir,
                                               batch_size=bs)
vocabulary_size = len(text_field.vocab)
textcnn = TextCNN(vocabulary_size=vocabulary_size,
                  class_num=183,
                  filter_num=filter_num,
                  filter_sizes=filter_size,
                  embedding_dim=128)
checkpoint = torch.load(load_model_path, map_location=device)
textcnn.load_state_dict(checkpoint)
textcnn = textcnn.to(device)
textcnn.eval()


def centroids_cal(data):
    centroids = torch.zeros(class_num, feature_dim).to(device)

    print('Calculating centroids.')

    # for model in self.networks.values():
    #     model.eval()
Пример #11
0
def main(_):
    #1.load data
    word2index, label2index, train_x, train_y, valid_x, valid_y, test_x, test_y =\
       load_data(FLAGS.cache_file_h5py, FLAGS.cache_file_pickle)
    vocab_size = len(word2index)
    num_classes = len(label2index)
    print(train_y[0:3])

    num_examples, FLAGS.sentence_len = train_x.shape

    #2 create session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        textCNN = TextCNN(filter_sizes, FLAGS.num_filters, num_classes, FLAGS.learning_rate, \
                          FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.sentence_len,\
                          vocab_size, FLAGS.embed_size, multi_label_flag = FLAGS.multi_label_flag)

        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            sess.run(tf.global_variables_initializer())

        if FLAGS.use_embedding:
            index2word = {v: k for k, v in word2index.items()}
            #assign_pretrained_word_embedding(sess, index2word, vocab_size, textCNN, FLAG.word2vec_model_path)

        curr_epoch = sess.run(textCNN.epoch_step)

        #3 feed data and training
        number_of_training_data = len(train_x)
        batch_size = FLAGS.batch_size
        iteration = 0
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            loss, counter = 0.0, 0
            for start, end in zip(range(0, number_of_training_data, batch_size), \
                                  range(batch_size, number_of_training_data, batch_size)):
                iteration = iteration + 1
                feed_dict = { textCNN.input_x: train_x[start: end],
                              textCNN.dropout_keep_prob: 0.8,
                              textCNN.is_training_flag: FLAGS.is_training_flag }
                if not FLAGS.multi_label_flag:
                    feed_dict[textCNN.input_y] = train_y[start: end]
                else:
                    feed_dict[textCNN.input_y_multilabel] = train_y[start: end]

                curr_loss, lr, _ = sess.run([textCNN.loss_val, textCNN.learning_rate, textCNN.train_op], feed_dict)
                loss, counter = loss + curr_loss, counter + 1
                # 每50步打印损失
                if counter % 50 == 0:
                    #do_eval(sess, textCNN, test_x, test_y, num_classes)
                    print("Epoch %d\tBatch %d\tTrain loss:%.3f\tLearning rate:%.5f" % \
                           (epoch, counter, loss/float(counter), lr))

            #每一轮进行验证
            print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0))
            if epoch % FLAGS.validate_every==0:
                #eval_loss, f1_score, f1_micro, f1_macro = do_eval(sess, textCNN, text_x, text_y, num_classes)
   #             do_eval(sess, textCNN, text_x, text_y, num_classes)
#save model to checkpoint
                save_path = FLAGS.ckpt_dir + "model.ckpt"
                saver.save(sess, save_path, global_step=epoch)
Пример #12
0
def Textcnn_train():
    ###########  load data  ###################
    if not os.path.exists(FLAGS.vocab_dir):
        words = tools.build_vocab(train_data=FLAGS.train_data,
                                  vocab_dir=FLAGS.vocab_dir)  ### 制作词汇表
    else:
        words = tools.read_file(FLAGS.vocab_dir)
    vocab_size = len(words)
    train_X, train_Y = tools.create_voabulary(train_data=FLAGS.train_data,
                                              vocab_data=FLAGS.vocab_dir,
                                              max_length=config.seq_length)
    val_X, val_Y = tools.create_voabulary(train_data=FLAGS.val_data,
                                          vocab_data=FLAGS.vocab_dir,
                                          max_length=config.seq_length)

    #trainX = pad_sequences(train_X, maxlen=200, value=0.)  # padding to max length
    #test_X = pad_sequences(test_X, maxlen=200, value=0.)  # padding to max length
    print("Data deal down!")
    ###############################################################################

    input_x = tf.placeholder(tf.int32, [None, config.seq_length],
                             name='input_x')
    input_y = tf.placeholder(tf.float32, [None, config.num_classes],
                             name='input_y')

    textcnn = TextCNN(config, vocab_size, keep_prob=config.dropout_keep_prob)
    logits = textcnn.cnn(input_x)  ### (?,10)
    loss = textcnn_loss(logits=logits, label=input_y)

    ############# 计算 acc ######################################
    acc = textcnn_acc(logits=logits, labels=input_y)
    ######################################################

    global_step = tf.Variable(0, name='global_step', trainable=False)
    learning_rate = tf.train.exponential_decay(
        learning_rate=FLAGS.learning_rate,
        global_step=global_step,
        decay_steps=2000,
        decay_rate=0.1,
        staircase=True)

    optim = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(
        loss=loss, global_step=global_step)

    tensorboard_dir = 'tensorboard/textcnn'
    tf.summary.scalar("loss", loss)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    saver = tf.train.Saver(max_to_keep=3)  ### 保存模型
    model_save_dir = 'checkpoints/'
    train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S',
                                     time.localtime(time.time()))
    model_name = 'TextCNNnet_{:s}.ckpt'.format(str(train_start_time))
    model_save_path = os.path.join(model_save_dir, model_name)

    model_restore_path = './checkpoints/TextCNNnet_2019-11-01-15-31-50.ckpt-4000'

    ##### 创建日志
    logging.basicConfig(
        filename='./checkpoints/' + model_name + '.log',
        format='%(asctime)s - %(pathname)s - %(levelname)s: %(message)s',
        level=logging.DEBUG,
        filemode='a',
        datefmt='%Y-%m-%d%I:%M:%S %p')
    logging.info('######  Next Is Training Infomation   ###################')

    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True
    sess = tf.Session(config=sess_config)

    with sess.as_default():
        if not FLAGS.model_store:
            step = 0
            init = tf.global_variables_initializer()
            sess.run(init)
            writer.add_graph(sess.graph)
        else:
            saver.restore(sess=sess, save_path=model_restore_path)
            step = sess.run(tf.train.get_global_step())
            writer.add_graph(sess.graph)
        print('First step is:', step)
        num_batch = int(
            (len(train_X) - 1) / config.batch_size) + 1  ### 总batch数
        acc_begain = 0
        for epoch in range(config.epochs):
            batch_train = tools.batch_iter(train_X, train_Y,
                                           config.batch_size)  ### 生成批次数据
            Begain_learn_rate = FLAGS.learning_rate
            for x_batch, y_batch in batch_train:
                step += 1
                _, learn_rate, train_loss_value, train_pred, train_acc, merge_summary_value = sess.run(
                    [optim, learning_rate, loss, logits, acc, merged_summary],
                    feed_dict={
                        input_x: x_batch,
                        input_y: y_batch
                    })
                if Begain_learn_rate != learn_rate:
                    information = '############ New Learning_Rate {:6f} in step {:d}  ###########'.format(
                        learn_rate, step)
                    logging.info(information)
                    print(information)
                    Begain_learn_rate = learn_rate
                if step % 10 == 0:
                    information = '## Epoch {:d} Step_Train / Total_Batch: {:d} / {:d}   train_loss= {:5f}  train_acc={:5f}'.\
                      format(int(step/num_batch),step, num_batch, train_loss_value, train_acc)
                    logging.info(information)
                    print(information)

                if step % 500 == 0:  ### 每 500 步进行一次验证,并保存最优模型
                    val_acc_all = 0
                    val_loss_all = 0
                    val_step = 0
                    batch_val = tools.batch_iter(val_X, val_Y,
                                                 config.batch_size)  ### 生成批次数据
                    for x_val, y_val in batch_val:
                        if x_val.shape[0] < config.batch_size:
                            pass
                        else:
                            _, val_loss_value, val_pred, val_acc, merge_summary_value = sess.run(
                                [optim, loss, logits, acc, merged_summary],
                                feed_dict={
                                    input_x: x_val,
                                    input_y: y_val
                                })
                            writer.add_summary(merge_summary_value, step)
                            val_acc_all = val_acc_all + val_acc
                            val_loss_all = val_loss_all + val_loss_value
                            val_step += 1
                    ave_acc = val_acc_all / val_step
                    ave_loss = val_loss_all / val_step
                    if (ave_acc - acc_begain) > 0.001:
                        acc_begain = ave_acc
                        saver.save(sess, model_save_path, global_step=step)
                        tf.train.write_graph(sess.graph_def, '',
                                             './checkpoints/textcnn_graph.pb')
                    information = '############   Val_loss = {:5f}   Val_acc = {:5f}   ##################'.format(
                        ave_loss, ave_acc)
                    logging.info(information)
                    print(information)
Пример #13
0
pic_dir = "./t-SNE/%s" % load_model_name
if not os.path.isdir(pic_dir):
    os.makedirs(pic_dir)


def y_tokenize(y):
    return int(y)

text_field = data.Field(lower=True,batch_first=True)
label_field = data.Field(sequential=False, tokenize=y_tokenize, use_vocab = False, batch_first=True)
train_iter, dev_iter, test_iter = process_data(text_field=text_field, label_field=label_field, data_dir=input_dir,batch_size=bs, mode=mode)
vocabulary_size = len(text_field.vocab)
# class_num = len(label_field.vocab)
class_num = 183
textcnn = TextCNN(vocabulary_size=vocabulary_size, class_num=class_num, filter_num=filter_num,
                filter_sizes=filter_size, embedding_dim=embedding_dim, dropout=dropout)
checkpoint = torch.load(load_model_path, map_location=device)
textcnn.load_state_dict(checkpoint)
textcnn = textcnn.to(device)
textcnn.eval()

samples=[]
for i in range(class_num):
    samples.append([])

for batch in train_iter:
    for i,label in enumerate(batch.label.numpy().tolist()):
        samples[label].append(batch.text)


max_points=100
Пример #14
0
batch_size = 32
embedding_dims = 100
epochs = 10

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)...')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = TextCNN(maxlen, vocab_size, embedding_dims).get_model()
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
#这里可以自己实现自己需要的回调函数,做比赛时基本都是自定义回调函数
early_stopping = EarlyStopping(monitor='val_acc', patience=2, mode='max')
model.fit(x_train,
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))

print('Test...')
result = model.predict(x_test)
Пример #15
0
def main(_):
    word2index, label2index, trainX, trainY, vaildX, validY, testX, testY = load_data(
        FLAGS.all_data_h5py, FLAGS.id_index_pkl)

    vocab_size = len(word2index)

    text_cnn = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.label_size,
                       FLAGS.learning_rate, FLAGS.decay_steps,
                       FLAGS.decay_rate, FLAGS.sentence_len, vocab_size,
                       FLAGS.embed_size)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())

        writer = tf.summary.FileWriter(FLAGS.summary_dir,
                                       tf.get_default_graph())

        number_of_training_data = len(trainX)
        batch_size = FLAGS.batch_size
        for epoch in range(0, FLAGS.num_epochs):
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:", trainX[start:end])
                    print("trainY[start:end]:", trainY[start:end])
                curr_loss, curr_acc, _ = sess.run(
                    [text_cnn.loss_val, text_cnn.accuracy, text_cnn.train_op],
                    feed_dict={
                        text_cnn.input_x: trainX[start:end],
                        text_cnn.input_y: trainY[start:end],
                        text_cnn.dropout_keep_prob: 0.8,
                        text_cnn.is_training_flag: True
                    })
                loss, acc, counter = loss + curr_loss, acc + curr_acc, counter + 1
                if counter % 500 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f"
                        % (epoch, counter, loss / float(counter),
                           acc / float(counter)))

            print("going to increment epoch counter....")
            sess.run(text_cnn.epoch_increment)

            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))
            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_acc = do_eval(sess, text_cnn, vaildX, validY,
                                              batch_size)
                print(
                    "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f"
                    % (epoch, eval_loss, eval_acc))

                save_path = FLAGS.ckpt_dir + "model.ckpt"
                saver.save(sess, save_path, global_step=text_cnn.epoch_step)

        test_loss, test_acc = do_eval(sess, text_cnn, testX, testY, batch_size)
        print("Test Loss:%.3f\tTest Accuracy:%.3f" % (test_loss, test_acc))

        writer.close()

if __name__ == '__main__':
    text_field = data.Field(lower=True, batch_first=True)
    label_field = data.Field(sequential=False,
                             use_vocab=False,
                             batch_first=True)
    train_iter, dev_iter, test_iter = process_data(text_field=text_field,
                                                   label_field=label_field,
                                                   data_dir=input_dir,
                                                   batch_size=bs)
    vocabulary_size = len(text_field.vocab)
    class_num = 183
    textcnn = TextCNN(vocabulary_size=vocabulary_size,
                      class_num=class_num,
                      filter_num=filter_num,
                      filter_sizes=filter_size,
                      embedding_dim=embedding_dim,
                      dropout=dropout)
    textcnn = textcnn.to(device)
    optimizer = torch.optim.Adam(textcnn.parameters(), lr=lr)
    textcnn.train()
    steps = 0
    best_acc = min_acc

    for epoch in range(1, epoch_num + 1):
        for batch in train_iter:
            textcnn.train()
            feature, target = batch.text, batch.label
            feature, target = feature.to(device), target.to(device)
            optimizer.zero_grad()
            logits = textcnn(feature)
Пример #17
0
def class_count(df):
    df_label = np.argmax(df, 1)
    class_freqs = 1 / np.bincount(df_label)
    class_freqs = class_freqs / (max(class_freqs) - min(class_freqs))
    return tf.convert_to_tensor(class_freqs, dtype=tf.float32)


F1_score = 0
for tag in tag_columns:

    train_Y = train_tags[tag]
    valid_Y = valid_tags[tag]
    weights = class_count(train_Y)

    print("Building the model for catetory: {}".format(tag))
    model = TextCNN(params, embed_matrix)

    print("Starting the training model for catetory: {}".format(tag))
    save_path = os.path.join(textcnn_dir, date, mode, tag)
    if os.path.exists(save_path):
        print('dir exists')
    else:
        print('dir not exists, create dir.')
        os.makedirs(save_path)

    train_data, train_steps = batch_generator(train_X, train_Y,
                                              params["batch_size"])
    valid_data, valid_steps = batch_generator(valid_X, valid_Y,
                                              params["batch_size"])

    train_model(model, train_data, valid_data, train_steps, valid_steps,