예제 #1
0
파일: img2svg.py 프로젝트: Jephthia/NNs
 def load_dataset(self):
     ds = TextLineDataset(str(pathlib.Path(self.log_dir, 'file_names.txt')))
     ds = ds.take(5)
     ds = ds.map(self.parse_svg_img, num_parallel_calls=tf.data.experimental.AUTOTUNE)
     ds = ds.padded_batch(2, drop_remainder=True)
     
     return ds
    def prepare_data(self):
        # Data preparation.
        # =======================================================
        if self.train_mode == 'CHAR-RANDOM':
            # 1.字符级
            # 读取词汇表
            self.vocab = preprocess.read_vocab(
                os.path.join('data', preprocess.CHAR_VOCAB_PATH))

        elif self.train_mode == 'WORD-NON-STATIC' or self.train_mode == 'MULTI':
            # 把预训练词向量的值读到变量中
            self.vocab = preprocess.read_vocab(
                os.path.join('data', preprocess.WORD_VOCAB_PATH))
            self.vecs_dict = preprocess.load_vecs(
                os.path.join('data', preprocess.SGNS_WORD_PATH))
            self.embedding_W = np.ndarray(
                shape=[self.vocab_size, self.embedding_dim], dtype=np.float32)
            for word in self.vocab:
                # 第n行对应id为n的词的词向量
                if word not in self.vecs_dict:
                    preprocess.add_word(word, self.vecs_dict)
                self.embedding_W[self.vocab[word]] = self.vecs_dict[word]

        self.dataset = TextLineDataset(
            os.path.join('data', preprocess.TRAIN_WITH_ID_PATH))

        return
    def prepare_data(self):
        # Data preparation.
        # =======================================================
        if self.train_mode == 'CHAR-RANDOM':
            # 1.字符级
            # 读取词汇表
            self.vocab = preprocess.read_vocab(
                os.path.join('data', preprocess.CHAR_VOCAB_PATH))

        elif self.train_mode == 'WORD-NON-STATIC' or self.train_mode == 'MULTI':
            # 把预训练词向量的值读到变量中
            self.vocab = preprocess.read_vocab(
                os.path.join('data', preprocess.WORD_VOCAB_PATH))
            self.vecs_dict = preprocess.load_vecs(
                os.path.join('data', preprocess.SGNS_WORD_PATH))
            self.embedding_W = np.ndarray(
                shape=[self.vocab_size, self.embedding_dim], dtype=np.float32)
            for word in self.vocab:
                # 第n行对应id为n的词的词向量
                if word not in self.vecs_dict:
                    preprocess.add_word(word, self.vecs_dict)
                self.embedding_W[self.vocab[word]] = self.vecs_dict[word]

        self.dataset = TextLineDataset(
            os.path.join('data', preprocess.TRAIN_WITH_ID_PATH))
        print('Shuffling dataset...')
        self.dataset = self.dataset.shuffle(preprocess.TOTAL_TRAIN_SIZE)
        # 分割数据集
        # 取前VALID_SIZE个样本给验证集
        valid_dataset = self.dataset.take(preprocess.VALID_SIZE).batch(
            self.valid_batch_size)
        # 剩下的给训练集
        train_dataset = self.dataset.skip(preprocess.VALID_SIZE).batch(
            self.train_batch_size)

        # Create a reinitializable iterator
        train_iterator = train_dataset.make_initializable_iterator()
        valid_iterator = valid_dataset.make_initializable_iterator()

        train_init_op = train_iterator.initializer
        valid_init_op = valid_iterator.initializer

        # 要获取元素,先sess.run(train_init_op)初始化迭代器
        # 再sess.run(next_train_element)
        next_train_element = train_iterator.get_next()
        next_valid_element = valid_iterator.get_next()

        return train_init_op, valid_init_op, next_train_element, next_valid_element
예제 #4
0
    def prepare_test_data(self):
        # 读取词汇表
        if self.train_mode == 'CHAR-RANDOM':
            # 1.字符级
            self.vocab = preprocess.read_vocab(os.path.join('data',preprocess.CHAR_VOCAB_PATH))

        elif self.train_mode == 'WORD-NON-STATIC':
            self.vocab = preprocess.read_vocab(os.path.join('data', preprocess.WORD_VOCAB_PATH))

        # 测试集有标题,读取时注意跳过第一行
        dataset = TextLineDataset(os.path.join('data',preprocess.TEST_PATH))
        dataset = dataset.shuffle(preprocess.TOTAL_TEST_SIZE).batch(self.test_batch_size)

        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()

        return dataset, next_element
예제 #5
0
    def test_pipeline(self, num_threads):
        real_fname = os.path.join(self.dataset_path, 'test_real.txt')

        # extract directories
        real_dir, inst_dir = self.real_dir, self.inst_dir

        # count lines
        num_real = count_lines(real_fname)

        # dataset creation
        with tf.name_scope('dataset'):
            real = TextLineDataset(real_fname)

            # @see https://www.tensorflow.org/api_docs/python/tf/contrib/data/shuffle_and_repeat
            #synt.apply(shuffle_and_repeat(buffer_size = num_synt)) #, count = 1))
            #real.apply(shuffle_and_repeat(buffer_size = num_real)) #, count = ceil(ratio)))

            real = real.shuffle(num_real) # no repetition! .repeat()

            # real data only
            augment = 0 # self.params.get('augment', 0)
            def name2real(name):
                inst = read_instr(os.path.join(inst_dir, name.decode() + '.png'))
                if augment:
                    src_dir = self.params.get('augment_src', 'best')
                    # print('{}/{}/{}'.format(real_dir, str(src_dir), name.decode() + '.JPG'))
                    full = read_image(os.path.join(real_dir, str(src_dir), 'rgb', name.decode() + '.jpg'), False)
                    pnts = read_points(os.path.join(real_dir, str(src_dir), 'points', name.decode() + '.txt'))
                    if isinstance(src_dir, float):
                        pnts *= src_dir
                    self.params['augment_scale'] = 0.
                    real = random_crop(full, pnts, self.params)
                else:
                    real = read_image(os.path.join(real_dir, '160x160', 'gray', name.decode() + '.jpg'))
                return real, inst, name.decode()
            real = real.map(lambda name: tuple(tf.py_func(name2real, [name], [tf.float32, tf.int32, tf.string])), num_parallel_calls = num_threads)

            #dataset = Dataset.zip((rend, xfer, real, inst_synt, inst_real))
            dataset = Dataset.zip({ 'real': real })
            dataset = dataset.batch(self.batch_size, drop_remainder = True) # we need full batches!
            dataset = dataset.prefetch(self.batch_size * 2)
            return dataset
예제 #6
0
파일: data.py 프로젝트: quocdat32461997/NER
    def __call__(self):
        # read text data
        texts = TextLineDataset(self.texts)
        targets = TextLineDataset(self.targets)

        # process text data
        dataset = self._process(texts, targets)

        if self.val_texts and self.val_targets:
            # read text data
            val_texts = TextLineDataset(self.val_texts)
            val_targets = TextLineDataset(self.val_targets)

            # process text data
            val_dataset = self._process(val_texts, val_targets)

            return dataset, val_dataset

        else:
            return dataset
예제 #7
0
    def prepare_data(self):
        self.train_dataset = TextLineDataset(os.path.join('data', preprocess.FILTERED_TRAIN_PATH)).skip(1)
        self.test_dataset = TextLineDataset(os.path.join('data', preprocess.FILTERED_TEST_PATH)).skip(1)

        print('Shuffling dataset...')
        # 打乱数据
        train_dataset = self.train_dataset.shuffle(5000).batch(self.train_batch_size)
        test_dataset = self.test_dataset.shuffle(2500).batch(self.test_batch_size)

        # Create a reinitializable iterator
        train_iterator = train_dataset.make_initializable_iterator()
        test_iterator = test_dataset.make_initializable_iterator()

        train_init_op = train_iterator.initializer
        test_init_op = test_iterator.initializer

        # 要获取元素,先sess.run(train_init_op)初始化迭代器
        # 再sess.run(next_train_element)
        next_train_element = train_iterator.get_next()
        next_test_element = test_iterator.get_next()

        return train_init_op, test_init_op, next_train_element, next_test_element
    def draw_confusion_matrix(self):
        # train_init_op, test_init_op, next_train_element, next_test_element = self.cnn.prepare_data()
        test_dataset = TextLineDataset(
            os.path.join('data', preprocess.FILTERED_TEST_PATH)).skip(1).batch(
                self.cnn.test_batch_size)
        # Create a reinitializable iterator
        test_iterator = test_dataset.make_one_shot_iterator()
        next_test_element = test_iterator.get_next()

        y_true = []
        y_pred = []
        test_loss = 0.0
        test_accuracy = 0.0
        test_precision = 0.0
        test_recall = 0.0
        test_f1_score = 0.0
        i = 0
        while True:
            try:
                lines = self.sess.run(next_test_element)
                batch_x, batch_y = self.cnn.convert_input(lines)
                feed_dict = {
                    self.input_x: batch_x,
                    self.labels: batch_y,
                    self.dropout_keep_prob: 1.0,
                    self.training: False
                }
                # loss, pred, true = sess.run([self.cnn.loss, self.cnn.prediction, self.cnn.labels], feed_dict)
                # 多次验证,取loss和score均值
                mean_score = 0
                for i in range(self.config.multi_test_num):
                    score = self.sess.run(self.score, feed_dict)
                    mean_score += score
                mean_score /= self.config.multi_test_num
                pred = self.sess.run(tf.argmax(mean_score, 1))
                y_pred.extend(pred)
                y_true.extend(batch_y)
                i += 1
            except tf.errors.OutOfRangeError:
                # 遍历完验证集,计算评估
                test_loss /= i
                test_accuracy = metrics.accuracy_score(y_true=y_true,
                                                       y_pred=y_pred)
                test_precision = metrics.precision_score(y_true=y_true,
                                                         y_pred=y_pred,
                                                         average='weighted')
                test_recall = metrics.recall_score(y_true=y_true,
                                                   y_pred=y_pred,
                                                   average='weighted')
                test_f1_score = metrics.f1_score(y_true=y_true,
                                                 y_pred=y_pred,
                                                 average='weighted')
                log = ('precision: %0.6f, recall: %0.6f, f1_score: %0.6f' %
                       (test_precision, test_recall, test_f1_score))
                print(log)

                cm = confusion_matrix(y_true, y_pred)
                print('Total samples:', np.sum(cm))
                cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # 归一化
                print('Confusion matrix:\n', cm)
                # 绘制混淆矩阵
                # ==============================================================
                fig, ax = plt.subplots()
                im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
                ax.figure.colorbar(im, ax=ax)
                # We want to show all ticks...
                ax.set(
                    xticks=np.arange(cm.shape[1]),
                    yticks=np.arange(cm.shape[0]),
                    # ... and label them with the respective list entries
                    xticklabels=self.class_name,
                    yticklabels=self.class_name,
                    title="Normalized confusion matrix",
                    ylabel='True label',
                    xlabel='Predicted label')

                # Rotate the tick labels and set their alignment.
                plt.setp(ax.get_xticklabels(),
                         rotation=45,
                         ha="right",
                         rotation_mode="anchor")

                # Loop over data dimensions and create text annotations.
                fmt = '.2f'
                thresh = cm.max() / 2.
                for i in range(cm.shape[0]):
                    for j in range(cm.shape[1]):
                        ax.text(
                            j,
                            i,
                            format(cm[i, j], fmt),
                            ha="center",
                            va="center",
                            color="white" if cm[i, j] > thresh else "black")
                fig.tight_layout()
                plt.savefig('./data/confusion_matrix.jpg')
                plt.show()
                # =====================================================================
                break
예제 #9
0
    def pipeline(self, name, num_threads):
        if not self.params.get('training', 1):
            return None
        synt_fname = os.path.join(self.dataset_path, name + '_synt.txt')
        real_fname = os.path.join(self.dataset_path, name + '_real.txt')
        unsup_fname = os.path.join(self.dataset_path, 'train_unsup.txt')

        num_synt, num_real, num_unsup = [count_lines(fname) for fname in [synt_fname, real_fname, unsup_fname]]
        ratio = num_synt / float(num_real)

        # extract directories
        fake_dirs, real_dir, inst_dir = self.fake_dirs, self.real_dir, self.inst_dir

        # dataset creation
        with tf.name_scope('dataset'):
            synt, real, unsup = [TextLineDataset(name) for name in [synt_fname, real_fname, unsup_fname]]

            # @see https://www.tensorflow.org/api_docs/python/tf/contrib/data/shuffle_and_repeat
            #synt.apply(shuffle_and_repeat(buffer_size = num_synt)) #, count = 1))
            #real.apply(shuffle_and_repeat(buffer_size = num_real)) #, count = ceil(ratio)))

            synt = synt.shuffle(num_synt).repeat()
            real = real.shuffle(num_real).repeat()
            unsup = unsup.shuffle(num_unsup).repeat()

            # map to corresonding files
            # synthetic data
            def name2synt(name):
                fakes = [
                    read_image(os.path.join(path, name.decode() + '.jpg'))
                    for path in fake_dirs.values()
                ]
                inst = read_instr(os.path.join(inst_dir, name.decode() + '.png'))
                return fakes + [inst]

            synt_types = [tf.float32 for _ in self.fakes] + [tf.int32]
            synt = synt.map(lambda name: tf.py_func(name2synt, [name], synt_types), num_parallel_calls = num_threads)

            # real data
            augment = self.params.get('augment', 1)
            def name2real(name):
                inst = read_instr(os.path.join(inst_dir, name.decode() + '.png'))
                if augment:
                    src_dir = self.params.get('augment_src', 'best')
                    # print('{}/{}/{}'.format(real_dir, str(src_dir), name.decode() + '.JPG'))
                    full = read_image(os.path.join(real_dir, str(src_dir), 'rgb', name.decode() + '.jpg'), False)
                    pnts = read_points(os.path.join(real_dir, str(src_dir), 'points', name.decode() + '.txt'))
                    if isinstance(src_dir, float):
                        pnts *= src_dir
                    real = random_crop(full, pnts, self.params)
                    # TODO add mirror augmentation
                else:
                    real = read_image(os.path.join(real_dir, name.decode() + '.jpg'))
                return real, inst
            real = real.map(lambda name: tuple(tf.py_func(name2real, [name], [tf.float32, tf.int32])), num_parallel_calls = num_threads)

            # unsup data
            def name2unsup(name):
                if augment:
                    # print('{}/{}/{}'.format(real_dir, str(src_dir), name.decode() + '.JPG'))
                    img = read_image(os.path.join(self.unsup_dir, name.decode() + '.jpg'), False)
                    imsz = img.shape # y,x,c
                    # [TL, TR, BR, BL]
                    real = random_crop(img, 
                            np.array([[5,5],[imsz[1]-5,5],[imsz[1]-5,imsz[0]-5],[5,imsz[0]-5]], dtype = np.float32), self.params)
                else:
                    real = read_image(os.path.join(self.unsup_dir, name.decode() + '.jpg'))
                return real

#             unsup = unsup.map(lambda name: tuple(tf.py_func(name2unsup, [name], [tf.float32])), num_parallel_calls = num_threads)

            # zip all, batch and prefetch
            #dataset = Dataset.zip((rend, xfer, real, inst_synt, inst_real))
            dataset = Dataset.zip({ 'synt': synt, 'real': real }) # , 'unsup': unsup
            dataset = dataset.batch(self.batch_size, drop_remainder = True) # we need full batches!
            dataset = dataset.prefetch(self.batch_size * 2)
            return dataset
예제 #10
0
class BiLSTM(object):
    def __init__(self, config):
        self.class_num = config.class_num
        self.unit_num = config.unit_num
        self.vocab_size = config.vocab_size

        self.dense_unit_num = config.dense_unit_num
        self.train_batch_size = config.train_batch_size
        self.valid_batch_size = config.valid_batch_size
        self.test_batch_size = config.test_batch_size

        if config.train_mode == 'CHAR-RANDOM':
            # 文本长度
            self.text_length = preprocess.MAX_CHAR_TEXT_LENGTH
            # 词嵌入维度
            self.embedding_dim = config.embedding_dim

        elif config.train_mode == 'WORD-NON-STATIC':
            self.text_length = preprocess.MAX_WORD_TEXT_LENGTH
            self.embedding_dim = preprocess.vec_dim

        self.train_mode = config.train_mode

        self.input_x = None
        self.input_y = None
        self.labels = None
        self.dropout_keep_prob = None
        self.training = None
        self.embedding_inputs = None
        self.embedding_inputs_expanded = None
        self.loss = None
        self.accuracy = None
        self.prediction = None
        self.vocab = None
        self.vecs_dict = {}
        self.embedding_W = None

    def setBiLSTM(self):
        # 输入层
        self.input_x = tf.placeholder(tf.int32, [None, self.text_length], name="input_x")

        self.labels = tf.placeholder(tf.int32, [None], name="input_y")
        # 把数字标签转为one hot形式
        self.input_y = tf.one_hot(self.labels, self.class_num)
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # 训练时batch_normalization的Training参数应为True,
        # 验证或测试时应为False
        self.training = tf.placeholder(tf.bool, name='training')

        # 词嵌入层
        with tf.device('/cpu:0'), tf.name_scope('embedding'):
            if self.train_mode == 'CHAR-RANDOM':
                # 随机初始化的词向量
                W = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_dim], -1.0, 1.0))
            elif self.train_mode == 'WORD-NON-STATIC':
                # 用之前读入的预训练词向量
                W = tf.Variable(self.embedding_W)
            self.embedding_inputs = tf.nn.embedding_lookup(W, self.input_x)

        with tf.name_scope("batch_norm"):
            self.embedding_inputs = tf.layers.batch_normalization(self.embedding_inputs, training=self.training)

        def basic_lstm_cell():
            bcell = tf.nn.rnn_cell.LSTMCell(self.unit_num)
            return tf.nn.rnn_cell.DropoutWrapper(bcell, output_keep_prob=self.dropout_keep_prob)

        with tf.name_scope("RNN"):
            # 双向LSTM网络,每层有units_num个神经元
            # ======================================================================================
            # Bidirection LSTM
            lstm_fw_cell = basic_lstm_cell()  # forward direction cell
            lstm_bw_cell = basic_lstm_cell()  # backward direction cell
            # [batch_size, sequence_length, hidden_size] #creates a dynamic bidirectional recurrent neural network
            output, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, self.embedding_inputs,
                                                         dtype=tf.float32)
            # concat output
            output = tf.concat(output, axis=2)  # [batch_size,sequence_length,hidden_size*2]

            rnn_output = output[:, -1, :]  # 取最后一个时序作为输出结果
            # =========================================================================================

        with tf.name_scope("dense"):
            # 全连接层
            # ======================================================================================
            h_full = tf.layers.dense(inputs=rnn_output,
                                     units=self.dense_unit_num,
                                     use_bias=True,
                                     kernel_initializer=tf.truncated_normal_initializer(stddev=0.1),
                                     bias_initializer=tf.constant_initializer(0.1)
                                     )
            h_full = tf.layers.dropout(h_full, rate=self.dropout_keep_prob)
            h_full = tf.nn.relu(h_full)
            # ==========================================================================================


        # Output layer
        with tf.name_scope('output'):
            score = tf.layers.dense(
                h_full,
                units=self.class_num,
                activation=None,
                use_bias=True,
                kernel_initializer=tf.truncated_normal_initializer(stddev=0.1),
                bias_initializer=tf.constant_initializer(0.1)
            )
            self.score = tf.multiply(score, 1, name='score')
            self.prediction = tf.argmax(score, 1, name='prediction')

        # Loss function
        with tf.name_scope('loss'):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=score, labels=self.input_y)
            self.loss = tf.reduce_mean(losses)

        # Calculate accuracy
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.prediction, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))

    def convert_input(self, lines):
        """
        将训练集数据转换为id或词向量表示
        """
        batch_x = []
        batch_y = []
        title = ""
        # 1.id
        for line in lines:
            line_ = line.decode("gbk").strip().split(',')
            title = ''.join(line_[0:-1])    # 逗号前段为标题
            label = ''.join(line_[-1])      # 最后一项为标签
            batch_x.append(preprocess.to_id(title, self.vocab, self.train_mode))
            batch_y.append(label)

        batch_x = np.stack(batch_x)
        return batch_x, batch_y

    def convert_test_input(self, titles):
        """
        将测试集tsv数据转为id或词向量表示
        :param titles:
        :return:
        """
        batch_x = []
        # 1.id
        for title in titles:
            valid_title = title.decode('gb18030').strip('\t')
            batch_x.append(preprocess.to_id(valid_title, self.vocab, self.train_mode))

        batch_x = np.stack(batch_x)
        return batch_x

    def prepare_data(self):
        # Data preparation.
        # =======================================================
        if self.train_mode == 'CHAR-RANDOM':
            # 1.字符级
            # 读取词汇表
            self.vocab = preprocess.read_vocab(os.path.join('data',preprocess.CHAR_VOCAB_PATH))

        elif self.train_mode == 'WORD-NON-STATIC' or self.train_mode == 'MULTI':
            # 把预训练词向量的值读到变量中
            self.vocab = preprocess.read_vocab(os.path.join('data', preprocess.WORD_VOCAB_PATH))
            self.vecs_dict = preprocess.load_vecs(os.path.join('data', preprocess.SGNS_WORD_PATH))
            self.embedding_W = np.ndarray(shape=[self.vocab_size, self.embedding_dim], dtype=np.float32)
            for word in self.vocab:
                # 第n行对应id为n的词的词向量
                if word not in self.vecs_dict:
                    preprocess.add_word(word, self.vecs_dict)
                self.embedding_W[self.vocab[word]] = self.vecs_dict[word]

        self.dataset = TextLineDataset(os.path.join('data', preprocess.TRAIN_WITH_ID_PATH))
        print('Shuffling dataset...')
        self.dataset = self.dataset.shuffle(preprocess.TOTAL_TRAIN_SIZE)
        # 分割数据集
        # 取前VALID_SIZE个样本给验证集
        valid_dataset = self.dataset.take(preprocess.VALID_SIZE).batch(self.valid_batch_size)
        # 剩下的给训练集
        train_dataset = self.dataset.skip(preprocess.VALID_SIZE).batch(self.train_batch_size)

        # Create a reinitializable iterator
        train_iterator = train_dataset.make_initializable_iterator()
        valid_iterator = valid_dataset.make_initializable_iterator()

        train_init_op = train_iterator.initializer
        valid_init_op = valid_iterator.initializer

        # 要获取元素,先sess.run(train_init_op)初始化迭代器
        # 再sess.run(next_train_element)
        next_train_element = train_iterator.get_next()
        next_valid_element = valid_iterator.get_next()

        return train_init_op, valid_init_op, next_train_element, next_valid_element
        # =============================================================
        # Date preparation ends.

    def prepare_test_data(self):
        # 读取词汇表
        if self.train_mode == 'CHAR-RANDOM':
            # 1.字符级
            self.vocab = preprocess.read_vocab(os.path.join('data',preprocess.CHAR_VOCAB_PATH))

        elif self.train_mode == 'WORD-NON-STATIC':
            self.vocab = preprocess.read_vocab(os.path.join('data', preprocess.WORD_VOCAB_PATH))

        # 测试集有标题,读取时注意跳过第一行
        dataset = TextLineDataset(os.path.join('data',preprocess.TEST_PATH))
        dataset = dataset.shuffle(preprocess.TOTAL_TEST_SIZE).batch(self.test_batch_size)

        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()

        return dataset, next_element
예제 #11
0
class CNN(object):
    def __init__(self, config):
        self.class_num = config.class_num
        self.img_size = config.img_size
        self.crop_size = config.crop_size
        self.train_batch_size = config.train_batch_size
        self.test_batch_size = config.test_batch_size
        self.test_per_batch = config.test_per_batch

        self.batch_x = ''
        self.batch_y = ''
        self.input_x = ''
        self.labels = ''
        self.input_y = ''
        self.dropout_keep_prob = ''
        self.training = ''
        self.embedding_inputs = ''
        self.embedding_inputs_expanded = ''
        self.loss = ''
        self.accuracy = ''
        self.prediction = ''
        self.vocab = ''

    def _set_input(self):
        # Input layer
        self.input_x = tf.placeholder(tf.float32, [None, self.img_size, self.img_size, 1], name="input_x")
        self.labels = tf.placeholder(tf.int32, [None], name="labels")
        self.input_y = tf.one_hot(self.labels, self.class_num, name='input_y')
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # 训练时batch_normalization的Training参数应为True,
        # 验证或测试时应为False
        self.training = tf.placeholder(tf.bool, name='training')

        self.input_x_enhanced = tf.map_fn(self.image_enhance, self.input_x)
     
    def _conv(self, input, ksize, stride, filters):
        return tf.layers.conv2d(
            inputs=input,
            filters=filters,
            kernel_size=[ksize, ksize],
            strides=[stride, stride],
            padding='SAME',
            activation=tf.nn.relu,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            bias_initializer=tf.initializers.constant(0.0),
        )

    def _maxpool_2x2(self, input):
        return tf.layers.max_pooling2d(
            inputs=input,
            pool_size=[2, 2],
            strides=[2, 2],
            padding='SAME',
        )

    def _fc(self, input, units, dropout_keep_prob, name=None):
        fc_output = tf.layers.dense(
            inputs=input,
            units=units,
            activation=tf.nn.relu,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            bias_initializer=tf.initializers.constant(0.0),
            kernel_regularizer=tf.contrib.layers.l2_regularizer(0.001),
            bias_regularizer=tf.contrib.layers.l2_regularizer(0.001),
            name=name
            )
        return tf.layers.dropout(fc_output, dropout_keep_prob)

    def setVGG19(self):
        """
        在此函数中设定模型
        :return:
        """
        self._set_input()
        # 加入批标准化以减少过拟合
        input_x_norm = tf.layers.batch_normalization(self.input_x_enhanced, training=self.training)

        # conv3-64
        conv3_64_1 = self._conv(input_x_norm, 3, 1, 64)
        conv3_64_output = self._conv(conv3_64_1, 3, 1, 64)

        # maxpool-1
        maxpool_1_output = self._maxpool_2x2(conv3_64_output)

        # conv3-128
        conv3_128_1 = self._conv(maxpool_1_output, 3, 1, 128)
        conv3_128_output = self._conv(conv3_128_1, 3, 1, 128)

        # maxpool-2
        maxpool_2_output = self._maxpool_2x2(conv3_128_output)

        # conv3-256
        conv3_256_1 = self._conv(maxpool_2_output, 3, 1, 256)
        conv3_256_2 = self._conv(conv3_256_1, 3, 1, 256)
        conv3_256_3 = self._conv(conv3_256_2, 3, 1, 256)
        conv3_256_output = self._conv(conv3_256_3, 3, 1, 256)

        # maxpool-3
        maxpool_3_output = self._maxpool_2x2(conv3_256_output)

        # conv3-512
        conv3_512_1 = self._conv(maxpool_3_output, 3, 1, 512)
        conv3_512_2 = self._conv(conv3_512_1, 3, 1, 512)
        conv3_512_3 = self._conv(conv3_512_2, 3, 1, 512)
        conv3_512_output = self._conv(conv3_512_3, 3, 1, 512)

        # maxpool-4
        maxpool_4_output = self._maxpool_2x2(conv3_512_output)

        # conv4-512
        conv3_512_1 = self._conv(maxpool_4_output, 3, 1, 512)
        conv3_512_2 = self._conv(conv3_512_1, 3, 1, 512)
        conv3_512_3 = self._conv(conv3_512_2, 3, 1, 512)
        conv3_512_output = self._conv(conv3_512_3, 3, 1, 512)

        # maxpool-5
        maxpool_5_output = self._maxpool_2x2(conv3_512_output)

        # flatten
        shape = maxpool_5_output.shape.as_list()
        dims = shape[1]*shape[2]*shape[3]
        maxpool_5_output_flatten = tf.reshape(maxpool_5_output, [-1, dims])

        # fully-connected-1
        fc_1 = self._fc(maxpool_5_output_flatten, 2048, self.dropout_keep_prob)

        # fully-connected-2
        fc_2 = self._fc(fc_1, 2048, self.dropout_keep_prob)

        # fully-connected-3
        fc_3 = self._fc(fc_2, 1000, self.dropout_keep_prob)

        # 输出层
        self.score = self._fc(fc_3, self.class_num, self.dropout_keep_prob, name='score')

        self.prediction = tf.argmax(self.score, 1, name='prediction')

        # Loss function
        with tf.name_scope('loss'):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.score, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + tf.losses.get_regularization_loss()

        # Calculate accuracy
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.prediction, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))

    def convert_input(self, lines):
        """
        把读取的字符串数据转为形状为[batch_size, img_size, img_size, 1]的数组,
        作为CNN的输入

        :param pixels:
        :param labels:
        :return:
        """
        batch_x = []
        batch_y = []
        for line in lines:
            line_ = line.decode('utf-8').strip().split(',')
            pixels = line_[1].split()  # 像素值
            label = line_[0]  # 第一项为标签
            batch_x.append([float(x) for x in pixels])
            batch_y.append(int(label))
        batch_x = np.stack(batch_x)
        batch_x = batch_x.reshape([-1, self.img_size, self.img_size, 1])
        batch_y = np.asarray(batch_y)

        return batch_x, batch_y

    def prepare_data(self):
        self.train_dataset = TextLineDataset(os.path.join('data', preprocess.FILTERED_TRAIN_PATH)).skip(1)
        self.test_dataset = TextLineDataset(os.path.join('data', preprocess.FILTERED_TEST_PATH)).skip(1)

        print('Shuffling dataset...')
        # 打乱数据
        train_dataset = self.train_dataset.shuffle(5000).batch(self.train_batch_size)
        test_dataset = self.test_dataset.shuffle(2500).batch(self.test_batch_size)

        # Create a reinitializable iterator
        train_iterator = train_dataset.make_initializable_iterator()
        test_iterator = test_dataset.make_initializable_iterator()

        train_init_op = train_iterator.initializer
        test_init_op = test_iterator.initializer

        # 要获取元素,先sess.run(train_init_op)初始化迭代器
        # 再sess.run(next_train_element)
        next_train_element = train_iterator.get_next()
        next_test_element = test_iterator.get_next()

        return train_init_op, test_init_op, next_train_element, next_test_element
        # ==============================================================

    def image_enhance(self, image):
        """
        对图片进行数据增强
        :param image: shape为[heigth, width, 1]的numpy数组或tensor
        :return:
        """
        # 进行随机裁剪
        images_crop = tf.image.random_crop(image, [self.crop_size, self.crop_size, 1])

        # 随机水平翻转
        images_crop = tf.image.random_flip_left_right(images_crop)
        # 随机对比度
        images_crop = tf.image.random_contrast(images_crop, 0.5, 1.5)
        # 随机亮度
        images_crop = tf.image.random_brightness(images_crop, max_delta=0.5)

        noise = tf.random_normal(shape=tf.shape(images_crop), mean=0.0, stddev=10.0,
                                 dtype=tf.float32)
        images_crop = tf.add(images_crop, noise)

        return images_crop

    def setVGG16(self):
        """
        在此函数中设定模型
        :return:
        """
        self._set_input()
        # 加入批标准化以减少过拟合
        input_x_norm = tf.layers.batch_normalization(self.input_x_enhanced, training=self.training)

        # conv3-64
        conv3_64_1 = self._conv(input_x_norm, 3, 1, 64)
        conv3_64_output = self._conv(conv3_64_1, 3, 1, 64)

        # maxpool-1
        maxpool_1_output = self._maxpool_2x2(conv3_64_output)

        # conv3-128
        conv3_128_1 = self._conv(maxpool_1_output, 3, 1, 128)
        conv3_128_output = self._conv(conv3_128_1, 3, 1, 128)

        # maxpool-2
        maxpool_2_output = self._maxpool_2x2(conv3_128_output)

        # conv3-256
        conv3_256_1 = self._conv(maxpool_2_output, 3, 1, 256)
        conv3_256_2 = self._conv(conv3_256_1, 3, 1, 256)
        conv3_256_output = self._conv(conv3_256_2, 1, 1, 256)

        # maxpool-3
        maxpool_3_output = self._maxpool_2x2(conv3_256_output)

        # conv3-512
        conv3_512_1 = self._conv(maxpool_3_output, 3, 1, 512)
        conv3_512_2 = self._conv(conv3_512_1, 3, 1, 512)
        conv3_512_output = self._conv(conv3_512_2, 1, 1, 512)

        # maxpool-4
        maxpool_4_output = self._maxpool_2x2(conv3_512_output)

        # conv4-512
        conv3_512_1 = self._conv(maxpool_4_output, 3, 1, 512)
        conv3_512_2 = self._conv(conv3_512_1, 3, 1, 512)
        conv3_512_output = self._conv(conv3_512_2, 1, 1, 512)

        # maxpool-5
        maxpool_5_output = self._maxpool_2x2(conv3_512_output)

        # flatten
        shape = maxpool_5_output.shape.as_list()
        dims = shape[1]*shape[2]*shape[3]
        maxpool_5_output_flatten = tf.reshape(maxpool_5_output, [-1, dims])

        # fully-connected-1
        fc_1 = self._fc(maxpool_5_output_flatten, 2048, self.dropout_keep_prob)

        # fully-connected-2
        fc_2 = self._fc(fc_1, 2048, self.dropout_keep_prob)

        # fully-connected-3
        fc_3 = self._fc(fc_2, 1000, self.dropout_keep_prob)

        # 输出层
        self.score = self._fc(fc_3, self.class_num, self.dropout_keep_prob, name='score')

        self.prediction = tf.argmax(self.score, 1, name='prediction')

        # Loss function
        with tf.name_scope('loss'):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.score, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + tf.losses.get_regularization_loss()

        # Calculate accuracy
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.prediction, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
예제 #12
0
class TextCNN(object):
    def __init__(self, config):
        self.class_num = config.class_num
        self.filter_sizes = config.filter_sizes
        self.filter_num = config.filter_num
        self.vocab_size = config.vocab_size

        self.dense_unit_num = config.dense_unit_num
        self.train_batch_size = config.train_batch_size
        self.valid_batch_size = config.valid_batch_size
        self.test_batch_size = config.test_batch_size

        if config.train_mode == 'CHAR-RANDOM':
            # 文本长度
            self.text_length = preprocess.MAX_CHAR_TEXT_LENGTH
            # 词嵌入维度
            self.embedding_dim = config.embedding_dim

        elif config.train_mode == 'WORD-NON-STATIC' or config.train_mode == 'MULTI':
            self.text_length = preprocess.MAX_WORD_TEXT_LENGTH
            self.embedding_dim = preprocess.vec_dim

        self.train_mode = config.train_mode

        self.input_x = None
        self.input_y = None
        self.labels = None
        self.dropout_keep_prob = None
        self.training = None
        self.embedding_inputs_expanded = None
        self.loss = None
        self.accuracy = None
        self.prediction = None
        self.vocab = None
        self.vecs_dict = {}
        self.embedding_W = None
        self.dataset = None

    def setCNN(self):
        # 输入层
        self.input_x = tf.placeholder(tf.int32, [None, self.text_length],
                                      name="input_x")
        self.labels = tf.placeholder(tf.int32, [None], name="input_y")
        # 把数字标签转为one hot形式
        self.input_y = tf.one_hot(self.labels, self.class_num)
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name="dropout_keep_prob")

        # 训练时batch_normalization的Training参数应为True,
        # 验证或测试时应为False
        self.training = tf.placeholder(tf.bool, name='training')

        # 词嵌入层
        with tf.device('/cpu:0'), tf.name_scope('embedding'):
            if self.train_mode == 'CHAR-RANDOM':
                # 随机初始化的词向量
                W = tf.Variable(
                    tf.random_uniform([self.vocab_size, self.embedding_dim],
                                      -1.0, 1.0))
                embedding_inputs = tf.nn.embedding_lookup(W, self.input_x)
                self.embedding_inputs_expanded = tf.expand_dims(
                    embedding_inputs, -1)
            elif self.train_mode == 'WORD-NON-STATIC':
                # 用之前读入的预训练词向量
                W = tf.Variable(self.embedding_W)
                embedding_inputs = tf.nn.embedding_lookup(W, self.input_x)
                self.embedding_inputs_expanded = tf.expand_dims(
                    embedding_inputs, -1)
            elif self.train_mode == 'MULTI':
                W1 = tf.Variable(self.embedding_W)
                W2 = tf.Variable(self.embedding_W, trainable=False)
                embedding_inputs1 = tf.nn.embedding_lookup(W1, self.input_x)
                embedding_inputs2 = tf.nn.embedding_lookup(W2, self.input_x)
                self.embedding_inputs_expanded = tf.stack(
                    [embedding_inputs1, embedding_inputs2], axis=-1)

        # The final pooling output, containing outputs from each filter
        pool_outputs = []
        # Iterate to create convolution layer for each filter
        for filter_size in self.filter_sizes:
            with tf.name_scope("conv-maxpool-%d" % filter_size):
                # Convolution layer 1
                # ==================================================================
                filter_shape = [filter_size, self.embedding_dim]

                conv_1 = tf.layers.conv2d(
                    inputs=self.embedding_inputs_expanded,
                    filters=self.filter_num,
                    kernel_size=filter_shape,
                    strides=[1, 1],
                    padding='VALID',
                    use_bias=True,
                    kernel_initializer=tf.initializers.truncated_normal(
                        stddev=0.1),
                    bias_initializer=tf.initializers.constant(0.1))
                # ===================================================================
                # Do batch normalization
                # =================================================================
                conv_1_output = tf.layers.batch_normalization(
                    conv_1, training=self.training)
                conv_1_output = tf.nn.relu(conv_1_output)
                # ======================================================================
                # Pooling layer 1
                # ====================================================================
                conv_1_output_shape = conv_1_output.shape.as_list()
                pool_1 = tf.layers.max_pooling2d(
                    inputs=conv_1_output,
                    pool_size=[conv_1_output_shape[1] - 1 + 1, 1],
                    strides=[1, 1],
                    padding='VALID')
                # =====================================================================

            pool_outputs.append(pool_1)

        # Combine all the pooling output
        # The total number of filters.
        total_filter_num = self.filter_num * len(self.filter_sizes)
        h_pool = tf.concat(pool_outputs, 3)
        h_pool_flat = tf.reshape(h_pool, [-1, total_filter_num])
        # Output shape[batch, total_filter_num]

        # Full-connected layer
        # ========================================================================
        with tf.name_scope('dense-%d' % self.dense_unit_num):
            h_full = tf.layers.dense(
                h_pool_flat,
                units=self.dense_unit_num,
                use_bias=True,
                kernel_initializer=tf.truncated_normal_initializer(stddev=0.1),
                bias_initializer=tf.constant_initializer(0.1))
            h_full = tf.layers.dropout(h_full, rate=self.dropout_keep_prob)
            h_full = tf.nn.relu(h_full)
        # =========================================================================

        # Output layer
        with tf.name_scope('output'):
            score = tf.layers.dense(
                h_full,
                units=self.class_num,
                activation=None,
                use_bias=True,
                kernel_initializer=tf.truncated_normal_initializer(stddev=0.1),
                bias_initializer=tf.constant_initializer(0.1))
            self.score = tf.multiply(score, 1, name='score')
            self.prediction = tf.argmax(score, 1, name='prediction')

        # Loss function
        with tf.name_scope('loss'):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=score, labels=self.input_y)
            self.loss = tf.reduce_mean(losses)

        # Calculate accuracy
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.prediction,
                                           tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(
                tf.cast(correct_predictions, tf.float32))

    def convert_input(self, lines):
        """
        将训练集数据转换为id或词向量表示
        """
        batch_x = []
        batch_y = []
        title = ""

        for line in lines:
            line_ = line.decode("gbk").strip().split(',')
            title = ''.join(line_[0:-1])  # 逗号前段为标题
            label = ''.join(line_[-1])  # 最后一项为标签
            batch_x.append(preprocess.to_id(title, self.vocab,
                                            self.train_mode))
            batch_y.append(label)

        batch_x = np.stack(batch_x)
        return batch_x, batch_y

    def convert_test_input(self, titles):
        """
        将测试集tsv数据转为id或词向量表示
        :param titles:
        :return:
        """
        batch_x = []
        # 1.id
        for title in titles:
            valid_title = title.decode('gb18030').strip('\t')
            batch_x.append(
                preprocess.to_id(valid_title, self.vocab, self.train_mode))

        batch_x = np.stack(batch_x)
        return batch_x

    def prepare_data(self):
        # Data preparation.
        # =======================================================
        if self.train_mode == 'CHAR-RANDOM':
            # 1.字符级
            # 读取词汇表
            self.vocab = preprocess.read_vocab(
                os.path.join('data', preprocess.CHAR_VOCAB_PATH))

        elif self.train_mode == 'WORD-NON-STATIC' or self.train_mode == 'MULTI':
            # 把预训练词向量的值读到变量中
            self.vocab = preprocess.read_vocab(
                os.path.join('data', preprocess.WORD_VOCAB_PATH))
            self.vecs_dict = preprocess.load_vecs(
                os.path.join('data', preprocess.SGNS_WORD_PATH))
            self.embedding_W = np.ndarray(
                shape=[self.vocab_size, self.embedding_dim], dtype=np.float32)
            for word in self.vocab:
                # 第n行对应id为n的词的词向量
                if word not in self.vecs_dict:
                    preprocess.add_word(word, self.vecs_dict)
                self.embedding_W[self.vocab[word]] = self.vecs_dict[word]

        self.dataset = TextLineDataset(
            os.path.join('data', preprocess.TRAIN_WITH_ID_PATH))
        print('Shuffling dataset...')
        self.dataset = self.dataset.shuffle(preprocess.TOTAL_TRAIN_SIZE)
        # 分割数据集
        # 取前VALID_SIZE个样本给验证集
        valid_dataset = self.dataset.take(preprocess.VALID_SIZE).batch(
            self.valid_batch_size)
        # 剩下的给训练集
        train_dataset = self.dataset.skip(preprocess.VALID_SIZE).batch(
            self.train_batch_size)

        # Create a reinitializable iterator
        train_iterator = train_dataset.make_initializable_iterator()
        valid_iterator = valid_dataset.make_initializable_iterator()

        train_init_op = train_iterator.initializer
        valid_init_op = valid_iterator.initializer

        # 要获取元素,先sess.run(train_init_op)初始化迭代器
        # 再sess.run(next_train_element)
        next_train_element = train_iterator.get_next()
        next_valid_element = valid_iterator.get_next()

        return train_init_op, valid_init_op, next_train_element, next_valid_element
        # =============================================================
        # Date preparation ends.

    def prepare_test_data(self):
        # 读取词汇表
        if self.train_mode == 'CHAR-RANDOM':
            # 1.字符级
            self.vocab = preprocess.read_vocab(
                os.path.join('data', preprocess.CHAR_VOCAB_PATH))

        elif self.train_mode == 'WORD-NON-STATIC' or self.train_mode == 'MULTI':
            self.vocab = preprocess.read_vocab(
                os.path.join('data', preprocess.WORD_VOCAB_PATH))

        # 测试集有标题,读取时注意跳过第一行
        dataset = TextLineDataset(os.path.join('data', preprocess.TEST_PATH))
        dataset = dataset.shuffle(preprocess.TOTAL_TEST_SIZE).batch(
            self.test_batch_size)

        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()

        return dataset, next_element