Пример #1
0
class Decoder(Sequential):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size):
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.lstm = LSTM(embedding_size, hidden_size)
        self.lstm_output = TimeDistributed(hidden_size,
                                           output_size,
                                           activation='tanh')
        self.softmax = TimeDistributed(output_size,
                                       vocab_size,
                                       activation='softmax')
        self.embedding = Embedding(vocab_size, embedding_size)

        self.layers = [
            self.lstm, self.lstm_output, self.softmax, self.embedding
        ]
        self.params = list(
            itertools.chain(*[
                layer.params for layer in self.layers
                if hasattr(layer, 'params')
            ]))

    def forward(self, ec_H, ec_C, mask):
        (sens_size, batch_size) = T.shape(mask)

        def step(m, prev_Y, prev_H, prev_C):
            """Forward a time step of the decoder."""
            # LSTM forward time step
            (H, C) = self.lstm.step(prev_Y, m, prev_H, prev_C)
            # LSTM output
            O = self.lstm_output.forward(H)
            # Apply softmax to LSTM output
            P = self.softmax.forward(O)
            # Make prediction
            one_hot_Y = T.argmax(P, axis=1)
            # Feed the output to the next time step
            Y = self.embedding.forward(one_hot_Y)
            # FIXME: Deal with differ length ?
            return (P, Y, H, C)

        results, updates = theano.scan(fn=step,
                                       sequences=[mask],
                                       outputs_info=[
                                           None,
                                           dict(initial=T.zeros(
                                               (batch_size,
                                                self.embedding_size)),
                                                taps=[-1]),
                                           dict(initial=ec_H, taps=[-1]),
                                           dict(initial=ec_C, taps=[-1])
                                       ])

        # return np.swapaxes(results[0], 0, 1)       # returns the softmax probabilities
        return results[0]
Пример #2
0
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size):
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.lstm = LSTM(embedding_size, hidden_size)
        self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh')
        self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax')
        self.embedding = Embedding(vocab_size, embedding_size)

        self.layers = [self.lstm, self.lstm_output, self.softmax, self.embedding]
        self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')]))
Пример #3
0
class Decoder(Sequential):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size):
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.lstm = LSTM(embedding_size, hidden_size)
        self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh')
        self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax')
        self.embedding = Embedding(vocab_size, embedding_size)

        self.layers = [self.lstm, self.lstm_output, self.softmax, self.embedding]
        self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')]))

    def forward(self, ec_H, ec_C, mask):
        (sens_size, batch_size) = T.shape(mask)

        def step(m, prev_Y, prev_H, prev_C):
            """Forward a time step of the decoder."""
            # LSTM forward time step
            (H, C) = self.lstm.step(prev_Y, m, prev_H, prev_C)
            # LSTM output
            O = self.lstm_output.forward(H)
            # Apply softmax to LSTM output
            P = self.softmax.forward(O)
            # Make prediction
            one_hot_Y = T.argmax(P, axis=1)
            # Feed the output to the next time step
            Y = self.embedding.forward(one_hot_Y)
            # FIXME: Deal with differ length ?
            return (P, Y, H, C)

        results, updates = theano.scan(
            fn=step,
            sequences=[mask],
            outputs_info=[
                None,
                dict(initial=T.zeros((batch_size, self.embedding_size)), taps=[-1]),
                dict(initial=ec_H, taps=[-1]),
                dict(initial=ec_C, taps=[-1])
            ]
        )

        # return np.swapaxes(results[0], 0, 1)       # returns the softmax probabilities
        return results[0]
Пример #4
0
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size):
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.lstm = LSTM(embedding_size, hidden_size)
        self.lstm_output = TimeDistributed(hidden_size,
                                           output_size,
                                           activation='tanh')
        self.softmax = TimeDistributed(output_size,
                                       vocab_size,
                                       activation='softmax')
        self.embedding = Embedding(vocab_size, embedding_size)

        self.layers = [
            self.lstm, self.lstm_output, self.softmax, self.embedding
        ]
        self.params = list(
            itertools.chain(*[
                layer.params for layer in self.layers
                if hasattr(layer, 'params')
            ]))
Пример #5
0
train_data_iter = data_iterator_simple(load_train_func,
                                       len(x_train),
                                       batch_size,
                                       shuffle=True,
                                       with_file_cache=False)
valid_data_iter = data_iterator_simple(load_valid_func,
                                       len(x_valid),
                                       batch_size,
                                       shuffle=True,
                                       with_file_cache=False)

x = nn.Variable((batch_size, sentence_length))
t = nn.Variable((batch_size, sentence_length, 1))
h = PF.embed(x, vocab_size, embedding_size)
h = LSTM(h, hidden, return_sequences=True)
h = TimeDistributed(PF.affine)(h, hidden, name='hidden')
y = TimeDistributed(PF.affine)(h, vocab_size, name='output')

mask = F.sum(F.sign(t), axis=2)  # do not predict 'pad'.
entropy = TimeDistributedSoftmaxCrossEntropy(y, t) * mask
count = F.sum(mask, axis=1)
loss = F.mean(F.div2(F.sum(entropy, axis=1), count))

# Create solver.
solver = S.Momentum(1e-2, momentum=0.9)
solver.set_parameters(nn.get_parameters())

# Create monitor.
from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed
monitor = Monitor('./tmp-lstmlm')
monitor_perplexity = MonitorSeries('perplexity', monitor, interval=1)
Пример #6
0
    def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, rad_dim=30, emb=None, ng_embs=None, pixels=None, con_width=None, filters=None, pooling_size=None):
        if trained_model is not None:
            param_dic = {}
            param_dic['nums_chars'] = self.nums_chars
            param_dic['nums_tags'] = self.nums_tags
            param_dic['tag_scheme'] = self.tag_scheme
            param_dic['graphic'] = self.graphic
            param_dic['pic_size'] = self.pic_size
            param_dic['word_vec'] = self.word_vec
            param_dic['radical'] = self.radical
            param_dic['crf'] = self.crf
            param_dic['emb_dim'] = emb_dim
            param_dic['gru'] = gru
            param_dic['rnn_dim'] = rnn_dim
            param_dic['rnn_num'] = rnn_num
            param_dic['drop_out'] = drop_out
            param_dic['filter_size'] = con_width
            param_dic['filters'] = filters
            param_dic['pooling_size'] = pooling_size
            param_dic['font'] = self.font
            param_dic['buckets_char'] = self.buckets_char
            param_dic['ngram'] = self.ngram
            #print param_dic
            f_model = open(trained_model, 'w')
            pickle.dump(param_dic, f_model)
            f_model.close()

        # define shared weights and variables

        dr = tf.placeholder(tf.float32, [], name='drop_out_holder')
        self.drop_out = dr
        self.drop_out_v = drop_out

        if self.word_vec:
            self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer')

        if self.radical:
            self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer')

        if self.ngram is not None:
            if ng_embs is not None:
                assert len(ng_embs) == len(self.ngram)
            else:
                ng_embs = [None for _ in range(len(self.ngram))]
            for i, n_gram in enumerate(self.ngram):
                self.gram_layers.append(EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ng_embs[i], name= str(i + 2) + 'gram_layer'))

        wrapper_conv_1, wrapper_mp_1, wrapper_conv_2, wrapper_mp_2, wrapper_dense, wrapper_dr = None, None, None, None, None, None

        if self.graphic:
            self.input_p = []
            assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None

            self.pixels = pixels
            pixel_dim = int(math.sqrt(len(pixels[0])))

            wrapper_conv_1 = TimeDistributed(Convolution(con_width, 1, filters, name='conv_1'), name='wrapper_c1')
            wrapper_mp_1 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_1'), name='wrapper_p1')

            p_size_1 = toolbox.down_pool(pixel_dim, pooling_size)

            wrapper_conv_2 = TimeDistributed(Convolution(con_width, filters, filters, name='conv_2'), name='wrapper_c2')
            wrapper_mp_2 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_2'), name='wrapper_p2')

            p_size_2 = toolbox.down_pool(p_size_1, pooling_size)

            wrapper_dense = TimeDistributed(HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense'), name='wrapper_3')
            wrapper_dr = TimeDistributed(DropoutLayer(self.drop_out), name='wrapper_dr')

        with tf.variable_scope('BiRNN'):

            if gru:
                fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
                bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
            else:
                fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True)
                bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True)

            if rnn_num > 1:
                fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell]*rnn_num, state_is_tuple=True)
                bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell]*rnn_num, state_is_tuple=True)

        output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='hidden'), name='wrapper')

        #define model for each bucket
        for idx, bucket in enumerate(self.buckets_char):
            if idx == 1:
                scope.reuse_variables()
            t1 = time()

            input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket))

            self.input_v.append([input_v])

            emb_set = []

            if self.word_vec:
                word_out = self.emb_layer(input_v)
                emb_set.append(word_out)

            if self.radical:
                input_r = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket))

                self.input_v[-1].append(input_r)
                radical_out = self.radical_layer(input_r)
                emb_set.append(radical_out)

            if self.ngram is not None:
                for i in range(len(self.ngram)):
                    input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket))
                    self.input_v[-1].append(input_g)
                    gram_out = self.gram_layers[i](input_g)
                    emb_set.append(gram_out)

            if self.graphic:
                input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim*pixel_dim])
                self.input_p.append(input_p)

                pix_out = tf.reshape(input_p, [-1, bucket, pixel_dim, pixel_dim, 1])
                pix_out = tf.unpack(pix_out, axis=1)

                conv_out_1 = wrapper_conv_1(pix_out)
                pooling_out_1 = wrapper_mp_1(conv_out_1)

                conv_out_2 = wrapper_conv_2(pooling_out_1)
                pooling_out_2 = wrapper_mp_2(conv_out_2)

                assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1]
                pooling_out = tf.reshape(pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters])
                pooling_out = tf.unpack(pooling_out, axis=1)

                graphic_out = wrapper_dense(pooling_out)
                graphic_out = wrapper_dr(graphic_out)

                emb_set.append(graphic_out)


            if len(emb_set) > 1:
                emb_out = tf.concat(2, emb_set)
                emb_out = tf.unpack(emb_out)

            else:
                emb_out = emb_set[0]

            rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v)

            output = output_wrapper(rnn_out)

            output_c = tf.pack(output, axis=1)

            self.output.append([output_c])

            self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))])

            self.bucket_dit[bucket] = idx

            print 'Bucket %d, %f seconds' % (idx + 1, time() - t1)

        assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) and len(self.output) == len(self.counts)

        self.params = tf.trainable_variables()

        self.saver = tf.train.Saver()
Пример #7
0
    def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, emb=None):
        if trained_model is not None:
            param_dic = {'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'crf': self.crf, 'emb_dim': emb_dim,
                         'gru': gru, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'buckets_char': self.buckets_char,
                         'ngram': self.ngram, 'is_space': self.is_space, 'sent_seg': self.sent_seg, 'emb_path': self.emb_path,
                         'tag_scheme': self.tag_scheme}
            #print param_dic
            f_model = open(trained_model, 'w')
            pickle.dump(param_dic, f_model)
            f_model.close()

        # define shared weights and variables

        dr = tf.placeholder(tf.float32, [], name='drop_out_holder')
        self.drop_out = dr
        self.drop_out_v = drop_out

        self.emb_layer = EmbeddingLayer(self.nums_chars + 20, emb_dim, weights=emb, name='emb_layer')

        if self.ngram is not None:
            ng_embs = [None for _ in range(len(self.ngram))]
            for i, n_gram in enumerate(self.ngram):
                self.gram_layers.append(EmbeddingLayer(n_gram + 5000 * (i + 2), emb_dim, weights=ng_embs[i], name= str(i + 2) + 'gram_layer'))

        with tf.variable_scope('BiRNN'):

            if gru:
                fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
                bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
            else:
                fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True)
                bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True)

            if rnn_num > 1:
                fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell]*rnn_num, state_is_tuple=True)
                bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell]*rnn_num, state_is_tuple=True)

        output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags, activation='linear', name='hidden'), name='wrapper')

        #define model for each bucket
        for idx, bucket in enumerate(self.buckets_char):
            if idx == 1:
                scope.reuse_variables()
            t1 = time()

            input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket))

            self.input_v.append([input_v])

            emb_set = []

            word_out = self.emb_layer(input_v)
            emb_set.append(word_out)

            if self.ngram is not None:
                for i in range(len(self.ngram)):
                    input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket))
                    self.input_v[-1].append(input_g)
                    gram_out = self.gram_layers[i](input_g)
                    emb_set.append(gram_out)

            if len(emb_set) > 1:
                emb_out = tf.concat(2, emb_set)

            else:
                emb_out = emb_set[0]

            emb_out = DropoutLayer(dr)(emb_out)
            emb_out = tf.unpack(emb_out)

            rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v)

            output = output_wrapper(rnn_out)
            output_c = tf.pack(output, axis=1)

            self.output.append([output_c])

            self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))])
            self.bucket_dit[bucket] = idx

            print 'Bucket %d, %f seconds' % (idx + 1, time() - t1)

        assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) and len(self.output) == len(self.counts)

        self.params = tf.trainable_variables()

        self.saver = tf.train.Saver()
Пример #8
0
EMBEDDING_LENGTH = len(chars)

# Creating the model.
model = Network(
    LSTM(size=512,
         input_size=EMBEDDING_LENGTH,
         batch_size=BATCH_SIZE,
         backprop_depth=SEQUENCE_LENGTH,
         stateful=True),
    LSTM(size=512,
         input_size=512,
         batch_size=BATCH_SIZE,
         backprop_depth=SEQUENCE_LENGTH,
         stateful=True),
    TimeDistributed(
        Dense(size=EMBEDDING_LENGTH,
              input_size=512,
              activation=SparseSoftmax())))

if RESTORE_MODEL_PATH:
    model.loadParams(RESTORE_MODEL_PATH)

optimizer = RMSprop(learning_rate=lambda n: 0.001)
loss_function = VectorCrossEntropy

model.assignOptimizer(optimizer)

if RESTORE_OPTIMIZER_PATH:
    optimizer.load(RESTORE_OPTIMIZER_PATH)

for epoch in range(INITIAL_EPOCH, NR_OF_EPOCHS + INITIAL_EPOCH):
    loss, accuracy = model.train(makeBatches(source, SEQUENCE_LENGTH,
Пример #9
0
    def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, rad_dim=30, emb=None,
                   ngram_embedding=None, pixels=None, con_width=None, filters=None, pooling_size=None):
        """

        :param trained_model:
        :param scope:
        :param emb_dim:
        :param gru:
        :param rnn_dim:
        :param rnn_num:
        :param drop_out:
        :param rad_dim: n
        :param emb:
        :param ngram_embedding: 预训练 ngram embeddig 文件
        :param pixels:
        :param con_width:
        :param filters:
        :param pooling_size:
        :return:
        """
        # trained_model: 模型存储路径
        if trained_model is not None:
            param_dic = {'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'tag_scheme': self.tag_scheme,
                         'graphic': self.graphic, 'pic_size': self.pic_size, 'word_vec': self.word_vec,
                         'radical': self.radical, 'crf': self.crf, 'emb_dim': emb_dim, 'gru': gru, 'rnn_dim': rnn_dim,
                         'rnn_num': rnn_num, 'drop_out': drop_out, 'filter_size': con_width, 'filters': filters,
                         'pooling_size': pooling_size, 'font': self.font, 'buckets_char': self.buckets_char,
                         'ngram': self.ngram}
            print "RNN dimension is %d" % rnn_dim
            print "RNN number is %d" % rnn_num
            print "Character embedding size is %d" % emb_dim
            print "Ngram embedding dimension is %d" % emb_dim
            # 存储模型超参数
            if self.metric == 'All':
                # rindex() 返回子字符串 str 在字符串中最后出现的位置
                # 截取模型文件名
                pindex = trained_model.rindex('/') + 1
                for m in self.all_metrics:
                    f_model = open(trained_model[:pindex] + m + '_' + trained_model[pindex:], 'w')
                    pickle.dump(param_dic, f_model)
                    f_model.close()
            else:
                f_model = open(trained_model, 'w')
                pickle.dump(param_dic, f_model)
                f_model.close()

        # define shared weights and variables

        dr = tf.placeholder(tf.float32, [], name='drop_out_holder')
        self.drop_out = dr
        self.drop_out_v = drop_out

        # 字向量层
        # 为什么字符数要加 500 ?
        # emb_dim 是每个字符的特征向量维度,可以通过命令行参数设置
        # weights 表示预训练的字向量,可以通过命令行参数设置
        if self.word_vec:
            self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer')

        # 偏旁部首向量
        # 依照《康熙字典》,共有 214 个偏旁部首。
        # 只用了常见汉字的偏旁部首,非常见汉字和非汉字的偏旁部首用其他两个特殊符号代替,
        # 所以共有 216 个偏旁部首
        if self.radical:
            self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer')

        if self.ngram is not None:
            if ngram_embedding is not None:
                assert len(ngram_embedding) == len(self.ngram)
            else:
                ngram_embedding = [None for _ in range(len(self.ngram))]
            for i, n_gram in enumerate(self.ngram):
                self.gram_layers.append(EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ngram_embedding[i],
                                                       name=str(i + 2) + 'gram_layer'))

        wrapper_conv_1, wrapper_mp_1, wrapper_conv_2, wrapper_mp_2, wrapper_dense, wrapper_dr = \
            None, None, None, None, None, None

        if self.graphic:
            # 使用图像信息,需要用到 CNN
            self.input_p = []
            assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None

            self.pixels = pixels
            pixel_dim = int(math.sqrt(len(pixels[0])))

            wrapper_conv_1 = TimeDistributed(Convolution(con_width, 1, filters, name='conv_1'), name='wrapper_c1')
            wrapper_mp_1 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_1'), name='wrapper_p1')

            p_size_1 = toolbox.down_pool(pixel_dim, pooling_size)

            wrapper_conv_2 = TimeDistributed(Convolution(con_width, filters, filters, name='conv_2'), name='wrapper_c2')
            wrapper_mp_2 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_2'), name='wrapper_p2')

            p_size_2 = toolbox.down_pool(p_size_1, pooling_size)

            wrapper_dense = TimeDistributed(
                HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense'), name='wrapper_3')
            wrapper_dr = TimeDistributed(DropoutLayer(self.drop_out), name='wrapper_dr')

        with tf.variable_scope('BiRNN'):

            if gru:
                fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
                bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
            else:
                fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True)
                bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True)

            if rnn_num > 1:
                fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell] * rnn_num, state_is_tuple=True)
                bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell] * rnn_num, state_is_tuple=True)

        # 隐藏层,输入是前向 RNN 的输出加上 后向 RNN 的输出,所以输入维度为 rnn_dim * 2
        # 输出维度即标签个数
        output_wrapper = TimeDistributed(
            HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='hidden'),
            name='wrapper')

        # define model for each bucket
        # 每一个 bucket 中的句子长度不一样,所以需要定义单独的模型
        # bucket: bucket 中的句子长度
        for idx, bucket in enumerate(self.buckets_char):
            if idx == 1:
                # scope 是 tf.variable_scope("tagger", reuse=None, initializer=initializer)
                # 只需要设置一次 reuse,后面就都 reuse 了
                scope.reuse_variables()
            t1 = time()

            # 输入的句子,one-hot 向量
            # shape = (batch_size, 句子长度)
            input_sentences = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket))

            self.input_v.append([input_sentences])

            emb_set = []

            if self.word_vec:
                # 根据 one-hot 向量查找对应的字向量
                # word_out: shape=(batch_size, 句子长度,字向量维度(64))
                word_out = self.emb_layer(input_sentences)
                emb_set.append(word_out)

            if self.radical:
                # 嵌入偏旁部首信息,shape = (batch_size, 句子长度)
                input_radicals = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket))

                self.input_v[-1].append(input_radicals)
                radical_out = self.radical_layer(input_radicals)
                emb_set.append(radical_out)

            if self.ngram is not None:
                for i in range(len(self.ngram)):
                    input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket))
                    self.input_v[-1].append(input_g)
                    gram_out = self.gram_layers[i](input_g)
                    emb_set.append(gram_out)

            if self.graphic:
                input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim * pixel_dim])
                self.input_p.append(input_p)

                pix_out = tf.reshape(input_p, [-1, bucket, pixel_dim, pixel_dim, 1])

                conv_out_1 = wrapper_conv_1(pix_out)
                pooling_out_1 = wrapper_mp_1(conv_out_1)

                conv_out_2 = wrapper_conv_2(pooling_out_1)
                pooling_out_2 = wrapper_mp_2(conv_out_2)

                assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1]
                pooling_out = tf.reshape(pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters])
                pooling_out = tf.unstack(pooling_out, axis=1)

                graphic_out = wrapper_dense(pooling_out)
                graphic_out = wrapper_dr(graphic_out)

                emb_set.append(graphic_out)

            if self.window_size > 1:

                padding_size = int(np.floor(self.window_size / 2))
                word_padded = tf.pad(word_out, [[0, 0], [padding_size, padding_size], [0, 0]], 'CONSTANT')

                Ws = []
                for q in range(1, self.window_size + 1):
                    Ws.append(tf.get_variable("W_%d" % q, shape=[q * emb_dim, self.filters_number]))
                b = tf.get_variable("b", shape=[self.filters_number])

                z = [None for _ in range(0, bucket)]

                for q in range(1, self.window_size + 1):
                    for i in range(padding_size, bucket + padding_size):
                        low = i - int(np.floor((q - 1) / 2))
                        high = i + int(np.ceil((q + 1) / 2))
                        x = word_padded[:, low, :]
                        for j in range(low + 1, high):
                            x = tf.concat(values=[x, word_padded[:, j, :]], axis=1)
                        z_iq = tf.tanh(tf.nn.xw_plus_b(x, Ws[q - 1], b))
                        if z[i - padding_size] is None:
                            z[i - padding_size] = z_iq
                        else:
                            z[i - padding_size] = tf.concat([z[i - padding_size], z_iq], axis=1)

                z = tf.stack(z, axis=1)
                values, indices = tf.nn.top_k(z, sorted=False, k=emb_dim)

                # highway layer
                X = tf.unstack(word_out, axis=1)
                Conv_X = tf.unstack(values, axis=1)
                X_hat = []
                W_t = tf.get_variable("W_t", shape=[emb_dim, emb_dim])
                b_t = tf.get_variable("b_t", shape=[emb_dim])
                for x, conv_x in zip(X, Conv_X):
                    T_x = tf.sigmoid(tf.nn.xw_plus_b(x, W_t, b_t))
                    X_hat.append(tf.multiply(conv_x, T_x) + tf.multiply(x, 1 - T_x))
                X_hat = tf.stack(X_hat, axis=1)
                emb_set.append(X_hat)
            if len(emb_set) > 1:
                # 各种字向量直接 concat 起来(字向量、偏旁部首、n-gram、图像信息等)
                emb_out = tf.concat(axis=2, values=emb_set)

            else:
                emb_out = emb_set[0]

            # rnn_out 是前向 RNN 的输出和后向 RNN 的输出 concat 之后的值
            rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr,
                             name='BiLSTM' + str(bucket), scope='BiRNN')(self.highway(emb_out, "tag"), input_sentences)

            # 应用全连接层,Wx+b 得到最后的输出
            output = output_wrapper(rnn_out)
            # 为什么要 [output] 而不是 output 呢?
            self.output.append([output])

            self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))])

            self.bucket_dit[bucket] = idx

            # language model
            lm_rnn_dim = rnn_dim
            with tf.variable_scope('LM-BiRNN'):
                if gru:
                    lm_fw_rnn_cell = tf.nn.rnn_cell.GRUCell(lm_rnn_dim)
                    lm_bw_rnn_cell = tf.nn.rnn_cell.GRUCell(lm_rnn_dim)
                else:
                    lm_fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(lm_rnn_dim, state_is_tuple=True)
                    lm_bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(lm_rnn_dim, state_is_tuple=True)

                if rnn_num > 1:
                    lm_fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([lm_fw_rnn_cell] * rnn_num, state_is_tuple=True)
                    lm_bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([lm_bw_rnn_cell] * rnn_num, state_is_tuple=True)
            lm_rnn_output = BiLSTM(lm_rnn_dim, fw_cell=lm_fw_rnn_cell,
                                   bw_cell=lm_bw_rnn_cell, p=dr,
                                   name='LM-BiLSTM' + str(bucket),
                                   scope='LM-BiRNN')(self.highway(emb_set[0]), input_sentences)

            lm_output_wrapper = TimeDistributed(
                HiddenLayer(lm_rnn_dim * 2, self.nums_chars + 2, activation='linear', name='lm_hidden'),
                name='lm_wrapper')
            lm_final_output = lm_output_wrapper(lm_rnn_output)
            self.lm_predictions.append([lm_final_output])
            self.lm_groundtruthes.append([tf.placeholder(tf.int32, [None, bucket], name='lm_targets' + str(bucket))])

            print 'Bucket %d, %f seconds' % (idx + 1, time() - t1)

        assert \
            len(self.input_v) == len(self.output) and \
            len(self.output) == len(self.output_) and \
            len(self.lm_predictions) == len(self.lm_groundtruthes) and \
            len(self.output) == len(self.counts)

        self.params = tf.trainable_variables()

        self.saver = tf.train.Saver()
h = PF.embed(x, char_vocab_size, char_embedding_dim)
h = F.transpose(h, (0, 3, 1, 2))
output = []
for f, f_size in zip(filters, filster_sizes):
    _h = PF.convolution(h,
                        f,
                        kernel=(1, f_size),
                        pad=(0, f_size // 2),
                        name='conv_{}'.format(f_size))
    _h = F.max_pooling(_h, kernel=(1, word_length))
    output.append(_h)
h = F.concatenate(*output, axis=1)
h = F.transpose(h, (0, 2, 1, 3))
h = F.reshape(h, (batch_size, sentence_length, sum(filters)))
# h = PF.batch_normalization(h, axes=[2])
h = TimeDistributed(Highway)(h, name='highway1')
h = TimeDistributed(Highway)(h, name='highway2')
h = LSTM(h, lstm_size, return_sequences=True, name='lstm1')
h = LSTM(h, lstm_size, return_sequences=True, name='lstm2')
h = TimeDistributed(PF.affine)(h, lstm_size, name='hidden')
y = TimeDistributed(PF.affine)(h, word_vocab_size, name='output')
t = nn.Variable((batch_size, sentence_length, 1))

mask = F.sum(F.sign(t), axis=2)  # do not predict 'pad'.
entropy = TimeDistributedSoftmaxCrossEntropy(y, t) * mask
count = F.sum(mask, axis=1)
loss = F.mean(F.div2(F.sum(entropy, axis=1), count))

# Create solver.
solver = S.Momentum(1e-2, momentum=0.9)
solver.set_parameters(nn.get_parameters())
    def main_graph(self,
                   trained_model,
                   scope,
                   emb_dim,
                   gru,
                   rnn_dim,
                   rnn_num,
                   drop_out=0.5,
                   emb=None,
                   ngram_embedding=None):
        """
        :param trained_model:
        :param scope:
        :param emb_dim:
        :param gru:
        :param rnn_dim:
        :param rnn_num:
        :param drop_out:
        :param emb:
        :return:
        """
        # trained_model: 模型存储路径
        if trained_model is not None:
            param_dic = {
                'nums_chars': self.nums_chars,
                'nums_tags': self.nums_tags,
                'tag_scheme': self.tag_scheme,
                'crf': self.crf,
                'emb_dim': emb_dim,
                'gru': gru,
                'rnn_dim': rnn_dim,
                'rnn_num': rnn_num,
                'drop_out': drop_out,
                'buckets_char': self.buckets_char,
                'ngram': self.ngram
            }
            print "RNN dimension is %d" % rnn_dim
            print "RNN number is %d" % rnn_num
            print "Character embedding size is %d" % emb_dim
            # 存储模型超参数
            if self.metric == 'All':
                # rindex() 返回子字符串 str 在字符串中最后出现的位置
                # 截取模型文件名
                pindex = trained_model.rindex('/') + 1
                for m in self.all_metrics:
                    f_model = open(
                        trained_model[:pindex] + m + '_' +
                        trained_model[pindex:], 'w')
                    pickle.dump(param_dic, f_model)
                    f_model.close()
            else:
                f_model = open(trained_model, 'w')
                pickle.dump(param_dic, f_model)
                f_model.close()

        # define shared weights and variables

        dr = tf.placeholder(tf.float32, [], name='drop_out_holder')
        self.drop_out = dr
        self.drop_out_v = drop_out

        # 字向量层
        # 为什么字符数要加 500 ?
        # emb_dim 是每个字符的特征向量维度,可以通过命令行参数设置
        # weights 表示预训练的字向量,可以通过命令行参数设置
        self.emb_layer = EmbeddingLayer(self.nums_chars + 500,
                                        emb_dim,
                                        weights=emb,
                                        name='emb_layer')

        if self.ngram is not None:
            if ngram_embedding is not None:
                assert len(ngram_embedding) == len(self.ngram)
            else:
                ngram_embedding = [None for _ in range(len(self.ngram))]
            for i, n_gram in enumerate(self.ngram):
                self.gram_layers.append(
                    EmbeddingLayer(n_gram + 1000 * (i + 2),
                                   emb_dim,
                                   weights=ngram_embedding[i],
                                   name=str(i + 2) + 'gram_layer'))

        # 隐藏层,输入是前向 RNN 的输出加上 后向 RNN 的输出,所以输入维度为 rnn_dim * 2
        # 输出维度即标签个数
        tag_output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2,
                                                         self.nums_tags[0],
                                                         activation='linear',
                                                         name='tag_hidden'),
                                             name='tag_output_wrapper')

        if self.char_freq_loss:
            freq_output_wrapper = TimeDistributed(HiddenLayer(
                rnn_dim * 2, 1, activation='sigmoid', name='freq_hidden'),
                                                  name='freq_output_wrapper')

        if self.co_train:
            lm_fw_wrapper = TimeDistributed(HiddenLayer(rnn_dim,
                                                        self.nums_chars + 2,
                                                        activation='linear',
                                                        name='lm_fw_hidden'),
                                            name='lm_fw_wrapper')
            lm_bw_wrapper = TimeDistributed(HiddenLayer(rnn_dim,
                                                        self.nums_chars + 2,
                                                        activation='linear',
                                                        name='lm_bw_hidden'),
                                            name='lm_bw_wrapper')

        # define model for each bucket
        # 每一个 bucket 中的句子长度不一样,所以需要定义单独的模型
        # bucket: bucket 中的句子长度
        for idx, bucket in enumerate(self.buckets_char):
            if idx == 1:
                # scope 是 tf.variable_scope("tagger", reuse=None, initializer=initializer)
                # 只需要设置一次 reuse,后面就都 reuse 了
                scope.reuse_variables()
            t1 = time()

            # 输入的句子,one-hot 向量
            # shape = (batch_size, 句子长度)
            input_sentences = tf.placeholder(tf.int32, [None, bucket],
                                             name='input_' + str(bucket))

            self.input_v.append([input_sentences])

            emb_set = []
            word_out = self.emb_layer(input_sentences)
            emb_set.append(word_out)

            if self.ngram is not None:
                for i in range(len(self.ngram)):
                    input_g = tf.placeholder(tf.int32, [None, bucket],
                                             name='input_g' + str(i) +
                                             str(bucket))
                    self.input_v[-1].append(input_g)
                    gram_out = self.gram_layers[i](input_g)
                    emb_set.append(gram_out)

            if len(emb_set) > 1:
                # 各种字向量直接 concat 起来(字向量、偏旁部首、n-gram、图像信息等)
                word_embeddings = tf.concat(axis=2, values=emb_set)

            else:
                word_embeddings = emb_set[0]

            # rnn_out 是前向 RNN 的输出和后向 RNN 的输出 concat 之后的值
            rnn_out_fw, rnn_out_bw = BiRNN(rnn_dim,
                                           p=dr,
                                           concat_output=False,
                                           gru=gru,
                                           name='BiLSTM' + str(bucket),
                                           scope='Tag-BiRNN')(word_embeddings,
                                                              input_sentences)

            tag_rnn_out_fw, tag_rnn_out_bw = rnn_out_fw, rnn_out_bw
            if self.co_train:
                if self.highway_layers > 0:
                    tag_rnn_out_fw = highway_network(rnn_out_fw,
                                                     self.highway_layers,
                                                     True,
                                                     is_train=True,
                                                     scope="tag_fw")
                    tag_rnn_out_bw = highway_network(rnn_out_bw,
                                                     self.highway_layers,
                                                     True,
                                                     is_train=True,
                                                     scope="tag_bw")
            tag_rnn_out = tf.concat(values=[tag_rnn_out_fw, tag_rnn_out_bw],
                                    axis=2)

            # 应用全连接层,Wx+b 得到最后的输出
            output = tag_output_wrapper(tag_rnn_out)
            # 为什么要 [output] 而不是 output 呢?
            self.output.append([output])

            self.output_.append([
                tf.placeholder(tf.int32, [None, bucket],
                               name='tags' + str(bucket))
            ])

            self.bucket_dit[bucket] = idx

            if self.co_train:
                # language model
                lm_rnn_out_fw, lm_rnn_out_bw = rnn_out_fw, rnn_out_bw
                if self.highway_layers > 0:
                    lm_rnn_out_fw = highway_network(rnn_out_fw,
                                                    self.highway_layers,
                                                    True,
                                                    is_train=True,
                                                    scope="lm_fw")
                    lm_rnn_out_bw = highway_network(rnn_out_bw,
                                                    self.highway_layers,
                                                    True,
                                                    is_train=True,
                                                    scope="lm_bw")

                self.lm_fw_predictions.append([lm_fw_wrapper(lm_rnn_out_fw)])
                self.lm_bw_predictions.append([lm_bw_wrapper(lm_rnn_out_bw)])
                self.lm_fw_groundtruthes.append([
                    tf.placeholder(tf.int32, [None, bucket],
                                   name='lm_fw_targets' + str(bucket))
                ])
                self.lm_bw_groundtruthes.append([
                    tf.placeholder(tf.int32, [None, bucket],
                                   name='lm_bw_targets' + str(bucket))
                ])

            if self.char_freq_loss:
                freq_rnn_out_fw, freq_rnn_out_bw = rnn_out_fw, rnn_out_bw
                if self.highway_layers > 0:
                    freq_rnn_out_fw = highway_network(rnn_out_fw,
                                                      self.highway_layers,
                                                      True,
                                                      is_train=True,
                                                      scope="freq_fw")
                    freq_rnn_out_bw = highway_network(rnn_out_bw,
                                                      self.highway_layers,
                                                      True,
                                                      is_train=True,
                                                      scope="freq_bw")
                freq_rnn_out = tf.concat(
                    values=[freq_rnn_out_fw, freq_rnn_out_bw], axis=2)

                self.char_freq_groundtruthes.append([
                    tf.placeholder(tf.float32, [None, bucket],
                                   name='freq_targets_%d' % bucket)
                ])
                self.char_freq_predictions.append(
                    [freq_output_wrapper(freq_rnn_out)])

            print 'Bucket %d, %f seconds' % (idx + 1, time() - t1)

        assert \
            len(self.input_v) == len(self.output) and \
            len(self.output) == len(self.output_) and \
            len(self.output) == len(self.counts)

        self.params = tf.trainable_variables()

        self.saver = tf.train.Saver()