class Decoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [ self.lstm, self.lstm_output, self.softmax, self.embedding ] self.params = list( itertools.chain(*[ layer.params for layer in self.layers if hasattr(layer, 'params') ])) def forward(self, ec_H, ec_C, mask): (sens_size, batch_size) = T.shape(mask) def step(m, prev_Y, prev_H, prev_C): """Forward a time step of the decoder.""" # LSTM forward time step (H, C) = self.lstm.step(prev_Y, m, prev_H, prev_C) # LSTM output O = self.lstm_output.forward(H) # Apply softmax to LSTM output P = self.softmax.forward(O) # Make prediction one_hot_Y = T.argmax(P, axis=1) # Feed the output to the next time step Y = self.embedding.forward(one_hot_Y) # FIXME: Deal with differ length ? return (P, Y, H, C) results, updates = theano.scan(fn=step, sequences=[mask], outputs_info=[ None, dict(initial=T.zeros( (batch_size, self.embedding_size)), taps=[-1]), dict(initial=ec_H, taps=[-1]), dict(initial=ec_C, taps=[-1]) ]) # return np.swapaxes(results[0], 0, 1) # returns the softmax probabilities return results[0]
def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [self.lstm, self.lstm_output, self.softmax, self.embedding] self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')]))
class Decoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [self.lstm, self.lstm_output, self.softmax, self.embedding] self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')])) def forward(self, ec_H, ec_C, mask): (sens_size, batch_size) = T.shape(mask) def step(m, prev_Y, prev_H, prev_C): """Forward a time step of the decoder.""" # LSTM forward time step (H, C) = self.lstm.step(prev_Y, m, prev_H, prev_C) # LSTM output O = self.lstm_output.forward(H) # Apply softmax to LSTM output P = self.softmax.forward(O) # Make prediction one_hot_Y = T.argmax(P, axis=1) # Feed the output to the next time step Y = self.embedding.forward(one_hot_Y) # FIXME: Deal with differ length ? return (P, Y, H, C) results, updates = theano.scan( fn=step, sequences=[mask], outputs_info=[ None, dict(initial=T.zeros((batch_size, self.embedding_size)), taps=[-1]), dict(initial=ec_H, taps=[-1]), dict(initial=ec_C, taps=[-1]) ] ) # return np.swapaxes(results[0], 0, 1) # returns the softmax probabilities return results[0]
def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [ self.lstm, self.lstm_output, self.softmax, self.embedding ] self.params = list( itertools.chain(*[ layer.params for layer in self.layers if hasattr(layer, 'params') ]))
train_data_iter = data_iterator_simple(load_train_func, len(x_train), batch_size, shuffle=True, with_file_cache=False) valid_data_iter = data_iterator_simple(load_valid_func, len(x_valid), batch_size, shuffle=True, with_file_cache=False) x = nn.Variable((batch_size, sentence_length)) t = nn.Variable((batch_size, sentence_length, 1)) h = PF.embed(x, vocab_size, embedding_size) h = LSTM(h, hidden, return_sequences=True) h = TimeDistributed(PF.affine)(h, hidden, name='hidden') y = TimeDistributed(PF.affine)(h, vocab_size, name='output') mask = F.sum(F.sign(t), axis=2) # do not predict 'pad'. entropy = TimeDistributedSoftmaxCrossEntropy(y, t) * mask count = F.sum(mask, axis=1) loss = F.mean(F.div2(F.sum(entropy, axis=1), count)) # Create solver. solver = S.Momentum(1e-2, momentum=0.9) solver.set_parameters(nn.get_parameters()) # Create monitor. from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor('./tmp-lstmlm') monitor_perplexity = MonitorSeries('perplexity', monitor, interval=1)
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, rad_dim=30, emb=None, ng_embs=None, pixels=None, con_width=None, filters=None, pooling_size=None): if trained_model is not None: param_dic = {} param_dic['nums_chars'] = self.nums_chars param_dic['nums_tags'] = self.nums_tags param_dic['tag_scheme'] = self.tag_scheme param_dic['graphic'] = self.graphic param_dic['pic_size'] = self.pic_size param_dic['word_vec'] = self.word_vec param_dic['radical'] = self.radical param_dic['crf'] = self.crf param_dic['emb_dim'] = emb_dim param_dic['gru'] = gru param_dic['rnn_dim'] = rnn_dim param_dic['rnn_num'] = rnn_num param_dic['drop_out'] = drop_out param_dic['filter_size'] = con_width param_dic['filters'] = filters param_dic['pooling_size'] = pooling_size param_dic['font'] = self.font param_dic['buckets_char'] = self.buckets_char param_dic['ngram'] = self.ngram #print param_dic f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out if self.word_vec: self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') if self.radical: self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer') if self.ngram is not None: if ng_embs is not None: assert len(ng_embs) == len(self.ngram) else: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append(EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ng_embs[i], name= str(i + 2) + 'gram_layer')) wrapper_conv_1, wrapper_mp_1, wrapper_conv_2, wrapper_mp_2, wrapper_dense, wrapper_dr = None, None, None, None, None, None if self.graphic: self.input_p = [] assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None self.pixels = pixels pixel_dim = int(math.sqrt(len(pixels[0]))) wrapper_conv_1 = TimeDistributed(Convolution(con_width, 1, filters, name='conv_1'), name='wrapper_c1') wrapper_mp_1 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_1'), name='wrapper_p1') p_size_1 = toolbox.down_pool(pixel_dim, pooling_size) wrapper_conv_2 = TimeDistributed(Convolution(con_width, filters, filters, name='conv_2'), name='wrapper_c2') wrapper_mp_2 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_2'), name='wrapper_p2') p_size_2 = toolbox.down_pool(p_size_1, pooling_size) wrapper_dense = TimeDistributed(HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense'), name='wrapper_3') wrapper_dr = TimeDistributed(DropoutLayer(self.drop_out), name='wrapper_dr') with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell]*rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell]*rnn_num, state_is_tuple=True) output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='hidden'), name='wrapper') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_v]) emb_set = [] if self.word_vec: word_out = self.emb_layer(input_v) emb_set.append(word_out) if self.radical: input_r = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket)) self.input_v[-1].append(input_r) radical_out = self.radical_layer(input_r) emb_set.append(radical_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if self.graphic: input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim*pixel_dim]) self.input_p.append(input_p) pix_out = tf.reshape(input_p, [-1, bucket, pixel_dim, pixel_dim, 1]) pix_out = tf.unpack(pix_out, axis=1) conv_out_1 = wrapper_conv_1(pix_out) pooling_out_1 = wrapper_mp_1(conv_out_1) conv_out_2 = wrapper_conv_2(pooling_out_1) pooling_out_2 = wrapper_mp_2(conv_out_2) assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1] pooling_out = tf.reshape(pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters]) pooling_out = tf.unpack(pooling_out, axis=1) graphic_out = wrapper_dense(pooling_out) graphic_out = wrapper_dr(graphic_out) emb_set.append(graphic_out) if len(emb_set) > 1: emb_out = tf.concat(2, emb_set) emb_out = tf.unpack(emb_out) else: emb_out = emb_set[0] rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v) output = output_wrapper(rnn_out) output_c = tf.pack(output, axis=1) self.output.append([output_c]) self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) and len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, emb=None): if trained_model is not None: param_dic = {'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'crf': self.crf, 'emb_dim': emb_dim, 'gru': gru, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'buckets_char': self.buckets_char, 'ngram': self.ngram, 'is_space': self.is_space, 'sent_seg': self.sent_seg, 'emb_path': self.emb_path, 'tag_scheme': self.tag_scheme} #print param_dic f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out self.emb_layer = EmbeddingLayer(self.nums_chars + 20, emb_dim, weights=emb, name='emb_layer') if self.ngram is not None: ng_embs = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append(EmbeddingLayer(n_gram + 5000 * (i + 2), emb_dim, weights=ng_embs[i], name= str(i + 2) + 'gram_layer')) with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell]*rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell]*rnn_num, state_is_tuple=True) output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags, activation='linear', name='hidden'), name='wrapper') #define model for each bucket for idx, bucket in enumerate(self.buckets_char): if idx == 1: scope.reuse_variables() t1 = time() input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_v]) emb_set = [] word_out = self.emb_layer(input_v) emb_set.append(word_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if len(emb_set) > 1: emb_out = tf.concat(2, emb_set) else: emb_out = emb_set[0] emb_out = DropoutLayer(dr)(emb_out) emb_out = tf.unpack(emb_out) rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v) output = output_wrapper(rnn_out) output_c = tf.pack(output, axis=1) self.output.append([output_c]) self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))]) self.bucket_dit[bucket] = idx print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) and len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
EMBEDDING_LENGTH = len(chars) # Creating the model. model = Network( LSTM(size=512, input_size=EMBEDDING_LENGTH, batch_size=BATCH_SIZE, backprop_depth=SEQUENCE_LENGTH, stateful=True), LSTM(size=512, input_size=512, batch_size=BATCH_SIZE, backprop_depth=SEQUENCE_LENGTH, stateful=True), TimeDistributed( Dense(size=EMBEDDING_LENGTH, input_size=512, activation=SparseSoftmax()))) if RESTORE_MODEL_PATH: model.loadParams(RESTORE_MODEL_PATH) optimizer = RMSprop(learning_rate=lambda n: 0.001) loss_function = VectorCrossEntropy model.assignOptimizer(optimizer) if RESTORE_OPTIMIZER_PATH: optimizer.load(RESTORE_OPTIMIZER_PATH) for epoch in range(INITIAL_EPOCH, NR_OF_EPOCHS + INITIAL_EPOCH): loss, accuracy = model.train(makeBatches(source, SEQUENCE_LENGTH,
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, rad_dim=30, emb=None, ngram_embedding=None, pixels=None, con_width=None, filters=None, pooling_size=None): """ :param trained_model: :param scope: :param emb_dim: :param gru: :param rnn_dim: :param rnn_num: :param drop_out: :param rad_dim: n :param emb: :param ngram_embedding: 预训练 ngram embeddig 文件 :param pixels: :param con_width: :param filters: :param pooling_size: :return: """ # trained_model: 模型存储路径 if trained_model is not None: param_dic = {'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'tag_scheme': self.tag_scheme, 'graphic': self.graphic, 'pic_size': self.pic_size, 'word_vec': self.word_vec, 'radical': self.radical, 'crf': self.crf, 'emb_dim': emb_dim, 'gru': gru, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'filter_size': con_width, 'filters': filters, 'pooling_size': pooling_size, 'font': self.font, 'buckets_char': self.buckets_char, 'ngram': self.ngram} print "RNN dimension is %d" % rnn_dim print "RNN number is %d" % rnn_num print "Character embedding size is %d" % emb_dim print "Ngram embedding dimension is %d" % emb_dim # 存储模型超参数 if self.metric == 'All': # rindex() 返回子字符串 str 在字符串中最后出现的位置 # 截取模型文件名 pindex = trained_model.rindex('/') + 1 for m in self.all_metrics: f_model = open(trained_model[:pindex] + m + '_' + trained_model[pindex:], 'w') pickle.dump(param_dic, f_model) f_model.close() else: f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out # 字向量层 # 为什么字符数要加 500 ? # emb_dim 是每个字符的特征向量维度,可以通过命令行参数设置 # weights 表示预训练的字向量,可以通过命令行参数设置 if self.word_vec: self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') # 偏旁部首向量 # 依照《康熙字典》,共有 214 个偏旁部首。 # 只用了常见汉字的偏旁部首,非常见汉字和非汉字的偏旁部首用其他两个特殊符号代替, # 所以共有 216 个偏旁部首 if self.radical: self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer') if self.ngram is not None: if ngram_embedding is not None: assert len(ngram_embedding) == len(self.ngram) else: ngram_embedding = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append(EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ngram_embedding[i], name=str(i + 2) + 'gram_layer')) wrapper_conv_1, wrapper_mp_1, wrapper_conv_2, wrapper_mp_2, wrapper_dense, wrapper_dr = \ None, None, None, None, None, None if self.graphic: # 使用图像信息,需要用到 CNN self.input_p = [] assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None self.pixels = pixels pixel_dim = int(math.sqrt(len(pixels[0]))) wrapper_conv_1 = TimeDistributed(Convolution(con_width, 1, filters, name='conv_1'), name='wrapper_c1') wrapper_mp_1 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_1'), name='wrapper_p1') p_size_1 = toolbox.down_pool(pixel_dim, pooling_size) wrapper_conv_2 = TimeDistributed(Convolution(con_width, filters, filters, name='conv_2'), name='wrapper_c2') wrapper_mp_2 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_2'), name='wrapper_p2') p_size_2 = toolbox.down_pool(p_size_1, pooling_size) wrapper_dense = TimeDistributed( HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense'), name='wrapper_3') wrapper_dr = TimeDistributed(DropoutLayer(self.drop_out), name='wrapper_dr') with tf.variable_scope('BiRNN'): if gru: fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim) else: fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True) if rnn_num > 1: fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell] * rnn_num, state_is_tuple=True) bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell] * rnn_num, state_is_tuple=True) # 隐藏层,输入是前向 RNN 的输出加上 后向 RNN 的输出,所以输入维度为 rnn_dim * 2 # 输出维度即标签个数 output_wrapper = TimeDistributed( HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='hidden'), name='wrapper') # define model for each bucket # 每一个 bucket 中的句子长度不一样,所以需要定义单独的模型 # bucket: bucket 中的句子长度 for idx, bucket in enumerate(self.buckets_char): if idx == 1: # scope 是 tf.variable_scope("tagger", reuse=None, initializer=initializer) # 只需要设置一次 reuse,后面就都 reuse 了 scope.reuse_variables() t1 = time() # 输入的句子,one-hot 向量 # shape = (batch_size, 句子长度) input_sentences = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_sentences]) emb_set = [] if self.word_vec: # 根据 one-hot 向量查找对应的字向量 # word_out: shape=(batch_size, 句子长度,字向量维度(64)) word_out = self.emb_layer(input_sentences) emb_set.append(word_out) if self.radical: # 嵌入偏旁部首信息,shape = (batch_size, 句子长度) input_radicals = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket)) self.input_v[-1].append(input_radicals) radical_out = self.radical_layer(input_radicals) emb_set.append(radical_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if self.graphic: input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim * pixel_dim]) self.input_p.append(input_p) pix_out = tf.reshape(input_p, [-1, bucket, pixel_dim, pixel_dim, 1]) conv_out_1 = wrapper_conv_1(pix_out) pooling_out_1 = wrapper_mp_1(conv_out_1) conv_out_2 = wrapper_conv_2(pooling_out_1) pooling_out_2 = wrapper_mp_2(conv_out_2) assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1] pooling_out = tf.reshape(pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters]) pooling_out = tf.unstack(pooling_out, axis=1) graphic_out = wrapper_dense(pooling_out) graphic_out = wrapper_dr(graphic_out) emb_set.append(graphic_out) if self.window_size > 1: padding_size = int(np.floor(self.window_size / 2)) word_padded = tf.pad(word_out, [[0, 0], [padding_size, padding_size], [0, 0]], 'CONSTANT') Ws = [] for q in range(1, self.window_size + 1): Ws.append(tf.get_variable("W_%d" % q, shape=[q * emb_dim, self.filters_number])) b = tf.get_variable("b", shape=[self.filters_number]) z = [None for _ in range(0, bucket)] for q in range(1, self.window_size + 1): for i in range(padding_size, bucket + padding_size): low = i - int(np.floor((q - 1) / 2)) high = i + int(np.ceil((q + 1) / 2)) x = word_padded[:, low, :] for j in range(low + 1, high): x = tf.concat(values=[x, word_padded[:, j, :]], axis=1) z_iq = tf.tanh(tf.nn.xw_plus_b(x, Ws[q - 1], b)) if z[i - padding_size] is None: z[i - padding_size] = z_iq else: z[i - padding_size] = tf.concat([z[i - padding_size], z_iq], axis=1) z = tf.stack(z, axis=1) values, indices = tf.nn.top_k(z, sorted=False, k=emb_dim) # highway layer X = tf.unstack(word_out, axis=1) Conv_X = tf.unstack(values, axis=1) X_hat = [] W_t = tf.get_variable("W_t", shape=[emb_dim, emb_dim]) b_t = tf.get_variable("b_t", shape=[emb_dim]) for x, conv_x in zip(X, Conv_X): T_x = tf.sigmoid(tf.nn.xw_plus_b(x, W_t, b_t)) X_hat.append(tf.multiply(conv_x, T_x) + tf.multiply(x, 1 - T_x)) X_hat = tf.stack(X_hat, axis=1) emb_set.append(X_hat) if len(emb_set) > 1: # 各种字向量直接 concat 起来(字向量、偏旁部首、n-gram、图像信息等) emb_out = tf.concat(axis=2, values=emb_set) else: emb_out = emb_set[0] # rnn_out 是前向 RNN 的输出和后向 RNN 的输出 concat 之后的值 rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(self.highway(emb_out, "tag"), input_sentences) # 应用全连接层,Wx+b 得到最后的输出 output = output_wrapper(rnn_out) # 为什么要 [output] 而不是 output 呢? self.output.append([output]) self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))]) self.bucket_dit[bucket] = idx # language model lm_rnn_dim = rnn_dim with tf.variable_scope('LM-BiRNN'): if gru: lm_fw_rnn_cell = tf.nn.rnn_cell.GRUCell(lm_rnn_dim) lm_bw_rnn_cell = tf.nn.rnn_cell.GRUCell(lm_rnn_dim) else: lm_fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(lm_rnn_dim, state_is_tuple=True) lm_bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(lm_rnn_dim, state_is_tuple=True) if rnn_num > 1: lm_fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([lm_fw_rnn_cell] * rnn_num, state_is_tuple=True) lm_bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([lm_bw_rnn_cell] * rnn_num, state_is_tuple=True) lm_rnn_output = BiLSTM(lm_rnn_dim, fw_cell=lm_fw_rnn_cell, bw_cell=lm_bw_rnn_cell, p=dr, name='LM-BiLSTM' + str(bucket), scope='LM-BiRNN')(self.highway(emb_set[0]), input_sentences) lm_output_wrapper = TimeDistributed( HiddenLayer(lm_rnn_dim * 2, self.nums_chars + 2, activation='linear', name='lm_hidden'), name='lm_wrapper') lm_final_output = lm_output_wrapper(lm_rnn_output) self.lm_predictions.append([lm_final_output]) self.lm_groundtruthes.append([tf.placeholder(tf.int32, [None, bucket], name='lm_targets' + str(bucket))]) print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert \ len(self.input_v) == len(self.output) and \ len(self.output) == len(self.output_) and \ len(self.lm_predictions) == len(self.lm_groundtruthes) and \ len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()
h = PF.embed(x, char_vocab_size, char_embedding_dim) h = F.transpose(h, (0, 3, 1, 2)) output = [] for f, f_size in zip(filters, filster_sizes): _h = PF.convolution(h, f, kernel=(1, f_size), pad=(0, f_size // 2), name='conv_{}'.format(f_size)) _h = F.max_pooling(_h, kernel=(1, word_length)) output.append(_h) h = F.concatenate(*output, axis=1) h = F.transpose(h, (0, 2, 1, 3)) h = F.reshape(h, (batch_size, sentence_length, sum(filters))) # h = PF.batch_normalization(h, axes=[2]) h = TimeDistributed(Highway)(h, name='highway1') h = TimeDistributed(Highway)(h, name='highway2') h = LSTM(h, lstm_size, return_sequences=True, name='lstm1') h = LSTM(h, lstm_size, return_sequences=True, name='lstm2') h = TimeDistributed(PF.affine)(h, lstm_size, name='hidden') y = TimeDistributed(PF.affine)(h, word_vocab_size, name='output') t = nn.Variable((batch_size, sentence_length, 1)) mask = F.sum(F.sign(t), axis=2) # do not predict 'pad'. entropy = TimeDistributedSoftmaxCrossEntropy(y, t) * mask count = F.sum(mask, axis=1) loss = F.mean(F.div2(F.sum(entropy, axis=1), count)) # Create solver. solver = S.Momentum(1e-2, momentum=0.9) solver.set_parameters(nn.get_parameters())
def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, emb=None, ngram_embedding=None): """ :param trained_model: :param scope: :param emb_dim: :param gru: :param rnn_dim: :param rnn_num: :param drop_out: :param emb: :return: """ # trained_model: 模型存储路径 if trained_model is not None: param_dic = { 'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'tag_scheme': self.tag_scheme, 'crf': self.crf, 'emb_dim': emb_dim, 'gru': gru, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'buckets_char': self.buckets_char, 'ngram': self.ngram } print "RNN dimension is %d" % rnn_dim print "RNN number is %d" % rnn_num print "Character embedding size is %d" % emb_dim # 存储模型超参数 if self.metric == 'All': # rindex() 返回子字符串 str 在字符串中最后出现的位置 # 截取模型文件名 pindex = trained_model.rindex('/') + 1 for m in self.all_metrics: f_model = open( trained_model[:pindex] + m + '_' + trained_model[pindex:], 'w') pickle.dump(param_dic, f_model) f_model.close() else: f_model = open(trained_model, 'w') pickle.dump(param_dic, f_model) f_model.close() # define shared weights and variables dr = tf.placeholder(tf.float32, [], name='drop_out_holder') self.drop_out = dr self.drop_out_v = drop_out # 字向量层 # 为什么字符数要加 500 ? # emb_dim 是每个字符的特征向量维度,可以通过命令行参数设置 # weights 表示预训练的字向量,可以通过命令行参数设置 self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer') if self.ngram is not None: if ngram_embedding is not None: assert len(ngram_embedding) == len(self.ngram) else: ngram_embedding = [None for _ in range(len(self.ngram))] for i, n_gram in enumerate(self.ngram): self.gram_layers.append( EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ngram_embedding[i], name=str(i + 2) + 'gram_layer')) # 隐藏层,输入是前向 RNN 的输出加上 后向 RNN 的输出,所以输入维度为 rnn_dim * 2 # 输出维度即标签个数 tag_output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='tag_hidden'), name='tag_output_wrapper') if self.char_freq_loss: freq_output_wrapper = TimeDistributed(HiddenLayer( rnn_dim * 2, 1, activation='sigmoid', name='freq_hidden'), name='freq_output_wrapper') if self.co_train: lm_fw_wrapper = TimeDistributed(HiddenLayer(rnn_dim, self.nums_chars + 2, activation='linear', name='lm_fw_hidden'), name='lm_fw_wrapper') lm_bw_wrapper = TimeDistributed(HiddenLayer(rnn_dim, self.nums_chars + 2, activation='linear', name='lm_bw_hidden'), name='lm_bw_wrapper') # define model for each bucket # 每一个 bucket 中的句子长度不一样,所以需要定义单独的模型 # bucket: bucket 中的句子长度 for idx, bucket in enumerate(self.buckets_char): if idx == 1: # scope 是 tf.variable_scope("tagger", reuse=None, initializer=initializer) # 只需要设置一次 reuse,后面就都 reuse 了 scope.reuse_variables() t1 = time() # 输入的句子,one-hot 向量 # shape = (batch_size, 句子长度) input_sentences = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket)) self.input_v.append([input_sentences]) emb_set = [] word_out = self.emb_layer(input_sentences) emb_set.append(word_out) if self.ngram is not None: for i in range(len(self.ngram)): input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket)) self.input_v[-1].append(input_g) gram_out = self.gram_layers[i](input_g) emb_set.append(gram_out) if len(emb_set) > 1: # 各种字向量直接 concat 起来(字向量、偏旁部首、n-gram、图像信息等) word_embeddings = tf.concat(axis=2, values=emb_set) else: word_embeddings = emb_set[0] # rnn_out 是前向 RNN 的输出和后向 RNN 的输出 concat 之后的值 rnn_out_fw, rnn_out_bw = BiRNN(rnn_dim, p=dr, concat_output=False, gru=gru, name='BiLSTM' + str(bucket), scope='Tag-BiRNN')(word_embeddings, input_sentences) tag_rnn_out_fw, tag_rnn_out_bw = rnn_out_fw, rnn_out_bw if self.co_train: if self.highway_layers > 0: tag_rnn_out_fw = highway_network(rnn_out_fw, self.highway_layers, True, is_train=True, scope="tag_fw") tag_rnn_out_bw = highway_network(rnn_out_bw, self.highway_layers, True, is_train=True, scope="tag_bw") tag_rnn_out = tf.concat(values=[tag_rnn_out_fw, tag_rnn_out_bw], axis=2) # 应用全连接层,Wx+b 得到最后的输出 output = tag_output_wrapper(tag_rnn_out) # 为什么要 [output] 而不是 output 呢? self.output.append([output]) self.output_.append([ tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket)) ]) self.bucket_dit[bucket] = idx if self.co_train: # language model lm_rnn_out_fw, lm_rnn_out_bw = rnn_out_fw, rnn_out_bw if self.highway_layers > 0: lm_rnn_out_fw = highway_network(rnn_out_fw, self.highway_layers, True, is_train=True, scope="lm_fw") lm_rnn_out_bw = highway_network(rnn_out_bw, self.highway_layers, True, is_train=True, scope="lm_bw") self.lm_fw_predictions.append([lm_fw_wrapper(lm_rnn_out_fw)]) self.lm_bw_predictions.append([lm_bw_wrapper(lm_rnn_out_bw)]) self.lm_fw_groundtruthes.append([ tf.placeholder(tf.int32, [None, bucket], name='lm_fw_targets' + str(bucket)) ]) self.lm_bw_groundtruthes.append([ tf.placeholder(tf.int32, [None, bucket], name='lm_bw_targets' + str(bucket)) ]) if self.char_freq_loss: freq_rnn_out_fw, freq_rnn_out_bw = rnn_out_fw, rnn_out_bw if self.highway_layers > 0: freq_rnn_out_fw = highway_network(rnn_out_fw, self.highway_layers, True, is_train=True, scope="freq_fw") freq_rnn_out_bw = highway_network(rnn_out_bw, self.highway_layers, True, is_train=True, scope="freq_bw") freq_rnn_out = tf.concat( values=[freq_rnn_out_fw, freq_rnn_out_bw], axis=2) self.char_freq_groundtruthes.append([ tf.placeholder(tf.float32, [None, bucket], name='freq_targets_%d' % bucket) ]) self.char_freq_predictions.append( [freq_output_wrapper(freq_rnn_out)]) print 'Bucket %d, %f seconds' % (idx + 1, time() - t1) assert \ len(self.input_v) == len(self.output) and \ len(self.output) == len(self.output_) and \ len(self.output) == len(self.counts) self.params = tf.trainable_variables() self.saver = tf.train.Saver()