def MultiDropoutGRUCell(size=DEFAULT_GRU_INTERNAL_SIZE, pkeep=DEFAULT_GRU_DROPOUT_KEEP_RATE, nlayers=DEFAULT_GRU_LAYERS): cell = DropoutGRUCell(size=size, pkeep=pkeep) cell = rnn.MultiRNNCell([cell] * nlayers, state_is_tuple=False) cell = rnn.DropoutWrapper(cell, output_keep_prob=pkeep) return cell
def __init__(self, config, batch, lens_batch, emotion_batch, nrc_batch, embed_matrix, phase=Phase.Predict): batch_size = batch.shape[1] input_size = batch.shape[2] # 31 emotion_size = emotion_batch.shape[2] # 6 # The tweets. input_size is the (maximum) number of timesteps, i.e. maximum tweet length self._x = tf.placeholder(tf.int32, shape=[batch_size, input_size]) # This tensor provides the actual number of timesteps for each # instance (words in a tweet). self._lens = tf.placeholder(tf.int32, shape=[batch_size]) # The emotion distribution if phase != Phase.Predict: self._y = tf.placeholder(tf.int32, shape=[batch_size, emotion_size]) # Embedding matrix self._embed = tf.placeholder( tf.float32, shape=[embed_matrix.shape[0], embed_matrix.shape[1]]) word_embeddings = tf.nn.embedding_lookup(self._embed, self._x) # Lexicon self._lexicon = lexicon = tf.placeholder( tf.float32, shape=[batch_size, input_size, emotion_size]) features = tf.concat([word_embeddings, lexicon], axis=2) cell = rnn.GRUCell(100) if phase == Phase.Train: regularized_cell = rnn.DropoutWrapper( cell, input_keep_prob=config.input_dropout, state_keep_prob=config.hidden_dropout) _, hidden = tf.nn.dynamic_rnn(regularized_cell, features, sequence_length=self._lens, dtype=tf.float32) else: _, hidden = tf.nn.dynamic_rnn(cell, features, sequence_length=self._lens, dtype=tf.float32) w = tf.get_variable("w", shape=[hidden.shape[1], emotion_size]) b = tf.get_variable("b", shape=[emotion_size]) logits = tf.matmul(hidden, w) + b if phase == Phase.Train or Phase.Validation: losses = tf.nn.softmax_cross_entropy_with_logits(labels=self._y, logits=logits) self._loss = loss = tf.reduce_sum(losses) if phase == Phase.Train: start_lr = 0.01 global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(start_lr, global_step, batch.shape[0], 0.90) self._train_op = tf.train.AdamOptimizer(learning_rate) \ .minimize(losses, global_step=global_step) self._probs = probs = tf.nn.softmax(logits) if phase == Phase.Validation: # Emotions of the gold data self._gold = gold_emotions = tf.argmax(self.y, axis=1) # Predicted emotions self._pred = pred_emotions = tf.argmax(logits, axis=1) correct = tf.equal(gold_emotions, pred_emotions) correct = tf.cast(correct, tf.float32) self._accuracy = tf.reduce_mean(correct)
def get_a_cell(lstm_size, keep_prob): lstm = rnn.BasicLSTMCell(lstm_size) drop = rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob) return drop
def __init__(self, use_lstm=False, num_samples=512, forward_only=False): self.source_vocab_size = config.vocabulary_size self.target_vocab_size = config.vocabulary_size self.buckets = config.BUCKETS self.batch_size = config.FLAGS.batch_size self.learning_rate = tf.Variable(float(config.FLAGS.learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * config.FLAGS.learning_rate_decay_factor) self.lsmt_size = config.FLAGS.lstm_size self.num_layers = config.FLAGS.num_layers self.dropout = config.FLAGS.dropout self.max_gradient_norm = config.FLAGS.max_gradient_norm self.global_step = tf.Variable(0, trainable=False) self.model_dir = config.model_dir # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w = tf.get_variable('proj_w', [self.lsmt_size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable('proj_b', [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(labels, logits): labels = tf.reshape( labels, [-1, 1]) # Add one dimension (nb of true classes, here 1) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. localWt = tf.cast(w_t, tf.float32) localB = tf.cast(b, tf.float32) localInputs = tf.cast(logits, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss( localWt, # Should have shape [num_classes, dim] localB, labels, localInputs, num_samples, # The number of classes to randomly sample per batch self.target_vocab_size), # The number of classes tf.float32) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_call = rnn.GRUCell(self.lsmt_size) if use_lstm: single_call = rnn.BasicLSTMCell(self.lsmt_size) if not forward_only: single_call = rnn.DropoutWrapper(single_call, input_keep_prob=1.0, output_keep_prob=self.dropout) cell = single_call if self.num_layers > 1: cell = rnn.MultiRNNCell([single_call] * self.num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): import copy temp_cell = copy.deepcopy(cell) return legacy_seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, temp_cell, num_encoder_symbols=self.source_vocab_size, num_decoder_symbols=self.target_vocab_size, embedding_size=self.lsmt_size, output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in range(self.buckets[-1][0]): self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in range(self.buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in range(len(self.decoder_inputs) - 1) ] # Training outputs and losses. if forward_only: self.outputs, self.losses = legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in range(len(self.buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) params = tf.trainable_variables() for b in range(len(self.buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, self.max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables(), max_to_keep=3) self.mergedSummaries = tf.summary.merge_all() self.writer = tf.summary.FileWriter(config.graph_dir)
X = tf.placeholder(tf.uint8, [None, None], name='X') # [ BATCHSIZE, SEQLEN ] Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, SEQLEN, ALPHASIZE ] # expected outputs = same sequence shifted by 1 since we are trying to predict the next character Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_') # [ BATCHSIZE, SEQLEN ] Yo_ = tf.one_hot(Y_, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, SEQLEN, ALPHASIZE ] # input state Hin = tf.placeholder(tf.float32, [None, INTERNALSIZE * NLAYERS], name='Hin') # [ BATCHSIZE, INTERNALSIZE * NLAYERS] # using a NLAYERS=3 layers of GRU cells, unrolled SEQLEN=30 times # dynamic_rnn infers SEQLEN from the size of the inputs Xo # How to properly apply dropout in RNNs: see README.md cells = [rnn.GRUCell(INTERNALSIZE) for _ in range(NLAYERS)] # "naive dropout" implementation dropcells = [rnn.DropoutWrapper(cell, input_keep_prob=pkeep) for cell in cells] multicell = rnn.MultiRNNCell(dropcells, state_is_tuple=False) multicell = rnn.DropoutWrapper( multicell, output_keep_prob=pkeep) # dropout for the softmax layer Yr, H = tf.nn.dynamic_rnn(multicell, Xo, dtype=tf.float32, initial_state=Hin) # Yr: [ BATCHSIZE, SEQLEN, INTERNALSIZE ] # H: [ BATCHSIZE, INTERNALSIZE*NLAYERS ] # this is the last state in the sequence H = tf.identity(H, name='H') # just to give it a name # Softmax layer implementation: # Flatten the first two dimension of the output [ BATCHSIZE, SEQLEN, ALPHASIZE ] => [ BATCHSIZE x SEQLEN, ALPHASIZE ] # then apply softmax readout layer. This way, the weights and biases are shared across unrolled time steps. # From the readout point of view, a value coming from a sequence time step or a minibatch item is the same thing.
# Create our TensorFlow Graph. batchsize = tf.placeholder(tf.int32, name='batchsize') lr = tf.placeholder(tf.float32, name='lr') pkeep = tf.placeholder(tf.float32, name='pkeep') X = tf.placeholder(tf.uint8, [None, None], name='X') # Input vector Xo = tf.one_hot( X, ALPHA_SIZE, 1.0, 0.0) # One Hots create vector size ALPHA_SIZE, all set 0 except character Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_') # Output tensor Yo_ = tf.one_hot(Y_, ALPHA_SIZE, 1.0, 0.0) # OneHot our output also Hin = tf.placeholder(tf.float32, [None, NUM_OF_GRUS * NUM_LAYERS], name='Hin') # Recurrent input states cells = [rnn.GRUCell(NUM_OF_GRUS) for _ in range(NUM_LAYERS)] # Create all our GRU cells per layer dropcells = [ rnn.DropoutWrapper(cell, input_keep_prob=pkeep) for cell in cells ] # DropOut inside RNN multicell = rnn.MultiRNNCell(dropcells, state_is_tuple=False) multicell = rnn.DropoutWrapper( multicell, output_keep_prob=pkeep) # DropOut for SoftMax layer Yr, H = tf.nn.dynamic_rnn( multicell, Xo, dtype=tf.float32, initial_state=Hin) # Unrolling through time happens here H = tf.identity(H, name='H') # Last state of sequence Yflat = tf.reshape(Yr, [-1, NUM_OF_GRUS]) Ylogits = layers.linear(Yflat, ALPHA_SIZE) Yflat_ = tf.reshape(Yo_, [-1, ALPHA_SIZE]) loss = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Yflat_) loss = tf.reshape(loss, [batchsize, -1]) Yo = tf.nn.softmax(Ylogits, name='Yo') Y = tf.argmax(Yo, 1)
def gru_cell(self): with tf.name_scope('gru_cell'): cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse) return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
def lstm_cell(hidden_size, keep_prob): cell = rnn.BasicLSTMCell(num_units=hidden_size, forget_bias=1.0) return rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
def __init__(self, args, training=True): self.args = args if not training: args.batch_size = 1 args.seq_length = 1 elif args.model == 'rnn': cell_fn = rnn.BasicRNNCell elif args.model == 'lstm': cell_fn = rnn.BasicLSTMCell elif args.model == 'gru': cell_fn == rnn.GRUCell elif args.model == 'nas': cell_fn = rnn.NASCell else: raise Exception('model type not support') # 构造隐藏层 cells = [] for _ in range(args.num_layers): cell = cell_fn(args.nums_size) if training and (args.input_keep_prob < 1.0 or args.output_keep_prob < 1.0): cell = rnn.DropoutWrapper(cell, input_keep_prob=args.input_keep_prob, output_keep_prob=args.output_keep_prob) cells.append(cell) self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True) # 构造输入层 # 占位符 self.input_data = tf.placeholder( tf.int32, shape=[args.batch_size, args.seq_length]) self.targets = tf.placeholder( tf.int32, shape=[args.batch_size, args.seq_length]) self.intial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable( 'softmax_w', shape=[args.nums_size, args.vocab_size]) softmax_b = tf.get_variable('softmax_b', shape=[args.vocab_size]) embeddings = tf.get_variable( 'embedding', [args.vocab_size, args.nums_size]) inputs = tf.nn.embedding_lookup(embeddings, self.input_data) # 输出的shape为[batch_size,seq_length,num_size] # 训练时输入层进行dropout if training and args.output_keep_prob < 1.0: inputs = tf.nn.dropout( inputs, output_keep_prob=args.output_keep_prob) # 现在要把shape变成[batch_size,1,num_size] # 将第一维的seq_length,[batch_size,1,num_size] inputs = tf.split(inputs, args.seq_length, 1) # 最后变成一个[batch_size,num_size]的一个list inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # loop函数连接num_steps步的rnn_cell,将h(t-1)的输出prev做变换然后传入h(t)作为输入 # 这里定义的loop实际在于当我们要测试运行结果,即让机器自己写文章时,我们需要对每一步 # 的输出进行查看。如果我们是在训练中,我们并不需要这个loop函数 def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient( tf.argmax(prev, 1)) # 输出的为vocab_size中的第某个序号 return tf.nn.embedding_lookup(embeddings, prev_symbol) outputs, last_state = legacy_seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if not training else None, scope='rnnlm') # 输出的shape为[num_steps][batch_size,num_size] # 这里的过程可以说基本等同于PTB模型,首先通过对output的重新梳理得到一个 # [batch_size*seq_length, rnn_size]的输出,并将之放入softmax里,并通过sequence # loss by example函数进行训练 output = tf.reshape(tf.concat(output, 1), [-1, args.nums_size]) self.logits = tf.matmul(output, softmax_w) + \ softmax_b # 最后输出的维度一行vocab_size列的一维List self.probs = tf.nn.softmax(self.logits) loss = legacy_seq2seq.sequence_loss_by_example([self.logits], [tf.reshape( self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length], dtype=tf.float32)]) # self.logits的shape为[batch_size*seq_length,vocab_size] # self.targets reshape为[1,batch_size*seq_length] # tf.ones(batch_size*seq_length,vocab_size) # 最后返回的结果长度为一维列表[1,batch_size*seq_length] # tf.nn.seq2seq.sequence_loss_by_example(logits, targets, weights):主要说一下这三个参数的意思和用法: # logits是一个二维的张量,比如是a*b,那么targets就是一个一维的张量长度为a,并且targets中元素的值是不能超过b的整形,32位的整数。也即是如果b等于4,那么targets中的元素的值都要小于4。 # weights就是一个一维的张量长度为a,并且是一个tf.float32的数。这是权重的意思。 # self.cost = cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
Nbofbatch = trainsize // batchsize learningi = tf.placeholder("float", [1]) learning_rate = 0.001 * pow(0.1, learningi[0]) x = tf.placeholder("float", [None, time_steps, n_input]) #input label placeholder y = tf.placeholder("float", [None, n_classes]) #processing the input tensor from [batch_size,n_steps,n_input] to "time_steps" number of [batch_size,n_input] tensors input = tf.unstack(x, time_steps, 1) #defining the network #lstm_layer=rnn.BasicLSTMCell(num_units,forget_bias=1) lstm_layer = rnn.DropoutWrapper(rnn.BasicLSTMCell(num_units, forget_bias=1), input_keep_prob=0.95, output_keep_prob=0.95, state_keep_prob=0.95) outputs, _ = rnn.static_rnn(lstm_layer, input, dtype="float32") # print(len(outputs)) outputs = tf.contrib.layers.fully_connected( outputs[-1], 16, weights_regularizer=tf.contrib.layers.l2_regularizer(0.1), activation_fn=tf.nn.tanh) prediction = tf.contrib.layers.fully_connected(outputs, 1, activation_fn=tf.nn.tanh) loss = tf.losses.mean_squared_error(y, prediction) opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
def inference(inputs, batch_size, num_steps, vocab_size, embedding_size, hidden_size, keep_prob, num_layers, num_classes, is_training, use_lstm=True, use_bidirectional_rnn=True): with tf.device('/cpu:0'): embedding = tf.get_variable( 'embedding', [vocab_size, embedding_size], initializer=tf.random_uniform_initializer(), dtype=tf.float32) inputs_embedding = tf.nn.embedding_lookup(embedding, inputs) if is_training and keep_prob < 1: inputs_embedding = tf.nn.dropout(inputs_embedding, keep_prob) inputs_embedding = tf.unstack(inputs_embedding, axis=1) initializer = tf.random_uniform_initializer(-0.1, 0.1) if use_lstm: forward_single_cell = rnn.LSTMCell(num_units=hidden_size, initializer=initializer, forget_bias=1.0) else: forward_single_cell = rnn.GRUCell(num_units=hidden_size) if is_training and keep_prob < 1.0: forward_single_cell = rnn.DropoutWrapper(forward_single_cell, output_keep_prob=keep_prob) forward_rnn_cell = rnn.MultiRNNCell( [forward_single_cell for _ in range(num_layers)]) if use_lstm: backward_single_cell = rnn.LSTMCell(num_units=hidden_size, initializer=initializer, forget_bias=1.0) else: backward_single_cell = rnn.GRUCell(num_units=hidden_size) if is_training and keep_prob < 1.0: backward_single_cell = rnn.DropoutWrapper(backward_single_cell, output_keep_prob=keep_prob) backward_rnn_cell = rnn.MultiRNNCell( [backward_single_cell for _ in range(num_layers)]) bi_flag = 1 if use_bidirectional_rnn: bi_flag = 2 outputs, forward_final_state, backward_final_state = rnn.static_bidirectional_rnn( forward_rnn_cell, backward_rnn_cell, inputs_embedding, dtype=tf.float32, sequence_length=[num_steps] * batch_size) final_state = (tf.concat( [forward_final_state[0], backward_final_state[0]], axis=2), tf.concat( [forward_final_state[1], backward_final_state[1]], axis=2)) else: outputs, final_state = rnn.static_rnn(forward_rnn_cell, inputs_embedding, dtype=tf.float32, sequence_length=[num_steps] * batch_size) output = tf.reshape(tf.concat(outputs, axis=1), shape=[-1, bi_flag * hidden_size]) weights = tf.get_variable('weights', [bi_flag * hidden_size, num_classes], dtype=tf.float32) biases = tf.get_variable('biases', [num_classes], dtype=tf.float32) logits = tf.matmul(output, weights) + biases return logits, final_state
def get_rnn_cell(self): return rnn.DropoutWrapper( LayerNormBasicLSTMCell(self.hidden_dim), input_keep_prob=self.dropout_keep_prob_t, output_keep_prob=self.dropout_keep_prob_t)
def build_model(self): self.X = tf.placeholder(tf.int32, [self.batch_size], name='input') self.Y = tf.placeholder(tf.int32, [self.batch_size], name='output') self.state = [ tf.placeholder(tf.float32, [self.batch_size, self.rnn_size], name='rnn_state') for _ in range(self.layers) ] self.global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('gru_layer'): sigma = self.sigma if self.sigma != 0 else np.sqrt( 6.0 / (self.n_risks + self.rnn_size)) if self.init_as_normal: initializer = tf.random_normal_initializer(mean=0, stddev=sigma) else: initializer = tf.random_uniform_initializer(minval=-sigma, maxval=sigma) embedding = tf.get_variable('embedding', [self.n_risks, self.rnn_size], initializer=initializer) # forward层的定义 softmax_W = tf.get_variable('softmax_w', [self.n_risks, self.rnn_size], initializer=initializer) softmax_b = tf.get_variable( 'softmax_b', [self.n_risks], initializer=tf.constant_initializer(0.0)) # 多层简单GRU层定义 cell = rnn_cell.GRUCell(self.rnn_size, activation=self.hidden_act) drop_cell = rnn_cell.DropoutWrapper( cell, output_keep_prob=self.dropout_p_hidden) stacked_cell = rnn_cell.MultiRNNCell([drop_cell] * self.layers) # 从embeddding层中获取X对应的向量 inputs = tf.nn.embedding_lookup(embedding, self.X) output, state = stacked_cell(inputs, tuple(self.state)) self.final_state = state self.output = output if self.is_training: ''' Use other examples of the minibatch as negative samples. ''' # 使得非input对应的embedding数据对应的output尽可能小 sampled_W = tf.nn.embedding_lookup(softmax_W, self.Y) sampled_b = tf.nn.embedding_lookup(softmax_b, self.Y) logits = tf.matmul(output, sampled_W, transpose_b=True) + sampled_b self.yhat = self.final_activation(logits) self.cost = self.loss_function(self.yhat) else: logits = tf.matmul(output, softmax_W, transpose_b=True) + softmax_b self.yhat = self.final_activation(logits) self.logit = tf.matmul(output, softmax_W, transpose_b=True) self.sfw = softmax_W # print 'yhat',self.yhat.shape # print 'logits' ,logits.shape # print 'output',output.shape # print 'state ',self.final_state # print 'sampled_W',sampled_W.shape if not self.is_training: return # 梯度下降学习速率,避免鞍点及震荡 self.lr = tf.maximum( 1e-5, tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps, self.decay, staircase=True)) ''' Try different optimizers. ''' # optimizer = tf.train.AdagradOptimizer(self.lr) optimizer = tf.train.AdamOptimizer(self.lr) # optimizer = tf.train.AdadeltaOptimizer(self.lr) # optimizer = tf.train.RMSPropOptimizer(self.lr) tvars = tf.trainable_variables() # 通过设置normvalue,避免梯度消失或者梯度爆炸 gvs = optimizer.compute_gradients(self.cost, tvars) if self.grad_cap > 0: capped_gvs = [(tf.clip_by_norm(grad, self.grad_cap), var) for grad, var in gvs] else: capped_gvs = gvs self.train_op = optimizer.apply_gradients(capped_gvs, global_step=self.global_step)
def DropoutGRUCell(size=DEFAULT_GRU_INTERNAL_SIZE, pkeep=DEFAULT_GRU_DROPOUT_KEEP_RATE): cell = rnn.GRUCell(size) cell = rnn.DropoutWrapper(cell, input_keep_prob=pkeep) return cell
def lstm_cell(): cell = rnn.LSTMCell(hidden_size, reuse=tf.get_variable_scope().reuse) return rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
out_weights = tf.Variable(tf.random_normal([num_units, n_classes])) out_bias = tf.Variable(tf.random_normal([n_classes])) # defining placeholders # input image placeholder x = tf.placeholder("float", [None, time_steps, n_input]) # input label placeholder y = tf.placeholder("float", [None, n_classes]) keep_prob = tf.placeholder(dtype=tf.float32) # processing the input tensor from [batch_size,n_steps,n_input] to "time_steps" number of [batch_size,n_input] tensors input = tf.unstack(x, time_steps, 1) # defining the network lstm_layer = rnn.BasicLSTMCell(num_units, forget_bias=1) dw_cell = rnn.DropoutWrapper(lstm_layer, output_keep_prob=keep_prob) outputs, _ = rnn.static_rnn(dw_cell, input, dtype="float32") # converting last output of dimension [batch_size,num_units] to [batch_size,n_classes] by out_weight multiplication prediction = tf.matmul(outputs[-1], out_weights) + out_bias # loss_function loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y)) # optimization opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) # model evaluation correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
def make_cell(): cell = rnn.BasicLSTMCell(self.num_hidden) cell = rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) return cell
def __init__(self, sequence_length, num_classes, vocab_size, lstm_hidden_size, fc_hidden_size, embedding_size, embedding_type, filter_sizes, num_filters, l2_reg_lambda=0.0, pretrained_embedding=None): # Placeholders for input, output, dropout_prob and training_tag self.input_x_front = tf.placeholder(tf.int32, [None, sequence_length], name="input_x_front") self.input_x_behind = tf.placeholder(tf.int32, [None, sequence_length], name="input_x_behind") self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.is_training = tf.placeholder(tf.bool, name="is_training") self.global_step = tf.Variable(0, trainable=False, name="Global_Step") def _linear(input_, output_size, scope="SimpleLinear"): """ Linear map: output[k] = sum_i(Matrix[k, i] * args[i] ) + Bias[k] Args: input_: a tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. scope: VariableScope for the created subgraph; defaults to "SimpleLinear". Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ shape = input_.get_shape().as_list() if len(shape) != 2: raise ValueError( "Linear is expecting 2D arguments: {0}".format(str(shape))) if not shape[1]: raise ValueError( "Linear expects shape[1] of arguments: {0}".format( str(shape))) input_size = shape[1] # Now the computation. with tf.variable_scope(scope): W = tf.get_variable("W", [input_size, output_size], dtype=input_.dtype) b = tf.get_variable("b", [output_size], dtype=input_.dtype) return tf.nn.xw_plus_b(input_, W, b) def _highway_layer(input_, size, num_layers=1, bias=-2.0, f=tf.nn.relu): """ Highway Network (cf. http://arxiv.org/abs/1505.00387). t = sigmoid(Wy + b) z = t * g(Wy + b) + (1 - t) * y where g is nonlinearity, t is transform gate, and (1 - t) is carry gate. """ for idx in range(num_layers): g = f( _linear(input_, size, scope=("highway_lin_{0}".format(idx)))) t = tf.sigmoid( _linear( input_, size, scope=("highway_gate_{0}".format(idx))) + bias) output = t * g + (1. - t) * input_ input_ = output return output # Embedding Layer with tf.device("/cpu:0"), tf.name_scope("embedding"): # Use random generated the word vector by default # Can also be obtained through our own word vectors trained by our corpus if pretrained_embedding is None: self.embedding = tf.Variable(tf.random_uniform( [vocab_size, embedding_size], minval=-1.0, maxval=1.0, dtype=tf.float32), trainable=True, name="embedding") else: if embedding_type == 0: self.embedding = tf.constant(pretrained_embedding, dtype=tf.float32, name="embedding") if embedding_type == 1: self.embedding = tf.Variable(pretrained_embedding, trainable=True, dtype=tf.float32, name="embedding") self.embedded_sentence_front = tf.nn.embedding_lookup( self.embedding, self.input_x_front) self.embedded_sentence_behind = tf.nn.embedding_lookup( self.embedding, self.input_x_behind) # Add dropout with tf.name_scope("dropout-input"): self.embedded_sentence_front_drop = tf.nn.dropout( self.embedded_sentence_front, self.dropout_keep_prob) self.embedded_sentence_behind_drop = tf.nn.dropout( self.embedded_sentence_behind, self.dropout_keep_prob) # Bi-LSTM Layer with tf.name_scope("Bi-lstm"): lstm_fw_cell = rnn.BasicLSTMCell( lstm_hidden_size) # forward direction cell lstm_bw_cell = rnn.BasicLSTMCell( lstm_hidden_size) # backward direction cell if self.dropout_keep_prob is not None: lstm_fw_cell = rnn.DropoutWrapper( lstm_fw_cell, output_keep_prob=self.dropout_keep_prob) lstm_bw_cell = rnn.DropoutWrapper( lstm_bw_cell, output_keep_prob=self.dropout_keep_prob) # Creates a dynamic bidirectional recurrent neural network # shape of `outputs`: tuple -> (outputs_fw, outputs_bw) # shape of `outputs_fw`: [batch_size, sequence_length, lstm_hidden_size] # shape of `state`: tuple -> (outputs_state_fw, output_state_bw) # shape of `outputs_state_fw`: tuple -> (c, h) c: memory cell; h: hidden state outputs_front, state_front = tf.nn.bidirectional_dynamic_rnn( lstm_fw_cell, lstm_bw_cell, self.embedded_sentence_front_drop, dtype=tf.float32) outputs_behind, state_behind = tf.nn.bidirectional_dynamic_rnn( lstm_fw_cell, lstm_bw_cell, self.embedded_sentence_behind_drop, dtype=tf.float32) # Concat output # shape of `lstm_concat`: [batch_size, sequence_length, lstm_hidden_size * 2] self.lstm_concat_front = tf.concat(outputs_front, axis=2) self.lstm_concat_behind = tf.concat(outputs_behind, axis=2) # shape of `lstm_out`: [batch_size, sequence_length, lstm_hidden_size * 2, 1] self.lstm_out_front = tf.expand_dims(self.lstm_concat_front, axis=-1) self.lstm_out_behind = tf.expand_dims(self.lstm_concat_behind, axis=-1) # Create a convolution + maxpool layer for each filter size pooled_outputs_front = [] pooled_outputs_behind = [] for filter_size in filter_sizes: with tf.name_scope("conv-filter{0}".format(filter_size)): # Convolution Layer filter_shape = [ filter_size, lstm_hidden_size * 2, 1, num_filters ] W = tf.Variable(tf.truncated_normal(shape=filter_shape, stddev=0.1, dtype=tf.float32), name="W") b = tf.Variable(tf.constant(value=0.1, shape=[num_filters], dtype=tf.float32), name="b") conv_front = tf.nn.conv2d(self.lstm_out_front, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") conv_behind = tf.nn.conv2d(self.lstm_out_behind, W, strides=[1, 1, 1, 1], padding="VALID", name="conv_behind") conv_front = tf.nn.bias_add(conv_front, b) conv_behind = tf.nn.bias_add(conv_behind, b) # Batch Normalization Layer conv_bn_front = batch_norm(conv_front, is_training=self.is_training, trainable=True, updates_collections=None) conv_bn_behind = batch_norm(conv_behind, is_training=self.is_training, trainable=True, updates_collections=None) # Apply nonlinearity conv_out_front = tf.nn.relu(conv_bn_front, name="relu_front") conv_out_behind = tf.nn.relu(conv_bn_behind, name="relu_behind") with tf.name_scope("pool-filter{0}".format(filter_size)): # Maxpooling over the outputs avg_pooled_front = tf.nn.avg_pool( conv_out_front, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding="VALID", name="pool") max_pooled_front = tf.nn.max_pool( conv_out_front, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding="VALID", name="pool") avg_pooled_behind = tf.nn.avg_pool( conv_out_behind, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding="VALID", name="pool") max_pooled_behind = tf.nn.max_pool( conv_out_behind, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding="VALID", name="pool") # shape of `pooled_combine`: [batch_size, 1, 1, num_filters * 2] pooled_combine_front = tf.concat( [avg_pooled_front, max_pooled_front], axis=3) pooled_combine_behind = tf.concat( [avg_pooled_behind, max_pooled_behind], axis=3) pooled_outputs_front.append(pooled_combine_front) pooled_outputs_behind.append(pooled_combine_behind) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) # shape of `pool`: [batch_size, 1, 1, num_filters_total * 2] self.pool_front = tf.concat(pooled_outputs_front, axis=3) self.pool_behind = tf.concat(pooled_outputs_behind, axis=3) self.pool_flat_front = tf.reshape(self.pool_front, shape=[-1, num_filters_total * 2]) self.pool_flat_behind = tf.reshape(self.pool_behind, shape=[-1, num_filters_total * 2]) # shape of `pool_flat_combine`: [batch_size, num_filters_total * 2 * 2] self.pool_flat_combine = tf.concat( [self.pool_flat_front, self.pool_flat_behind], axis=1) # Fully Connected Layer with tf.name_scope("fc"): W = tf.Variable(tf.truncated_normal( shape=[num_filters_total * 2 * 2, fc_hidden_size], stddev=0.1, dtype=tf.float32), name="W") b = tf.Variable(tf.constant(value=0.1, shape=[fc_hidden_size], dtype=tf.float32), name="b") self.fc = tf.nn.xw_plus_b(self.pool_flat_combine, W, b) # Batch Normalization Layer self.fc_bn = batch_norm(self.fc, is_training=self.is_training, trainable=True, updates_collections=None) # Apply nonlinearity self.fc_out = tf.nn.relu(self.fc_bn, name="relu") # Highway Layer with tf.name_scope("highway"): self.highway = _highway_layer(self.fc_out, self.fc_out.get_shape()[1], num_layers=1, bias=0) # Add dropout with tf.name_scope("dropout"): self.h_drop = tf.nn.dropout(self.highway, self.dropout_keep_prob) # Final scores and predictions with tf.name_scope("output"): W = tf.Variable(tf.truncated_normal( shape=[fc_hidden_size, num_classes], stddev=0.1, dtype=tf.float32), name="W") b = tf.Variable(tf.constant(value=0.1, shape=[num_classes], dtype=tf.float32), name="b") self.logits = tf.nn.xw_plus_b(self.h_drop, W, b, name="logits") self.softmax_scores = tf.nn.softmax(self.logits, name="softmax_scores") self.predictions = tf.argmax(self.logits, 1, name="predictions") self.topKPreds = tf.nn.top_k(self.softmax_scores, k=1, sorted=True, name="topKPreds") # Calculate mean cross-entropy loss, L2 loss with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.input_y, logits=self.logits) losses = tf.reduce_mean(losses, name="softmax_losses") l2_losses = tf.add_n([ tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables() ], name="l2_losses") * l2_reg_lambda self.loss = tf.add(losses, l2_losses, name="loss") # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") # TODO: Reconsider the metrics calculation # Number of correct predictions with tf.name_scope("num_correct"): correct = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.num_correct = tf.reduce_sum(tf.cast(correct, "float"), name="num_correct") # Calculate Fp with tf.name_scope("fp"): fp = tf.metrics.false_positives(labels=tf.argmax(self.input_y, 1), predictions=self.predictions) self.fp = tf.reduce_sum(tf.cast(fp, "float"), name="fp") # Calculate Fn with tf.name_scope("fn"): fn = tf.metrics.false_negatives(labels=tf.argmax(self.input_y, 1), predictions=self.predictions) self.fn = tf.reduce_sum(tf.cast(fn, "float"), name="fn") # Calculate Recall with tf.name_scope("recall"): self.recall = self.num_correct / (self.num_correct + self.fn) # Calculate Precision with tf.name_scope("precision"): self.precision = self.num_correct / (self.num_correct + self.fp) # Calculate F1 with tf.name_scope("F1"): self.F1 = (2 * self.precision * self.recall) / (self.precision + self.recall) # Calculate AUC with tf.name_scope("AUC"): self.AUC = tf.metrics.auc(self.softmax_scores, self.input_y, name="AUC")
#'cnnoutscale': tf.Variable(weights['cnnoutscale']), #'featbeta': tf.Variable(tf.zeros([4096])), #'featscale': tf.Variable(tf.ones([4096])), #'gbeta': tf.Variable(tf.zeros([1000])), #'gscale': tf.Variable(tf.ones([1000])) } # question-embedding #embed_ques_W = tf.Variable(tf.random_uniform([vocabulary_size, input_embedding_size], -0.08, 0.08), name='embed_ques_W') # encoder: RNN body lstm_1 = rnn_cell.LSTMCell(rnn_size, input_embedding_size, use_peepholes=True, state_is_tuple=False) lstm_dropout_1 = rnn_cell.DropoutWrapper(lstm_1, output_keep_prob=1 - dropout_rate) lstm_2 = rnn_cell.LSTMCell(rnn_size, rnn_size, use_peepholes=True, state_is_tuple=False) lstm_dropout_2 = rnn_cell.DropoutWrapper(lstm_2, output_keep_prob=1 - dropout_rate) stacked_lstm = rnn_cell.MultiRNNCell([lstm_dropout_1, lstm_dropout_2], state_is_tuple=False) image = tf.placeholder(tf.float32, [batch_size, 2048]) question = tf.placeholder(tf.int32, [batch_size, max_words_q]) answers_true = tf.placeholder(tf.int32, (batch_size, 1000)) noise = tf.placeholder(tf.float32, [batch_size, 4096]) #state = tf.zeros([batch_size, stacked_lstm.state_size])
def _ner_private(self, input_data, config, is_training): """Decode model for ner Args: encoder_units - these are the encoder units: [batch_size X encoder_size] with the one the pos prediction pos_prediction: must be the same size as the encoder_size returns: logits """ # concatenate the encoder_units and the pos_prediction # pos_prediction = tf.reshape(pos_prediction, # [self.batch_size, self.num_steps, self.pos_embedding_size]) print('Hello before encoder', input_data) encoder_units = tf.transpose(input_data, [1, 0, 2]) # ner_inputs = tf.concat([pos_prediction, encoder_units], 2) ner_inputs = input_data with tf.variable_scope("ner_decoder"): # cell = rnn.BasicLSTMCell(config.ner_decoder_size, forget_bias=1.0, reuse=tf.get_variable_scope().reuse) # # if is_training and config.keep_prob < 1: # cell = rnn.DropoutWrapper( # cell, output_keep_prob=config.keep_prob) # # decoder_outputs, decoder_states = tf.nn.dynamic_rnn(cell, # ner_inputs, # dtype=tf.float32, # time_major=False, # scope="ner_rnn") lstm_cell_fw = tf.compat.v1.nn.rnn_cell.LSTMCell( config.ner_decoder_size / 2, reuse=tf.get_variable_scope().reuse, forget_bias=1.0) lstm_cell_bw = tf.compat.v1.nn.rnn_cell.LSTMCell( config.ner_decoder_size / 2, reuse=tf.get_variable_scope().reuse, forget_bias=1.0) if is_training and config.keep_prob < 1: lstm_cell_fw = rnn.DropoutWrapper( lstm_cell_fw, output_keep_prob=config.keep_prob) lstm_cell_bw = rnn.DropoutWrapper( lstm_cell_bw, output_keep_prob=config.keep_prob) decoder_outputs, decoder_states = tf.nn.bidirectional_dynamic_rnn( cell_fw=lstm_cell_fw, cell_bw=lstm_cell_bw, dtype=tf.float32, inputs=ner_inputs, time_major=False, scope="ner_rnn") decoder_outputs = tf.concat(decoder_outputs, axis=2) output = tf.reshape(tf.concat(decoder_outputs, 1), [-1, config.ner_decoder_size]) softmax_w = tf.get_variable( "softmax_w", [config.ner_decoder_size, config.num_ner_tags]) softmax_b = tf.get_variable("softmax_b", [config.num_ner_tags]) logits = tf.matmul(output, softmax_w) + softmax_b return logits, decoder_states
def __init__(self, args, training=True): self.args = args if not training: args.batch_size = 1 args.seq_length = 1 # choose different rnn cell if args.model == 'rnn': cell_fn = rnn.RNNCell elif args.model == 'gru': cell_fn = rnn.GRUCell elif args.model == 'lstm': cell_fn = rnn.LSTMCell elif args.model == 'nas': cell_fn = rnn.NASCell else: raise Exception("model type not supported: {}".format(args.model)) # warp multi layered rnn cell into one cell with dropout cells = [] for _ in range(args.num_layers): cell = cell_fn(args.rnn_size) if training and (args.output_keep_prob < 1.0 or args.input_keep_prob < 1.0): cell = rnn.DropoutWrapper( cell, input_keep_prob=args.input_keep_prob, output_keep_prob=args.output_keep_prob) cells.append(cell) self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True) # input/target data (int32 since input is char-level) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) # softmax output layer, use softmax to classify with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) # transform input to embedding embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) # dropout beta testing: double check which one should affect next line if training and args.output_keep_prob: inputs = tf.nn.dropout(inputs, args.output_keep_prob) # unstack the input to fits in rnn model inputs = tf.split(inputs, args.seq_length, 1) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # loop function for rnn_decoder, which take the previous i-th cell's output and generate the (i+1)-th cell's input def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) # rnn_decoder to generate the ouputs and final state. When we are not training the model, we use the loop function. outputs, last_state = legacy_seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if not training else None, scope='rnnlm') output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size]) # output layer self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) # loss is calculate by the log loss and taking the average. loss = legacy_seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])]) with tf.name_scope('cost'): self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() # calculate gradients grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) with tf.name_scope('optimizer'): optimizer = tf.train.AdamOptimizer(self.lr) # apply gradient change to the all the trainable variable. self.train_op = optimizer.apply_gradients(zip(grads, tvars)) # instrument tensorboard tf.summary.histogram('logits', self.logits) tf.summary.histogram('loss', loss) tf.summary.scalar('train_loss', self.cost)
def cell(): cell = rnn.DropoutWrapper(rnn.GRUCell(num_units=self.num_unit)) return cell
def build_model(self): """ build bilstm model architecture 1. embeddding layer, 2.Bi-LSTM layer, 3.concat, 4.FC layer 5.softmax """ # 1. Embedding layer with tf.device('/cpu:0'), tf.name_scope('embedding'): self.embedding_matrix = tf.get_variable(shape=[self.vocabulary_size, self.embedding_dim], initializer=tf.contrib.layers.xavier_initializer(uniform=True), name='embedding_matrix', trainable=self.embedding_trainable) # get emebedding of words in the sentence, [None, sequence_length, embedding_dim] self.embedded_words = tf.nn.embedding_lookup(self.embedding_matrix, self.sentence) # 2. Bi-LSTM layer with tf.name_scope('bilstm_layer'): lstm_fw_cell = rnn.BasicLSTMCell(num_units=self.hidden_size) # forward direction cell lstm_bw_cell = rnn.BasicLSTMCell(num_units=self.hidden_size) # backward direction cell if self.lstm_drop_out: lstm_fw_cell = rnn.DropoutWrapper(cell=lstm_fw_cell, output_keep_prob=self.dropout_keep_prob) lstm_bw_cell = rnn.DropoutWrapper(cell=lstm_bw_cell, output_keep_prob=self.dropout_keep_prob) ''' bidirectional_dynamic_rnn: input: [batch_size, sequence_length, embedding_dim], max_time == sequence_length output: A tuple (outputs, output_states) outputs: A tuple (output_fw, output_bw) output_fw: [batch_size, max_time, cell_fw.output_size] output_bw: [batch_size, max_time, cell_bw.output_size] ''' (fw_output, bw_output), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell, cell_bw=lstm_bw_cell, inputs=self.embedded_words, dtype=tf.float32) print("bidirectional_dynamic_rnn outputs: ", fw_output.get_shape(), bw_output.get_shape()) # 3. concat, axis=2, concat cell_fw.output_size and cell_bw.output_size output_rnn = tf.concat((fw_output, bw_output), axis=2) # [batch_size, sequence_length, hidden_size * 2] # last cell output self.output_rnn_last = tf.reduce_mean(output_rnn, axis=1) # [batch_size, hidden_size * 2] print('last cell output:', self.output_rnn_last.get_shape()) with tf.name_scope('readout'): # 4.linear classifier self.W_projection = tf.get_variable(shape=[self.hidden_size * 2, self.label_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True), name='linear_W_projection') self.b_projection = tf.get_variable(shape=[self.label_size], name='linear_b_projection') self.logits = tf.add(tf.matmul(self.output_rnn_last, self.W_projection), self.b_projection, name='logits') self.prediction_probs = tf.nn.softmax(self.logits) with tf.name_scope("loss"): l2_loss = tf.constant(0.0) if self.embedding_trainable: l2_loss += tf.nn.l2_loss(self.embedding_matrix) # l2_losses = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name]) * l2_lambda l2_loss += tf.nn.l2_loss(self.W_projection) l2_loss += tf.nn.l2_loss(self.b_projection) losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.labels) self.loss = tf.reduce_mean(losses) + self.l2_reg_lambda * l2_loss with tf.name_scope("accuracy"): labels = tf.argmax(self.labels, 1) self.predictions = tf.argmax(self.logits, 1, name="predictions") self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.predictions, labels), "float"), name="accuracy")
def __init__(self, args, training=True): self.args = args if not training: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn.BasicRNNCell elif args.model == 'gru': cell_fn = rnn.GRUCell elif args.model == 'lstm': cell_fn = rnn.BasicLSTMCell elif args.model == 'nas': cell_fn = rnn.NASCell else: raise Exception("model type not supported: {}".format(args.model)) cells = [] for _ in range(args.num_layers): cell = cell_fn(args.rnn_size) if training and (args.output_keep_prob < 1.0 or args.input_keep_prob < 1.0): cell = rnn.DropoutWrapper( cell, input_keep_prob=args.input_keep_prob, output_keep_prob=args.output_keep_prob) cells.append(cell) self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) # dropout beta testing: double check which one should affect next line if training and args.output_keep_prob: inputs = tf.nn.dropout(inputs, args.output_keep_prob) inputs = tf.split(inputs, args.seq_length, 1) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = legacy_seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if not training else None, scope='rnnlm') output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = legacy_seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])]) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length with tf.name_scope('cost'): self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) with tf.name_scope('optimizer'): optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) # instrument tensorboard tf.summary.histogram('logits', self.logits) tf.summary.histogram('loss', loss) tf.summary.scalar('train_loss', self.cost)
def build_wp(self, id): print('creating %s weak predictor network...' % id) with self.graph.as_default() as graph: with tf.variable_scope(id) as scope: self.rnn_cell[id] = rnn_cell = rnn.MultiRNNCell([ rnn.DropoutWrapper(rnn.LSTMCell( self.config.LSTM_LAYER_SIZE), input_keep_prob=self.keep_prob) for _ in range(self.config.LSTM_LAYERS) ]) state = () for s in rnn_cell.state_size: c = tf.placeholder(tf.float32, [None, s.c]) h = tf.placeholder(tf.float32, [None, s.h]) state += (tf.contrib.rnn.LSTMStateTuple(c, h), ) self.state[id] = state # Batch size x time steps x features. output, new_state = tf.nn.dynamic_rnn( rnn_cell, self.input, initial_state=state, sequence_length=self.seq_len) self.new_state[id] = new_state fc_layer_idx = 0 for num_units in self.config.FC_LAYERS: scope_name = 'fc_layer_%d' % fc_layer_idx with tf.name_scope(scope_name): output = tf.contrib.layers.fully_connected( output, num_units, activation_fn=tf.nn.relu, scope='dense_%d' % fc_layer_idx) output = tf.nn.dropout(output, self.keep_prob) fc_layer_idx += 1 # final layer to make prediction with tf.name_scope('prediction_layer'): self.returns[ id] = returns = tf.contrib.layers.fully_connected( output, 1, activation_fn=None) with tf.name_scope('loss'): diff = returns - tf.expand_dims(self.labels, 2) self.sse[id] = sse = tf.reduce_sum( tf.multiply(tf.square(diff), tf.expand_dims(self.mask, 2))) self.cost[id] = cost = sse / tf.reduce_sum(self.mask) self.optimizer[id] = optimizer = tf.train.AdamOptimizer() self.vars[id] = vars = tf.trainable_variables(scope.name) self.grads_and_vars[ id] = grads_and_vars = optimizer.compute_gradients( cost, var_list=vars) self.train[id] = optimizer.apply_gradients(grads_and_vars) self.saver[id] = tf.train.Saver(tf.trainable_variables( scope.name), max_to_keep=None)
def __init__(self, reversed_dict, article_max_len, summary_max_len, args, forward_only=False): self.vocabulary_size = len(reversed_dict) self.embedding_size = args.embedding_size self.num_hidden = args.num_hidden self.num_layers = args.num_layers self.learning_rate = args.learning_rate self.beam_width = args.beam_width if not forward_only: self.keep_prob = args.keep_prob else: self.keep_prob = 1.0 self.cell = tf.nn.rnn_cell.BasicLSTMCell with tf.variable_scope("decoder/projection"): self.projection_layer = tf.layers.Dense(self.vocabulary_size, use_bias=False) self.batch_size = tf.placeholder(tf.int32, (), name="batch_size") self.X = tf.placeholder(tf.int32, [None, article_max_len]) self.X_len = tf.placeholder(tf.int32, [None]) self.decoder_input = tf.placeholder(tf.int32, [None, summary_max_len]) self.decoder_len = tf.placeholder(tf.int32, [None]) self.decoder_target = tf.placeholder(tf.int32, [None, summary_max_len]) self.global_step = tf.Variable(0, trainable=False) with tf.name_scope("embedding"): if not forward_only and args.glove: init_embeddings = tf.constant(get_init_embedding(reversed_dict, self.embedding_size), dtype=tf.float32) else: init_embeddings = tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0) self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings) self.encoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.X), perm=[1, 0, 2]) self.decoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.decoder_input), perm=[1, 0, 2]) with tf.name_scope("encoder"): fw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)] bw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)] fw_cells = [rnn.DropoutWrapper(cell) for cell in fw_cells] bw_cells = [rnn.DropoutWrapper(cell) for cell in bw_cells] encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn( fw_cells, bw_cells, self.encoder_emb_inp, sequence_length=self.X_len, time_major=True, dtype=tf.float32) self.encoder_output = tf.concat(encoder_outputs, 2) encoder_state_c = tf.concat((encoder_state_fw[0].c, encoder_state_bw[0].c), 1) encoder_state_h = tf.concat((encoder_state_fw[0].h, encoder_state_bw[0].h), 1) self.encoder_state = rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h) with tf.name_scope("decoder"), tf.variable_scope("decoder") as decoder_scope: decoder_cell = self.cell(self.num_hidden * 2) if not forward_only: attention_states = tf.transpose(self.encoder_output, [1, 0, 2]) attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( self.num_hidden * 2, attention_states, memory_sequence_length=self.X_len, normalize=True) decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, attention_layer_size=self.num_hidden * 2) initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size) initial_state = initial_state.clone(cell_state=self.encoder_state) helper = tf.contrib.seq2seq.TrainingHelper(self.decoder_emb_inp, self.decoder_len, time_major=True) decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, initial_state) outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, scope=decoder_scope) self.decoder_output = outputs.rnn_output self.logits = tf.transpose( self.projection_layer(self.decoder_output), perm=[1, 0, 2]) self.logits_reshape = tf.concat( [self.logits, tf.zeros([self.batch_size, summary_max_len - tf.shape(self.logits)[1], self.vocabulary_size])], axis=1) else: tiled_encoder_output = tf.contrib.seq2seq.tile_batch( tf.transpose(self.encoder_output, perm=[1, 0, 2]), multiplier=self.beam_width) tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(self.encoder_state, multiplier=self.beam_width) tiled_seq_len = tf.contrib.seq2seq.tile_batch(self.X_len, multiplier=self.beam_width) attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( self.num_hidden * 2, tiled_encoder_output, memory_sequence_length=tiled_seq_len, normalize=True) decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, attention_layer_size=self.num_hidden * 2) initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size * self.beam_width) initial_state = initial_state.clone(cell_state=tiled_encoder_final_state) decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=decoder_cell, embedding=self.embeddings, start_tokens=tf.fill([self.batch_size], tf.constant(2)), end_token=tf.constant(3), initial_state=initial_state, beam_width=self.beam_width, output_layer=self.projection_layer ) outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder, output_time_major=True, maximum_iterations=summary_max_len, scope=decoder_scope) self.prediction = tf.transpose(outputs.predicted_ids, perm=[1, 2, 0]) with tf.name_scope("loss"): if not forward_only: crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits_reshape, labels=self.decoder_target) weights = tf.sequence_mask(self.decoder_len, summary_max_len, dtype=tf.float32) self.loss = tf.reduce_sum(crossent * weights / tf.to_float(self.batch_size)) params = tf.trainable_variables() gradients = tf.gradients(self.loss, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.update = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
def __init__(self, config: BiLSTMConfig, is_training, input_ids, label_ids, seq_length, init_embedding=None): """Constructor for BertModel. Args: config: `BertConfig` instance. is_training: bool. rue for training model, false for eval model. Controls whether dropout will be applied. input_ids: int64 Tensor of shape [batch_size, seq_length, feat_size]. label_ids: (optional) int64 Tensor of shape [batch_size, seq_length]. seq_length: (optional) int64 Tensor of shape [batch_size]. init_embedding: (optional) Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ self.input_ids = input_ids self.label_ids = label_ids self.seq_length = seq_length self.is_training = is_training input_shape = model_utils.get_shape_list(input_ids, expected_rank=3) batch_size = input_shape[0] max_length = input_shape[1] window_size = input_shape[2] if not is_training: config.embedding_dropout_prob = 0.0 config.hidden_dropout_prob = 0.0 if init_embedding is None: embedding = tf.get_variable( shape=[config.vocab_size, config.embedding_size], dtype=tf.float32, name='embedding', initializer=tf.truncated_normal_initializer(stddev=0.02)) else: embedding = tf.Variable(init_embedding, dtype=tf.float32, name='embedding') with tf.variable_scope('embedding'): x = tf.nn.embedding_lookup(embedding, input_ids) feat_size = window_size x = tf.reshape(x, [batch_size, -1, feat_size * config.embedding_size]) x = model_utils.dropout(x, config.embedding_dropout_prob) with tf.variable_scope('rnn_cell'): if config.rnn_cell == 'lstm': fw_cell = tf.nn.rnn_cell.LSTMCell(config.hidden_size, name='basic_lstm_cell') bw_cell = tf.nn.rnn_cell.LSTMCell(config.hidden_size, name='basic_lstm_cell') else: fw_cell = rnn.GRUCell(config.hidden_size) bw_cell = rnn.GRUCell(config.hidden_size) fw_cell = rnn.DropoutWrapper(fw_cell, output_keep_prob=1.0 - config.hidden_dropout_prob) bw_cell = rnn.DropoutWrapper(bw_cell, output_keep_prob=1.0 - config.hidden_dropout_prob) fw_multi_cell = rnn.MultiRNNCell([fw_cell] * config.num_hidden_layers) bw_multi_cell = rnn.MultiRNNCell([bw_cell] * config.num_hidden_layers) with tf.variable_scope('rnn'): if config.bi_direction: (forward_output, backword_output), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_multi_cell, cell_bw=bw_multi_cell, inputs=x, sequence_length=seq_length, dtype=tf.float32) output = tf.concat([forward_output, backword_output], axis=2) else: forward_output, _ = tf.nn.dynamic_rnn( cell=fw_multi_cell, inputs=x, sequence_length=seq_length, dtype=tf.float32) output = forward_output with tf.variable_scope('output'): logits = layers.fully_connected(inputs=output, num_outputs=config.num_classes, activation_fn=None) self.prediction = tf.argmax(logits, axis=-1) with tf.variable_scope('loss'): weight = tf.sequence_mask(seq_length, dtype=tf.float32) self.loss = tf.contrib.seq2seq.sequence_loss( logits=logits, targets=self.label_ids, weights=weight, average_across_timesteps=True, average_across_batch=True)
def op_cell(): return rnn.DropoutWrapper(cell(), output_keep_prob=config.keep_prob)
Epoch = tf.placeholder("float") X = tf.placeholder("float", [None, num_steps, input_size]) y = tf.placeholder("float", [None, num_steps, output_size]) batch_size = tf.placeholder(tf.int32, []) # keep_prob = tf.placeholder(tf.float32) # 设置单层LSTM lstm_cell = rnn.BasicLSTMCell(num_units=hidden_size, forget_bias=1.0, state_is_tuple=True) # 设置dropout if mode == 'dropout' or mode == 'cost_limited': lstm_cell = rnn.DropoutWrapper(cell=lstm_cell, input_keep_prob=1.0, output_keep_prob=keep_prob) # Double-LSTM # mlstm_cell = rnn.MultiRNNCell([lstm_cell] * 2, state_is_tuple=True) # 初始化状态 init_state = lstm_cell.zero_state(batch_size, dtype=tf.float32) # init_state = lstm_cell.zero_state(Batch_Size, dtype=tf.float32) # outputs, state = tf.nn.dynamic_rnn(mlstm_cell, inputs=X, initial_state=init_state, time_major=False) # h_state = outputs[:, -1, :] outputs = [] state = init_state with tf.variable_scope('RNN'):
def dec_cell(self,num): #return tfn.MultiRNNCell([tfn.GRUCell(num,name="dec_cell"+str(i)) for i in range(self.config['num'])]) return tfn.DropoutWrapper(tfn.GRUCell(num, name="dec_cell"),state_keep_prob=self.drop)