def add_model(self, input_data): """Adds a linear-layer plus a softmax transformation The core transformation for this model which transforms a batch of input data into a batch of predictions. In this case, the mathematical transformation effected is y = softmax(xW + b) Hint: Make sure to create tf.Variables as needed. Also, make sure to use tf.name_scope to ensure that your name spaces are clean. Hint: For this simple use-case, it's sufficient to initialize both weights W and biases b with zeros. Args: input_data: A tensor of shape (batch_size, n_features). Returns: out: A tensor of shape (batch_size, n_classes) """ ### YOUR CODE HERE # W = tf.Variable(tf.zeros((self.config.n_features, self.config.n_classes)), name="weights") # b = tf.Variable(tf.zeros((self.config.n_classes, )), name="biases") with tf.variable_scope('softmax'): W = tf.get_variable("weights", (self.config.n_features, self.config.n_classes), initializer=tf.constant_initializer(0.0)) b = tf.get_variable("bias", (self.config.n_classes,), initializer=tf.constant_initializer(0.0)) out = softmax(tf.matmul(input_data, W) + b) ### END YOUR CODE return out
def instance_norm(x, epsilon=1e-5): """Instance Normalization. See Ulyanov, D., Vedaldi, A., & Lempitsky, V. (2016). Instance Normalization: The Missing Ingredient for Fast Stylization, Retrieved from http://arxiv.org/abs/1607.08022 Parameters ---------- x : TYPE Description epsilon : float, optional Description Returns ------- TYPE Description """ with tf.variable_scope('instance_norm'): mean, var = tf.nn.moments(x, [1, 2], keep_dims=True) scale = tf.get_variable( name='scale', shape=[x.get_shape()[-1]], initializer=tf.truncated_normal_initializer(mean=1.0, stddev=0.02)) offset = tf.get_variable( name='offset', shape=[x.get_shape()[-1]], initializer=tf.constant_initializer(0.0)) out = scale * tf.div(x - mean, tf.sqrt(var + epsilon)) + offset return out
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. r, u = array_ops.split(1, 2, _linear([inputs, state], 2 * self._num_units, True, 1.0, self.weights_init, self.trainable, self.restore, self.reuse)) r, u = self._inner_activation(r), self._inner_activation(u) with tf.variable_scope("Candidate"): c = self._activation( _linear([inputs, r * state], self._num_units, True, 0., self.weights_init, self.trainable, self.restore, self.reuse)) new_h = u * state + (1 - u) * c self.W, self.b = list(), list() # Retrieve RNN Variables with tf.variable_scope('Gates/Linear', reuse=True): self.W.append(tf.get_variable('Matrix')) self.b.append(tf.get_variable('Bias')) with tf.variable_scope('Candidate/Linear', reuse=True): self.W.append(tf.get_variable('Matrix')) self.b.append(tf.get_variable('Bias')) return new_h, new_h
def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size size = config.hidden_size self.max_len = max_len = config.max_len vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, config.max_len]) self._targets = tf.placeholder(tf.int32, [batch_size]) embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) output = tf.reduce_sum(inputs, 1) softmax_w = tf.get_variable("softmax_w", [size, 2]) softmax_b = tf.get_variable("softmax_b", [2]) logits = tf.matmul(output, softmax_w) + softmax_b prediction = tf.nn.softmax(logits) self._prediction = prediction loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, self._targets) self._cost = cost = tf.reduce_sum(loss) / batch_size if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def get_run_op(): # Create an optimizer that performs gradient descent. #opt = tf.train.GradientDescentOptimizer(learning_rate=0.01) slice_size = FLAGS.batch_size / FLAGS.num_cuts print('Slice size:{}'.format(slice_size)) data = None label = None last_fc = [tf.no_op()] with tf.device('/gpu:0'): data = tf.get_variable( name = 'data', shape=[slice_size, FLAGS.hidden_size], trainable=False) ''' label = tf.get_variable( name = 'label', shape = [slice_size, FLAGS.hidden_size], trainable=False)) with tf.variable_scope('fc_in'): weight_in = tf.zeros([1000, FLAGS.hidden_size]) for k in xrange(FLAGS.num_cuts): with tf.control_dependencies([last_fc[-1]]): last_fc.append(tf.matmul(data[k+1], weight_in)) ''' for i in xrange(FLAGS.num_cuts): last_fc.append(data) for i in xrange(FLAGS.num_layers): dev = '/gpu:%d' % (i * FLAGS.num_gpus / FLAGS.num_layers) with tf.device(dev), scopes.arg_scope([variables.variable], device=dev): tmp_fc = [tf.no_op()] with tf.variable_scope('fc%d' % i): w = tf.get_variable( name='w', shape=[FLAGS.hidden_size, FLAGS.hidden_size], trainable=True) for k in xrange(FLAGS.num_cuts): with tf.control_dependencies([tmp_fc[-1]]): tmp_fc.append(tf.matmul(last_fc[k+1], w)) last_fc = tmp_fc if i == FLAGS.num_layers - 1: with tf.control_dependencies(last_fc): train_op = tf.no_op() ''' with tf.device('/gpu:%d' % (FLAGS.num_gpus - 1)): tmp_fc = [tf.no_op()] with tf.variable_scope('fc_out'): weight_out = tf.zeros([FLAGS.hidden_size, 1000]) for k in xrange(FLAGS.num_cuts): with tf.control_dependencies([tmp_fc[-1]]): tmp_fc.append(tf.matmul(last_fc[k+1], weight_out)) last_fc = tmp_fc loss = tf.nn_softmax_cross_entropy_with_logits(last_fc, labels, name='xentropy') grads = opt.compute_gradients(loss) apply_gradient_op = opt.apply_gradients(grads) train_op = tf.group(apply_gradient_op) ''' init_op = tf.initialize_all_variables() return init_op, train_op
def add_model_vars(self): ''' You model contains the following parameters: embedding: tensor(vocab_size, embed_size) W1: tensor(2* embed_size, embed_size) b1: tensor(1, embed_size) U: tensor(embed_size, output_size) bs: tensor(1, output_size) Hint: Add the tensorflow variables to the graph here and *reuse* them while building the compution graphs for composition and projection for each tree Hint: Use a variable_scope "Composition" for the composition layer, and "Projection") for the linear transformations preceding the softmax. ''' embed_size = self.config.embed_size vocab_size = len(self.vocab) output_size = self.config.label_size with tf.variable_scope('Composition'): ### YOUR CODE HERE embedding = tf.get_variable("embedding", shape=(vocab_size, embed_size)) W1 = tf.get_variable("W1", shape=(2 * embed_size, embed_size)) b1 = tf.get_variable("b1", shape=(1, embed_size)) ### END YOUR CODE with tf.variable_scope('Projection'): ### YOUR CODE HERE U = tf.get_variable("U", shape=(embed_size, output_size)) bs = tf.get_variable("bs", shape=(1, output_size)) ### END YOUR CODE self.optimizer = tf.train.AdamOptimizer(learning_rate=self.config.lr) # dummy_total is a simple sum to ensure that the variables for the AdamOptimizer # are created for initialization and before restore the variables later. # It should never actually get executed. dummy_total = tf.constant(0.0) for v in tf.trainable_variables(): dummy_total +=tf.reduce_sum(v) self.dummy_minimizer = self.optimizer.minimize(dummy_total)
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(1, 2, state) concat = _linear([inputs, h], 4 * self._num_units, True, 0., self.weights_init, self.trainable, self.restore, self.reuse) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(1, 4, concat) new_c = (c * self._inner_activation(f + self._forget_bias) + self._inner_activation(i) * self._activation(j)) new_h = self._activation(new_c) * self._inner_activation(o) if self._state_is_tuple: new_state = _rnn_cell.LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat(1, [new_c, new_h]) # Retrieve RNN Variables with tf.variable_scope('Linear', reuse=True): self.W = tf.get_variable('Matrix') self.b = tf.get_variable('Bias') return new_h, new_state
def instantiate_weights(self): """define all weights here""" with tf.variable_scope("embedding_projection"): # embedding matrix self.Embedding = tf.get_variable("Embedding", shape=[self.vocab_size, self.embed_size],initializer=self.initializer) # [vocab_size,embed_size] tf.random_uniform([self.vocab_size, self.embed_size],-1.0,1.0) self.Embedding_label = tf.get_variable("Embedding_label", shape=[self.num_classes, self.embed_size],dtype=tf.float32) #,initializer=self.initializer self.W_projection = tf.get_variable("W_projection", shape=[self.sequence_length*self.d_model, self.num_classes],initializer=self.initializer) # [embed_size,label_size] self.b_projection = tf.get_variable("b_projection", shape=[self.num_classes])
def loss(self, logits, labels): """Adds loss ops to the computational graph. Hint: Use sparse_softmax_cross_entropy_with_logits Hint: Remember to add l2_loss (see tf.nn.l2_loss) Args: logits: tensor(num_nodes, output_size) labels: python list, len = num_nodes Returns: loss: tensor 0-D """ loss = None # YOUR CODE HERE labels = tf.convert_to_tensor(labels, dtype=tf.int64) softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels) l2 = self.config.l2 with tf.variable_scope('Composition', reuse=True): W1 = tf.get_variable("W1") with tf.variable_scope('Projection', reuse=True): U = tf.get_variable("U") l2_loss = tf.nn.l2_loss(W1) + tf.nn.l2_loss(U) l2_loss *= l2 loss = tf.reduce_sum(softmax_loss) + l2_loss # END YOUR CODE return loss
def FullyConnected(x, out_dim, W_init=None, b_init=None, nl=tf.nn.relu, use_bias=True): """ Fully-Connected layer. :param input: a tensor to be flattened except the first dimension. :param out_dim: output dimension :param W_init: initializer for W. default to `xavier_initializer_conv2d`. :param b_init: initializer for b. default to zero initializer. :param nl: nonlinearity. default to `relu`. :param use_bias: whether to use bias. a boolean default to True :returns: a 2D tensor """ x = batch_flatten(x) in_dim = x.get_shape().as_list()[1] if W_init is None: #W_init = tf.truncated_normal_initializer(stddev=1 / math.sqrt(float(in_dim))) W_init = tf.uniform_unit_scaling_initializer(factor=1.43) if b_init is None: b_init = tf.constant_initializer() W = tf.get_variable('W', [in_dim, out_dim], initializer=W_init) if use_bias: b = tf.get_variable('b', [out_dim], initializer=b_init) prod = tf.nn.xw_plus_b(x, W, b) if use_bias else tf.matmul(x, W) return nl(prod, name='output')
def wide_model(numeric_input, category_input, vocabs): transpose_category_input = tf.transpose(category_input) category_sum = None # Append embadding category to numeric_sum for i in range(0, len(vocabs)): embedding = tf.get_variable("wideem" + str(i), [vocabs[i], 8], initializer=tf.contrib.layers.xavier_initializer() #partitioner=tf.fixed_size_partitioner(n_pss)) #partitioner=tf.min_max_variable_partitioner(n_pss, 0, 2 << 10) ) # Pick one column from category input col = tf.gather(transpose_category_input, [i])[0] #col = tf.nn.embedding_lookup(transpose_category_input, [i])[0] # Same as make [0001]*[w1,w2,w3,w4] = lookup w4 #embedded_col = embedding_lookup(tf.identity(embedding), col) # number * embedding output number embedded_col = embedding_ops.embedding_lookup_unique(embedding, col) if category_sum is None: category_sum = embedded_col else: category_sum = tf.concat([category_sum, embedded_col], 1) tf.set_random_seed(1) w = tf.get_variable("W", [numeric_input.shape[1] + category_sum.shape[1], 1], initializer=tf.contrib.layers.xavier_initializer()) wmodel_logits_sum = tf.matmul(tf.concat([numeric_input, category_sum], 1), w) return wmodel_logits_sum
def Linear(args, output_dim, bias=True, bias_init=0.0, scope=None): if not isinstance(args, (list, tuple)): args = [args] input_dim = 0 shapes = [a.get_shape().as_list() for a in args] for shape in shapes: if len(shape) != 2: raise ValueError("Linear is expecting 2d arguments: %s" % str(shapes)) elif not shape[1]: raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes)) else: input_dim += shape[1] with tf.variable_scope(scope or "linear"): W = tf.get_variable("W", (input_dim, output_dim)) if len(args) == 1: result = tf.matmul(args[0], W) else: result = tf.matmul(tf.concat(1, args), W) if not bias: return result b = tf.get_variable("b", (output_dim,), initializer=tf.constant_initializer(bias_init)) return result + b
def _initialize_weights(self): all_weights = dict() # Encoding layers for i, n_hidden in enumerate(self.hidden_units): weight_name = 'encoder%d_W' % i bias_name = 'encoder%d_b' % i if i == 0: weight_shape = [self.n_input, n_hidden] else: weight_shape = [self.hidden_units[i-1], n_hidden] all_weights[weight_name] = tf.get_variable(weight_name, weight_shape, initializer=tf.contrib.layers.xavier_initializer()) all_weights[bias_name] = tf.get_variable(bias_name, [n_hidden], initializer=tf.constant_initializer(0.0)) # Decoding layers hidden_units_rev = self.hidden_units[::-1] for i, n_hidden in enumerate(hidden_units_rev): weight_name = 'decoder%d_W' % i bias_name = 'decoder%d_b' % i if i != len(hidden_units_rev) - 1: # not the last layer weight_shape = [n_hidden, hidden_units_rev[i+1]] else: weight_shape = [n_hidden, self.n_input] all_weights[weight_name] = tf.get_variable(weight_name, weight_shape, initializer=tf.contrib.layers.xavier_initializer()) all_weights[bias_name] = tf.get_variable(bias_name, [n_hidden], initializer=tf.constant_initializer(0.0)) return all_weights
def add_logits_op(self): """ Adds logits to self """ with tf.variable_scope("bi-lstm"): lstm_fwrd_cell = tf.contrib.rnn.LSTMCell(self.hidden_size) lstm_back_cell = tf.contrib.rnn.LSTMCell(self.hidden_size) (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(lstm_fwrd_cell, lstm_back_cell, self.word_embeddings, sequence_length=self.sequence_lengths, dtype=tf.float32) output = tf.concat([output_fw, output_bw], axis=-1) output = tf.nn.dropout(output, self.dropout) with tf.variable_scope("proj"): W = tf.get_variable("W", shape=[2*self.hidden_size, self.ntags], dtype=tf.float32) b = tf.get_variable("b", shape=[self.ntags], dtype=tf.float32, initializer=tf.zeros_initializer()) ntime_steps = tf.shape(output)[1] output = tf.reshape(output, [-1, 2*self.hidden_size]) pred = tf.matmul(output, W) + b self.logits = tf.reshape(pred, [-1, ntime_steps, self.ntags])
def init(): d_model = 512 d_k = 64 d_v = 64 sequence_length =6 #5 decoder_sent_length=6 h = 8 batch_size = 4*32 num_layer=6 # 2.set Q,K,V vocab_size = 1000 embed_size = d_model initializer = tf.random_normal_initializer(stddev=0.1) Embedding = tf.get_variable("Embedding_d", shape=[vocab_size, embed_size], initializer=initializer) decoder_input_x = tf.placeholder(tf.int32, [batch_size, decoder_sent_length], name="input_x") # [4,10] print("1.decoder_input_x:", decoder_input_x) decoder_input_embedding = tf.nn.embedding_lookup(Embedding, decoder_input_x) # [batch_size*sequence_length,embed_size] #Q = embedded_words # [batch_size*sequence_length,embed_size] #K_s = embedded_words # [batch_size*sequence_length,embed_size] #K_v_encoder = tf.placeholder(tf.float32, [batch_size,decoder_sent_length, d_model], name="input_x") #sequence_length Q = tf.placeholder(tf.float32, [batch_size,sequence_length, d_model], name="input_x") K_s=decoder_input_embedding K_v_encoder= tf.get_variable("v_variable",shape=[batch_size,decoder_sent_length, d_model],initializer=initializer) #tf.float32, print("2.output from encoder:",K_v_encoder) mask = get_mask(decoder_sent_length) #sequence_length decoder = Decoder(d_model, d_k, d_v, sequence_length, h, batch_size, Q, K_s, K_v_encoder,decoder_sent_length,mask=mask,num_layer=num_layer) return decoder,Q, K_s
def __init__(self, session, np_matrix, rank, learning_rate=0.1): matrix = tf.constant(np_matrix, dtype=tf.float32) scale = 2 * np.sqrt(np_matrix.mean() / rank) initializer = tf.random_uniform_initializer(maxval=scale) with tf.device('/job:ps/task:0'): self.matrix_W = tf.get_variable( "W", (np_matrix.shape[0], rank), initializer=initializer ) with tf.device("/job:ps/task:1"): self.matrix_H = tf.get_variable( "H", (rank, np_matrix.shape[1]), initializer=initializer ) matrix_WH = tf.matmul(self.matrix_W, self.matrix_H) f_norm = tf.reduce_sum(tf.pow(matrix - matrix_WH, 2)) nn_w = tf.reduce_sum(tf.abs(self.matrix_W) - self.matrix_W) nn_h = tf.reduce_sum(tf.abs(self.matrix_H) - self.matrix_H) constraint = INFINITY * (nn_w + nn_h) self.loss = f_norm + constraint self.constraint = constraint self.session = session self.optimizer = tf.train.GradientDescentOptimizer( learning_rate ).minimize(self.loss)
def project_bilstm_layer(self, lstm_outputs, name=None): """ hidden layer between lstm layer and logits :param lstm_outputs: [batch_size, num_steps, emb_size] :return: [batch_size, num_steps, num_tags] """ with tf.variable_scope("project" if not name else name): with tf.variable_scope("hidden"): W = tf.get_variable("W", shape=[self.hidden_unit * 2, self.hidden_unit], dtype=tf.float32, initializer=self.initializers.xavier_initializer()) b = tf.get_variable("b", shape=[self.hidden_unit], dtype=tf.float32, initializer=tf.zeros_initializer()) output = tf.reshape(lstm_outputs, shape=[-1, self.hidden_unit * 2]) hidden = tf.tanh(tf.nn.xw_plus_b(output, W, b)) # project to score of tags with tf.variable_scope("logits"): W = tf.get_variable("W", shape=[self.hidden_unit, self.num_labels], dtype=tf.float32, initializer=self.initializers.xavier_initializer()) b = tf.get_variable("b", shape=[self.num_labels], dtype=tf.float32, initializer=tf.zeros_initializer()) pred = tf.nn.xw_plus_b(hidden, W, b) return tf.reshape(pred, [-1, self.seq_length, self.num_labels])
def create_model(self): print "Setting up model", sys.stdout.flush() # placeholders for data + targets self._input_data = tf.placeholder(tf.int32, shape=(self.batch_size, self.num_steps)) self._targets = tf.placeholder(tf.int32, [self.batch_size, self.num_steps]) # set up lookup function self.embedding = tf.constant(self.saved_embedding,name="embedding") self.inputs = tf.nn.embedding_lookup(self.embedding, self._input_data) # lstm model self.lstm_cell = rnn_cell.BasicLSTMCell(self.lstm_size) self.cell = rnn_cell.MultiRNNCell([self.lstm_cell] * self.num_layers) self._initial_state = self.cell.zero_state(self.batch_size, tf.float32) from tensorflow.models.rnn import rnn self.inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, self.num_steps, self.inputs)] self.outputs, self.states = rnn.rnn(self.cell, self.inputs, initial_state=self._initial_state) self.output = tf.reshape(tf.concat(1, self.outputs), [-1, self.lstm_size]) self.softmax_w = tf.get_variable("softmax_w", [self.lstm_size, self.vocab_size]) self.softmax_b = tf.get_variable("softmax_b", [self.vocab_size]) self.logits = tf.matmul(self.output, self.softmax_w) + self.softmax_b #print "self.states.get_shape():",self.states.get_shape() #print "tf.shape(self.states)",tf.shape(self.states) self._final_state = self.states self.saver = tf.train.Saver() #delete data to save memory if network is used for sampling only if self.only_for_sampling: del self.data print "done"
def affine_reuseable(x, shape): W = tf.get_variable("W", shape, initializer=tf.random_normal_initializer()) b = tf.get_variable("b", [shape[1]], initializer=tf.constant_initializer(0.0)) model = tf.nn.relu(tf.matmul(x, W) + b) return model
def tf_baseline_conv2d(): import tensorflow as tf import cntk.contrib.crosstalk.crosstalk_tensorflow as crtf ci = crtf.instance tf.reset_default_graph() x = tf.placeholder(tf.float32, [batch_size, num_chars, char_emb_dim]) filter_bank = tf.get_variable("char_filter_bank", shape=[filter_width, char_emb_dim, num_filters], dtype=tf.float32) bias = tf.get_variable("char_filter_biases", shape=[num_filters], dtype=tf.float32) char_conv = tf.expand_dims(tf.transpose(tf.nn.conv1d(x, filter_bank, stride=1, padding='VALID') + bias, perm=[0,2,1]), -1) ci.watch(cstk.Conv2DArgs(W=crtf.find_trainable('char_filter_bank'), b=crtf.find_trainable('char_filter_biases')), 'conv2d', var_type=cstk.Conv2DAttr, attr=cstk.Conv2DAttr(filter_shape=(filter_width, char_emb_dim,), num_filters=num_filters)) ci.watch(char_conv, 'conv2d_out', var_type=crtf.VariableType) # note the output is transposed to NCHW with tf.Session() as sess: sess.run(tf.global_variables_initializer()) data = {x:input_data} ci.set_workdir(workdir) ci.set_data(sess, data) ci.fetch('conv2d_out', save=True) ci.fetch('conv2d', save=True) ci.assign('conv2d', load=True) assert ci.compare('conv2d_out') ci.reset() sess.close()
def _conv_layer(self, name, input_var, stride, in_channels, out_channels, options = {}): activation = options.get('activation', 'relu') dropout = options.get('dropout', None) padding = options.get('padding', 'SAME') batchnorm = options.get('batchnorm', True) transpose = options.get('transpose', False) with tf.variable_scope(name) as scope: if not transpose: filter_shape = [KERNEL_SIZE, KERNEL_SIZE, in_channels, out_channels] else: filter_shape = [KERNEL_SIZE, KERNEL_SIZE, out_channels, in_channels] kernel = tf.get_variable( 'weights', shape=filter_shape, initializer=tf.truncated_normal_initializer(stddev=math.sqrt(2.0 / KERNEL_SIZE / KERNEL_SIZE / in_channels)), dtype=tf.float32 ) biases = tf.get_variable( 'biases', shape=[out_channels], initializer=tf.constant_initializer(0.0), dtype=tf.float32 ) if not transpose: output = tf.nn.bias_add( tf.nn.conv2d( input_var, kernel, [1, stride, stride, 1], padding=padding ), biases ) else: batch = tf.shape(input_var)[0] side = tf.shape(input_var)[1] output = tf.nn.bias_add( tf.nn.conv2d_transpose( input_var, kernel, [batch, side * stride, side * stride, out_channels], [1, stride, stride, 1], padding=padding ), biases ) if batchnorm: output = tf.contrib.layers.batch_norm(output, center=True, scale=True, is_training=self.is_training, decay=0.99) if dropout is not None: output = tf.nn.dropout(output, keep_prob=1-dropout) if activation == 'relu': return tf.nn.relu(output, name=scope.name) elif activation == 'sigmoid': return tf.nn.sigmoid(output, name=scope.name) elif activation == 'none': return output else: raise Exception('invalid activation {} specified'.format(activation))
def testLSTMBasicToBlockPeeping(self): with self.test_session(use_gpu=self._use_gpu) as sess: batch_size = 2 input_size = 3 cell_size = 4 sequence_length = 5 inputs = [] for _ in range(sequence_length): inp = tf.convert_to_tensor( np.random.randn(batch_size, input_size), dtype=tf.float32) inputs.append(inp) initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=19890212) with tf.variable_scope("basic", initializer=initializer): cell = tf.nn.rnn_cell.LSTMCell(cell_size, use_peepholes=True, state_is_tuple=True) outputs, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32) sess.run([tf.initialize_all_variables()]) basic_outputs = sess.run(outputs) basic_grads = sess.run(tf.gradients(outputs, inputs)) basic_wgrads = sess.run(tf.gradients(outputs, tf.trainable_variables())) with tf.variable_scope("block", initializer=initializer): w = tf.get_variable("w", shape=[input_size + cell_size, cell_size * 4], dtype=tf.float32) b = tf.get_variable("b", shape=[cell_size * 4], dtype=tf.float32, initializer=tf.zeros_initializer) wci = tf.get_variable("wci", shape=[cell_size], dtype=tf.float32) wcf = tf.get_variable("wcf", shape=[cell_size], dtype=tf.float32) wco = tf.get_variable("wco", shape=[cell_size], dtype=tf.float32) _, _, _, _, _, _, outputs = fused_lstm( tf.convert_to_tensor(sequence_length, dtype=tf.int64), inputs, w, b, wci=wci, wcf=wcf, wco=wco, cell_clip=0, use_peephole=True) sess.run([tf.initialize_all_variables()]) block_outputs = sess.run(outputs) block_grads = sess.run(tf.gradients(outputs, inputs)) block_wgrads = sess.run(tf.gradients(outputs, [w, b, wci, wcf, wco])) self.assertAllClose(basic_outputs, block_outputs) self.assertAllClose(basic_grads, block_grads) for basic, block in zip(basic_wgrads, block_wgrads): self.assertAllClose(basic, block, rtol=1e-2, atol=1e-2)
def _batch_norm(x, name, is_train): """ Apply a batch normalization layer. """ with tf.variable_scope(name): inputs_shape = x.get_shape() axis = list(range(len(inputs_shape) - 1)) param_shape = int(inputs_shape[-1]) moving_mean = tf.get_variable('mean', [param_shape], initializer=tf.constant_initializer(0.0), trainable=False) moving_var = tf.get_variable('variance', [param_shape], initializer=tf.constant_initializer(1.0), trainable=False) beta = tf.get_variable('offset', [param_shape], initializer=tf.constant_initializer(0.0)) gamma = tf.get_variable('scale', [param_shape], initializer=tf.constant_initializer(1.0)) control_inputs = [] def mean_var_with_update(): mean, var = tf.nn.moments(x, axis) update_moving_mean = moving_averages.assign_moving_average(moving_mean, mean, 0.995) update_moving_var = moving_averages.assign_moving_average(moving_var, var, 0.995) control_inputs = [update_moving_mean, update_moving_var] return tf.identity(mean), tf.identity(var) def mean_var(): mean = moving_mean var = moving_var return tf.identity(mean), tf.identity(var) mean, var = tf.cond(is_train, mean_var_with_update, mean_var) with tf.control_dependencies(control_inputs): normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-4) return normed
def add_projection(self, rnn_outputs): """Adds a projection layer. The projection layer transforms the hidden representation to a distribution over the vocabulary. Hint: Here are the dimensions of the variables you will need to create U: (hidden_size, len(vocab)) b_2: (len(vocab),) Args: rnn_outputs: List of length num_steps, each of whose elements should be a tensor of shape (batch_size, embed_size). Returns: outputs: List of length num_steps, each a tensor of shape (batch_size, len(vocab) """ ### YOUR CODE HERE with tf.name_scope('Projection Layer'): U = tf.get_variable('U', [self.config.hidden_size, len(self.vocab)]) b2 = tf.get_variable('b2', len(self.vocab)) outputs = [tf.nn.softmax(tf.matmul(o,U)+b2) for o in rnn_outputs] ### END YOUR CODE return outputs
def logistic_regression(X, y, class_weight=None): """Creates logistic regression TensorFlow subgraph. Args: X: tensor or placeholder for input features, shape should be [batch_size, n_features]. y: tensor or placeholder for target, shape should be [batch_size, n_classes]. class_weight: tensor, [n_classes], where for each class it has weight of the class. If not provided will check if graph contains tensor `class_weight:0`. If that is not provided either all ones are used. Returns: Predictions and loss tensors. """ with tf.variable_scope('logistic_regression'): tf.histogram_summary('logistic_regression.X', X) tf.histogram_summary('logistic_regression.y', y) weights = tf.get_variable('weights', [X.get_shape()[1], y.get_shape()[-1]]) bias = tf.get_variable('bias', [y.get_shape()[-1]]) tf.histogram_summary('logistic_regression.weights', weights) tf.histogram_summary('logistic_regression.bias', bias) # If no class weight provided, try to retrieve one from pre-defined # tensor name in the graph. if not class_weight: try: class_weight = tf.get_default_graph().get_tensor_by_name('class_weight:0') except KeyError: pass return softmax_classifier(X, y, weights, bias, class_weight=class_weight)
def testVarOpScopeReuseParam(self): with self.test_session(): with tf.variable_scope("outer") as outer: with tf.variable_op_scope([], "tower", "default"): self.assertEqual(tf.get_variable("w", []).name, "outer/tower/w:0") with tf.name_scope("scope2") as sc2: self.assertEqual(sc2, "outer/tower/scope2/") with tf.variable_op_scope([], None, "default"): self.assertEqual(tf.get_variable("w", []).name, "outer/default/w:0") with tf.name_scope("scope2") as sc2: self.assertEqual(sc2, "outer/default/scope2/") with tf.variable_scope(outer) as outer: with tf.variable_op_scope([], "tower", "default", reuse=True): self.assertEqual(tf.get_variable("w", []).name, "outer/tower/w:0") with tf.name_scope("scope2") as sc2: self.assertEqual(sc2, "outer_1/tower/scope2/") outer.reuse_variables() with tf.variable_op_scope([], None, "default"): self.assertEqual(tf.get_variable("w", []).name, "outer/default/w:0") with tf.name_scope("scope2") as sc2: self.assertEqual(sc2, "outer_1/default/scope2/")
def testVarOpScopeOuterScope(self): with self.test_session(): with tf.variable_scope("outer") as outer: pass with tf.variable_op_scope([], outer, "default"): self.assertEqual(tf.get_variable("w", []).name, "outer/w:0") with tf.name_scope("scope2") as sc2: self.assertEqual(sc2, "outer_1/scope2/") with tf.variable_op_scope([], None, "default"): self.assertEqual(tf.get_variable("w", []).name, "outer/default/w:0") with tf.name_scope("scope2") as sc2: self.assertEqual(sc2, "outer_1/default/scope2/") with tf.variable_op_scope([], outer, "default", reuse=True): self.assertEqual(tf.get_variable("w", []).name, "outer/w:0") with tf.name_scope("scope2") as sc2: self.assertEqual(sc2, "outer_2/scope2/") outer.reuse_variables() with tf.variable_op_scope([], None, "default"): self.assertEqual(tf.get_variable("w", []).name, "outer/default/w:0") with tf.name_scope("scope2") as sc2: self.assertEqual(sc2, "outer_2/default/scope2/")
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function([newsum, newsumsq, newcount], [], updates=[tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount)])
def testVarOpScope(self): with self.test_session(): with tf.name_scope("scope1"): with tf.variable_op_scope([], "tower", "default"): self.assertEqual(tf.get_variable("w", []).name, "tower/w:0") with tf.name_scope("scope2") as sc2: self.assertEqual(sc2, "scope1/tower/scope2/") with tf.variable_op_scope([], "tower", "default"): with self.assertRaises(ValueError): tf.get_variable("w", []) with tf.name_scope("scope2") as sc2: self.assertEqual(sc2, "scope1/tower_1/scope2/") with tf.name_scope("scope2"): with tf.variable_op_scope([], None, "default"): self.assertEqual(tf.get_variable("w", []).name, "default/w:0") with tf.name_scope("scope2") as sc2: self.assertEqual(sc2, "scope2/default/scope2/") with tf.variable_op_scope([], None, "default"): self.assertEqual(tf.get_variable("w", []).name, "default_1/w:0") with tf.name_scope("scope2") as sc2: self.assertEqual(sc2, "scope2/default_1/scope2/")
def ternarize(x, thresh=0.05): """ Implemented Trained Ternary Quantization: https://arxiv.org/abs/1612.01064 Code modified from the authors' at: https://github.com/czhu95/ternarynet/blob/master/examples/Ternary-Net/ternary.py """ shape = x.get_shape() thre_x = tf.stop_gradient(tf.reduce_max(tf.abs(x)) * thresh) w_p = tf.get_variable('Wp', initializer=1.0, dtype=tf.float32) w_n = tf.get_variable('Wn', initializer=1.0, dtype=tf.float32) tf.summary.scalar(w_p.op.name + '-summary', w_p) tf.summary.scalar(w_n.op.name + '-summary', w_n) mask = tf.ones(shape) mask_p = tf.where(x > thre_x, tf.ones(shape) * w_p, mask) mask_np = tf.where(x < -thre_x, tf.ones(shape) * w_n, mask_p) mask_z = tf.where((x < thre_x) & (x > - thre_x), tf.zeros(shape), mask) @tf.custom_gradient def _sign_mask(x): return tf.sign(x) * mask_z, lambda dy: dy w = _sign_mask(x) w = w * mask_np tf.summary.histogram(w.name, w) return w
def zero_bias(shape, name=None): return tf.get_variable(name, shape, initializer=tf.constant_initializer(0.0))
def weight_variable(scope, shape): with tf.variable_scope(scope): W = tf.get_variable('W', shape, initializer=tf.contrib.layers.xavier_initializer()) return W
def bias_variable(scope, shape): with tf.variable_scope(scope): b = tf.get_variable('b', shape, initializer=tf.constant_initializer(0.1)) return b
def main(flag, load_existing_dump=False): highlight_string("INITIALIZING") print "loading data.." dataset = load_datasets(load_existing_dump)#加载数据集 config = dataset.model_config#加载训练参数 print "word vocab Size: {}".format(len(dataset.word2idx)) print "pos vocab Size: {}".format(len(dataset.pos2idx)) print "dep vocab Size: {}".format(len(dataset.dep2idx)) print "Training Size: {}".format(len(dataset.train_inputs[0])) print "valid data Size: {}".format(len(dataset.valid_data)) print "test data Size: {}".format(len(dataset.test_data)) print len(dataset.word2idx), len(dataset.word_embedding_matrix) print len(dataset.pos2idx), len(dataset.pos_embedding_matrix) print len(dataset.dep2idx), len(dataset.dep_embedding_matrix) if not os.path.exists(os.path.join(DataConfig.data_dir_path, DataConfig.model_dir)): os.makedirs(os.path.join(DataConfig.data_dir_path, DataConfig.model_dir)) with tf.Graph().as_default(), tf.Session() as sess: print "Building network...", start = time.time() with tf.variable_scope("model") as model_scope: model = ParserModel(config, dataset.word_embedding_matrix, dataset.pos_embedding_matrix, dataset.dep_embedding_matrix)#神经网络模型 saver = tf.train.Saver()#保存 """ model_scope.reuse_variables() -> no need to call tf.variable_scope(model_scope, reuse = True) again -> directly access variables & call functions inside this block itself. -> ref: https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/variable_scope -> https://stackoverflow.com/questions/35919020/whats-the-difference-of-name-scope-and-a-variable-scope-in-tensorflow """ print "took {:.2f} seconds\n".format(time.time() - start) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(os.path.join(DataConfig.data_dir_path, DataConfig.summary_dir, DataConfig.train_summ_dir), sess.graph) valid_writer = tf.summary.FileWriter(os.path.join(DataConfig.data_dir_path, DataConfig.summary_dir, DataConfig.test_summ_dir)) if flag == Flags.TRAIN:#训练 # Variable initialization -> not needed for .restore() """ The variables to restore do not have to have been initialized, as restoring is itself a way to initialize variables. """ sess.run(tf.global_variables_initializer()) """ call 'assignment' after 'init' only, else 'assignment' will get reset by 'init' """ sess.run(tf.assign(model.word_embedding_matrix, model.word_embeddings))#把初始化的向量放入训练矩阵中 sess.run(tf.assign(model.pos_embedding_matrix, model.pos_embeddings)) sess.run(tf.assign(model.dep_embedding_matrix, model.dep_embeddings)) highlight_string("TRAINING") model.print_trainable_varibles()#输出各个层定义 model.fit(sess, saver, config, dataset, train_writer, valid_writer, merged)#训练主函数 # Testing highlight_string("Testing") print "Restoring best found parameters on dev set" saver.restore(sess, os.path.join(DataConfig.data_dir_path, DataConfig.model_dir, DataConfig.model_name)) model.compute_dependencies(sess, dataset.test_data, dataset)#由模型计算当前依存弧 test_UAS = model.get_UAS(dataset.test_data)#准确度 print "test UAS: {}".format(test_UAS * 100) train_writer.close() valid_writer.close() # visualize trained embeddings after complete training (not after each epoch) with tf.variable_scope(model_scope, reuse=True): pos_emb = tf.get_variable("feature_lookup/pos_embedding_matrix", [len(dataset.pos2idx.keys()), dataset.model_config.embedding_dim]) visualize_sample_embeddings(sess, os.path.join(DataConfig.data_dir_path, DataConfig.model_dir), dataset.pos2idx.keys(), dataset.pos2idx, pos_emb) print "to Visualize Embeddings, run in terminal:" print "tensorboard --logdir=" + os.path.abspath(os.path.join(DataConfig.data_dir_path, DataConfig.model_dir)) else:#加载已有的变量进行测试 ckpt_path = tf.train.latest_checkpoint(os.path.join(DataConfig.data_dir_path, DataConfig.model_dir)) if ckpt_path is not None: print "Found checkpoint! Restoring variables.." saver.restore(sess, ckpt_path) highlight_string("Testing") model.compute_dependencies(sess, dataset.test_data, dataset)#由模型计算当前依存弧 test_UAS = model.get_UAS(dataset.test_data)#准确度 print "test UAS: {}".format(test_UAS * 100) # model.run_valid_epoch(sess, dataset.valid_data, dataset) # valid_UAS = model.get_UAS(dataset.valid_data) # print "valid UAS: {}".format(valid_UAS * 100) highlight_string("Embedding Visualization") with tf.variable_scope(model_scope, reuse=True): pos_emb = tf.get_variable("feature_lookup/pos_embedding_matrix", [len(dataset.pos2idx.keys()), dataset.model_config.embedding_dim]) visualize_sample_embeddings(sess, os.path.join(DataConfig.data_dir_path, DataConfig.model_dir), dataset.pos2idx.keys(), dataset.pos2idx, pos_emb) print "to Visualize Embeddings, run in terminal:" print "tensorboard --logdir=" + os.path.abspath(os.path.join(DataConfig.data_dir_path, DataConfig.model_dir)) else: print "No checkpoint found!"
def conv_layers(self,input_x,name_scope,reuse_flag=False): """main computation graph here: 1.embedding-->2.CONV-RELU-MAX_POOLING-->3.linear classifier""" # 1.=====>get emebedding of words in the sentence embedded_words = tf.nn.embedding_lookup(self.Embedding,input_x)#[None,sentence_length,embed_size] sentence_embeddings_expanded=tf.expand_dims(embedded_words,-1) #[None,sentence_length,embed_size,1). expand dimension so meet input requirement of 2d-conv # 2.=====>loop each filter size. for each filter, do:convolution-pooling layer(a.create filters,b.conv,c.apply nolinearity,d.max-pooling)---> # you can use:tf.nn.conv2d;tf.nn.relu;tf.nn.max_pool; feature shape is 4-d. feature is a new variable pooled_outputs = [] for i,filter_size in enumerate(self.filter_sizes): with tf.variable_scope(str(name_scope)+"convolution-pooling-%s" %filter_size,reuse=reuse_flag): # ====>a.create filter #Layer1:CONV-RELU filter=tf.get_variable("filter-%s"%filter_size,[filter_size,self.embed_size,1,self.num_filters],initializer=self.initializer) # ====>b.conv operation: conv2d===>computes a 2-D convolution given 4-D `input` and `filter` tensors. #Conv.Input: given an input tensor of shape `[batch, in_height, in_width, in_channels]` and a filter / kernel tensor of shape `[filter_height, filter_width, in_channels, out_channels]` #Conv.Returns: A `Tensor`. Has the same type as `input`. # A 4-D tensor. The dimension order is determined by the value of `data_format`, see below for details. #1)each filter with conv2d's output a shape:[1,sequence_length-filter_size+1,1,1];2)*num_filters--->[1,sequence_length-filter_size+1,1,num_filters];3)*batch_size--->[batch_size,sequence_length-filter_size+1,1,num_filters] #input data format:NHWC:[batch, height, width, channels];output:4-D conv=tf.nn.conv2d(sentence_embeddings_expanded, filter, strides=[1,1,1,1], padding="VALID",name="conv") #shape:[batch_size,sequence_length - filter_size + 1,1,num_filters] print("conv1.0:", conv) #conv,update_ema_conv1=self.batchnorm(conv,self.tst, self.iter, self.b1_conv1) #TODO TODO TODO TODO TODO #print("conv1.1:",conv) # ====>c. apply nolinearity b=tf.get_variable("b-%s"%filter_size,[self.num_filters]) #ADD 2017-06-09 h=tf.nn.relu(tf.nn.bias_add(conv,b),"relu") #shape:[batch_size,sequence_length - filter_size + 1,1,num_filters]. tf.nn.bias_add:adds `bias` to `value` #Layer2:CONV-RELU ################################################################################# #TODO h=tf.reshape(h,[-1,self.sequence_length-filter_size+1,self.num_filters,1]) #shape:[batch_size,sequence_length-filter_size+1,num_filters,1] #TODO ##filter2 = tf.get_variable("filter2-%s" % filter_size, [1, self.num_filters, 1, self.num_filters],initializer=self.initializer) ###conv2=tf.nn.conv2d(h,filter2,strides=[1,1,1,1],padding="VALID",name="conv2") #shape:[] #conv2, update_ema_conv2 = self.batchnorm(conv2, self.tst, self.iter, self.b1_conv2) ###print("conv2:",conv2) ###b2 = tf.get_variable("b2-%s" % filter_size, [self.num_filters]) # ADD 2017-06-09 ###conv2=conv2+conv ###h = tf.nn.relu(tf.nn.bias_add(conv2, b2),"relu2") # shape:[batch_size,sequence_length - filter_size + 1,1,num_filters]. tf.nn.bias_add:adds `bias` to `value` ################################################################################ #Layer3:CONV-RELU #h = tf.reshape(h, [-1, self.sequence_length - filter_size + 1, self.num_filters, 1]) #shape:[batch_size,sequence_length-filter_size+1,num_filters,1] #filter3 = tf.get_variable("filter3-%s" % filter_size, [1, self.num_filters, 1, self.num_filters],initializer=self.initializer) #conv3=tf.nn.conv2d(h,filter3,strides=[1,1,1,1],padding="VALID",name="conv3") #shape:[] #print("conv3:",conv3) #b3 = tf.get_variable("b3-%s" % filter_size, [self.num_filters]) # ADD 2017-06-09 #h = tf.nn.relu(tf.nn.bias_add(conv3, b3),"relu3") # shape:[batch_size,sequence_length - filter_size + 1,1,num_filters]. tf.nn.bias_add:adds `bias` to `value` # ====>. max-pooling. value: A 4-D `Tensor` with shape `[batch, height, width, channels] # ksize: A list of ints that has length >= 4. The size of the window for each dimension of the input tensor. # strides: A list of ints that has length >= 4. The stride of the sliding window for each dimension of the input tensor. #pooled=tf.nn.max_pool(h, ksize=[1,self.sequence_length-filter_size*2+2,1,1], strides=[1,1,1,1], padding='VALID',name="pool")#shape:[batch_size, 1, 1, num_filters].max_pool:performs the max pooling on the input. #pooled=tf.nn.max_pool(h, ksize=[1,self.sequence_length-filter_size+1,1,1], strides=[1,1,1,1], padding='VALID',name="pool")#shape:[batch_size, 1, 1, num_filters].max_pool:performs the max pooling on the input. TODO #####max_k_pooling############ h=tf.reshape(h,[-1,self.sequence_length - filter_size + 1,self.num_filters]) #[batch_size,sequence_length - filter_size + 1,num_filters] h=tf.transpose(h, [0, 2, 1]) #[batch_size,num_filters,sequence_length - filter_size + 1] h = tf.nn.top_k(h, k=self.top_k, name='top_k')[0] # [batch_size,num_filters,self.k] h=tf.reshape(h,[-1,self.num_filters*self.top_k]) #TODO [batch_size,num_filters*self.k] ################## pooled_outputs.append(h) # 3.=====>combine all pooled features, and flatten the feature.output' shape is a [1,None] #e.g. >>> x1=tf.ones([3,3]);x2=tf.ones([3,3]);x=[x1,x2] # x12_0=tf.concat(x,0)---->x12_0' shape:[6,3] # x12_1=tf.concat(x,1)---->x12_1' shape;[3,6] h_pool=tf.concat(pooled_outputs,1) #shape:[batch_size, num_filters_total*self.k]. tf.concat=>concatenates tensors along one dimension.where num_filters_total=num_filters_1+num_filters_2+num_filters_3 h_pool_flat=tf.reshape(h_pool,[-1,self.num_filters_total*3]) #shape should be:[None,num_filters_total]. here this operation has some result as tf.sequeeze().e.g. x's shape:[3,3];tf.reshape(-1,x) & (3, 3)---->(1,9) print("h_pool_flat:",h_pool_flat) #4.=====>add dropout: use tf.nn.dropout with tf.name_scope("dropout"): h=tf.nn.dropout(h_pool_flat,keep_prob=self.dropout_keep_prob) #[None,num_filters_total] return h #,update_ema_conv1,update_ema_conv2
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training2 = linear_targets is not None # test에서 이게 True로 되는데, 이게 의도한 것인가??? is_training = not rnn_decoder_test_mode self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: # speaker_embedding_size = f(16) speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed( speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway" ) # 'enc_prenet_sizes': [f(256), f(128)] encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [ get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) for idx in range(hp.dec_layer_num) ] else: deep_dense = lambda x, dim: tf.layers.dense( x, dim, activation=tf.nn.softsign ) # softsign: x / (abs(x) + 1) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다. before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type)) else: # self.num_speakers =1인 경우 speaker_embed = None before_highway = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet( char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet' ) # 'enc_prenet_sizes': [f(256), f(128)], dropout_prob = 0.5 # ==> (N, T_in, 128) # enc_rnn_size = 128 encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) # single: attention_size = 128 if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_mon_norm_hccho': attention_mechanism = BahdanauMonotonicAttention_hccho( hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다. # carpedm20은 tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만, keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다. attention_cell = AttentionWrapper( GRUCell(hp.attention_state_size), attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 dec_prenet_outputs = DecoderPrenetWrapper( attention_cell, speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) # dec_prenet_sizes = [f(256), f(128)] # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다. # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ] concat_cell = ConcatOutputAndAttentionWrapper( dec_prenet_outputs, embed_to_concat=speaker_embed ) # concat(output,attention,speaker_embed)해서 새로운 output을 만든다. # Decoder (layers specified bottom to top): dec_rnn_size= 256 cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size) ] # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데... for _ in range(hp.dec_layer_num): # hp.dec_layer_num = 2 cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor ) # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까??? (hp.num_mels+1) * hp.reduction_factor decoder_init_state = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. ) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training2: # rnn_decoder_test_mode = True if test mode, train mode에서는 False helper = TacoTrainingHelper( inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters) # max_iters=200 # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다. ==> final_decoder_state[0] alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def fc(input, num_output, name='fc'): with tf.variable_scope(name): num_input = input.get_shape()[1] W = tf.get_variable('w', [num_input, num_output], tf.float32, tf.random_normal_initializer(0.0, 0.02)) b = tf.get_variable('b', [num_output], initializer=tf.constant_initializer(0.0)) return tf.matmul(input, W) + b
def weight(shape, name=None): return tf.get_variable(name, shape, initializer=tf.random_normal_initializer(0.0, 0.1))
sparse = False p = 0.1 # sparsity parameter beta = 0.1 # sparsity regularization strength # setting up the graph: data = tf.placeholder( dtype=tf.float32, shape=(None, input_dim)) noiseless = tf.placeholder( dtype=tf.float32, shape=(None, input_dim)) # JUST FOR TRAINING # def get_noise(data): # noise = np.random.randn(*data.shape)*0.2 # return noise a = data for l in range(1,len(network_structure)): w = tf.get_variable(name="w%s" % l, initializer=tf.random_normal(shape=(network_structure[l-1], network_structure[l]), stddev=1./np.sqrt(network_structure[l-1]))) b = tf.get_variable(name="b%s" % l, shape=(1, network_structure[l]), initializer=tf.zeros_initializer) z = tf.matmul(a, w) + b a = tf.nn.sigmoid(z, name="a%s" % l) # last activation has name "a%s" % (len(network_structure)-1) # if l == coding_layer_index: # coding_layer_activations = a ################################ # Defining the loss operation: # ################################ loss_reconstruction = tf.losses.mean_squared_error(labels=noiseless, predictions=a) loss_sparsity = 0 if sparse: # getting the coding layer (the smallest hidden layer):
def positive_bias(shape, name=None): return tf.get_variable(name, shape, initializer=tf.constant_initializer(0.1))
def transformer_model(input_tensor, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False, extras=None): """Multi-headed, multi-layer Transformer from "Attention is All You Need". This is almost an exact implementation of the original Transformer encoder. See the original paper: https://arxiv.org/abs/1706.03762 Also see: https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], with 1 for positions that can be attended to and 0 in positions that should not be. hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers (blocks) in the Transformer. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the "intermediate" (a.k.a., feed forward) layer. intermediate_act_fn: function. The non-linear activation function to apply to the output of the intermediate/feed-forward layer. hidden_dropout_prob: float. Dropout probability for the hidden layers. attention_probs_dropout_prob: float. Dropout probability of the attention probabilities. initializer_range: float. Range of the initializer (stddev of truncated normal). do_return_all_layers: Whether to also return all layers or just the final layer. Returns: float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer. Raises: ValueError: A Tensor shape or parameter is invalid. """ if hidden_size % num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, num_attention_heads)) attention_head_size = int(hidden_size / num_attention_heads) input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] input_width = input_shape[2] # The Transformer performs sum residuals on all layers so the input needs # to be the same as the hidden size. if input_width != hidden_size: raise ValueError( "The width of the input tensor (%d) != hidden size (%d)" % (input_width, hidden_size)) extras.entity_pos_table_key = tf.get_variable( name='entity_pos_embeddings_key', shape=[extras.max_distance * 2 + 1, hidden_size], initializer=create_initializer(0.5)) extras.entity_pos_table_val = tf.get_variable( name='entity_pos_embeddings_val', shape=[extras.max_distance * 2 + 1, hidden_size], initializer=create_initializer(0.5)) # We keep the representation as a 2D tensor to avoid re-shaping it back and # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on # the GPU/CPU but may not be free on the TPU, so we want to minimize them to # help the optimizer. prev_output = reshape_to_matrix(input_tensor) all_layer_outputs = [] for layer_idx in range(num_hidden_layers): with tf.variable_scope("layer_%d" % layer_idx): layer_input = prev_output with tf.variable_scope("attention"): attention_heads = [] with tf.variable_scope("self"): attention_head = attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob= attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length, extras=extras) attention_heads.append(attention_head) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output = tf.concat(attention_heads, axis=-1) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope("output"): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=create_initializer( initializer_range)) attention_output = dropout(attention_output, hidden_dropout_prob) attention_output = layer_norm(attention_output + layer_input) # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope("intermediate"): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=create_initializer(initializer_range)) # Down-project back to `hidden_size` then add the residual. with tf.variable_scope("output"): layer_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=create_initializer(initializer_range)) layer_output = dropout(layer_output, hidden_dropout_prob) layer_output = layer_norm(layer_output + attention_output) prev_output = layer_output all_layer_outputs.append(layer_output) if do_return_all_layers: final_outputs = [] for layer_output in all_layer_outputs: final_output = reshape_from_matrix(layer_output, input_shape) final_outputs.append(final_output) return final_outputs else: final_output = reshape_from_matrix(prev_output, input_shape) return final_output
X = tf.placeholder(tf.float32, [None, 784]) X_img = tf.reshape(X, [-1, 28, 28, 1]) Y = tf.placeholder(tf.float32, [None, 10]) W1 = tf.Variable(tf.random_normal([3,3,1,32], stddev=0.01)) L1 = tf.nn.conv2d(X_img, W1, strides=[1,1,1,1], padding='SAME') L1 = tf.nn.relu(L1) L1 = tf.nn.max_pool(L1, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME') W2 = tf.Variable(tf.random_normal([3,3,32,64], stddev=0.01)) L2 = tf.nn.conv2d(L1, W2, strides=[1,1,1,1], padding='SAME') L2 = tf.nn.relu(L2) L2 = tf.nn.max_pool(L2, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME') L2 = tf.reshape(L2, [-1,7*7*64]) W3 = tf.get_variable('W3', shape=[7*7*64, 10], initializer = tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.random_normal([10])) hypothesis = tf.matmul(L2, W3) + b cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hypothesis, labels=Y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) sess = tf.Session() sess.run(tf.global_variables_initializer()) print('Learning started. It takes sometime.') for epoch in range(training_epochs): avg_cost = 0 total_batch = int(mnist.train.num_examples / batch_size) for i in range(total_batch): batch_xs, batch_ys = mnist.train.next_batch(batch_size)
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, num_samples=512, forward_only=False): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w = tf.get_variable("proj_w", [size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = tf.nn.rnn_cell.GRUCell(size) cell = single_cell if num_layers > 1: cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return tf.nn.seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. if forward_only: self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.global_variables())
def _create_variables(self): with tf.name_scope('embedding'): self.user_map_embedding = tf.Variable(tf.truncated_normal(shape=[self.n_users, self.n_factors * self.embedding], mean=0.0, stddev=0.01),name='user_map', dtype=tf.float32) self.user_embedding = tf.Variable(tf.truncated_normal(shape=[self.n_users, self.embedding], mean=0.0, stddev=0.01),name='user_emb', dtype=tf.float32) self.item_map_embedding = tf.get_variable( dtype=tf.float32, initializer=tf.reshape(self.item_factors, [-1, self.n_factors * self.embedding]), name='item_map', trainable=False) self.item_embedding = tf.Variable(tf.truncated_normal(shape=[self.n_items, self.embedding], mean=0.0, stddev=0.01),name='item_emb', dtype=tf.float32)
def __init__(self, is_training, batch_size): """ :param is_training: is or not training, True/False :param batch_size: the size of one batch :param num_steps: the length of one lstm """ #定义网络参数 self.learning_rate = tf.Variable(float(LEARNING_RATE), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * LEARNING_RATE_DECAY_FACTOR) self.global_step = 0 self.global_epoch = 0 self.batch_size = batch_size # 定义输入层,其维度是batch_size * num_steps self.pre_input = tf.placeholder(tf.int32, [batch_size,None]) self.pre_input_seq_length = tf.placeholder(tf.int32, [batch_size,]) self.fol_input = tf.placeholder(tf.int32, [batch_size,None]) self.fol_input_seq_length = tf.placeholder(tf.int32, [batch_size,]) # 定义预期输出,它的维度和上面维度相同 self.targets = tf.placeholder(tf.int32, [batch_size,]) embedding = tf.get_variable("embedding", [VOCAB_SIZE, HIDDEN_SIZE]) # embedding矩阵 # pre_context_model with tf.variable_scope('Pre') as scope: pre_cell = tf.contrib.rnn.BasicLSTMCell(num_units=PRE_CONTEXT_HIDDEN_SIZE, forget_bias=0.0, state_is_tuple=True) if is_training: pre_cell = tf.contrib.rnn.DropoutWrapper(pre_cell, output_keep_prob=KEEP_PROB) pre_lstm_cell = tf.contrib.rnn.MultiRNNCell([pre_cell] * PRE_CONTEXT_NUM_LAYERS, state_is_tuple=True) pre_input = tf.nn.embedding_lookup(embedding, self.pre_input) # 将原本单词ID转为单词向量。 if is_training: pre_input = tf.nn.dropout(pre_input, KEEP_PROB) self.pre_initial_state = pre_lstm_cell.zero_state(batch_size, tf.float32) # 初始化最初的状态。 pre_outputs, pre_states = tf.nn.dynamic_rnn(pre_lstm_cell, pre_input,sequence_length=self.pre_input_seq_length, initial_state=self.pre_initial_state,dtype=tf.float32) #tmp_output = pre_outputs[:, -1, :] #上一时刻的输出作下一时刻预测的输入 #pre_outputs, pre_states = pre_lstm_cell(tmp_output, pre_states) pre_outputs = pre_outputs[:, -1, :] self.pre_final_state = pre_states #上文LSTM的最终状态 # fol_context_model with tf.variable_scope('Fol') as scope: fol_cell = tf.contrib.rnn.BasicLSTMCell(num_units=FOL_CONTEXT_HIDDEN_SIZE, forget_bias=0.0, state_is_tuple=True) if is_training: fol_cell = tf.contrib.rnn.DropoutWrapper(fol_cell, output_keep_prob=KEEP_PROB) fol_lstm_cell = tf.contrib.rnn.MultiRNNCell([fol_cell] * FOL_CONTEXT_NUM_LAYERS, state_is_tuple=True) fol_input = tf.nn.embedding_lookup(embedding, self.fol_input) # 将原本单词ID转为单词向量。 if is_training: fol_input = tf.nn.dropout(fol_input, KEEP_PROB) self.fol_initial_state = fol_lstm_cell.zero_state(batch_size, tf.float32) # 初始化最初的状态。 fol_outputs, fol_states = tf.nn.dynamic_rnn(fol_lstm_cell, fol_input,sequence_length=self.fol_input_seq_length, initial_state=self.fol_initial_state, dtype=tf.float32) #tmp_output = fol_outputs[:, -1, :] # 上一时刻的输出作下一时刻预测的输入 #fol_outputs, fol_states = fol_lstm_cell(tmp_output, fol_states) fol_outputs = fol_outputs[:, -1, :] self.fol_final_state = fol_states #下文lstm的最终状态 # 综合两个lstm的数据,加权平均 # self.output = tf.add(pre_outputs,fol_outputs)/2 # 全连接层 # weight = tf.get_variable("weight", [HIDDEN_SIZE, VOCAB_SIZE]) # bias = tf.get_variable("bias", [VOCAB_SIZE]) # self.logits = tf.matmul(self.output, weight) + bias # 简单拼接 output = tf.concat([pre_outputs, fol_outputs], 1) # 全连接层 weight = tf.get_variable("weight", [2*HIDDEN_SIZE, VOCAB_SIZE]) bias = tf.get_variable("bias", [VOCAB_SIZE]) self.logits = tf.matmul(output, weight) + bias ''' 定义交叉熵损失函数和平均损失。 logits中在vocab_size个结果中选择概率最大的结果与相应的targets结果比较计算loss值 返回一个 [batch_size] 的1维张量 ''' #loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( # [self.logits], [self.targets], # [tf.ones([batch_size], dtype=tf.float32)]) #softmax+交叉熵损失函数 loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.targets) # 记录cost with tf.variable_scope('cost') as scope: self.cost = tf.reduce_mean(loss) self.ave_cost = tf.Variable(0.0, trainable=False, dtype=tf.float32) self.ave_cost_op = self.ave_cost.assign(tf.divide( tf.add(tf.multiply(self.ave_cost, self.global_step), self.cost), self.global_step+1)) #global_step从0开始 tf.summary.scalar('cost', self.cost) tf.summary.scalar('ave_cost', self.ave_cost) # 只在训练模型时定义反向传播操作。 # 记录accuracy with tf.variable_scope('accuracy') as scope: correct_prediction = tf.equal(self.targets, tf.cast(tf.argmax(self.logits, -1), tf.int32)) self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) self.ave_accuracy = tf.Variable(0.0, trainable=False, dtype=tf.float32) self.ave_accuracy_op = self.ave_accuracy.assign(tf.divide( tf.add(tf.multiply(self.ave_accuracy, self.global_step),self.accuracy),self.global_step+1)) # global_step从0开始 tf.summary.scalar('accuracy', self.accuracy) tf.summary.scalar('ave_accuracy', self.ave_accuracy) # 只在训练模型时定义反向传播操作。 # 只在训练模型时定义反向传播操作。 if not is_training: return # trainable_variables = tf.trainable_variables() # trainable_variables = tf.all_variables() # 控制梯度大小,定义优化方法和训练步骤。 # grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainable_variables), MAX_GRAD_NORM) # optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE) # self.train_op = optimizer.apply_gradients(zip(grads, trainable_variables)) self.train_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost) self.merged_summary_op = tf.summary.merge_all() # 收集节点
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): """Performs various post-processing on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add embeddings for `token_type_ids`. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Must be specified if `use_token_type` is True. token_type_vocab_size: int. The vocabulary size of `token_type_ids`. token_type_embedding_name: string. The name of the embedding table variable for token type ids. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same shape as `input_tensor`. Raises: ValueError: One of the tensor shapes or input values is invalid. """ input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_token_type: if token_type_ids is None: raise ValueError("`token_type_ids` must be specified if" "`use_token_type` is True.") token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) # This vocab will be small so we always do one-hot here, since it is always # faster for a small vocabulary. flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if use_position_embeddings: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) output += position_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output
def add_prediction_op(self): """Adds the unrolled RNN: h_0 = 0 for t in 1 to T: o_t, h_t = cell(x_t, h_{t-1}) o_drop_t = Dropout(o_t, dropout_rate) y_t = o_drop_t U + b_2 TODO: There a quite a few things you'll need to do in this function: - Define the variables U, b_2. - Define the vector h as a constant and inititalize it with zeros. See tf.zeros and tf.shape for information on how to initialize this variable to be of the right shape. https://www.tensorflow.org/api_docs/python/constant_op/constant_value_tensors#zeros https://www.tensorflow.org/api_docs/python/array_ops/shapes_and_shaping#shape - In a for loop, begin to unroll the RNN sequence. Collect the predictions in a list. - When unrolling the loop, from the second iteration onwards, you will HAVE to call tf.get_variable_scope().reuse_variables() so that you do not create new variables in the RNN cell. See https://www.tensorflow.org/versions/master/how_tos/variable_scope/ - Concatenate and reshape the predictions into a predictions tensor. Hint: You will find the function tf.pack (similar to np.asarray) useful to assemble a list of tensors into a larger tensor. https://www.tensorflow.org/api_docs/python/array_ops/slicing_and_joining#pack Hint: You will find the function tf.transpose and the perms argument useful to shuffle the indices of the tensor. https://www.tensorflow.org/api_docs/python/array_ops/slicing_and_joining#transpose Remember: * Use the xavier initilization for matrices. * Note that tf.nn.dropout takes the keep probability (1 - p_drop) as an argument. The keep probability should be set to the value of self.dropout_placeholder Returns: pred: tf.Tensor of shape (batch_size, max_length, n_classes) """ x = self.add_embedding() dropout_rate = self.dropout_placeholder preds = [] # Predicted output at each timestep should go here! # Use the cell defined below. For Q2, we will just be using the # RNNCell you defined, but for Q3, we will run this code again # with a GRU cell! if Config.cell == "rnn": cell = RNNCell(Config.n_features * Config.embed_size, Config.hidden_size) elif Config.cell == "gru": cell = GRUCell(Config.n_features * Config.embed_size, Config.hidden_size) else: raise ValueError("Unsuppported cell type: " + Config.cell) # Define U and b2 as variables. # Initialize state as vector of zeros. ### YOUR CODE HERE (~4-6 lines) U = tf.get_variable( 'U', shape=(Config.hidden_size, Config.n_classes), initializer=tf.contrib.layers.xavier_initializer(seed=1)) b2 = tf.get_variable( 'b2', shape=(Config.n_classes), initializer=tf.contrib.layers.xavier_initializer(seed=2)) h = tf.zeros(shape=(tf.shape(x)[0], Config.hidden_size)) ### END YOUR CODE with tf.variable_scope("RNN"): for time_step in range(self.max_length): ### YOUR CODE HERE (~6-10 lines) if time_step > 0: tf.get_variable_scope().reuse_variables() output, h = cell(x[:, time_step, :], h) output = tf.nn.dropout(output, self.dropout_placeholder) output = tf.matmul(output, U) + b2 preds.append(output) ### END YOUR CODE # Make sure to reshape @preds here. ### YOUR CODE HERE (~2-4 lines) preds = tf.stack(preds) print preds.shape preds = tf.transpose(preds, perm=[1, 0, 2]) ### END YOUR CODE assert preds.get_shape().as_list() == [ None, self.max_length, Config.n_classes ], "predictions are not of the right shape. Expected {}, got {}".format( [None, self.max_length, Config.n_classes], preds.get_shape().as_list()) return preds
hr_imgs.append(img) tmp = img[:, :, 0] lr_img = misc.imresize(tmp, [height // scale, width // scale], interp='bicubic', mode='F') lr_imgs.append(lr_img / 255.0) bic_img = misc.imresize(lr_img, [height, width], interp='bicubic', mode='F') bic_imgs.append(bic_img / 255.0) pad = t // 2 lr_imgs = [lr_imgs[0]] * pad + lr_imgs + [lr_imgs[-1]] * pad bic_imgs = [bic_imgs[0]] * pad + bic_imgs + [bic_imgs[-1]] * pad print('files num:', len(lr_imgs)) lr = tf.placeholder(dtype=tf.float32, shape=[1, t, height // scale, width // scale, 1]) bic = tf.placeholder(dtype=tf.float32, shape=[1, height, width, 1]) tf_pre_sr = tf.get_variable('tf_pre_sr', shape=[1, height, width, 1], dtype=tf.float32, collections=[tf.GraphKeys.LOCAL_VARIABLES]) tf_pre_feat = tf.get_variable('tf_pre_feat', shape=[1, height // scale, width // scale, 128], dtype=tf.float32, collections=[tf.GraphKeys.LOCAL_VARIABLES]) with tf.variable_scope('video_sr'): m = model() local_sr, local_feat = m.local_net(lr, bic) local_sr = tf.clip_by_value(local_sr, 0, 1) refine_sr, refine_feat = m.refine_net(tf_pre_sr, tf_pre_feat, local_sr, local_feat) refine_sr = tf.clip_by_value(refine_sr, 0, 1) saver = tf.train.Saver()
def __init__(self, args, infer=False): self.kernels = [1, 2, 3, 4, 5, 6, 7] self.kernel_features = [50, 100, 150, 200, 200, 200, 200] assert len(self.kernels) == len( self.kernel_features), 'kernels size:%d,kernel_feature size:%d' % ( len(self.kernels, len(self.kernel_features))) self.input_ = tf.placeholder(tf.int32, shape=[ args.batch_size, args.num_unroll_steps, args.max_word_length ], name="input") self.targets = tf.placeholder(tf.int32, [args.batch_size, args.num_unroll_steps], name='targets') target_list = tf.unpack(self.targets, axis=1) #hjq ''' First, embed characters ''' with tf.variable_scope('Embedding'): char_embedding_r = tf.get_variable( 'char_embedding', [args.char_vocab_size, args.char_embed_size]) char_embedinglist = tf.unpack(char_embedding_r) char_embedinglist[0] = tf.zeros([args.char_embed_size], dtype=tf.float32) self.char_embedding = tf.pack(char_embedinglist) # [batch_size x max_word_length, num_unroll_steps, char_embed_size] input_embedded = tf.nn.embedding_lookup(self.char_embedding, self.input_) input_embedded_s = tf.reshape( input_embedded, [-1, args.max_word_length, args.char_embed_size]) ''' Second, apply convolutions ''' # [batch_size x num_unroll_steps, cnn_size] # where cnn_size=sum(kernel_features) input_cnn = tdnn(input_embedded_s, self.kernels, self.kernel_features) ''' Maybe apply Highway ''' if args.highway_layers > 0: input_cnn = highway(input_cnn, input_cnn.get_shape()[-1], num_layers=args.highway_layers) ''' Finally, do LSTM ''' with tf.variable_scope('LSTM'): cell = tf.nn.rnn_cell.BasicLSTMCell(args.rnn_size, state_is_tuple=True, forget_bias=0.0) if args.dropout > 0.0: cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=1. - args.dropout) if args.rnn_layers > 1: cell = tf.nn.rnn_cell.MultiRNNCell([cell] * args.rnn_layers, state_is_tuple=True) self.initial_rnn_state = cell.zero_state(args.batch_size, dtype=tf.float32) input_cnn = tf.reshape( input_cnn, [args.batch_size, args.num_unroll_steps, -1]) # input_cnn2 = [tf.squeeze(x, [1]) for x in tf.split(1, num_unroll_steps, input_cnn)] input_cnn2 = tf.unpack( input_cnn, axis=1 ) #hjq, a list of Tensor[batch_size x hidden],length of num_unroll_steps outputs, state = tf.nn.rnn(cell, input_cnn2, initial_state=self.initial_rnn_state, dtype=tf.float32) #origin self.final_rnn_state = state # linear projection onto output (word) vocab self.logits = [] with tf.variable_scope('WordEmbedding') as scope: for idx, output in enumerate(tf.unpack(outputs, axis=0)): if idx > 0: scope.reuse_variables() self.logits.append(linear(output, args.word_vocab_size)) self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits( self.logits, target_list), name='loss') / args.batch_size cost = self.loss / args.num_unroll_steps self.global_step = tf.Variable(0, name='global_step', trainable=False) self.learning_rate = tf.Variable(args.learning_rate, trainable=False, name='learning_rate') tvars = tf.trainable_variables() grads, self.global_norm = tf.clip_by_global_norm( tf.gradients(cost, tvars), args.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def train(data, batch_size=128, learning_rate=FLAGS.learning_rate, log_dir='./log', checkpoint_dir='./checkpoint', num_epochs=-1): # tf Graph input with tf.device('/cpu:0'): with tf.name_scope('data'): if FLAGS.dataset == "imagenet": x, yt = image_processing.distorted_inputs( data, batch_size=batch_size, num_preprocess_threads=FLAGS.num_threads) else: x, yt = data.generate_batches(batch_size, num_threads=FLAGS.num_threads) global_step = tf.get_variable('global_step', shape=[], dtype=tf.int64, initializer=tf.constant_initializer(0), trainable=False) if FLAGS.gpu: device_str = '/gpu:' + str(FLAGS.device) else: device_str = '/cpu:0' with tf.device(device_str): alphas_training_operations = [] # Convolution Layer 1 W_conv1 = weight_variable(shape=([5, 5, 3, 32]), name="W_conv1") b_conv1 = bias_variable(shape=[32], name="b_conv1") alphas_training_op1, ABCLayer1, alphas_loss1 = ABC( W_conv1, b_conv1, no_binary_filters=5, no_ApproxConvLayers=5, padding="SAME") alphas_training_operations.append(alphas_training_op1) conv1 = ABCLayer1(x) pool1 = max_pool_2x2(conv1) bn_conv1 = tf.layers.batch_normalization(pool1, axis=-1) h_conv1 = tf.nn.relu(bn_conv1) # Convolution Layer 2 W_conv2 = tf.Variable(values["W_conv2"], name="W_conv2") b_conv2 = tf.Variable(values["b_conv2"], name="b_conv2") alphas_training_op2, ABCLayer2, alphas_loss2 = ABC( W_conv2, b_conv2, no_binary_filters=5, no_ApproxConvLayers=5, padding="SAME") alphas_training_operations.append(alphas_training_op2) conv2 = ABCLayer2(h_conv1) pool2 = max_pool_2x2(conv2) bn_conv2 = tf.layers.batch_normalization(pool2, axis=-1) h_conv2 = tf.nn.relu(bn_conv2) # Flat the conv2 output h_conv2_flat = tf.reshape(h_conv2, shape=(-1, 7 * 7 * 64)) # Dense layer1 W_fc1 = weight_variable([7 * 7 * 64, 1024]) b_fc1 = bias_variable([1024]) h_fc1 = tf.nn.relu(tf.matmul(h_conv2_flat, W_fc1) + b_fc1) # Output layer W_fc2 = weight_variable([1024, 10]) b_fc2 = bias_variable([10]) model = tf.matmul(h_fc1, W_fc2) + b_fc2 y = model graph_init = tf.global_variables_initializer() # Define loss and optimizer with tf.name_scope('objective'): loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=yt, logits=y)) accuracy = tf.reduce_mean( tf.cast(tf.nn.in_top_k(y, yt, 1), tf.float32)) opt = tf.contrib.layers.optimize_loss( loss, global_step, learning_rate, 'Adam', gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, #moving_average_decay=0.9, learning_rate_decay_fn=learning_rate_decay_fn if FLAGS.using_learning_rate_decay_fn else None, update_ops=None, variables=None, name=None) #grads = opt.compute_gradients(loss) #apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # loss_avg ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step, name='average') ema_op = ema.apply([loss, accuracy] + tf.trainable_variables()) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, ema_op) loss_avg = ema.average(loss) tf.summary.scalar('loss/training', loss_avg) accuracy_avg = ema.average(accuracy) tf.summary.scalar('accuracy/training', accuracy_avg) check_loss = tf.check_numerics(loss, 'model diverged: loss->nan') tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, check_loss) updates_collection = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies([opt]): train_op = tf.group(*updates_collection) if FLAGS.summary: add_summaries(scalar_list=[accuracy, accuracy_avg, loss, loss_avg], activation_list=tf.get_collection( tf.GraphKeys.ACTIVATIONS), var_list=tf.trainable_variables()) # grad_list=grads) summary_op = tf.summary.merge_all() # Configure options for session gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.InteractiveSession(config=tf.ConfigProto( log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options, )) if FLAGS.resume: logging.info('resuming from ' + checkpoint_dir) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(checkpoint_dir + '/') if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found') return #print sess.run('global_step:0') #print global_step.eval() else: saver = tf.train.Saver(max_to_keep=5) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) num_batches = data.size[0] / batch_size summary_writer = tf.summary.FileWriter(log_dir, graph=sess.graph) epoch = global_step.eval() / num_batches if FLAGS.resume else 0 display_interval = FLAGS.display_interval or num_batches / 10 test_interval = FLAGS.test_interval or num_batches / 2 logging.info('num of trainable paramaters: %d' % count_params(tf.trainable_variables())) tic = time.clock() while epoch != num_epochs: curr_step = 0 # Initializing the variables #with tf.Session() as session: # print(session.run(ww)) logging.info('Started epoch %d' % epoch) while curr_step < data.size[0]: for alpha_training_op in alphas_training_operations: for alpha_epoch in range(alpha_training_epochs): sess.run(alpha_training_op) _, loss_val, step = sess.run([train_op, loss, global_step]) # if step%display_interval==0: # step, acc_value, loss_value, summary = sess.run( # [global_step, accuracy_avg, loss_avg, summary_op]) # logging.info("step %d loss %.3f accuracy %.3f" %(step,loss_value,acc_value)) # summary_out = tf.Summary() # summary_out.ParseFromString(summary) # summary_writer.add_summary(summary_out, step) # summary_writer.flush() # if step%test_interval==0: # saver.save(sess, save_path=checkpoint_dir + # '/model.ckpt', global_step=global_step) # test_top1,test_top5,test_loss = evaluate(model, FLAGS.dataset, # batch_size=batch_size, # checkpoint_dir=checkpoint_dir) # logging.info('Test loss %.3f Test top1 %.3f Test top5 %.3f' % (test_loss,test_top1,test_top5)) # summary_out = tf.Summary() # summary_out.ParseFromString(summary) # summary_out.value.add(tag='accuracy/test_top1', simple_value=test_top1) # summary_out.value.add(tag='accuracy/test_top5', simple_value=test_top5) # summary_out.value.add(tag='loss/test', simple_value=test_loss) # summary_writer.add_summary(summary_out, step) # summary_writer.flush() curr_step += FLAGS.batch_size step, acc_value, loss_value, summary = sess.run( [global_step, accuracy_avg, loss_avg, summary_op]) saver.save(sess, save_path=checkpoint_dir + '/model.ckpt', global_step=global_step) test_top1, test_top5, test_loss = evaluate( model, FLAGS.dataset, batch_size=batch_size, checkpoint_dir=checkpoint_dir) logging.info('Test loss %.3f Test top1 %.3f Test top5 %.3f' % (test_loss, test_top1, test_top5)) summary_out = tf.Summary() summary_out.ParseFromString(summary) summary_out.value.add(tag='accuracy/test_top1', simple_value=test_top1) summary_out.value.add(tag='accuracy/test_top5', simple_value=test_top5) summary_out.value.add(tag='loss/test', simple_value=test_loss) summary_writer.add_summary(summary_out, step) summary_writer.flush() logging.info("Finished epoch %d " % epoch) epoch += 1 # When done, ask the threads to stop. coord.request_stop() coord.join(threads) coord.clear_stop() summary_writer.close() toc = time._conv_block duration = toc - tic logging.info('Duration %.3f ' % (duration))
def decoding_w(self) -> tf.Variable: with tf.name_scope("output_projection"): return tf.get_variable( name="state_to_word_W", shape=[self.output_projection_size, len(self.vocabulary)], initializer=tf.glorot_normal_initializer())
def build_graph(top_k, is_train=True, num_classes=FLAGS.charset_size): images = tf.placeholder(dtype=tf.float32, shape=[None, 112, 112, 1], name='image_batch') labels = tf.placeholder(dtype=tf.int64, shape=[None], name='label_batch') if is_train: net, endpoints = vgg_a(images, num_classes=num_classes, is_training=True, reuse=False) pre_label = tf.argmax(net, 1) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=net, labels=labels)) accuracy = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(net, 1), labels), tf.float32)) probabilities = tf.nn.softmax(net) predicted_val_top_k, predicted_index_top_k = tf.nn.top_k(probabilities, k=top_k) accuracy_in_top_k = tf.reduce_mean( tf.cast(tf.nn.in_top_k(probabilities, labels, top_k), tf.float32)) else: vali_net, vali_end_points = vgg_a(images, num_classes=num_classes, is_training=False, reuse=True) vali_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=vali_net, labels=labels)) vali_pre_label = tf.argmax(vali_net, 1) vali_accuracy = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(vali_net, 1), labels), tf.float32)) vali_probabilities = tf.nn.softmax(vali_net) vali_predicted_val_top_k, vali_predicted_index_top_k = tf.nn.top_k( vali_probabilities, k=top_k) vali_accuracy_in_top_k = tf.reduce_mean( tf.cast(tf.nn.in_top_k(vali_probabilities, labels, top_k), tf.float32)) return { 'images': images, 'labels': labels, 'logits': vali_net, 'top_k': top_k, 'loss': vali_loss, 'accuracy': vali_accuracy, 'predicted': vali_pre_label, 'accuracy_top_k': vali_accuracy_in_top_k, 'predicted_distribution': vali_probabilities, 'predicted_index_top_k': vali_predicted_index_top_k, 'predicted_val_top_k': vali_predicted_val_top_k } global_step = tf.get_variable("step", [], initializer=tf.constant_initializer(0.0), trainable=False) #rate = tf.train.exponential_decay(0.001, global_step, decay_steps=FLAGS.decay_step, decay_rate=0.97, staircase=True) opt = tf.train.AdamOptimizer(learning_rate=1e-4) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies([tf.group(*update_ops)]): train_op = opt.minimize(loss, global_step=global_step) tf.summary.scalar('loss', loss) tf.summary.scalar('accuracy', accuracy) merged_summary_op = tf.summary.merge_all() return { 'images': images, 'labels': labels, 'logits': net, 'top_k': top_k, 'global_step': global_step, 'train_op': train_op, 'loss': loss, 'accuracy': accuracy, 'accuracy_top_k': accuracy_in_top_k, 'merged_summary_op': merged_summary_op, 'predicted': pre_label, 'predicted_distribution': probabilities, 'predicted_index_top_k': predicted_index_top_k, 'predicted_val_top_k': predicted_val_top_k }
def build_and_restore_model(init_checkpoint, bert_config_file): input_ids = tf.placeholder(tf.int32, shape=(None, None), name='input_ids') input_mask = tf.placeholder(tf.int32, shape=(None, None), name='input_mask') segment_ids = tf.placeholder(tf.int32, shape=(None, None), name='segment_ids') bert_config = modeling.BertConfig.from_json_file(bert_config_file) model = modeling.BertModel( config=bert_config, is_training=False, # embedding=None, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, # task_ids=None, use_one_hot_embeddings=True, scope="bert") final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) #batch_size = final_hidden_shape[0] #seq_length = final_hidden_shape[1] #hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "cls/squad/output_weights", [2, bert_config.hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [-1, bert_config.hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) #logits = tf.reshape(logits, [batch_size, seq_length, 2]) #logits = tf.transpose(logits, [2, 0, 1]) # unstacked_logits = tf.unstack(logits, axis=0) # (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) # return (start_logits, end_logits) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) return tvars
def decoding_b(self) -> tf.Variable: with tf.name_scope("output_projection"): return tf.get_variable( name="state_to_word_b", shape=[len(self.vocabulary)], initializer=tf.zeros_initializer())
def model_fn(features, labels, mode): if model_io_config.fix_lm == True: scope = model_config.scope + "/task" else: scope = model_config.scope if mode == tf.estimator.ModeKeys.TRAIN: hidden_dropout_prob = model_config.hidden_dropout_prob attention_probs_dropout_prob = model_config.attention_probs_dropout_prob dropout_prob = model_config.dropout_prob else: hidden_dropout_prob = 0.0 attention_probs_dropout_prob = 0.0 dropout_prob = 0.0 label_ids = features["label_ids"] repres_lst = {} for index, name in enumerate(input_name): if index > 0: reuse = True else: reuse = model_reuse repres_lst[name] = esim_bert_encoding(model_config, features, labels, mode, name, scope, dropout_prob, reuse=reuse) a_output, b_output = alignment(model_config, repres_lst["a"], repres_lst["b"], features["input_mask_{}".format("a")], features["input_mask_{}".format("b")], scope, reuse=model_reuse) repres_a = esim_bert_pooling(model_config, a_output, features["input_mask_{}".format("a")], scope, dropout_prob, reuse=model_reuse) repres_b = esim_bert_pooling(model_config, b_output, features["input_mask_{}".format("b")], scope, dropout_prob, reuse=True) pair_repres = tf.concat([repres_a, repres_b, tf.abs(repres_a-repres_b), repres_b*repres_a], axis=-1) with tf.variable_scope(scope, reuse=model_reuse): try: label_ratio_table = tf.get_variable( name="label_ratio", shape=[num_labels,], initializer=tf.constant(label_tensor), trainable=False) ratio_weight = tf.nn.embedding_lookup(label_ratio_table, label_ids) print("==applying class weight==") except: ratio_weight = None (loss, per_example_loss, logits) = classifier.classifier(model_config, pair_repres, num_labels, label_ids, dropout_prob, ratio_weight) if mode == tf.estimator.ModeKeys.TRAIN: pretrained_tvars = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) if load_pretrained: model_io_fn.load_pretrained(pretrained_tvars, init_checkpoint, exclude_scope=exclude_scope_dict["task"]) trainable_params = model_io_fn.get_params(scope, not_storage_params=not_storage_params) tvars = trainable_params storage_params = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) model_io_fn.set_saver() for var in storage_params: print(var.name, var.get_shape(), "==storage params==") for var in tvars: print(var.name, var.get_shape(), "==trainable params==") if mode == tf.estimator.ModeKeys.TRAIN: model_io_fn.print_params(tvars, string=", trainable params") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimizer_fn = optimizer.Optimizer(opt_config) train_op = optimizer_fn.get_train_op(loss, tvars, opt_config.init_lr, opt_config.num_train_steps) return [train_op, loss, per_example_loss, logits] else: model_io_fn.print_params(tvars, string=", trainable params") return [loss, loss, per_example_loss, logits]
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE, global_step, decay_steps, cifar10.LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.GradientDescentOptimizer(lr) # Calculate the gradients for each model tower. tower_grads = [] for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope: # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. loss = tower_loss(scope) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.scalar_summary('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad: summaries.append( tf.histogram_summary(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / FLAGS.num_gpus format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) if add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") cluster_weights = tf.get_variable( "cluster_weights", [feature_size, cluster_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable( "cluster_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) hidden1_weights = tf.get_variable( "hidden1_weights", [cluster_size, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(activation, hidden1_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, **unused_params)
def create_variables(name, shape, initializer=tf.contrib.layers.xavier_initializer()): regularizer = tf.contrib.layers.l2_regularizer(scale=L2_value) new_variables = tf.get_variable(name, shape=shape, initializer=initializer,regularizer=regularizer) activation_summary(new_variables) return new_variables
def _build_graph(self): with tf.name_scope('TRANSFORMER'): self.logits = self._transformer_layer( inputs=self.x, decoder_inputs=self.decoder_inputs, drop_rate=self._drop_rate, is_training=self._is_training) with tf.name_scope('Loss'): self.preds = tf.to_int32(tf.argmax(self.logits, axis=-1)) # Accuracy: Remove <PAD> Characters is_target = tf.to_float(tf.not_equal(self.y, 0)) total_num = tf.reduce_sum(is_target) correct_num = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * is_target) self.acc = correct_num / total_num # Loss: Remove <PAD> Characters self.y_smoothed = tf.cond( pred=self._is_training, true_fn=lambda: label_smoother( tf.one_hot(self.y, depth=len(self.target_int2vocab))), false_fn=lambda: tf.one_hot( self.y, depth=len(self.target_int2vocab))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * is_target) / (tf.reduce_sum(is_target)) with tf.name_scope('Training_Scheme'): self.global_step = tf.get_variable( name='global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(value=1, dtype=tf.int32), trainable=False) self.learning_rate = warmup_learning_rate( d_model=self._num_units, step_num=self.global_step, warmup_step=self._warmup_step) # for batch normalization update self.update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=0.9, beta2=0.98, epsilon=1e-9) with tf.control_dependencies(self.update_op): self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) with tf.name_scope('Summary'): tf.summary.scalar('accuracy', self.acc) tf.summary.scalar('mean_loss', self.mean_loss) tf.summary.scalar('learning_rate', self.learning_rate) self.merged = tf.summary.merge_all() self.saver = tf.train.Saver() print("Model is built...")
def __init__(self, args): ''' 모델 초기화 :param args: 하이퍼 파라미터가 저장된 dict ''' self.is_train = args["is_train"] self.batch_size = args["batch_size"] self.keep_pob = args["keep_prob"] self.dropout_prob = 1.0 - self.keep_pob self.learning_rate = args["learning_rate"] self.relation_vocab_size = args["relation_vocab_size"] self.entity_vocab_size = args["entity_vocab_size"] self.entity_type_emb_size = args["entity_type_emb_size"] self.char_vocab_size = args["char_vocab_size"] self.char_emb_size = args["char_emb_size"] self.max_sentences = args["max_sentences"] self.word_maxlen = args["word_maxlen"] self.word_emb_table = args["embedding_table"] self.word_emb_size = args["word_emb_size"] self.filter_size = args["filter_size"] self.num_filter = args["num_filter"] self.max_entities = args["max_entities"] self.entity_max_tokens = args["entity_max_tokens"] self.entity_max_chars = args["entity_max_chars"] # 인코더, 디코더 파라미터 self.encoder_stack = args["encoder_stack"] self.encoder_max_step = args["encoder_max_step"] self.encoder_hidden = args["encoder_hidden"] self.decoder_hidden = args["decoder_hidden"] self.global_step = tf.get_variable( 'global_step', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False) # 모델 입력단 초기화 self._placeholder_init() # 모델과 함께 학습하며 finetune되는 단어 임베딩 테이블 finetune_table = tf.get_variable( name="word_embedding_table_finetuning", initializer=self.word_emb_table, trainable=True, dtype=tf.float32) # 사전 학습 값 그대로 사용할 고정 단어 임베딩 테이블 fix_table = tf.get_variable(name="word_embedding_table_fix", initializer=self.word_emb_table, trainable=False, dtype=tf.float32) # 임의 초기화 문자 임베딩 테이블 char_emb_table = tf.get_variable( "char_emb_table", shape=[self.char_vocab_size, self.char_emb_size], initializer=tf.truncated_normal_initializer(stddev=0.1)) # 임의 초기화 개체 타입 임베딩 테이블 entity_type_emb_table = tf.get_variable( "entity_type_emb_table", shape=[self.entity_vocab_size, self.entity_type_emb_size], initializer=tf.truncated_normal_initializer(stddev=0.1)) # 문장 인덱스 one-hot 임베딩 테이블 sentence_id_emb_table = tf.eye(num_rows=self.max_sentences) # 문장 단어 임베딩 context_embedding = self._context_embedding_layer( fix_table=fix_table, finetune_table=finetune_table, char_emb_table=char_emb_table) # 문장 개체 임베딩 entity_type_embedding = tf.nn.embedding_lookup( entity_type_emb_table, self.context_entity_type) # 문장 인덱스 임베딩 sentence_id_embedding = tf.nn.embedding_lookup(sentence_id_emb_table, self.sentence_id) # entity token, character, type, position, sentence_id embedding entity_embedding = self._entity_pool_embedding( fix_table=fix_table, finetune_table=finetune_table, char_emb_table=char_emb_table, token_entities=self.entity_pool, char_entities=self.char_entity_pool) # 문장에 있는 개체의 임베딩 가져오는 부분 context_entity_emb = [] unstack_entity_pool = tf.unstack(entity_embedding, axis=0) unstack_context_entity_id = tf.unstack(self.context_entity_id, axis=0) for entity_pool, context in zip(unstack_entity_pool, unstack_context_entity_id): context_entity_emb.append( tf.nn.embedding_lookup(entity_pool, context)) context_entity_emb = tf.stack(context_entity_emb, axis=0) # context token, character, entity_type, sentence_id embedding context_embedding = tf.concat([ context_embedding, entity_type_embedding, sentence_id_embedding, context_entity_emb ], axis=-1) # 개체 임베딩, 개체 문장 인덱스 임베딩 entity_pool_type_emb = tf.nn.embedding_lookup(entity_type_emb_table, self.entity_pool_type) entity_pool_sent_emb = tf.nn.embedding_lookup(sentence_id_emb_table, self.entity_sent_id) entity_pool_emb = tf.concat( [entity_embedding, entity_pool_type_emb, entity_pool_sent_emb], axis=-1) # 관계 없는 개체가 포인팅하게 할 none 벡터 none_emb = tf.get_variable(name="none_emb", shape=[self.decoder_hidden], initializer=tf.zeros_initializer) pad_emb = tf.get_variable(name="pad_emb", shape=[self.decoder_hidden], initializer=tf.zeros_initializer) pad_token = tf.expand_dims(tf.stack([pad_emb] * self.batch_size, 0), axis=1, name="pad_token") none_token = tf.expand_dims(tf.stack([none_emb] * self.batch_size, 0), axis=1, name="none_token") # 문장 인코딩 encoder_output, encoder_state = self._biGRU_encoding_layer( encoder_input=context_embedding, encoder_length=self.context_input_length, name="encoder_layer") # 개체 인코딩 및 문장 개체 간 주의 집중 pointing_mem, decoder_state = self._entity_encoding_layer( entity_pool_emb, encoder_output, encoder_state) # 디코더에서 포인팅 할 타겟 self.pointing_target = tf.concat([pad_token, none_token, pointing_mem], axis=1) # 디코더 입력 decoder_input = tf.concat([entity_pool_emb, pointing_mem], axis=-1) # 디코더 레이어 및 train op self._dual_pointer_decoder(decoder_input=decoder_input, decoder_init_state=decoder_state, decoder_hidden=self.decoder_hidden, pointing_memory=self.pointing_target)