def calculate_softmax(prefix_words_id ,hps): softmax_w = sharded_variable( 'softmax_w', [hps.vocab_size, hps.projected_size], hps.num_shards) softmax_b = tf.get_variable('softmax_b', [hps.vocab_size]) full_softmax_w = tf.reshape( tf.concat(1, softmax_w), [-1, hps.projected_size]) full_softmax_w = full_softmax_w[:hps.vocab_size, :] logits = (tf.matmul(prefix_words_id, full_softmax_w, transpose_b=True) + softmax_b) softmax = tf.nn.softmax(logits) return softmax[len(prefix_words_id)]
def _forward(self, gpu, x, y): print("Setting up forward pass on GPU:%d" % gpu) hps = self.hps self.initial_states = [] for i in range(hps.num_layers): with tf.device("/gpu:%d" % gpu): state = ( tf.Variable(tf.zeros([hps.batch_size, hps.state_size], dtype=getdtype(hps, True)), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], name="state_c_%d_%d" % (gpu, i), dtype=getdtype(hps, True)), tf.Variable(tf.zeros([hps.batch_size, hps.projected_size], dtype=getdtype(hps, True)), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], name="state_h_%d_%d" % (gpu, i), dtype=getdtype(hps, True)), ) self.initial_states += [state] emb_vars = sharded_variable("emb", [hps.vocab_size, hps.emb_size], hps.num_shards, dtype=getdtype(hps, False)) x = tf.nn.embedding_lookup(emb_vars, x) # [bs, steps, emb_size] if hps.keep_prob < 1.0: x = tf.nn.dropout(x, hps.keep_prob) inputs = [ tf.squeeze(input=tf.cast(v, getdtype(hps, True)), axis=[1]) for v in tf.split(value=x, num_or_size_splits=hps.num_steps, axis=1) ] for i in range(hps.num_layers): with tf.variable_scope("lstm_%d" % i) as scope: if hps.num_of_groups > 1: assert (hps.fact_size is None) print("Using G-LSTM") print("Using %d groups" % hps.num_of_groups) cell = GLSTMCell(num_units=hps.state_size, num_proj=hps.projected_size, number_of_groups=hps.num_of_groups) else: if hps.fact_size: print("Using F-LSTM") print("Using factorization: %d x %d x %d" % (2 * hps.projected_size, int( hps.fact_size), 4 * hps.state_size)) cell = FLSTMCell(num_units=hps.state_size, num_proj=hps.projected_size, factor_size=int(hps.fact_size)) else: print("Using LSTMP") cell = LSTMCell(num_units=hps.state_size, num_proj=hps.projected_size) state = tf.contrib.rnn.LSTMStateTuple( self.initial_states[i][0], self.initial_states[i][1]) if hps.use_residual: cell = ResidualWrapper(cell=cell) for t in range(hps.num_steps): if t > 0: scope.reuse_variables() inputs[t], state = cell(inputs[t], state) if hps.keep_prob < 1.0: inputs[t] = tf.nn.dropout(inputs[t], hps.keep_prob) with tf.control_dependencies([ self.initial_states[i][0].assign(state[0]), self.initial_states[i][1].assign(state[1]) ]): inputs[t] = tf.identity(inputs[t]) # inputs = tf.reshape(tf.concat(1, inputs), [-1, hps.projected_size]) inputs = tf.reshape(tf.concat(inputs, 1), [-1, hps.projected_size]) # Initialization ignores the fact that softmax_w is transposed. Twhat worked slightly better. softmax_w = sharded_variable("softmax_w", [hps.vocab_size, hps.projected_size], hps.num_shards) softmax_b = tf.get_variable("softmax_b", [hps.vocab_size]) if hps.num_sampled == 0: full_softmax_w = tf.reshape(tf.concat(softmax_w, 1), [-1, hps.projected_size]) full_softmax_w = full_softmax_w[:hps.vocab_size, :] logits = tf.matmul(tf.to_float(inputs), full_softmax_w, transpose_b=True) + softmax_b targets = tf.reshape(y, [-1]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=targets) else: targets = tf.reshape(y, [-1, 1]) loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, targets, tf.to_float(inputs), hps.num_sampled, hps.vocab_size) #loss = tf.reduce_mean(loss * tf.reshape(w, [-1])) loss = tf.reduce_mean(loss) return loss
def _forward(self, gpu, x, y, w): hps = self.hps w = tf.to_float(w) self.initial_states = [] for i in range(hps.num_layers): with tf.device("/gpu:%d" % gpu): v = tf.Variable(tf.zeros( [hps.batch_size, hps.state_size + hps.projected_size]), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], name="state_%d_%d" % (gpu, i)) self.initial_states += [v] emb_vars = sharded_variable("emb", [hps.vocab_size, hps.emb_size], hps.num_shards) x = tf.nn.embedding_lookup(emb_vars, x) # [bs, steps, emb_size] if hps.keep_prob < 1.0: x = tf.nn.dropout(x, hps.keep_prob) inputs = [tf.squeeze(v, [1]) for v in tf.split(1, hps.num_steps, x)] for i in range(hps.num_layers): with tf.variable_scope("lstm_%d" % i): cell = LSTMCell(hps.state_size, hps.emb_size, num_proj=hps.projected_size) state = self.initial_states[i] for t in range(hps.num_steps): inputs[t], state = cell(inputs[t], state) if hps.keep_prob < 1.0: inputs[t] = tf.nn.dropout(inputs[t], hps.keep_prob) with tf.control_dependencies( [self.initial_states[i].assign(state)]): inputs[t] = tf.identity(inputs[t]) inputs = tf.reshape(tf.concat(1, inputs), [-1, hps.projected_size]) # Initialization ignores the fact that softmax_w is transposed. That worked slightly better. softmax_w = sharded_variable("softmax_w", [hps.vocab_size, hps.projected_size], hps.num_shards) softmax_b = tf.get_variable("softmax_b", [hps.vocab_size]) if hps.num_sampled == 0: full_softmax_w = tf.reshape(tf.concat(1, softmax_w), [-1, hps.projected_size]) full_softmax_w = full_softmax_w[:hps.vocab_size, :] logits = tf.matmul(inputs, full_softmax_w, transpose_b=True) + softmax_b # targets = tf.reshape(tf.transpose(self.y), [-1]) targets = tf.reshape(y, [-1]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits, targets) else: targets = tf.reshape(y, [-1, 1]) loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, tf.to_float(inputs), targets, hps.num_sampled, hps.vocab_size) loss = tf.reduce_mean(loss * tf.reshape(w, [-1])) return loss
def __init__(self, hps): self.x = tf.placeholder(tf.int32, [1]) self.ind = tf.placeholder(tf.int32, name="ind") # self.ind_len = tf.placeholder(tf.int32, name="ind_len") self.initial_state = tf.Variable( tf.zeros([hps.batch_size, hps.state_size + hps.projected_size]), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], name="state_0_0") emb_vars = sharded_variable( "emb", [hps.vocab_size, hps.emb_size], hps.num_shards) # vocab_size is too big for this model x = tf.nn.embedding_lookup(emb_vars, self.x) # [batch_size, steps, emb_size] if hps.keep_prob < 1.0: x = tf.nn.dropout(x, hps.keep_prob) # [batch_size,emb_size]*steps #inputs = [tf.squeeze(v, [1]) for v in tf.split(x, hps.num_steps, 1)] inputs = x with tf.variable_scope("lstm_0"): cell = LSTMCell(hps.state_size, hps.emb_size, num_proj=hps.projected_size) state = self.initial_state inputs, self.final_state = cell( inputs, state ) # inputs[t] is h{t}?? state is h(t) ?? result of inputs[t] is project ?? if hps.keep_prob < 1.0: inputs = tf.nn.dropout(inputs, hps.keep_prob) inputs = tf.reshape(tf.concat(inputs, 1), [-1, hps.projected_size]) # Initialization ignores the fact that softmax_w is transposed. That worked slightly better. softmax_w = sharded_variable("softmax_w", [hps.vocab_size, hps.projected_size], hps.num_shards) softmax_b = tf.get_variable("softmax_b", [hps.vocab_size]) full_softmax_w = tf.reshape(tf.concat(softmax_w, 1), [-1, hps.projected_size]) full_softmax_w = full_softmax_w[:hps.vocab_size, :] # [batch_size*steps,vocab] self.logits = tf.matmul(inputs, full_softmax_w, transpose_b=True) + softmax_b self.logits = tf.squeeze(self.logits) """ ind_logits = tf.reshape(self.logits, [hps.vocab_size, -1]) ind_logits = tf.gather(ind_logits, self.ind) print "ind_logits:", self.logits, ind_logits ind_logits = tf.reshape(ind_logits, [-1, self.ind_len]) self.top_k = tf.minimum(self.ind_len, hps.arg_max) # for complete _, self.ind_index = tf.nn.top_k(ind_logits, self.top_k) print "ind_index:", self.ind_index """ # for predict _, self.index = tf.nn.top_k(self.logits, hps.arg_max) # for ema ema = tf.train.ExponentialMovingAverage(decay=0.999) variables_to_average = find_trainable_variables("LSTM") self.avg_dict = ema.variables_to_restore(variables_to_average)
def _forward(self, gpu, x, y, mode="train"): hps = self.hps self.initial_states = [] #every layer has a initial_state :tf.zero([hps.batch_size, hps.state_size + hps.projected_size]) for i in range(hps.num_layers): with tf.device("/gpu:%d" % gpu): v = tf.Variable(tf.zeros( [hps.batch_size, hps.state_size + hps.projected_size]), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], name="state_%d_%d" % (gpu, i)) #self.initial_states += [v] self.initial_states = v # for layers = 1 emb_vars = sharded_variable( "emb", [hps.vocab_size, hps.emb_size], hps.num_shards) #vocab_size is too big for this model x = tf.nn.embedding_lookup(emb_vars, x) # [batch_size, steps, emb_size] if hps.keep_prob < 1.0: x = tf.nn.dropout(x, hps.keep_prob) # [batch_size,emb_size]*steps inputs = [tf.squeeze(v, [1]) for v in tf.split(x, hps.num_steps, 1)] for i in range(hps.num_layers): with tf.variable_scope("lstm_%d" % i): cell = LSTMCell(hps.state_size, hps.emb_size, num_proj=hps.projected_size) state = self.initial_states for t in range(hps.num_steps): # step inputs[t], state = cell( inputs[t], state ) #inputs[t] is h{t}?? state is h(t) ?? result of inputs[t] is project ?? if hps.keep_prob < 1.0: inputs[t] = tf.nn.dropout(inputs[t], hps.keep_prob) #with tf.control_dependencies([self.initial_states.assign(state)]): # for bi-lstm? or two layer lstm # inputs[t] = tf.identity(inputs[t]) self.final_state = state # [batch_size*steps,projected_size] inputs = tf.reshape(tf.concat(inputs, 1), [-1, hps.projected_size]) # Initialization ignores the fact that softmax_w is transposed. That worked slightly better. softmax_w = sharded_variable("softmax_w", [hps.vocab_size, hps.projected_size], hps.num_shards) softmax_b = tf.get_variable("softmax_b", [hps.vocab_size]) if hps.num_sampled == 0: full_softmax_w = tf.reshape(tf.concat(softmax_w, 1), [-1, hps.projected_size]) full_softmax_w = full_softmax_w[:hps.vocab_size, :] # [batch_size*steps,vocab] logits = tf.matmul(inputs, full_softmax_w, transpose_b=True) + softmax_b print("train log logits:", logits) # targets = tf.reshape(tf.transpose(self.y), [-1]) # index = tf.argmax(logits,axis=1) #_, index = tf.nn.top_k(logits, hps.arg_max) #print "index:",index targets = tf.reshape(y, [-1]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=targets, logits=logits) #loss = tf.reduce_mean(loss * tf.reshape(w, [-1])) loss = tf.reduce_mean(loss) if mode == "predict_next": return loss, logits else: targets = tf.reshape(y, [-1, 1]) loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, targets, tf.to_float(inputs), hps.num_sampled, hps.vocab_size) loss = tf.reduce_mean(loss) return loss
def _forward(self, gpu, x, y, w): hps = self.hps w = tf.to_float(w) self.initial_states = [] for i in range(hps.num_layers): with tf.device('/gpu:%d' % gpu): v = tf.Variable(tf.zeros( [self.batch_size, hps.state_size + hps.projected_size]), trainable=False, collections=['initial_state'], name='state_%d_%d' % (gpu, i), validate_shape=False) self.initial_states += [v] emb_vars = sharded_variable('emb', [hps.vocab_size, hps.emb_size], hps.num_shards) x = tf.nn.embedding_lookup(emb_vars, x) # [bs, steps, emb_size] if hps.keep_prob < 1.0: x = tf.nn.dropout(x, hps.keep_prob) inputs = [ tf.squeeze(v, [1]) for v in tf.split(x, hps.num_steps, axis=1) ] for i in range(hps.num_layers): with tf.variable_scope('lstm_%d' % i): cell = LSTMCell(hps.state_size, hps.emb_size, num_proj=hps.projected_size) state = self.initial_states[i] for t in range(hps.num_steps): inputs[t], state = cell(inputs[t], state) if hps.keep_prob < 1.0: inputs[t] = tf.nn.dropout(inputs[t], hps.keep_prob) with tf.control_dependencies( [self.initial_states[i].assign(state)]): inputs[t] = tf.identity(inputs[t]) inputs = tf.reshape(tf.concat(inputs, 1), [-1, hps.projected_size]) tf.add_to_collection('hidden_output', inputs) # Initialization ignores the fact that softmax_w is transposed. # That worked slightly better. softmax_w = sharded_variable('softmax_w', [hps.vocab_size, hps.projected_size], hps.num_shards) softmax_b = tf.get_variable('softmax_b', [hps.vocab_size]) full_softmax_w = tf.reshape(tf.concat(softmax_w, 1), [-1, hps.projected_size]) full_softmax_w = full_softmax_w[:hps.vocab_size, :] logits = (tf.matmul(inputs, full_softmax_w, transpose_b=True) + softmax_b) softmax = tf.nn.softmax(logits) tf.add_to_collection('softmax', softmax) if hps.num_sampled == 0: # targets = tf.reshape(tf.transpose(self.y), [-1]) targets = tf.reshape(y, [-1]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=targets, logits=logits) else: targets = tf.reshape(y, [-1, 1]) loss = tf.nn.sampled_softmax_loss(weights=softmax_w, biases=softmax_b, labels=targets, inputs=tf.to_float(inputs), num_sampled=hps.num_sampled, num_classes=hps.vocab_size) loss = tf.reduce_mean(loss * tf.reshape(w, [-1])) return loss