def sample(self, time, outputs, state): del state # Return -1s where we did not sample, and sample_ids elsewhere select_sample = bernoulli_sample(probs=self.sampling_probability, dtype=dtypes.bool, sample_shape=self.batch_size, seed=self.scheduling_seed) return array_ops.where( select_sample, categorical_sample(logits=outputs, seed=self.seed), gen_array_ops.fill([self.batch_size], -1))
def alphas(shape, alpha_value, name=None): """Creates a tensor with all elements set to `alpha_value`. This operation returns a tensor of type `dtype` with shape `shape` and all elements set to alpha. Parameters ---------- shape: A list of integers, a tuple of integers, or a 1-D `Tensor` of type `int32`. The shape of the desired tensor alpha_value: `float32`, `float64`, `int8`, `uint8`, `int16`, `uint16`, int32`, `int64` The value used to fill the resulting `Tensor`. name: str A name for the operation (optional). Returns ------- A `Tensor` with all elements set to alpha. Examples -------- >>> tl.alphas([2, 3], tf.int32) # [[alpha, alpha, alpha], [alpha, alpha, alpha]] """ with ops.name_scope(name, "alphas", [shape]) as name: alpha_tensor = convert_to_tensor(alpha_value) alpha_dtype = dtypes.as_dtype(alpha_tensor.dtype).base_dtype if not isinstance(shape, ops.Tensor): try: shape = constant_op._tensor_shape_tensor_conversion_function( tensor_shape.TensorShape(shape)) except (TypeError, ValueError): shape = ops.convert_to_tensor(shape, dtype=dtypes.int32) if not shape._shape_tuple(): shape = reshape(shape, [-1]) # Ensure it's a vector try: output = constant(alpha_value, shape=shape, dtype=alpha_dtype, name=name) except (TypeError, ValueError): output = fill(shape, constant(alpha_value, dtype=alpha_dtype), name=name) if output.dtype.base_dtype != alpha_dtype: raise AssertionError("Dtypes do not corresponds: %s and %s" % (output.dtype.base_dtype, alpha_dtype)) return output
def sample(self, time, outputs, state, name=None): with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperSample", [time, outputs, state]): # Return -1s where we did not sample, and sample_ids elsewhere select_sampler = bernoulli.Bernoulli( probs=self._sampling_probability, dtype=dtypes.bool) select_sample = select_sampler.sample(sample_shape=self.batch_size, seed=self._scheduling_seed) sample_id_sampler = categorical.Categorical(logits=outputs) return array_ops.where(select_sample, sample_id_sampler.sample(seed=self._seed), gen_array_ops.fill([self.batch_size], -1))
def sample(self, time, outputs, state, name=None): with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperSample", [time, outputs, state]): # Return -1s where we did not sample, and sample_ids elsewhere select_sampler = bernoulli.Bernoulli( probs=self._sampling_probability, dtype=dtypes.bool) select_sample = select_sampler.sample( sample_shape=self.batch_size, seed=self._scheduling_seed) sample_id_sampler = categorical.Categorical(logits=outputs) return array_ops.where( select_sample, sample_id_sampler.sample(seed=self._seed), gen_array_ops.fill([self.batch_size], -1))
def _create_slots(self, var_list): for v in var_list: with ops.colocate_with(v): dtype = v.dtype.base_dtype if v.get_shape().is_fully_defined(): init = init_ops.constant_initializer(self._initial_accumulator_value, dtype=dtype) else: # Use a Tensor instead of initializer if variable does not have static # shape. init_constant = gen_array_ops.fill(array_ops.shape(v), self._initial_accumulator_value) init = math_ops.cast(init_constant, dtype) self._get_or_make_slot_with_initializer(v, init, v.get_shape(), dtype, "accumulator", self._name)
def _create_slots(self, var_list): for v in var_list: with ops.colocate_with(v): dtype = v.dtype.base_dtype if v.get_shape().is_fully_defined(): init = init_ops.constant_initializer(self._initial_accumulator_value, dtype=dtype) else: # Use a Tensor instead of initializer if variable does not have static # shape. init_constant = gen_array_ops.fill(array_ops.shape(v), self._initial_accumulator_value) init = math_ops.cast(init_constant, dtype) self._get_or_make_slot_with_initializer(v, init, v.get_shape(), dtype, "accumulator", self._name)
def _create_vars(self, var_list, state): for v in var_list: # TODO(isaprykin): Delete colocate_with(v) from other optimizers and # confirm that colocation will happen anyway. dtype = v.dtype.base_dtype if v.get_shape().is_fully_defined(): init = init_ops.constant_initializer(self._initial_accumulator_value, dtype=dtype) else: # Use a Tensor instead of initializer if variable does not have static # shape. init_constant = gen_array_ops.fill( array_ops.shape(v), self._initial_accumulator_value) init = math_ops.cast(init_constant, dtype) state.create_slot_with_initializer(v, init, v.get_shape(), dtype, "accumulator")
def get_next_input(inp, out): next_input = inp.read(time) if self._prenet is not None: next_input = self._prenet(next_input) out = self._prenet(out) if self._sampling_prob > 0.: next_input = tf.stop_gradient(next_input) out = tf.stop_gradient(out) select_sampler = bernoulli.Bernoulli( probs=self._sampling_prob, dtype=dtypes.bool ) select_sample = select_sampler.sample( sample_shape=(self.batch_size, 1), seed=self._seed ) select_sample = tf.tile(select_sample, [1, self._last_dim]) sample_ids = array_ops.where( select_sample, out, gen_array_ops.fill( [self.batch_size, self._last_dim], tf.cast(-20., self._dtype) ) ) where_sampling = math_ops.cast( array_ops.where(sample_ids > -20), dtypes.int32 ) where_not_sampling = math_ops.cast( array_ops.where(sample_ids <= -20), dtypes.int32 ) sample_ids_sampling = array_ops.gather_nd(sample_ids, where_sampling) inputs_not_sampling = array_ops.gather_nd( next_input, where_not_sampling ) sampled_next_inputs = sample_ids_sampling base_shape = array_ops.shape(next_input) next_input = ( array_ops.scatter_nd( indices=where_sampling, updates=sampled_next_inputs, shape=base_shape ) + array_ops.scatter_nd( indices=where_not_sampling, updates=inputs_not_sampling, shape=base_shape ) ) return next_input
def sample(self, time, outputs, state, name=None): with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperSample", [time, outputs, state]): # Return -1s where we did not sample, and sample_ids elsewhere select_sampler = bernoulli.Bernoulli( probs=self._sampling_probability, dtype=dtypes.bool) select_sample = select_sampler.sample( sample_shape=self.batch_size, seed=self._scheduling_seed) # self.logs = tf.Print(select_sample, [select_sample]) # sample_id_sampler = categorical.Categorical(logits=outputs) sample_ids = math_ops.cast(math_ops.argmax(outputs, axis=-1), dtypes.int32) # select_sample = tf.ones(shape=(self.batch_size,), dtype=dtypes.bool, name="test") return array_ops.where( select_sample, sample_ids, gen_array_ops.fill([self.batch_size], -1))
def alphas(shape, alpha_value, name=None): """Creates a tensor with all elements set to `alpha_value`. This operation returns a tensor of type `dtype` with shape `shape` and all elements set to alpha. Parameters ---------- shape: A list of integers, a tuple of integers, or a 1-D `Tensor` of type `int32`. The shape of the desired tensor alpha_value: `float32`, `float64`, `int8`, `uint8`, `int16`, `uint16`, int32`, `int64` The value used to fill the resulting `Tensor`. name: str A name for the operation (optional). Returns ------- A `Tensor` with all elements set to alpha. Examples -------- >>> tl.alphas([2, 3], tf.int32) # [[alpha, alpha, alpha], [alpha, alpha, alpha]] """ with ops.name_scope(name, "alphas", [shape]) as name: alpha_tensor = convert_to_tensor(alpha_value) alpha_dtype = dtypes.as_dtype(alpha_tensor.dtype).base_dtype if not isinstance(shape, ops.Tensor): try: shape = constant_op._tensor_shape_tensor_conversion_function(tensor_shape.TensorShape(shape)) except (TypeError, ValueError): shape = ops.convert_to_tensor(shape, dtype=dtypes.int32) if not shape._shape_tuple(): shape = reshape(shape, [-1]) # Ensure it's a vector try: output = constant(alpha_value, shape=shape, dtype=alpha_dtype, name=name) except (TypeError, ValueError): output = fill(shape, constant(alpha_value, dtype=alpha_dtype), name=name) if output.dtype.base_dtype != alpha_dtype: raise AssertionError("Dtypes do not corresponds: %s and %s" % (output.dtype.base_dtype, alpha_dtype)) return output
def scheduled_sampling(self, batch_size, sampling_probability, true, estimate): with variable_scope.variable_scope("ScheduledEmbedding"): # Return -1s where we do not sample, and sample_ids elsewhere select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool) select_sample = select_sampler.sample(sample_shape=batch_size) sample_ids = array_ops.where( select_sample, tf.range(batch_size), gen_array_ops.fill([batch_size], -1)) where_sampling = math_ops.cast( array_ops.where(sample_ids > -1), tf.int32) where_not_sampling = math_ops.cast( array_ops.where(sample_ids <= -1), tf.int32) _estimate = array_ops.gather_nd(estimate, where_sampling) _true = array_ops.gather_nd(true, where_not_sampling) base_shape = array_ops.shape(true) result1 = array_ops.scatter_nd(indices=where_sampling, updates=_estimate, shape=base_shape) result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=_true, shape=base_shape) result = result1 + result2 return result1 + result2
def scheduled_sampling(self, batch_size, sampling_probability, true, estimate): with variable_scope.variable_scope("ScheduledEmbedding"): # Return -1s where we do not sample, and sample_ids elsewhere select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool) select_sample = select_sampler.sample(sample_shape=batch_size) sample_ids = array_ops.where( select_sample, tf.range(batch_size), gen_array_ops.fill([batch_size], -1)) where_sampling = math_ops.cast( array_ops.where(sample_ids > -1), tf.int32) where_not_sampling = math_ops.cast( array_ops.where(sample_ids <= -1), tf.int32) _estimate = array_ops.gather_nd(estimate, where_sampling) _true = array_ops.gather_nd(true, where_not_sampling) base_shape = array_ops.shape(true) result1 = array_ops.scatter_nd(indices=where_sampling, updates=_estimate, shape=base_shape) result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=_true, shape=base_shape) result = result1 + result2 return result1 + result2
def gen_crossentropy(y_true, y_pred, q=0.7, k=-1.0): # Filter true values ("y_true") in "y_pred" y_ok = array_ops.boolean_mask(y_pred, gen_math_ops.equal(y_true, 1)) # Conversion for Float64 for valid operations in TensorFlow um = np.float64(1.) q = np.float64(q) if k == -1: # cross entropy loss # mean[ (1-y_ok^q)/q ] return K.mean(math_ops.divide( math_ops.subtract(um, math_ops.pow(y_ok, q)), q), axis=-1) else: # truncated cross entropy loss k = np.float64(k) # if y_ok < k # [ (1-k^q)/q ] (no broadcasting in Where()) # [ (1-y_ok^q)/q ] vfunct = array_ops.where( gen_math_ops.less_equal(y_ok, k), gen_array_ops.fill(array_ops.shape(y_ok), (um - k**q) / q), math_ops.divide(math_ops.subtract(um, math_ops.pow(y_ok, q)), q)) return K.mean(vfunct, axis=-1) # mean [ above values ]
def scheduled_sampling_vocab_dist(hps, sampling_probability, output, embedding, inp, alpha = 0): # borrowed ideas from https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledEmbeddingTrainingHelper def soft_argmax(alpha, output): #alpha_exp = tf.exp(alpha * output) # (batch_size, vocab_size) #one_hot_scores = alpha_exp / tf.reshape(tf.reduce_sum(alpha_exp, axis=1),[-1,1]) #(batch_size, vocab_size) one_hot_scores = tf.nn.softmax(alpha * output) return one_hot_scores def soft_top_k(alpha, output, K): copy = tf.identity(output) p = [] arg_top_k = [] for k in range(K): sargmax = soft_argmax(alpha, copy) copy = (1-sargmax)* copy p.append(tf.reduce_sum(sargmax * output, axis=1)) arg_top_k.append(sargmax) return tf.stack(p, axis=1), tf.stack(arg_top_k) with variable_scope.variable_scope("ScheduledEmbedding"): # Return -1s where we did not sample, and sample_ids elsewhere select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool) select_sample = select_sampler.sample(sample_shape=hps.batch_size) sample_id_sampler = categorical.Categorical(probs=output) # equals to argmax{ Multinomial(output, total_count=1) }, our greedy search selection sample_ids = array_ops.where( select_sample, sample_id_sampler.sample(seed=123), gen_array_ops.fill([hps.batch_size], -1)) where_sampling = math_ops.cast( array_ops.where(sample_ids > -1), tf.int32) where_not_sampling = math_ops.cast( array_ops.where(sample_ids <= -1), tf.int32) if hps.greedy_scheduled_sampling: sample_ids = tf.argmax(output, axis=1, output_type=tf.int32) sample_ids_sampling = array_ops.gather_nd(sample_ids, where_sampling) inputs_not_sampling = array_ops.gather_nd(inp, where_not_sampling) if hps.E2EBackProp: if hps.hard_argmax: greedy_search_prob, greedy_search_sample = tf.nn.top_k(output, k=hps.k) # (batch_size, k) greedy_search_prob_normalized = greedy_search_prob/tf.reshape(tf.reduce_sum(greedy_search_prob,axis=1),[-1,1]) greedy_embedding = tf.nn.embedding_lookup(embedding, greedy_search_sample) normalized_embedding = tf.multiply(tf.reshape(greedy_search_prob_normalized,[hps.batch_size,hps.k,1]), greedy_embedding) e2e_embedding = tf.reduce_mean(normalized_embedding,axis=1) else: e = [] greedy_search_prob, greedy_search_sample = soft_top_k(alpha, output, K=hps.k) # (batch_size, k), (k, batch_size, vocab_size) greedy_search_prob_normalized = greedy_search_prob / tf.reshape(tf.reduce_sum(greedy_search_prob, axis=1), [-1, 1]) for _ in range(hps.k): a_k = greedy_search_sample[_] e_k = tf.matmul(tf.reshape(greedy_search_prob_normalized[:,_],[-1,1]) * a_k, embedding) e.append(e_k) e2e_embedding = tf.reduce_sum(e, axis=0) # (batch_size, emb_dim) sampled_next_inputs = array_ops.gather_nd(e2e_embedding, where_sampling) else: if hps.hard_argmax: sampled_next_inputs = tf.nn.embedding_lookup(embedding, sample_ids_sampling) else: # using soft armax (greedy) proposed in: https://arxiv.org/abs/1704.06970 #alpha_exp = tf.exp(alpha * (output_not_extended + G)) # (batch_size, vocab_size) #one_hot_scores = alpha_exp / tf.reduce_sum(alpha_exp, axis=1) #(batch_size, vocab_size) one_hot_scores = soft_argmax(alpha, output) #(batch_size, vocab_size) soft_argmax_embedding = tf.matmul(one_hot_scores, embedding) #(batch_size, emb_size) sampled_next_inputs = array_ops.gather_nd(soft_argmax_embedding, where_sampling) base_shape = array_ops.shape(inp) result1 = array_ops.scatter_nd(indices=where_sampling, updates=sampled_next_inputs, shape=base_shape) result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=inputs_not_sampling, shape=base_shape) return result1 + result2
def scheduled_sampling(hps, sampling_probability, output, embedding, inp, alpha=0): """No teacher forcing, sampling decoder input for current step, either right word or generated word Args: hps: model hyper parameters sampling_probability: probability of the sampler for current step output: decoder output for prevsious step, (Batch_size, extended_vsize) embedding: model embeding, (vocab_size, embed_dim) inp: decoder inputs for current step, (Batch_size, embed_dim) alpha: soft argmax argument """ # borrowed ideas from https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledEmbeddingTrainingHelper vocab_size = embedding.get_shape()[0] def soft_argmax(alpha, _output): """Soft argmax is derivative, transform argmax into a soft vector, computed orignal vocab dist and normalize it Args: alpha: soft argmax argument _output: decoder output for every step, (Batch_size, extended_vsize) Returns: (Batch_size, vocab_size) """ # new_oov_scores, (Batch_size, 1) new_oov_scores = tf.reshape( _output[:, 0] + tf.reduce_sum(_output[:, vocab_size:], axis=1), [-1, 1]) # add score for all OOV to the UNK score # _output, (Batch_size, vocab_size) _output = tf.concat([new_oov_scores, _output[:, 1:vocab_size]], axis=1) # select only the vocab_size outputs _output = _output / tf.reshape(tf.reduce_sum(output, axis=1), [-1, 1]) # re-normalize scores # alpha_exp = tf.exp(alpha * _output) # (batch_size, vocab_size) # one_hot_scores = alpha_exp / tf.reshape(tf.reduce_sum(alpha_exp, axis=1),[-1,1]) #(batch_size, vocab_size) one_hot_scores = tf.nn.softmax((alpha * _output)) return one_hot_scores def soft_top_k(alpha, _output, K): """ Args: alpha: same as soft argmax _output: index Returns: (Batch_size, K), (Batch_size, K, vocab_size) """ copy = tf.identity(_output) p = [] arg_top_k = [] for k in range(K): sargmax = soft_argmax(alpha, copy) copy = (1 - sargmax) * copy p.append(tf.reduce_sum(sargmax * _output, axis=1)) arg_top_k.append(sargmax) return tf.stack(p, axis=1), tf.stack(arg_top_k) with variable_scope.variable_scope("ScheduledEmbedding"): # Return -1s where we did not sample, and sample_ids elsewhere # decide whether sampling and which word to sample for every data in current batch select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool) select_sample = select_sampler.sample(sample_shape=hps.batch_size) sample_id_sampler = categorical.Categorical( probs=output ) # equals to argmax{ Multinomial(output, total_count=1) }, our greedy search selection sample_ids = array_ops.where(select_sample, sample_id_sampler.sample(seed=123), gen_array_ops.fill([hps.batch_size], -1)) # sample_ids, (Batch_size) where_sampling = math_ops.cast(array_ops.where(sample_ids > -1), tf.int32) where_not_sampling = math_ops.cast(array_ops.where(sample_ids <= -1), tf.int32) if hps.greedy_scheduled_sampling: # sample_ids (batch_size, 1) sample_ids = tf.argmax(output, axis=1, output_type=tf.int32) sample_ids_sampling = array_ops.gather_nd(sample_ids, where_sampling) cond = tf.less(sample_ids_sampling, vocab_size) # replace oov with unk sample_ids_sampling = tf.cast(cond, tf.int32) * sample_ids_sampling inputs_not_sampling = array_ops.gather_nd(inp, where_not_sampling) if hps.E2EBackProp: if hps.hard_argmax: greedy_search_prob, greedy_search_sample = tf.nn.top_k( output, k=hps.k) # (batch_size, k) greedy_search_prob_normalized = greedy_search_prob / tf.reshape( tf.reduce_sum(greedy_search_prob, axis=1), [-1, 1]) cond = tf.less(greedy_search_sample, vocab_size) # replace oov with unk greedy_search_sample = tf.cast(cond, tf.int32) * greedy_search_sample greedy_embedding = tf.nn.embedding_lookup( embedding, greedy_search_sample) normalized_embedding = tf.multiply( tf.reshape(greedy_search_prob_normalized, [hps.batch_size, hps.k, 1]), greedy_embedding) e2e_embedding = tf.reduce_mean(normalized_embedding, axis=1) else: e = [] greedy_search_prob, greedy_search_sample = soft_top_k( alpha, output, K=hps.k) # (batch_size, k), (k, batch_size, vocab_size) greedy_search_prob_normalized = greedy_search_prob / tf.reshape( tf.reduce_sum(greedy_search_prob, axis=1), [-1, 1]) for _ in range(hps.k): a_k = greedy_search_sample[_] e_k = tf.matmul( tf.reshape(greedy_search_prob_normalized[:, _], [-1, 1]) * a_k, embedding) e.append(e_k) e2e_embedding = tf.reduce_sum(e, axis=0) # (batch_size, emb_dim) sampled_next_inputs = array_ops.gather_nd(e2e_embedding, where_sampling) else: if hps.hard_argmax: sampled_next_inputs = tf.nn.embedding_lookup( embedding, sample_ids_sampling) else: # using soft armax (greedy) proposed in: https://arxiv.org/abs/1704.06970 # alpha_exp = tf.exp(alpha * (output_not_extended + G)) # (batch_size, vocab_size) # one_hot_scores = alpha_exp / tf.reduce_sum(alpha_exp, axis=1) #(batch_size, vocab_size) one_hot_scores = soft_argmax( alpha, output) # (batch_size, vocab_size) soft_argmax_embedding = tf.matmul( one_hot_scores, embedding) # (batch_size, emb_size) sampled_next_inputs = array_ops.gather_nd( soft_argmax_embedding, where_sampling) base_shape = array_ops.shape(inp) result1 = array_ops.scatter_nd(indices=where_sampling, updates=sampled_next_inputs, shape=base_shape) result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=inputs_not_sampling, shape=base_shape) return result1 + result2
def scheduled_sampling(hps, sampling_probability, output, embedding, inp): vocab_size = embedding.get_shape()[0].value with variable_scope.variable_scope("ScheduleEmbedding"): select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool) select_sample = select_sampler.sample(sample_shape=hps.batch_size) sample_id_sampler = categorical.Categorical(probs=output) sample_ids = array_ops.where(select_sample, sample_id_sampler.sample(seed=123), gen_array_ops.fill([hps.batch_size],-1)) where_sampling = math_ops.cast(array_ops.where(sample_ids > -1), tf.int32) where_not_sampling = math_ops.cast(array_ops.where(sample_ids <= -1), tf.int32) sample_ids_sampling = array_ops.gather_nd(sample_ids, where_sampling) cond = tf.less(sample_ids_sampling, vocab_size) sample_ids_sampling = tf.cast(cond, tf.int32) * sample_ids_sampling inputs_not_sampling = array_ops.gather_nd(inp, where_not_sampling) sampling_next_inputs = tf.nn.embedding_lookup(embedding, sample_ids_sampling) result1 = array_ops.scatter_nd(indices=where_sampling, updates=sampling_next_inputs, shape=array_ops.shape(inp)) result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=inputs_not_sampling, shape=array_ops.shape(inp)) return result1 + result2
def init(): # Use a Tensor instead of initializer if variable does not have # static shape. init_constant = gen_array_ops.fill(array_ops.shape(v), self._initial_accumulator_value) return math_ops.cast(init_constant, dtype)
def init(v=v, dtype=dtype): # Use a Tensor instead of initializer if variable does not have # static shape. init_constant = gen_array_ops.fill( array_ops.shape(v), self._initial_accumulator_value) return math_ops.cast(init_constant, dtype)
def scheduled_sampling(hps, sampling_probability, output, embedding, inp, alpha=0): # borrowed ideas from https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledEmbeddingTrainingHelper vocab_size = embedding.get_shape()[0].value def soft_argmax(alpha, output): alpha_exp = tf.exp(alpha * output) # (batch_size, vocab_size) one_hot_scores = alpha_exp / tf.reshape( tf.reduce_sum(alpha_exp, axis=1), [-1, 1]) #(batch_size, vocab_size) return one_hot_scores def soft_top_k(alpha, output, K): copy = tf.identity(output) p = [] arg_top_k = [] for k in range(K): sargmax = soft_argmax(alpha, copy) copy = (1 - sargmax) * copy p.append(tf.reduce_sum(sargmax * output, axis=1)) # replace oov with unk if necessary mask = tf.equal(tf.reduce_max(sargmax, axis=1), tf.reduce_max(sargmax[:, 0:vocab_size], axis=1)) sargmax_truncated = tf.where( mask, sargmax[:, 0:vocab_size], tf.stack([ tf.one_hot(0, vocab_size) for _ in range(hps.batch_size) ])) arg_top_k.append(sargmax_truncated) return p, tf.stack(arg_top_k) with variable_scope.variable_scope("ScheduledEmbedding"): # Return -1s where we did not sample, and sample_ids elsewhere select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool) select_sample = select_sampler.sample(sample_shape=hps.batch_size) sample_id_sampler = categorical.Categorical( probs=output ) # equals to argmax{ Multinomial(output, total_count=1) }, our greedy search selection sample_ids = array_ops.where(select_sample, sample_id_sampler.sample(seed=123), gen_array_ops.fill([hps.batch_size], -1)) where_sampling = math_ops.cast(array_ops.where(sample_ids > -1), tf.int32) where_not_sampling = math_ops.cast(array_ops.where(sample_ids <= -1), tf.int32) if hps.greedy_scheduled_sampling: sample_ids = tf.argmax(output, axis=1, output_type=tf.int32) sample_ids_sampling = array_ops.gather_nd(sample_ids, where_sampling) cond = tf.less(sample_ids_sampling, vocab_size) # replace oov with unk sample_ids_sampling = tf.cast(cond, tf.int32) * sample_ids_sampling inputs_not_sampling = array_ops.gather_nd(inp, where_not_sampling) if hps.E2EBackProp: if hps.hard_argmax: greedy_search_prob, greedy_search_sample = tf.nn.top_k( output, k=hps.k) # (batch_size, k) greedy_search_prob_normalized = greedy_search_prob / tf.reshape( tf.reduce_sum(greedy_search_prob, axis=1), [-1, 1]) cond = tf.less(greedy_search_sample, vocab_size) # replace oov with unk greedy_search_sample = tf.cast(cond, tf.int32) * greedy_search_sample greedy_embedding = tf.nn.embedding_lookup( embedding, greedy_search_sample) normalized_embedding = tf.multiply( tf.reshape(greedy_search_prob_normalized, [hps.batch_size, hps.k, 1]), greedy_embedding) e2e_embedding = tf.reduce_sum(normalized_embedding, axis=1) else: e = [] greedy_search_prob, greedy_search_sample = soft_top_k( alpha, output, K=hps.k) # (batch_size, k), (k, vocab_size) greedy_search_prob_normalized = greedy_search_prob / tf.reshape( tf.reduce_sum(greedy_search_prob, axis=1), [-1, 1]) for _ in range(hps.k): a_k = greedy_search_sample[_] e_k = tf.matmul( tf.reshape(greedy_search_prob_normalized[:, _], [-1, 1]) * a_k, embedding) e.append(e_k) e2e_embedding = tf.reduce_sum(e, axis=0) # (batch_size, emb_dim) sampled_next_inputs = array_ops.gather_nd(e2e_embedding, where_sampling) else: if hps.hard_argmax: sampled_next_inputs = tf.nn.embedding_lookup( embedding, sample_ids_sampling) else: # using soft armax (greedy) proposed in: https://arxiv.org/abs/1704.06970 if not hps.greedy_scheduled_sampling: # Gumbel reparametrization trick: https://arxiv.org/abs/1704.06970 U = tf.random_uniform( (hps.batch_size, vocab_size), 10e-12, (1 - 10e-12)) # add a small number to avoid log(0) G = -tf.log(-tf.log(U)) else: G = tf.zeros((hps.batch_size, vocab_size)) #alpha_exp = tf.exp(alpha * (output_not_extended + G)) # (batch_size, vocab_size) #one_hot_scores = alpha_exp / tf.reduce_sum(alpha_exp, axis=1) #(batch_size, vocab_size) one_hot_scores = soft_argmax( alpha, (output + G)) #(batch_size, vocab_size) sampled_next_inputs = tf.matmul( one_hot_scores, embedding) #(batch_size, emb_size) base_shape = array_ops.shape(inp) result1 = array_ops.scatter_nd(indices=where_sampling, updates=sampled_next_inputs, shape=base_shape) result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=inputs_not_sampling, shape=base_shape) return result1 + result2