def testFuseResizeAndConv(self): with self.cached_session() as sess: inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6] input_op = constant_op.constant( np.array(inputs), shape=[1, 2, 3, 2], dtype=dtypes.float32) resize_op = image_ops.resize_bilinear( input_op, [12, 4], align_corners=False) weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4] weights_op = constant_op.constant( np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32) nn_ops.conv2d( resize_op, weights_op, [1, 1, 1, 1], padding="VALID", name="output") original_graph_def = sess.graph_def original_result = sess.run(["output:0"]) optimized_graph_def = optimize_for_inference_lib.fuse_resize_and_conv( original_graph_def, ["output"]) with self.cached_session() as sess: _ = importer.import_graph_def( optimized_graph_def, input_map={}, name="optimized") optimized_result = sess.run(["optimized/output:0"]) self.assertAllClose(original_result, optimized_result) for node in optimized_graph_def.node: self.assertNotEqual("Conv2D", node.op) self.assertNotEqual("MirrorPad", node.op)
def _test_convolution(tensor_in_sizes, filter_in_sizes, dilations, strides, padding, data_format): """ One iteration of convolution with given shapes and attributes """ total_size_1 = 1 total_size_2 = 1 for s in tensor_in_sizes: total_size_1 *= s for s in filter_in_sizes: total_size_2 *= s # Initializes the input tensor with array containing incrementing # numbers from 1. data_array = [f * 1.0 for f in range(1, total_size_1 + 1)] filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)] with tf.Graph().as_default(): in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype='float32') in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype='float32') strides = [1] + strides + [1] dilations = [1] + dilations + [1] nn_ops.conv2d(in_data, in_filter, strides=strides, padding=padding, data_format=data_format) compare_tf_with_tvm(np.reshape(data_array, tensor_in_sizes).astype('float32'), 'Placeholder:0', 'Conv2D:0')
def testFusePadAndConv(self): with self.cached_session() as sess: inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6] input_op = constant_op.constant( np.array(inputs), shape=[1, 2, 3, 2], dtype=dtypes.float32) pad_op = array_ops.pad(input_op, [[0, 0], [1, 1], [2, 2], [0, 0]], mode="REFLECT") weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4] weights_op = constant_op.constant( np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32) nn_ops.conv2d( pad_op, weights_op, [1, 1, 1, 1], padding="VALID", name="output") original_graph_def = sess.graph_def original_result = sess.run(["output:0"]) optimized_graph_def = optimize_for_inference_lib.fuse_resize_and_conv( original_graph_def, ["output"]) with self.cached_session() as sess: _ = importer.import_graph_def( optimized_graph_def, input_map={}, name="optimized") optimized_result = sess.run(["optimized/output:0"]) self.assertAllClose(original_result, optimized_result) for node in optimized_graph_def.node: self.assertNotEqual("Conv2D", node.op) self.assertNotEqual("ResizeBilinear", node.op)
def build_graph(device, input_shape, filter_shape, strides, padding, num_iters): """builds a graph containing a sequence of conv2d operations. Args: device: String, the device to run on. input_shape: Shape of the input tensor. filter_shape: Shape of the filter tensor. strides: A list of ints. 1-D of length 4. The stride of sliding window for each dimension of input. padding: A string from: "SAME", "VALID". The type of padding algorithm to use. num_iters: number of iterations to run conv2d. Returns: An array of tensors to run() """ with ops.device("/%s:0" % device): inp = variables.Variable(random_ops.truncated_normal(input_shape)) filt = variables.Variable(random_ops.truncated_normal(filter_shape)) outputs = [] conv2d_op = nn_ops.conv2d(inp, filt, strides, padding, data_format="NHWC") outputs.append(conv2d_op) for _ in range(1, num_iters): with ops.control_dependencies([conv2d_op]): conv2d_op = nn_ops.conv2d( inp, filt, strides, padding, data_format="NHWC") outputs.append(conv2d_op) return control_flow_ops.group(*outputs)
def testAtrousSequence(self): """Tests optimization of sequence of atrous convolutions. Verifies that a sequence of `atrous_conv2d` operations with identical `rate` parameters, 'SAME' `padding`, and `filters` with odd heights/ widths: net = atrous_conv2d(net, filters1, rate, padding="SAME") net = atrous_conv2d(net, filters2, rate, padding="SAME") ... net = atrous_conv2d(net, filtersK, rate, padding="SAME") is equivalent to: pad = ... # padding so that the input dims are multiples of rate net = space_to_batch(net, paddings=pad, block_size=rate) net = conv2d(net, filters1, strides=[1, 1, 1, 1], padding="SAME") net = conv2d(net, filters2, strides=[1, 1, 1, 1], padding="SAME") ... net = conv2d(net, filtersK, strides=[1, 1, 1, 1], padding="SAME") net = batch_to_space(net, crops=pad, block_size=rate) """ padding = "SAME" # The padding needs to be "SAME" np.random.seed(1) # Make it reproducible. with self.session(use_gpu=True): # Input: [batch, height, width, input_depth] for height in range(15, 17): for width in range(15, 17): x_shape = [3, height, width, 2] x = np.random.random_sample(x_shape).astype(np.float32) for kernel in [1, 3, 5]: # The kernel size needs to be odd. # Filter: [kernel_height, kernel_width, input_depth, output_depth] f_shape = [kernel, kernel, 2, 2] f = 1e-2 * np.random.random_sample(f_shape).astype(np.float32) for rate in range(2, 4): # y1: three atrous_conv2d in a row. y1 = nn_ops.atrous_conv2d(x, f, rate, padding=padding) y1 = nn_ops.atrous_conv2d(y1, f, rate, padding=padding) y1 = nn_ops.atrous_conv2d(y1, f, rate, padding=padding) # y2: space_to_batch, three conv2d in a row, batch_to_space pad_bottom = 0 if height % rate == 0 else rate - height % rate pad_right = 0 if width % rate == 0 else rate - width % rate pad = [[0, pad_bottom], [0, pad_right]] y2 = array_ops.space_to_batch(x, paddings=pad, block_size=rate) y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding) y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding) y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding) y2 = array_ops.batch_to_space(y2, crops=pad, block_size=rate) self.assertAllClose( y1.eval(), self.evaluate(y2), rtol=1e-2, atol=1e-2)
def _BuildSmallModel(self): image = array_ops.zeros([2, 6, 6, 3]) kernel = variable_scope.get_variable( 'DW', [3, 3, 3, 6], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME') kernel = variable_scope.get_variable( 'DW2', [2, 2, 6, 12], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME') return x
def testExtractPointwiseConv2dPatches(self): with ops.Graph().as_default(), self.test_session() as sess: batch_size = 10 image_height = image_width = 8 in_channels = out_channels = 3 kernel_height = kernel_width = 1 strides = [1, 1, 1, 1] padding = 'VALID' images = random_ops.random_uniform( [batch_size, image_height, image_width, in_channels], seed=0) kernel_shape = [kernel_height, kernel_width, in_channels, out_channels] kernel = random_ops.random_uniform(kernel_shape, seed=1) # Ensure shape matches expectation. patches = utils.extract_pointwise_conv2d_patches(images, kernel_shape) self.assertEqual(patches.shape.as_list(), [ batch_size, image_height, image_width, kernel_height, kernel_width, in_channels ]) # Ensure extract...patches() + matmul() and conv2d() implementation # give the same answer. outputs = nn_ops.conv2d(images, kernel, strides, padding) patches_flat = array_ops.reshape( patches, [-1, kernel_height * kernel_width * in_channels]) kernel_flat = array_ops.reshape(kernel, [-1, out_channels]) outputs_flat = math_ops.matmul(patches_flat, kernel_flat) outputs_, outputs_flat_ = sess.run([outputs, outputs_flat]) self.assertAllClose(outputs_.flatten(), outputs_flat_.flatten())
def _tf_enc_attention_decoder(self, attention_states, last_enc_state, cell, num_heads=1, dtype=dtypes.float32, scope=None): """RNN decoder with attention for the sequence-to-sequence model. Args: return_encodings: If true, return encoder hidden states. Otherwise, return single step decoding tensors """ if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) with variable_scope.variable_scope(scope or "attention_decoder"): attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) # Hidden states multiplied with W1 v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) return [last_enc_state] + [hidden] + hidden_features + v
def SimulateFusedConv2dBiasActivationInt8(conv_input_scale, conv_input, kernel, padding, strides, side_input_scale, side_input, biases): """Simulates the int8 fused 2-D convolution op using separate float ops. The arguments and return values have the same format, meanings and restrictions as the actual op. Args: conv_input_scale: A scalar 'float'. conv_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout. kernel: A `Tensor` of type `qint8` in OIHW_VECT_I layout. padding: A `string` from: `"SAME", "VALID"`. strides: A list of `ints`. side_input_scale: A scalar 'float'. side_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout. biases: A `Tensor` of type `float32` in NCHW layout. Returns: A `Tensor` of type `qint8` in NCHW_VECT_C layout. """ conv_result = nn_ops.conv2d( NchwVectCToNchw(gen_array_ops.dequantize(conv_input, -128, 127)), OihwVectIToHwio(gen_array_ops.dequantize(kernel, -128, 127)), strides=strides, padding=padding, data_format="NCHW") * conv_input_scale conv_and_side_inputs = conv_result + side_input_scale * NchwVectCToNchw( gen_array_ops.dequantize(side_input, -128, 127)) logit = nn_ops.bias_add(conv_and_side_inputs, biases, data_format="NCHW") result, _, _ = gen_array_ops.quantize_v2( NchwToNchwVectC(nn_ops.relu(logit)), -128, 127, dtypes.qint8) return result
def _CloneWithNewOperands(layer_op, input_tensor, weight_tensor): """Clones layer_op with input_tensor and weight_tensor as new inputs.""" new_layer_name = layer_op.name.split('/')[-1] + '_Fold' if layer_op.type == 'Conv2D': return nn_ops.conv2d( input_tensor, weight_tensor, strides=layer_op.get_attr('strides'), padding=layer_op.get_attr('padding'), use_cudnn_on_gpu=layer_op.get_attr('use_cudnn_on_gpu'), data_format=layer_op.get_attr('data_format'), name=new_layer_name) elif layer_op.type == 'MatMul': return math_ops.matmul( input_tensor, weight_tensor, transpose_a=layer_op.get_attr('transpose_a'), transpose_b=layer_op.get_attr('transpose_b'), name=new_layer_name) elif layer_op.type == 'DepthwiseConv2dNative': return nn.depthwise_conv2d( input_tensor, weight_tensor, strides=layer_op.get_attr('strides'), padding=layer_op.get_attr('padding'), name=new_layer_name) else: raise ValueError('Cannot handle operation of type: %s' % layer_op.type)
def _strict_conv1d(x, h): """Return x * h for rank 1 tensors x and h.""" with ops.op_scope([x, h], 'strict_conv1d'): x = array_ops.reshape(x, (1, -1, 1, 1)) h = array_ops.reshape(h, (-1, 1, 1, 1)) result = nn_ops.conv2d(x, h, [1, 1, 1, 1], 'SAME') return array_ops.reshape(result, [-1])
def _VerifyValues(self, input_sizes=None, filter_sizes=None, strides=None, dilations=None, padding=None, data_format_src="NHWC", data_format_dst="NHWC", expected=None): """Tests that tf.nn.conv2d produces the expected value. Args: input_sizes: Input tensor dimensions in [batch, input_rows, input_cols, input_depth]. filter_sizes: Filter tensor dimensions in [kernel_rows, kernel_cols, input_depth, output_depth]. strides: Strides. dilations: RHS dilations. padding: Padding type. data_format_src: Data format input is in. data_format_dst: Data format verification will run and input is converted to. expected: Expected output. """ total_size_1 = np.prod(input_sizes) total_size_2 = np.prod(filter_sizes) x1 = np.arange(1, total_size_1 + 1, dtype=np.float32).reshape(input_sizes) x2 = np.arange(1, total_size_2 + 1, dtype=np.float32).reshape(filter_sizes) strides = [1] + strides + [1] if dilations is None: dilations = [1, 1] dilations = [1] + dilations + [1] # Convert between data formats. expected = test_utils.ConvertBetweenDataFormats(expected, data_format_src, data_format_dst) x1 = test_utils.ConvertBetweenDataFormats(x1, data_format_src, data_format_dst) input_sizes = test_utils.PermuteDimsBetweenDataFormats( input_sizes, data_format_src, data_format_dst) strides = test_utils.PermuteDimsBetweenDataFormats(strides, data_format_src, data_format_dst) dilations = test_utils.PermuteDimsBetweenDataFormats( dilations, data_format_src, data_format_dst) with self.test_session() as sess: t1 = array_ops.placeholder(dtypes.float32, shape=input_sizes) t2 = array_ops.placeholder(dtypes.float32, shape=filter_sizes) with self.test_scope(): out = nn_ops.conv2d( t1, t2, strides=strides, padding=padding, data_format=data_format_dst, dilations=dilations) value = sess.run(out, {t1: x1, t2: x2}) self.assertAllClose(expected, value, 1e-3)
def testSmallNetwork(self): image = array_ops.placeholder(dtypes.float32, shape=[1, 28, 28, 1]) label = array_ops.placeholder(dtypes.float32, shape=[1, 10]) w = variables.Variable( random_ops.truncated_normal([5, 5, 1, 32], stddev=0.1)) b = variables.Variable(random_ops.truncated_normal([32], stddev=0.1)) conv = nn_ops.conv2d(image, w, strides=[1, 1, 1, 1], padding="SAME") h_conv = nn_ops.relu(conv + b) h_conv_flat = array_ops.reshape(h_conv, [1, -1]) w_fc = variables.Variable( random_ops.truncated_normal([25088, 10], stddev=0.1)) b_fc = variables.Variable(random_ops.truncated_normal([10], stddev=0.1)) y_conv = nn_ops.softmax(math_ops.matmul(h_conv_flat, w_fc) + b_fc) cross_entropy = math_ops.reduce_mean(-math_ops.reduce_sum( label * math_ops.log(y_conv), reduction_indices=[1])) _ = adam.AdamOptimizer(1e-4).minimize(cross_entropy) mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph()) report = cost_analyzer.GenerateCostReport(mg) self.assertTrue(b"MatMul" in report) self.assertTrue(b"ApplyAdam" in report) self.assertTrue(b"Conv2D" in report) self.assertTrue(b"Conv2DBackpropInput" in report) self.assertTrue(b"Conv2DBackpropFilter" in report) self.assertTrue(b"Softmax" in report) # Also print the report to make it easier to debug print("{}".format(report))
def ReferenceDepthwiseConv2D(input_tensor, filter_tensor, strides, padding, data_format=None): # Reference implementation of depthwise convolution that uses regular # convolution. convs = [] in_channels = filter_tensor.shape[2] # Use a custom implementation of depthwise conv2d using slicing. for channel in xrange(in_channels): # Slice the input along channel if data_format == "NCHW": input_slice = input_tensor[:, channel:channel+1, :, :] else: input_slice = input_tensor[:, :, :, channel:channel+1] # Slice the filters. Filters are H, W, InC, DepthMultiplier filter_slice = filter_tensor[:, :, channel:channel+1, :] # Do conv convs.append(nn_ops.conv2d(input_slice, filter_slice, strides, padding, data_format=data_format, name="depthwise_slice_%d" % channel)) # Concat along dimension. if data_format == "NCHW": return array_ops.concat(convs, 1) else: return array_ops.concat(convs, 3)
def _VerifyValues(self, input_sizes, filter_sizes, stride, padding, expected): """Tests that tf.nn.conv2d produces the expected value. Args: input_sizes: Input tensor dimensions in [batch, input_rows, input_cols, input_depth]. filter_sizes: Filter tensor dimensions in [kernel_rows, kernel_cols, input_depth, output_depth]. stride: Stride. padding: Padding type. expected: Expected output. """ total_size_1 = np.prod(input_sizes) total_size_2 = np.prod(filter_sizes) x1 = np.arange(1, total_size_1 + 1, dtype=np.float32).reshape(input_sizes) x2 = np.arange(1, total_size_2 + 1, dtype=np.float32).reshape(filter_sizes) strides = [1, stride, stride, 1] with self.test_session() as sess: with self.test_scope(): t1 = array_ops.placeholder(dtypes.float32, shape=input_sizes) t2 = array_ops.placeholder(dtypes.float32, shape=filter_sizes) out = nn_ops.conv2d( t1, t2, strides=strides, padding=padding, data_format="NHWC") value = sess.run(out, {t1: x1, t2: x2}) self.assertArrayNear(expected, np.ravel(value), 1e-3)
def testGradientDilatedConv(self): if test.is_gpu_available(cuda_only=True): with self.test_session(use_gpu=True): for padding in ["SAME", "VALID"]: for stride in [1, 2]: np.random.seed(1) in_shape = [5, 8, 6, 4] in_val = constant_op.constant( 2 * np.random.random_sample(in_shape) - 1, dtype=dtypes.float32) filter_shape = [3, 3, 4, 6] # Make a convolution op with the current settings, # just to easily get the shape of the output. conv_out = nn_ops.conv2d( in_val, array_ops.zeros(filter_shape), dilations=[1, 2, 2, 1], strides=[1, stride, stride, 1], padding=padding) out_backprop_shape = conv_out.get_shape().as_list() out_backprop_val = constant_op.constant( 2 * np.random.random_sample(out_backprop_shape) - 1, dtype=dtypes.float32) output = nn_ops.conv2d_backprop_filter( in_val, filter_shape, out_backprop_val, dilations=[1, 2, 2, 1], strides=[1, stride, stride, 1], padding=padding) err = gradient_checker.compute_gradient_error( [in_val, out_backprop_val], [in_shape, out_backprop_shape], output, filter_shape) print("conv2d_backprop_filter gradient err = %g " % err) err_tolerance = 2e-3 self.assertLess(err, err_tolerance)
def _Conv2DBackpropInputGrad(op, grad): """The derivatives for deconvolution. Args: op: the Deconvolution op. grad: the tensor representing the gradient w.r.t. the output Returns: the gradients w.r.t. the input and the filter """ return [ None, nn_ops.conv2d_backprop_filter( grad, array_ops.shape(op.inputs[1]), op.inputs[2], dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"), data_format=op.get_attr("data_format").decode()), nn_ops.conv2d( grad, op.inputs[1], dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"), data_format=op.get_attr("data_format").decode()) ]
def separable_conv2d(input, depthwise_filter, pointwise_filter, strides, padding, name=None): """2-D convolution with separable filters. Performs a depthwise convolution that acts separately on channels followed by a pointwise convolution that mixes channels. Note that this is separability between dimensions `[1, 2]` and `3`, not spatial separability between dimensions `1` and `2`. In detail, output[b, i, j, k] = sum_{di, dj, q, r] input[b, strides[1] * i + di, strides[2] * j + dj, q] * depthwise_filter[di, dj, q, r] * pointwise_filter[0, 0, q * channel_multiplier + r, k] `strides` controls the strides for the depthwise convolution only, since the pointwise convolution has implicit strides of `[1, 1, 1, 1]`. Must have `strides[0] = strides[3] = 1`. For the most common case of the same horizontal and vertical strides, `strides = [1, stride, stride, 1]`. Args: input: 4-D `Tensor` with shape `[batch, in_height, in_width, in_channels]`. depthwise_filter: 4-D `Tensor` with shape `[filter_height, filter_width, in_channels, channel_multiplier]`. Contains `in_channels` convolutional filters of depth 1. pointwise_filter: 4-D `Tensor` with shape `[1, 1, channel_multiplier * in_channels, out_channels]`. Pointwise filter to mix channels after `depthwise_filter` has convolved spatially. strides: 1-D of size 4. The strides for the depthwise convolution for each dimension of `input`. padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. name: A name for this operation (optional). Returns: A 4-D `Tensor` of shape `[batch, out_height, out_width, out_channels]`. """ with ops.op_scope([input, depthwise_filter, pointwise_filter], name, "separable_conv2d") as name: input = ops.convert_to_tensor(input, name="tensor_in") depthwise_filter = ops.convert_to_tensor(depthwise_filter, name="depthwise_filter") pointwise_filter = ops.convert_to_tensor(pointwise_filter, name="pointwise_filter") if pointwise_filter.get_shape().ndims is not None: assert len(pointwise_filter.get_shape()) == 4 assert pointwise_filter.get_shape()[0] == 1 assert pointwise_filter.get_shape()[1] == 1 if depthwise_filter.get_shape().ndims and input.get_shape().ndims: channel_multiplier = depthwise_filter.get_shape()[3] in_channels = input.get_shape()[3] out_channels = pointwise_filter.get_shape()[3] # This would mean the separable convolutions is over-parametrized. assert channel_multiplier * in_channels < out_channels # The layout of the ops in the graph are expected to be as follows: # separable_conv2d // Conv2D op corresponding to the pointwise conv. # separable_conv2d/depthwise // Concat op for the deptwise outputs. # separable_conv2d/depthwise/depth0 // Conv2D op for depth 0 # separable_conv2d/depthwise/depth1 // Conv2D op for depth 1 # separable_conv2d/depthwise/depth2 // Conv2D op for depth 2 depthwise = depthwise_conv2d(input, depthwise_filter, strides, padding, name="depthwise") return nn_ops.conv2d(depthwise, pointwise_filter, [1, 1, 1, 1], padding="VALID", name=name)
def func(inp): conv = nn_ops.conv2d( inp, filter=array_ops.ones([3, 3, 3, 16]), strides=[1, 1, 1, 1], padding='SAME') output = nn_ops.relu(conv, name='output') return output
def depthwise_conv2d(input, filter, strides, padding, name=None): """Depthwise 2-D convolution. Given an input tensor of shape `[batch, in_height, in_width, in_channels]` and a filter tensor of shape `[filter_height, filter_width, in_channels, channel_multiplier]` containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies a different filter to each input channel (expanding from 1 channel to `channel_multiplier` channels for each), then concatenates the results together. The output has `in_channels * channel_multiplier` channels. In detail, output[b, i, j, k * channel_multiplier + q] = sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] * filter[di, dj, k, q] Must have `strides[0] = strides[3] = 1`. For the most common case of the same horizontal and vertical strides, `strides = [1, stride, stride, 1]`. Args: input: 4-D with shape `[batch, in_height, in_width, in_channels]`. filter: 4-D with shape `[filter_height, filter_width, in_channels, channel_multiplier]`. strides: 1-D of size 4. The stride of the sliding window for each dimension of `input`. padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution) name: A name for this operation (optional). Returns: A 4-D `Tensor` of shape `[batch, out_height, out_width, in_channels * channel_multiplier].` """ with ops.op_scope([input, filter], name, "depthwise") as name: input = ops.convert_to_tensor(input, name="tensor_in") filter = ops.convert_to_tensor(filter, name="filter_in") # A shape is required to statically compute the number of separable filters. if filter.get_shape().ndims is not None: assert len(filter.get_shape()) == 4 in_channels = filter.get_shape()[2] # Sanity checks, if shape information is available for the inputs. if input.get_shape().ndims is not None: assert len(input.get_shape()) == 4 assert input.get_shape()[3] == in_channels, ( "Mismatched input depth %d and number of depthwise filters %d." % ( input.get_shape()[3].value, in_channels)) else: assert input.get_shape().ndims is not None, ( "Either tensor must provide static shape information.") assert input.get_shape().ndims == 4 in_channels = input.get_shape()[3] if in_channels == 1: return nn_ops.conv2d(input, filter, strides, padding, name=name) else: return nn_ops.depthwise_conv2d_native(input, filter, strides, padding, name=name)
def spatial_conv(batch, gain): s = array_ops.shape(batch) padded = array_ops.pad(batch, [[0, 0], [2, 2], [2, 2], [0, 0]], 'REFLECT') xt = array_ops.transpose(padded, [0, 3, 1, 2]) xt = array_ops.reshape(xt, [s[0] * s[3], s[1] + 4, s[2] + 4, 1]) conv_out = nn_ops.conv2d(xt, gaussian_filter * gain, [1] * 4, 'VALID') conv_xt = array_ops.reshape(conv_out, [s[0], s[3], s[1], s[2]]) conv_xt = array_ops.transpose(conv_xt, [0, 2, 3, 1]) return conv_xt
def testConv2dGradWRTFilter(self): x = constant_op.constant([0.5], dtype=dtypes.float32, shape=[1, 4, 4, 3], name='input') f = array_ops.placeholder( dtype=dtypes.float32, shape=[2, 2, 3, 2], name='filter') y = nn_ops.conv2d(x, f, [1, 1, 1, 1], 'SAME') self.run_test(f, y)
def BuildSplitableModel(): """Build a small model that can be run partially in each step.""" image = array_ops.zeros([2, 6, 6, 3]) kernel1 = variable_scope.get_variable( 'DW', [3, 3, 3, 6], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) r1 = nn_ops.conv2d(image, kernel1, [1, 2, 2, 1], padding='SAME') kernel2 = variable_scope.get_variable( 'DW2', [2, 3, 3, 6], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) r2 = nn_ops.conv2d(image, kernel2, [1, 2, 2, 1], padding='SAME') r3 = r1 + r2 return r1, r2, r3
def BuildSmallModel(): """Build a small forward conv model.""" image = array_ops.zeros([2, 6, 6, 3]) _ = variable_scope.get_variable( 'ScalarW', [], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) kernel = variable_scope.get_variable( 'DW', [3, 3, 3, 6], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME') kernel = variable_scope.get_variable( 'DW2', [2, 2, 6, 12], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME') return x
def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, bias, strides, padding, activation_mode, data_format, dtype): """Verifies the output values of the convolution function. Args: tensor_in_sizes: Input tensor dimensions in [batch, input_rows, input_cols, input_depth]. filter_in_sizes: Filter tensor dimensions in [kernel_rows, kernel_cols, input_depth, output_depth]. bias: 1-D bias tensor of length output_depth. strides: Stride: [col_stride, row_stride] padding: Padding type. activation_mode: Activation mode. data_format: Format of the data tensors. dtype: Data type for inputs and outputs. Returns: Symbolic tensor value and reference value that can be used to execute the computation and verify the results. """ input_size = np.prod(tensor_in_sizes) filter_size = np.prod(filter_in_sizes) bias_size = filter_in_sizes[-1] # equals to output depth # Initializes the input tensor with array containing incrementing # numbers from 1. x1 = [f * 1.0 for f in range(1, input_size + 1)] x2 = [f * 1.0 for f in range(1, filter_size + 1)] # This is to guarantee that there is always negative values after # bias add so that we can test whether relu works correctly. x3 = bias with self.test_session(use_gpu=True): t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype) t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype) t3 = constant_op.constant(x3, shape=[bias_size], dtype=dtype) strides = [1] + strides + [1] if data_format == "NCHW": t1 = test_util.NHWCToNCHW(t1) strides = test_util.NHWCToNCHW(strides) output = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation( t1, t2, t3, strides=strides, padding=padding, data_format=data_format, activation_mode=activation_mode) ref_conv_output = nn_ops.conv2d( t1, t2, strides=strides, padding=padding, data_format=data_format) ref_bias_output = nn_ops.bias_add( ref_conv_output, t3, data_format=data_format) ref_output = nn_ops.relu(ref_bias_output) if data_format == "NCHW": output = test_util.NCHWToNHWC(output) ref_output = test_util.NCHWToNHWC(ref_output) return output, ref_output
def build_conv_bias_relu_graph(device, input_shape, filter_shape, strides, padding, num_iters, data_format): """builds a graph containing a sequence of conv2d operations. Args: device: String, the device to run on. input_shape: Shape of the input tensor. filter_shape: Shape of the filter tensor. strides: A list of ints. 1-D of length 4. The stride of sliding window for each dimension of input. padding: A string from: "SAME", "VALID". The type of padding algorithm to use. num_iters: number of iterations to run conv2d. data_format: data format string of input, 'NHWC' and 'NCHW' are supported. Returns: An array of tensors to run() """ if data_format == "NCHW": input_shape = [ input_shape[0], input_shape[3], input_shape[1], input_shape[2] ] with ops.device("/%s:0" % device): inp = variables.Variable(random_ops.truncated_normal(input_shape)) filt = variables.Variable(random_ops.truncated_normal(filter_shape)) bias_shape = [filter_shape[-1]] bias = variables.Variable(random_ops.truncated_normal(bias_shape)) outputs = [] conv2d_out = nn_ops.conv2d( inp, filt, strides, padding, data_format=data_format) bias_out = nn_ops.bias_add(conv2d_out, bias, data_format=data_format) relu_out = nn_ops.relu(bias_out) outputs.append(relu_out) for _ in range(1, num_iters): with ops.control_dependencies([relu_out]): conv2d_out = nn_ops.conv2d( inp, filt, strides, padding, data_format=data_format) bias_out = nn_ops.bias_add(conv2d_out, bias, data_format=data_format) relu_out = nn_ops.relu(bias_out) outputs.append(relu_out) return control_flow_ops.group(*outputs)
def _random_out_op(self, in_shape, filter_shape): # Choosing not to use array_op.zeros() to prevent possible removal by # optimization in_op = self._random_data_op(in_shape) filter_op = self._random_data_op(filter_shape) # Use the forward op's shape-inference conv_op = nn_ops.conv2d( in_op, filter_op, strides=_STRIDES, padding=_PADDING) out_shape = conv_op.get_shape() out_op = self._random_data_op(out_shape) return out_op
def decoder_type_1(decoder_hidden, attn_size, initializer=None): with vs.variable_scope("decoder_type_1", initializer=initializer): k = vs.get_variable("AttnDecW_%d" % 0, [1, 1, attn_size, 1], initializer=initializer) hidden_features = nn_ops.conv2d(decoder_hidden, k, [1, 1, 1, 1], "SAME") # s will be (?, timesteps) s = math_ops.reduce_sum(math_ops.tanh(hidden_features), [2, 3]) return s
def _CloneConv2d(self, op, inputs, new_name): input_tensor = inputs[0] weights = inputs[1] self._AssertConvShapes(op.name, input_tensor, weights) return nn_ops.conv2d( input_tensor, weights, strides=op.get_attr('strides'), padding=op.get_attr('padding'), use_cudnn_on_gpu=op.get_attr('use_cudnn_on_gpu'), data_format=op.get_attr('data_format'), name=new_name).op
def luong_general(hidden, decoder_hidden_state, initializer=None): # size of decoder layers attention_vec_size = hidden.get_shape()[3].value with vs.variable_scope("luong_general", initializer=initializer): # here we calculate the W_a * s_i-1 (W1 * h_1) part of the attention alignment k = vs.get_variable("AttnW_%d" % 0, [1, 1, attention_vec_size, attention_vec_size], initializer=initializer) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") s = math_ops.reduce_sum((hidden_features * decoder_hidden_state), [2, 3]) return s
def rnn_decoder(self,encode_embed, attention_states, initial_state, cell, num_heads=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): """RNN decoder for the sequence-to-sequence model. """ with variable_scope.variable_scope(scope or "rnn_decoder"): batch_size = tf.shape(encode_embed[0])[0]# Needed for reshaping. attn_length = attention_states.get_shape()[1].value #number of output vector in sequence attn_size = attention_states.get_shape()[2].value #the dimension size of each output vector state_size = initial_state.get_shape()[1].value #the dimension size of state vector print(batch_size,attn_length,attn_size,state_size,"batch_size,attn_lengt,attn_size,state_size") # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. print(attention_states.get_shape(),"attention_states.get_shape()") hidden = tf.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] hidden_features2 = [] v = [] u = [] linear_w = [] linear_b = [] abstract_w = [] abstract_b = [] abstract_layers = [int((attn_size + state_size)/(2 + 2*i)) for i in xrange(2)] + [1] attention_vec_size = attn_size# Size of query vectors for attention. head_weights = [] for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))#[B,T,1,attn_vec_size] k2 = variable_scope.get_variable("AttnW2_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features2.append(nn_ops.conv2d(hidden, k2, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) u.append(variable_scope.get_variable("AttnU_%d" % a, [attention_vec_size])) head_weights.append(variable_scope.get_variable("head_weight_%d" % a,[1])) current_layer_size = attn_size + state_size linear_w.append(variable_scope.get_variable("linearW_%d" % a, [1,1,current_layer_size, 1])) linear_b.append(variable_scope.get_variable("linearB_%d" % a, [1])) abstract_w.append([]) abstract_b.append([]) for i in xrange(len(abstract_layers)): layer_size = abstract_layers[i] abstract_w[a].append(variable_scope.get_variable("Att_%d_layerW_%d" % (a,i), [1,1,current_layer_size, layer_size])) abstract_b[a].append(variable_scope.get_variable("Att_%d_layerB_%d" % (a,i), [layer_size])) current_layer_size = layer_size def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = []# Results of attention reads will be stored here. aw = []# Attention weights will be stored here tiled_query = tf.tile(tf.reshape(query, [-1, 1, 1, state_size]),[1,attn_length,1, 1]) print(hidden.get_shape(),"hidden.get_shape()") print(tiled_query.get_shape(),"tiled_query.get_shape()") concat_input = tf.concat(axis=3, values=[hidden, tiled_query]) #concat_input = tf.concat(3, [hidden, hidden]) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): s = None if self.hparams.att_strategy == 'multi': print('Attention: multiply') y = linear(query, attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) #s = math_ops.reduce_sum( # u[a] * math_ops.tanh(y * hidden_features[a]), [2, 3]) s = math_ops.reduce_sum( hidden * math_ops.tanh(y), [2, 3]) #hidden_features[a] * math_ops.tanh(y), [2, 3]) elif self.hparams.att_strategy == 'multi_add': print('Attention: multiply_add') y = linear(query, attention_vec_size, True, scope='y') y2 = linear(query, attention_vec_size, True , scope='y2') y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) y2 = tf.reshape(y2, [-1, 1, 1, attention_vec_size]) #s = math_ops.reduce_sum( # u[a] * math_ops.tanh(y * hidden_features[a]), [2, 3]) s = math_ops.reduce_sum( hidden * math_ops.tanh(y2), [2, 3]) s = s + math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) elif self.hparams.att_strategy == 'NTN': print('Attention: NTN') y = linear(query, attn_size, False) y = tf.tile(tf.reshape(y, [-1, 1, 1, attn_size]),[1,attn_length,1,1]) s = math_ops.reduce_sum(hidden * y, [2,3]) #bilnear s = s + math_ops.reduce_sum(nn_ops.conv2d(concat_input, linear_w[a], [1, 1, 1, 1], "SAME"), [2,3]) #linear s = s + linear_b[a] #bias #print(s.get_shape()) #s = tf.tanh(s) #non linear elif self.hparams.att_strategy == 'elu': print('Attention: elu') cur_input = concat_input #for i in xrange(len(abstract_layers)): # cur_input = tf.contrib.layers.fully_connected(cur_input, abstract_layers[i], activation_fn=tf.nn.elu) for i in xrange(len(abstract_layers)): cur_input = nn_ops.conv2d(cur_input, abstract_w[a][i], [1, 1, 1, 1], "SAME") cur_input = cur_input + abstract_b[a][i] cur_input = tf.nn.elu(cur_input) s = math_ops.reduce_sum(cur_input,[2,3]) else: print('Attention: add') y = linear(query, attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) att = s * head_weights[a]#nn_ops.softmax(s) aw.append(att) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( tf.reshape(att, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(tf.reshape(d, [-1, attn_size])) return aw, ds state = initial_state outputs = [] prev = None batch_attn_size = tf.stack([batch_size, attn_size]) batch_attw_size = tf.stack([batch_size, attn_length]) attns = [tf.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] attw = [1.0/attn_length * tf.ones(batch_attw_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns:# Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) # Directly use previous state attw, attns = attention(initial_state) aw = math_ops.reduce_sum(attw,0) output = tf.scalar_mul(1.0/float(num_heads), aw) output = output - tf.reduce_min(output,1,keep_dims=True) outputs.append(output) return outputs, state
def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = []# Results of attention reads will be stored here. aw = []# Attention weights will be stored here tiled_query = tf.tile(tf.reshape(query, [-1, 1, 1, state_size]),[1,attn_length,1, 1]) print(hidden.get_shape(),"hidden.get_shape()") print(tiled_query.get_shape(),"tiled_query.get_shape()") concat_input = tf.concat(axis=3, values=[hidden, tiled_query]) #concat_input = tf.concat(3, [hidden, hidden]) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): s = None if self.hparams.att_strategy == 'multi': print('Attention: multiply') y = linear(query, attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) #s = math_ops.reduce_sum( # u[a] * math_ops.tanh(y * hidden_features[a]), [2, 3]) s = math_ops.reduce_sum( hidden * math_ops.tanh(y), [2, 3]) #hidden_features[a] * math_ops.tanh(y), [2, 3]) elif self.hparams.att_strategy == 'multi_add': print('Attention: multiply_add') y = linear(query, attention_vec_size, True, scope='y') y2 = linear(query, attention_vec_size, True , scope='y2') y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) y2 = tf.reshape(y2, [-1, 1, 1, attention_vec_size]) #s = math_ops.reduce_sum( # u[a] * math_ops.tanh(y * hidden_features[a]), [2, 3]) s = math_ops.reduce_sum( hidden * math_ops.tanh(y2), [2, 3]) s = s + math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) elif self.hparams.att_strategy == 'NTN': print('Attention: NTN') y = linear(query, attn_size, False) y = tf.tile(tf.reshape(y, [-1, 1, 1, attn_size]),[1,attn_length,1,1]) s = math_ops.reduce_sum(hidden * y, [2,3]) #bilnear s = s + math_ops.reduce_sum(nn_ops.conv2d(concat_input, linear_w[a], [1, 1, 1, 1], "SAME"), [2,3]) #linear s = s + linear_b[a] #bias #print(s.get_shape()) #s = tf.tanh(s) #non linear elif self.hparams.att_strategy == 'elu': print('Attention: elu') cur_input = concat_input #for i in xrange(len(abstract_layers)): # cur_input = tf.contrib.layers.fully_connected(cur_input, abstract_layers[i], activation_fn=tf.nn.elu) for i in xrange(len(abstract_layers)): cur_input = nn_ops.conv2d(cur_input, abstract_w[a][i], [1, 1, 1, 1], "SAME") cur_input = cur_input + abstract_b[a][i] cur_input = tf.nn.elu(cur_input) s = math_ops.reduce_sum(cur_input,[2,3]) else: print('Attention: add') y = linear(query, attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) att = s * head_weights[a]#nn_ops.softmax(s) aw.append(att) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( tf.reshape(att, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(tf.reshape(d, [-1, attn_size])) return aw, ds
def pointer_decoder(decoder_inputs, initial_state, attention_states, cell, feed_prev=True, dtype=dtypes.float32, scope=None): """RNN decoder with pointer net for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "pointer_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) with vs.variable_scope(scope or "point_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. input_size = decoder_inputs[0].get_shape()[1].value attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) attention_vec_size = attn_size # Size of query vectors for attention. k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = vs.get_variable("AttnV", [attention_vec_size]) states = [initial_state] def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) return s outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = array_ops.zeros(batch_attn_size, dtype=dtype) attns.set_shape([None, attn_size]) inps = [] for i in xrange(len(decoder_inputs)): if i > 0: vs.get_variable_scope().reuse_variables() inp = decoder_inputs[i] if feed_prev and i > 0: inp = tf.pack(decoder_inputs) inp = tf.transpose(inp, perm=[1, 0, 2]) inp = tf.reshape(inp, [-1, attn_length, input_size]) inp = tf.reduce_sum(inp * tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1) inp = tf.stop_gradient(inp) inps.append(inp) # Use the same inputs in inference, order internaly # Merge input and previous attentions into one vector of the right size. x = rnn_cell._linear([inp, attns], cell.output_size, True) # Run the RNN. cell_output, new_state = cell(x, states[-1]) states.append(new_state) # Run the attention mechanism. output = attention(new_state) outputs.append(output) return outputs, states, inps
def attention(decoder_state, temporal_e, coverage=None): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder temporal_e: store previous attentions for temporal attention mechanism coverage: Optional. Previous timestep's coverage vector, shape (batch_size, max_enc_steps, 1, 1). Returns: context_vector: weighted sum of _enc_states attn_dist: attention distribution coverage: new coverage vector. shape (batch_size, max_enc_steps, 1, 1) masked_e: store the attention score for temporal attention mechanism. """ with variable_scope.variable_scope("Attention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) decoder_features = linear( decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size) # We can't have coverage with matrix attention if not _hps.matrix_attention and use_coverage and coverage is not None: # non-first step of coverage # Multiply coverage vector by w_c to get coverage_features. coverage_features = nn_ops.conv2d( coverage, w_c, [1, 1, 1, 1], "SAME" ) # c has shape (batch_size, max_enc_steps, 1, attention_vec_size) # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn) e_not_masked = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features + coverage_features), [2, 3]) # shape (batch_size,max_enc_steps) masked_e = nn_ops.softmax( e_not_masked ) * enc_padding_mask # (batch_size, max_enc_steps) masked_sums = tf.reduce_sum(masked_e, axis=1) # shape (batch_size) masked_e = masked_e / tf.reshape(masked_sums, [-1, 1]) # Equation 3 in if _hps.use_temporal_attention: try: len_temporal_e = temporal_e.get_shape()[0] except: len_temporal_e = 0 if len_temporal_e == 0: attn_dist = masked_e else: masked_sums = tf.reduce_sum( temporal_e, axis=0 ) + 1e-10 # if it's zero due to masking we set it to a small value attn_dist = masked_e / masked_sums # (batch_size, max_enc_steps) else: attn_dist = masked_e masked_attn_sums = tf.reduce_sum(attn_dist, axis=1) attn_dist = attn_dist / tf.reshape(masked_attn_sums, [-1, 1]) # re-normalize # Update coverage vector coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) else: if _hps.matrix_attention: # Calculate h_d * W_attn * h_i, equation 2 in https://arxiv.org/pdf/1705.04304.pdf _dec_attn = tf.unstack( tf.matmul( tf.squeeze(decoder_features, axis=[1, 2]), w_attn), axis=0) # batch_size * (attention_vec_size) _enc_states_lst = tf.unstack( tf.squeeze(_enc_states, axis=2), axis=0 ) # batch_size * (max_enc_steps, attention_vec_size) e_not_masked = tf.squeeze( tf.stack([ tf.matmul(tf.reshape(_dec, [1, -1]), tf.transpose(_enc)) for _dec, _enc in zip(_dec_attn, _enc_states_lst) ]), axis=1) # (batch_size, max_enc_steps) masked_e = tf.exp( e_not_masked * enc_padding_mask) # (batch_size, max_enc_steps) else: # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e_not_masked = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e, (batch_size, max_enc_steps) masked_e = nn_ops.softmax( e_not_masked ) * enc_padding_mask # (batch_size, max_enc_steps) masked_sums = tf.reduce_sum( masked_e, axis=1) # shape (batch_size) masked_e = masked_e / tf.reshape(masked_sums, [-1, 1]) if _hps.use_temporal_attention: try: len_temporal_e = temporal_e.get_shape()[0] except: len_temporal_e = 0 if len_temporal_e == 0: attn_dist = masked_e else: masked_sums = tf.reduce_sum( temporal_e, axis=0 ) + 1e-10 # if it's zero due to masking we set it to a small value attn_dist = masked_e / masked_sums # (batch_size, max_enc_steps) else: attn_dist = masked_e # Calculate attention distribution masked_attn_sums = tf.reduce_sum(attn_dist, axis=1) attn_dist = attn_dist / tf.reshape(masked_attn_sums, [-1, 1]) # re-normalize if use_coverage: # first step of training coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2), 2) # initialize coverage # Calculate the context vector from attn_dist and _enc_states context_vector = math_ops.reduce_sum( array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * _enc_states, [1, 2]) # shape (batch_size, attn_size). context_vector = array_ops.reshape(context_vector, [-1, attn_size]) return context_vector, attn_dist, coverage, masked_e
def attention_decoder(_hps, v_size, _max_art_oovs, _enc_batch_extend_vocab, emb_dec_inputs, target_batch, _dec_in_state, _enc_states, enc_padding_mask, dec_padding_mask, cell, embedding, sampling_probability, alpha, unk_id, initial_state_attention=False, pointer_gen=True, use_coverage=False, prev_coverage=None, prev_decoder_outputs=[], prev_encoder_es=[]): """ Args: _hps: parameter of the models. v_size: vocab size. _max_art_oovs: size of the oov tokens in current batch. _enc_batch_extend_vocab: encoder extended vocab batch. emb_dec_inputs: A list of 2D Tensors [batch_size x emb_dim]. target_batch: The indices of the target words. shape (max_dec_steps, batch_size) _dec_in_state: 2D Tensor [batch_size x cell.state_size]. _enc_states: 3D Tensor [batch_size x max_enc_steps x attn_size]. enc_padding_mask: 2D Tensor [batch_size x max_enc_steps] containing 1s and 0s; indicates which of the encoder locations are padding (0) or a real token (1). dec_padding_mask: 2D Tensor [batch_size x max_dec_steps] containing 1s and 0s; indicates which of the decoder locations are padding (0) or a real token (1). cell: rnn_cell.RNNCell defining the cell function and size. embedding: embedding matrix [vocab_size, emb_dim]. sampling_probability: sampling probability for scheduled sampling. alpha: soft-argmax argument. initial_state_attention: Note that this attention decoder passes each decoder input through a linear layer with the previous step's context vector to get a modified version of the input. If initial_state_attention is False, on the first decoder step the "previous context vector" is just a zero vector. If initial_state_attention is True, we use _dec_in_state to (re)calculate the previous step's context vector. We set this to False for train/eval mode (because we call attention_decoder once for all decoder steps) and True for decode mode (because we call attention_decoder once for each decoder step). pointer_gen: boolean. If True, calculate the generation probability p_gen for each decoder step. use_coverage: boolean. If True, use coverage mechanism. prev_coverage: If not None, a tensor with shape (batch_size, max_enc_steps). The previous step's coverage vector. This is only not None in decode mode when using coverage. prev_decoder_outputs: if not empty, a tensor of (len(prev_decoder_steps), batch_size, hidden_dim). The previous decoder output used for calculating the intradecoder attention during decode mode prev_encoder_es: if not empty, a tensor of (len(prev_encoder_es), batch_size, hidden_dim). The previous attention vector used for calculating the temporal attention during decode mode. Returns: outputs: A list of the same length as emb_dec_inputs of 2D Tensors of shape [batch_size x cell.output_size]. The output vectors. state: The final state of the decoder. A tensor shape [batch_size x cell.state_size]. attn_dists: A list containing tensors of shape (batch_size,max_enc_steps). The attention distributions for each decoder step. p_gens: List of length emb_dim, containing tensors of shape [batch_size, 1]. The values of p_gen for each decoder step. Empty list if pointer_gen=False. coverage: Coverage vector on the last step computed. None if use_coverage=False. vocab_scores: vocab distribution. final_dists: final output distribution. samples: contains sampled tokens. greedy_search_samples: contains greedy tokens. temporal_e: contains temporal attention. """ with variable_scope.variable_scope("attention_decoder") as scope: batch_size = _enc_states.get_shape()[ 0] # if this line fails, it's because the batch size isn't defined attn_size = _enc_states.get_shape( )[2] # if this line fails, it's because the attention length isn't defined emb_size = emb_dec_inputs[0].get_shape()[ 1] # if this line fails, it's because the embedding isn't defined decoder_attn_size = _dec_in_state.c.get_shape()[1] tf.logging.info("batch_size %i, attn_size: %i, emb_size: %i", batch_size, attn_size, emb_size) # Reshape _enc_states (need to insert a dim) _enc_states = tf.expand_dims( _enc_states, axis=2) # now is shape (batch_size, max_enc_steps, 1, attn_size) # To calculate attention, we calculate # v^T tanh(W_h h_i + W_s s_t + b_attn) # where h_i is an encoder state, and s_t a decoder state. # attn_vec_size is the length of the vectors v, b_attn, (W_h h_i) and (W_s s_t). # We set it to be equal to the size of the encoder states. attention_vec_size = attn_size # Get the weight matrix W_h and apply it to each encoder state to get (W_h h_i), the encoder features if _hps.matrix_attention: w_attn = variable_scope.get_variable( "w_attn", [attention_vec_size, attention_vec_size]) if _hps.intradecoder: w_dec_attn = variable_scope.get_variable( "w_dec_attn", [decoder_attn_size, decoder_attn_size]) else: W_h = variable_scope.get_variable( "W_h", [1, 1, attn_size, attention_vec_size]) v = variable_scope.get_variable("v", [attention_vec_size]) encoder_features = nn_ops.conv2d(_enc_states, W_h, [ 1, 1, 1, 1 ], "SAME") # shape (batch_size,max_enc_steps,1,attention_vec_size) if _hps.intradecoder: W_h_d = variable_scope.get_variable( "W_h_d", [1, 1, decoder_attn_size, decoder_attn_size]) v_d = variable_scope.get_variable("v_d", [decoder_attn_size]) # Get the weight vectors v and w_c (w_c is for coverage) if use_coverage: with variable_scope.variable_scope("coverage"): w_c = variable_scope.get_variable( "w_c", [1, 1, 1, attention_vec_size]) if prev_coverage is not None: # for beam search mode with coverage # reshape from (batch_size, max_enc_steps) to (batch_size, max_enc_steps, 1, 1) prev_coverage = tf.expand_dims(tf.expand_dims(prev_coverage, 2), 3) def attention(decoder_state, temporal_e, coverage=None): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder temporal_e: store previous attentions for temporal attention mechanism coverage: Optional. Previous timestep's coverage vector, shape (batch_size, max_enc_steps, 1, 1). Returns: context_vector: weighted sum of _enc_states attn_dist: attention distribution coverage: new coverage vector. shape (batch_size, max_enc_steps, 1, 1) masked_e: store the attention score for temporal attention mechanism. """ with variable_scope.variable_scope("Attention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) decoder_features = linear( decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size) # We can't have coverage with matrix attention if not _hps.matrix_attention and use_coverage and coverage is not None: # non-first step of coverage # Multiply coverage vector by w_c to get coverage_features. coverage_features = nn_ops.conv2d( coverage, w_c, [1, 1, 1, 1], "SAME" ) # c has shape (batch_size, max_enc_steps, 1, attention_vec_size) # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn) e_not_masked = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features + coverage_features), [2, 3]) # shape (batch_size,max_enc_steps) masked_e = nn_ops.softmax( e_not_masked ) * enc_padding_mask # (batch_size, max_enc_steps) masked_sums = tf.reduce_sum(masked_e, axis=1) # shape (batch_size) masked_e = masked_e / tf.reshape(masked_sums, [-1, 1]) # Equation 3 in if _hps.use_temporal_attention: try: len_temporal_e = temporal_e.get_shape()[0] except: len_temporal_e = 0 if len_temporal_e == 0: attn_dist = masked_e else: masked_sums = tf.reduce_sum( temporal_e, axis=0 ) + 1e-10 # if it's zero due to masking we set it to a small value attn_dist = masked_e / masked_sums # (batch_size, max_enc_steps) else: attn_dist = masked_e masked_attn_sums = tf.reduce_sum(attn_dist, axis=1) attn_dist = attn_dist / tf.reshape(masked_attn_sums, [-1, 1]) # re-normalize # Update coverage vector coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) else: if _hps.matrix_attention: # Calculate h_d * W_attn * h_i, equation 2 in https://arxiv.org/pdf/1705.04304.pdf _dec_attn = tf.unstack( tf.matmul( tf.squeeze(decoder_features, axis=[1, 2]), w_attn), axis=0) # batch_size * (attention_vec_size) _enc_states_lst = tf.unstack( tf.squeeze(_enc_states, axis=2), axis=0 ) # batch_size * (max_enc_steps, attention_vec_size) e_not_masked = tf.squeeze( tf.stack([ tf.matmul(tf.reshape(_dec, [1, -1]), tf.transpose(_enc)) for _dec, _enc in zip(_dec_attn, _enc_states_lst) ]), axis=1) # (batch_size, max_enc_steps) masked_e = tf.exp( e_not_masked * enc_padding_mask) # (batch_size, max_enc_steps) else: # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e_not_masked = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e, (batch_size, max_enc_steps) masked_e = nn_ops.softmax( e_not_masked ) * enc_padding_mask # (batch_size, max_enc_steps) masked_sums = tf.reduce_sum( masked_e, axis=1) # shape (batch_size) masked_e = masked_e / tf.reshape(masked_sums, [-1, 1]) if _hps.use_temporal_attention: try: len_temporal_e = temporal_e.get_shape()[0] except: len_temporal_e = 0 if len_temporal_e == 0: attn_dist = masked_e else: masked_sums = tf.reduce_sum( temporal_e, axis=0 ) + 1e-10 # if it's zero due to masking we set it to a small value attn_dist = masked_e / masked_sums # (batch_size, max_enc_steps) else: attn_dist = masked_e # Calculate attention distribution masked_attn_sums = tf.reduce_sum(attn_dist, axis=1) attn_dist = attn_dist / tf.reshape(masked_attn_sums, [-1, 1]) # re-normalize if use_coverage: # first step of training coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2), 2) # initialize coverage # Calculate the context vector from attn_dist and _enc_states context_vector = math_ops.reduce_sum( array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * _enc_states, [1, 2]) # shape (batch_size, attn_size). context_vector = array_ops.reshape(context_vector, [-1, attn_size]) return context_vector, attn_dist, coverage, masked_e def intra_decoder_attention(decoder_state, outputs): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder outputs: list of decoder states for implementing intra-decoder mechanism, len(decoder_states) * (batch_size, hidden_dim) Returns: context_decoder_vector: weighted sum of _dec_states decoder_attn_dist: intra-decoder attention distribution """ attention_dec_vec_size = attn_dec_size = decoder_state.c.get_shape( )[1] # hidden_dim try: len_dec_states = outputs.get_shape()[0] except: len_dec_states = 0 attention_dec_vec_size = attn_dec_size = decoder_state.c.get_shape( )[1] # hidden_dim _decoder_states = tf.expand_dims( tf.reshape(outputs, [batch_size, -1, attn_dec_size]), axis=2 ) # now is shape (batch_size,len(decoder_states), 1, attn_size) _prev_decoder_features = nn_ops.conv2d( _decoder_states, W_h_d, [1, 1, 1, 1], "SAME" ) # shape (batch_size,len(decoder_states),1,attention_vec_size) with variable_scope.variable_scope("DecoderAttention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) try: decoder_features = linear( decoder_state, attention_dec_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1 ) # reshape to (batch_size, 1, 1, attention_dec_vec_size) # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) if _hps.matrix_attention: # Calculate h_d * W_attn * h_d, equation 6 in https://arxiv.org/pdf/1705.04304.pdf _dec_attn = tf.matmul( tf.squeeze(decoder_features), w_dec_attn) # (batch_size, decoder_attn_size) _dec_states_lst = tf.unstack( tf.reshape(_prev_decoder_features, [batch_size, -1, decoder_attn_size]) ) # batch_size * (len(decoder_states), decoder_attn_size) e_not_masked = tf.reshape( tf.stack([ tf.matmul(_dec_attn, tf.transpose(k)) for k in _dec_states_lst ]), [batch_size, -1 ]) # (batch_size, len(decoder_states)) masked_e = tf.exp( e_not_masked * dec_padding_mask[:, :len_dec_states] ) # (batch_size, len(decoder_states)) else: # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e_not_masked = math_ops.reduce_sum( v_d * math_ops.tanh(_prev_decoder_features + decoder_features), [ 2, 3 ]) # calculate e, (batch_size,len(decoder_states)) masked_e = nn_ops.softmax( e_not_masked ) * dec_padding_mask[:, : len_dec_states] # (batch_size,len(decoder_states)) if len_dec_states <= 1: masked_e = array_ops.ones( [batch_size, 1]) # first step is filled with equal values masked_sums = tf.reshape( tf.reduce_sum(masked_e, axis=1), [-1, 1] ) # (batch_size,1), # if it's zero due to masking we set it to a small value decoder_attn_dist = masked_e / masked_sums # (batch_size,len(decoder_states)) context_decoder_vector = math_ops.reduce_sum( array_ops.reshape(decoder_attn_dist, [batch_size, -1, 1, 1]) * _decoder_states, [1, 2]) # (batch_size, attn_size) context_decoder_vector = array_ops.reshape( context_decoder_vector, [-1, attn_dec_size]) # (batch_size, attn_size) except: return array_ops.zeros( [batch_size, decoder_attn_size]), array_ops.zeros([batch_size, 0]) return context_decoder_vector, decoder_attn_dist outputs = [] temporal_e = [] attn_dists = [] vocab_scores = [] vocab_dists = [] final_dists = [] p_gens = [] samples = [ ] # this holds the words chosen by sampling based on the final distribution for each decoding step, list of max_dec_steps of (batch_size, 1) greedy_search_samples = [ ] # this holds the words chosen by greedy search (taking the max) on the final distribution for each decoding step, list of max_dec_steps of (batch_size, 1) sampling_rewards = [] # list of size max_dec_steps (batch_size, k) greedy_rewards = [] # list of size max_dec_steps (batch_size, k) state = _dec_in_state coverage = prev_coverage # initialize coverage to None or whatever was passed in context_vector = array_ops.zeros([batch_size, attn_size]) context_decoder_vector = array_ops.zeros( [batch_size, decoder_attn_size]) context_vector.set_shape([ None, attn_size ]) # Ensure the second shape of attention vectors is set. if initial_state_attention: # true in decode mode # Re-calculate the context vector from the previous step so that we can pass it through a linear layer with this step's input to get a modified version of the input context_vector, _, coverage, _ = attention( _dec_in_state, tf.stack(prev_encoder_es, axis=0), coverage ) # in decode mode, this is what updates the coverage vector if _hps.intradecoder: context_decoder_vector, _ = intra_decoder_attention( _dec_in_state, tf.stack(prev_decoder_outputs, axis=0)) for i, inp in enumerate(emb_dec_inputs): tf.logging.info("Adding attention_decoder timestep %i of %i", i, len(emb_dec_inputs)) if i > 0: variable_scope.get_variable_scope().reuse_variables() if _hps.mode in [ 'train', 'eval' ] and _hps.scheduled_sampling and i > 0: # start scheduled sampling after we received the first decoder's output # modify the input to next decoder using scheduled sampling if FLAGS.scheduled_sampling_final_dist: inp = scheduled_sampling(_hps, sampling_probability, final_dist, embedding, inp, alpha) else: inp = scheduled_sampling_vocab_dist( _hps, sampling_probability, vocab_dist, embedding, inp, alpha) # Merge input and previous attentions into one vector x of the same size as inp emb_dim = inp.get_shape().with_rank(2)[1] if emb_dim is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = linear([inp] + [context_vector], emb_dim, True) # Run the decoder RNN cell. cell_output = decoder state cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: # always true in decode mode with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=True ): # you need this because you've already run the initial attention(...) call context_vector, attn_dist, _, masked_e = attention( state, tf.stack(prev_encoder_es, axis=0), coverage) # don't allow coverage to update if _hps.intradecoder: context_decoder_vector, _ = intra_decoder_attention( state, tf.stack(prev_decoder_outputs, axis=0)) else: context_vector, attn_dist, coverage, masked_e = attention( state, tf.stack(temporal_e, axis=0), coverage) if _hps.intradecoder: context_decoder_vector, _ = intra_decoder_attention( state, tf.stack(outputs, axis=0)) attn_dists.append(attn_dist) temporal_e.append(masked_e) with variable_scope.variable_scope("combined_context"): if _hps.intradecoder: context_vector = linear( [context_vector] + [context_decoder_vector], attn_size, False) # Calculate p_gen if pointer_gen: with tf.variable_scope('calculate_pgen'): p_gen = linear([context_vector, state.c, state.h, x], 1, True) # Tensor shape (batch_size, 1) p_gen = tf.sigmoid(p_gen) p_gens.append(p_gen) # Concatenate the cell_output (= decoder state) and the context vector, and pass them through a linear layer # This is V[s_t, h*_t] + b in the paper with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + [context_vector], cell.output_size, True) outputs.append(output) # Add the output projection to obtain the vocabulary distribution with tf.variable_scope('output_projection'): if i > 0: tf.get_variable_scope().reuse_variables() trunc_norm_init = tf.truncated_normal_initializer( stddev=_hps.trunc_norm_init_std) w_out = tf.get_variable('w', [_hps.dec_hidden_dim, v_size], dtype=tf.float32, initializer=trunc_norm_init) # w_t_out = tf.transpose(w) v_out = tf.get_variable('v', [v_size], dtype=tf.float32, initializer=trunc_norm_init) if i > 0: tf.get_variable_scope().reuse_variables() if FLAGS.share_decoder_weights: # Eq. 13 in https://arxiv.org/pdf/1705.04304.pdf w_out = tf.transpose( math_ops.tanh( linear([embedding] + [tf.transpose(w_out)], _hps.dec_hidden_dim, bias=False))) score = tf.nn.xw_plus_b(output, w_out, v_out) if _hps.scheduled_sampling and not _hps.greedy_scheduled_sampling: # Gumbel reparametrization trick: https://arxiv.org/abs/1704.06970 U = tf.random_uniform( score.get_shape(), 10e-12, (1 - 10e-12)) # add a small number to avoid log(0) G = -tf.log(-tf.log(U)) score = score + G vocab_scores.append(score) # apply the linear layer vocab_dist = tf.nn.softmax(score) vocab_dists.append( vocab_dist ) # The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays. The words are in the order they appear in the vocabulary file. # For pointer-generator model, calc final distribution from copy distribution and vocabulary distribution if _hps.pointer_gen: final_dist = _calc_final_dist(_hps, v_size, _max_art_oovs, _enc_batch_extend_vocab, p_gen, vocab_dist, attn_dist) else: # final distribution is just vocabulary distribution final_dist = vocab_dist final_dists.append(final_dist) # get the sampled token and greedy token # this will take the final_dist and sample from it for a total count of k (k samples) one_hot_k_samples = tf.distributions.Multinomial( total_count=1., probs=final_dist ).sample( _hps.k ) # sample k times according to https://arxiv.org/pdf/1705.04304.pdf, size (k, batch_size, extended_vsize) k_argmax = tf.argmax(one_hot_k_samples, axis=2, output_type=tf.int32) # (k, batch_size) k_sample = tf.transpose(k_argmax) # shape (batch_size, k) greedy_search_prob, greedy_search_sample = tf.nn.top_k( final_dist, k=_hps.k) # (batch_size, k) greedy_search_samples.append(greedy_search_sample) samples.append(k_sample) if FLAGS.use_discounted_rewards: _sampling_rewards = [] _greedy_rewards = [] for _ in range(_hps.k): rl_fscore = tf.reshape( rouge_l_fscore( tf.transpose(tf.stack(samples)[:, :, _]), target_batch), [-1, 1]) # shape (batch_size, 1) _sampling_rewards.append(tf.reshape(rl_fscore, [-1, 1])) rl_fscore = tf.reshape( rouge_l_fscore( tf.transpose( tf.stack(greedy_search_samples)[:, :, _]), target_batch), [-1, 1]) # shape (batch_size, 1) _greedy_rewards.append(tf.reshape(rl_fscore, [-1, 1])) sampling_rewards.append( tf.squeeze(tf.stack(_sampling_rewards, axis=1), axis=-1)) # (batch_size, k) greedy_rewards.append( tf.squeeze(tf.stack(_greedy_rewards, axis=1), axis=-1)) # (batch_size, k) if FLAGS.use_discounted_rewards: sampling_rewards = tf.stack(sampling_rewards) greedy_rewards = tf.stack(greedy_rewards) else: _sampling_rewards = [] _greedy_rewards = [] for _ in range(_hps.k): rl_fscore = rouge_l_fscore( tf.transpose(tf.stack(samples)[:, :, _]), target_batch) # shape (batch_size, 1) _sampling_rewards.append(tf.reshape(rl_fscore, [-1, 1])) rl_fscore = rouge_l_fscore( tf.transpose(tf.stack(greedy_search_samples)[:, :, _]), target_batch) # shape (batch_size, 1) _greedy_rewards.append(tf.reshape(rl_fscore, [-1, 1])) sampling_rewards = tf.squeeze(tf.stack(_sampling_rewards, axis=1), axis=-1) # (batch_size, k) greedy_rewards = tf.squeeze(tf.stack(_greedy_rewards, axis=1), axis=-1) # (batch_size, k) # If using coverage, reshape it if coverage is not None: coverage = array_ops.reshape(coverage, [batch_size, -1]) return (outputs, state, attn_dists, p_gens, coverage, vocab_scores, final_dists, samples, greedy_search_samples, temporal_e, sampling_rewards, greedy_rewards)
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1412.7449 (see below for details). It is recommended for complex sequence-to-sequence tasks. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) state = initial_state def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = linear([inp] + attns, input_size, True) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + attns, output_size, True) if loop_function is not None: prev = output outputs.append(output) return outputs, state
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False): if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError( "With less than 1 heads, use a non-attention decoder.") if attention_states.get_shape()[2].value is None: raise ValueError("Shape[2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder", dtype=dtype) as scope: dtype = scope.dtype batch_size = array_ops.shape( decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value if attn_length is None: attn_length = array_ops.shape(attention_states)[1] attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] # TODO attention_vec_size = 100 #attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable( "AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append( nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append( variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) state = initial_state def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = [ array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads) ] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = linear([inp] + attns, input_size, True) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + attns, output_size, True) if loop_function is not None: prev = output outputs.append(output) return outputs, state
def intra_decoder_attention(decoder_state, outputs): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder outputs: list of decoder states for implementing intra-decoder mechanism, len(decoder_states) * (batch_size, hidden_dim) Returns: context_decoder_vector: weighted sum of _dec_states decoder_attn_dist: intra-decoder attention distribution """ attention_dec_vec_size = attn_dec_size = decoder_state.c.get_shape( )[1] # hidden_dim try: len_dec_states = outputs.get_shape()[0] except: len_dec_states = 0 attention_dec_vec_size = attn_dec_size = decoder_state.c.get_shape( )[1] # hidden_dim _decoder_states = tf.expand_dims( tf.reshape(outputs, [batch_size, -1, attn_dec_size]), axis=2 ) # now is shape (batch_size,len(decoder_states), 1, attn_size) _prev_decoder_features = nn_ops.conv2d( _decoder_states, W_h_d, [1, 1, 1, 1], "SAME" ) # shape (batch_size,len(decoder_states),1,attention_vec_size) with variable_scope.variable_scope("DecoderAttention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) try: decoder_features = linear( decoder_state, attention_dec_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1 ) # reshape to (batch_size, 1, 1, attention_dec_vec_size) # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) if _hps.matrix_attention: # Calculate h_d * W_attn * h_d, equation 6 in https://arxiv.org/pdf/1705.04304.pdf _dec_attn = tf.matmul( tf.squeeze(decoder_features), w_dec_attn) # (batch_size, decoder_attn_size) _dec_states_lst = tf.unstack( tf.reshape(_prev_decoder_features, [batch_size, -1, decoder_attn_size]) ) # batch_size * (len(decoder_states), decoder_attn_size) e_not_masked = tf.reshape( tf.stack([ tf.matmul(_dec_attn, tf.transpose(k)) for k in _dec_states_lst ]), [batch_size, -1 ]) # (batch_size, len(decoder_states)) masked_e = tf.exp( e_not_masked * dec_padding_mask[:, :len_dec_states] ) # (batch_size, len(decoder_states)) else: # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e_not_masked = math_ops.reduce_sum( v_d * math_ops.tanh(_prev_decoder_features + decoder_features), [ 2, 3 ]) # calculate e, (batch_size,len(decoder_states)) masked_e = nn_ops.softmax( e_not_masked ) * dec_padding_mask[:, : len_dec_states] # (batch_size,len(decoder_states)) if len_dec_states <= 1: masked_e = array_ops.ones( [batch_size, 1]) # first step is filled with equal values masked_sums = tf.reshape( tf.reduce_sum(masked_e, axis=1), [-1, 1] ) # (batch_size,1), # if it's zero due to masking we set it to a small value decoder_attn_dist = masked_e / masked_sums # (batch_size,len(decoder_states)) context_decoder_vector = math_ops.reduce_sum( array_ops.reshape(decoder_attn_dist, [batch_size, -1, 1, 1]) * _decoder_states, [1, 2]) # (batch_size, attn_size) context_decoder_vector = array_ops.reshape( context_decoder_vector, [-1, attn_dec_size]) # (batch_size, attn_size) except: return array_ops.zeros( [batch_size, decoder_attn_size]), array_ops.zeros([batch_size, 0]) return context_decoder_vector, decoder_attn_dist
def attention_decoder(decoder_inputs, initial_for_state, initial_bac_state, attention_states, for_cell, bac_cell, maxout_size, output_size=None, num_heads=1, loop_function=None, embed_function=None, dtype=None, scope=None, embedding_size=620, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError( "With less than 1 heads, use a non-attention decoder.") if attention_states.get_shape()[2].value is None: raise ValueError("Shape[2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = for_cell.output_size with variable_scope.variable_scope(scope or "attention_decoder", dtype=dtype) as scope: dtype = scope.dtype batch_size = array_ops.shape( decoder_inputs[0])[0] # Needed for reshaping. # This is the number of encoders attn_length = attention_states.get_shape()[1].value if attn_length is None: attn_length = array_ops.shape(attention_states)[1] # This is the output dimension of each encoder attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. # hidden is just the attention_states reshaped to 4 dimensions hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) # hidden_features = [] # v = [] # Divide by two because attention_vec_size consists of both forward # and backward encoder attention_vec_size = attn_size // 2 # Size of query vectors for attention. # for a in range(num_heads): k = variable_scope.get_variable("AttnW_0", [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") # v = variable_scope.get_variable("AttnV_0", [attention_vec_size]) # Set the end state of the reverse encoder as the initial state of decoder # Create a two layer RELU Feedforward network. # Uncomment below to not use FW # state = initial_bac_state # state = [None] * len(initial_state) state_size = initial_bac_state[0].get_shape()[1].value def attention(query): """Put attention masks on hidden using hidden_features and query.""" # ds = [] # Results of attention reads will be stored here. # if nest.is_sequence(query): # If the query is a tuple, flatten it. # query_list = nest.flatten(query) # for q in query_list: # Check that ndims == 2 if specified. # ndims = q.get_shape().ndims # if ndims: # assert ndims == 2 # query = array_ops.concat_v2(query_list, 1) # for a in range(num_heads): with variable_scope.variable_scope("Attention_0"): # y = linear(query, attention_vec_size, True) y = array_ops.reshape(query, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(tf.multiply(y, hidden_features), [2, 3]) # s = math_ops.reduce_sum(v * math_ops.tanh(y*hidden_features), # [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds = array_ops.reshape(d, [-1, attn_size]) return ds, a outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) # attns = [ # array_ops.zeros( # batch_attn_size, dtype=dtype) for _ in range(num_heads) # ] # state = initial_bac_state hidState = [None] * len(initial_for_state) state = [None] * len(initial_for_state) for i in range(len(initial_for_state)): with variable_scope.variable_scope("F_init_for_%d" % i): hidState[i] = tf.nn.relu(linear(initial_for_state[i], state_size, True, scope="Linear0"), name="relu0") state[i] = tf.nn.relu(linear(hidState[i], state_size, True, scope="Linear1"), name="relu1") # state[i] = tf.nn.relu(linear(y, state_size, True, scope="Linear1"), name="relu1") # state = for_cell.zero_state(batch_size, dtype) # for a in attns: # Ensure the second shape of attention vectors is set. # a.set_shape([None, attn_size]) # For first attention, use the input hidden state from backward decoder cell_output = state[0] contexts = [] for_output = [] collect_attn = [] with variable_scope.variable_scope("Decoder_For"): for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. # if loop_function is not None and prev is not None: # with variable_scope.variable_scope("loop_function", reuse=True): # inp = loop_function(prev, i) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] # if input_size.value is None: # raise ValueError("Could not infer input size from input: %s" % inp.name) # Below should almost always be false # x = linear([inp] + attns, input_size, True) # Run the RNN. context, a = attention(cell_output) contexts.append(context) collect_attn.append(a) inp_concat = tf.concat(1, [inp, context]) cell_output, state = for_cell(inp_concat, state) for_output.append(cell_output) # Run the attention mechanism. # with variable_scope.variable_scope("AttnOutputProjection"): # # attns is a list of heads. Here I just have one though # output = linear([cell_output] + attns, maxout_size, True) # output is t # output = maxout(t_tilda, maxout_size) # output = linear(t, output_size, True) # if loop_function is not None: # prev = output # outputs.append(output) # state = initial_for_state # state = initial_for_state hidState = [None] * len(initial_bac_state) state = [None] * len(initial_bac_state) for i in range(len(initial_bac_state)): with variable_scope.variable_scope("F_init_bac_%d" % i): hidState[i] = tf.nn.relu(linear(initial_bac_state[i], state_size, True, scope="Linear0"), name="relu0") state[i] = tf.nn.relu(linear(hidState[i], state_size, True, scope="Linear1"), name="relu1") # state[i] = tf.nn.relu(linear(y, state_size, True, scope="Linear1"), name="relu1") # state = bac_cell.zero_state(batch_size, dtype) bac_output = [] with variable_scope.variable_scope("Decoder_Back"): # for i, (inp, out) in enumerate(zip(reversed(input_attn[2:]), reversed(output_attn[:-2]))): for i, (inp, context) in enumerate( zip(reversed(decoder_inputs[2:]), reversed(contexts[:-2]))): if i > 0: variable_scope.get_variable_scope().reuse_variables() inp_concat = tf.concat(1, [inp, context]) cell_output, state = bac_cell(inp_concat, state) bac_output.insert(0, cell_output) q_vec = [] with variable_scope.variable_scope("OutputProjection"): for i, (for_out, bac_out, context) in enumerate(zip(for_output, bac_output, contexts)): # for i, (inp, out) in enumerate(zip(reversed(input_attn[2:]), reversed(output_attn[:-2]))): if i > 0: variable_scope.get_variable_scope().reuse_variables() t_tilda = linear(tf.concat(1, [ for_out, bac_out, decoder_inputs[i], decoder_inputs[i + 2], context ]), 2 * maxout_size, True, scope="sj") # temp2 = linear(tf.concat([decoder_inputs[i], decoder_inputs[i+2]]), 2*maxout_size, True, scope="ey") # temp3 = linear(context, 2*maxout_size, True, scope="ctxt") # t_tilda = temp + temp2 + temp3 t_output = maxout(t_tilda, maxout_size) output = linear(t_output, embedding_size, True, scope="t_tilda2") q_vec.append(output) # q_vec = q_vec[::-1] # print('collect_attn', len(collect_attn)) return q_vec, state, collect_attn
def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, bias, strides, padding, activation_mode, data_format, filter_format, dtype): """Verifies the output values of the convolution function. Args: tensor_in_sizes: Input tensor dimensions in [batch, input_rows, input_cols, input_depth]. filter_in_sizes: Filter tensor dimensions in [kernel_rows, kernel_cols, input_depth, output_depth]. bias: 1-D bias tensor of length output_depth. strides: Stride: [col_stride, row_stride] padding: Padding type. activation_mode: Activation mode. data_format: Format of the data tensors. filter_format: Filter format to use for the fused convolution. dtype: Data type for inputs and outputs. Returns: Symbolic tensor value and reference value that can be used to execute the computation and verify the results. """ input_size = np.prod(tensor_in_sizes) filter_size = np.prod(filter_in_sizes) bias_size = filter_in_sizes[-1] # equals to output depth # Initializes the input tensor with array containing incrementing # numbers from 1. x1 = [f * 1.0 for f in range(1, input_size + 1)] x2 = [f * 1.0 for f in range(1, filter_size + 1)] # This is to guarantee that there are always negative values after # bias add so that we can test whether relu works correctly. x3 = bias t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype) t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype) fused_t2 = t2 if filter_format == "OIHW": fused_t2 = _HwioToOihw(t2) t3 = constant_op.constant(x3, shape=[bias_size], dtype=dtype) strides = [1] + strides + [1] if data_format == "NCHW": t1 = test_util.NHWCToNCHW(t1) strides = test_util.NHWCToNCHW(strides) output = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation( t1, fused_t2, t3, strides=strides, padding=padding, data_format=data_format, filter_format=filter_format, activation_mode=activation_mode) ref_conv_output = nn_ops.conv2d(t1, t2, strides=strides, padding=padding, data_format=data_format) ref_bias_output = nn_ops.bias_add(ref_conv_output, t3, data_format=data_format) ref_output = nn_ops.relu(ref_bias_output) if data_format == "NCHW": output = test_util.NCHWToNHWC(output) ref_output = test_util.NCHWToNHWC(ref_output) return output, ref_output