def ExpectScaledSize(self, spec, target_shape, factor=1): """Tests that the output of the graph of the given spec has target_shape.""" with tf.Graph().as_default(): with self.test_session() as sess: self.SetupInputs() # Only the placeholders are given at construction time. vgsl = vgslspecs.VGSLSpecs(self.ph_widths, self.ph_heights, True) outputs = vgsl.Build(self.ph_image, spec) # Compute the expected output widths from the given scale factor. target_widths = tf.div(self.in_widths, factor).eval() target_heights = tf.div(self.in_heights, factor).eval() # Run with the 'real' data. tf.global_variables_initializer().run() res_image, res_widths, res_heights = sess.run( [outputs, vgsl.GetLengths(2), vgsl.GetLengths(1)], feed_dict={ self.ph_image: self.in_image, self.ph_widths: self.in_widths, self.ph_heights: self.in_heights }) self.assertEqual(tuple(res_image.shape), target_shape) if target_shape[1] > 1: self.assertEqual(tuple(res_heights), tuple(target_heights)) if target_shape[2] > 1: self.assertEqual(tuple(res_widths), tuple(target_widths))
def Build(self, input_pattern, input_spec, model_spec, output_spec, optimizer_type, num_preprocess_threads, reader): """Builds the model from the separate input/layers/output spec strings. Args: input_pattern: File pattern of the data in tfrecords of TF Example format. input_spec: Specification of the input layer: batchsize,height,width,depth (4 comma-separated integers) Training will run with batches of batchsize images, but runtime can use any batch size. height and/or width can be 0 or -1, indicating variable size, otherwise all images must be the given size. depth must be 1 or 3 to indicate greyscale or color. NOTE 1-d image input, treating the y image dimension as depth, can be achieved using S1(1x0)1,3 as the first op in the model_spec, but the y-size of the input must then be fixed. model_spec: Model definition. See vgslspecs.py output_spec: Output layer definition: O(2|1|0)(l|s|c)n output layer with n classes. 2 (heatmap) Output is a 2-d vector map of the input (possibly at different scale). 1 (sequence) Output is a 1-d sequence of vector values. 0 (value) Output is a 0-d single vector value. l uses a logistic non-linearity on the output, allowing multiple hot elements in any output vector value. s uses a softmax non-linearity, with one-hot output in each value. c uses a softmax with CTC. Can only be used with s (sequence). NOTE Only O1s and O1c are currently supported. optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'. num_preprocess_threads: Number of threads to use for image processing. reader: Function that returns an actual reader to read Examples from input files. If None, uses tf.TFRecordReader(). """ self.global_step = tf.Variable(0, name='global_step', trainable=False) shape = _ParseInputSpec(input_spec) out_dims, out_func, num_classes = _ParseOutputSpec(output_spec) self.using_ctc = out_func == 'c' images, heights, widths, labels, sparse, _ = vgsl_input.ImageInput( input_pattern, num_preprocess_threads, shape, self.using_ctc, reader) self.labels = labels self.sparse_labels = sparse self.layers = vgslspecs.VGSLSpecs(widths, heights, self.mode == 'train') last_layer = self.layers.Build(images, model_spec) self._AddOutputs(last_layer, out_dims, out_func, num_classes) if self.mode == 'train': self._AddOptimizer(optimizer_type) # For saving the model across training and evaluation self.saver = tf.train.Saver()