def construct_merger(self, n_spatial_dims, n_channels, patch_shape, patch_cnn_spec, patch_mlp_spec, merge_mlp_spec, response_mlp_spec, batch_normalize, batch_normalize_patch, task_name, hyperparameters, **kwargs): # construct patch interpretation network patch_transforms = [] if task_name == "featurelevel_ucf101": n_channels = 512 + 4096 shape = self.cropper.output_shape else: if patch_cnn_spec == "pretrained": import pretrained patch_transforms.append( pretrained.get_patch_transform(**hyperparameters)) shape = patch_transforms[-1].get_dim("output") elif patch_cnn_spec: patch_transforms.append( masonry.construct_cnn( name="patch_cnn", layer_specs=patch_cnn_spec, input_shape=patch_shape, n_channels=n_channels, batch_normalize=batch_normalize_patch)) shape = patch_transforms[-1].get_dim("output") patch_transforms.append(bricks.FeedforwardFlattener(input_shape=shape)) if patch_mlp_spec: patch_transforms.append( masonry.construct_mlp( name="patch_mlp", hidden_dims=patch_mlp_spec, input_dim=patch_transforms[-1].output_dim, weights_init=initialization.Orthogonal(), biases_init=initialization.Constant(0), batch_normalize=batch_normalize_patch)) self.patch_transform = bricks.FeedforwardSequence( [brick.apply for brick in patch_transforms], name="ffs") # construct theta interpretation network self.merge_mlp = masonry.construct_mlp( name="merge_mlp", input_dim=2 * n_spatial_dims, hidden_dims=merge_mlp_spec, weights_init=initialization.Orthogonal(), biases_init=initialization.Constant(0), batch_normalize=batch_normalize) self.response_mlp = masonry.construct_mlp( name="response_mlp", hidden_dims=response_mlp_spec, input_dim=self.patch_transform.output_dim + self.merge_mlp.output_dim, weights_init=initialization.Orthogonal(), biases_init=initialization.Constant(0), batch_normalize=batch_normalize) self.children.extend( [self.patch_transform, self.merge_mlp, self.response_mlp])
def __init__(self, hidden_dim, cropper, attention_state_name, hyperparameters, **kwargs): super(RecurrentAttentionModel, self).__init__(**kwargs) self.rnn = bricks.RecurrentStack( [bricks.LSTM(activation=bricks.Tanh(), dim=hidden_dim), bricks.LSTM(activation=bricks.Tanh(), dim=hidden_dim)], weights_init=initialization.NormalizedInitialization( initialization.IsotropicGaussian()), biases_init=initialization.Constant(0)) # name of the RNN state that determines the parameters of the next glimpse self.attention_state_name = attention_state_name self.cropper = cropper self.construct_locator(**hyperparameters) self.construct_merger(**hyperparameters) self.embedder = bricks.Linear( name="embedder", input_dim=self.response_mlp.output_dim, output_dim=4*self.rnn.get_dim("states"), use_bias=True, weights_init=initialization.Orthogonal(), biases_init=initialization.Constant(1)) # don't let blocks touch my children self.initialization_config_pushed = True self.children.extend([self.rnn, self.cropper, self.embedder]) # states aren't known until now self.apply.outputs = self.rnn.apply.outputs self.compute_initial_state.outputs = self.rnn.apply.outputs
def construct_locator(self, locate_mlp_spec, n_spatial_dims, location_std, scale_std, batch_normalize, **kwargs): self.n_spatial_dims = n_spatial_dims self.locate_mlp = masonry.construct_mlp( name="locate_mlp", input_dim=self.get_dim(self.attention_state_name), hidden_dims=locate_mlp_spec, weights_init=initialization.Orthogonal(), biases_init=initialization.Constant(0), batch_normalize=batch_normalize) self.theta_from_area = bricks.Linear( input_dim=self.locate_mlp.output_dim, output_dim=2*n_spatial_dims, name="theta_from_area", # normalize columns because the fan-in is large weights_init=initialization.NormalizedInitialization( initialization.IsotropicGaussian()), # initialize location biases to zero and scale biases to one # so the model will zoom in by default biases_init=initialization.Constant(np.array( [0.] * n_spatial_dims + [1.] * n_spatial_dims))) self.T_rng = theano.sandbox.rng_mrg.MRG_RandomStreams(12345) self.location_std = location_std self.scale_std = scale_std self.children.extend([ self.locate_mlp, self.theta_from_area])
def construct_cnn(name, layer_specs, n_channels, input_shape, batch_normalize): ndim = len(input_shape) conv_module = { 2: conv2d, 3: conv3d, }[ndim] cnn = conv_module.ConvolutionalSequence( name=name, layers=[ construct_cnn_layer(name="%s_%i" % (name, i), layer_spec=layer_spec, ndim=ndim, conv_module=conv_module, batch_normalize=batch_normalize) for i, layer_spec in enumerate(layer_specs) ], num_channels=n_channels, image_size=tuple(input_shape), weights_init=initialization.ConvolutionalInitialization( initialization.Orthogonal()), # our activation function will handle the bias use_bias=False) # ensure output dim is determined cnn.push_allocation_config() # tell the activations what shapes they'll be dealing with for layer in cnn.layers: activation = util.get_conv_activation(layer, conv_module) assert isinstance(activation, bricks.NormalizedActivation) activation.shape = layer.get_dim("output") activation.broadcastable = [False] + ndim * [True] cnn.initialize() return cnn
def __init__(self, hidden_dim, cropper, attention_state_name, hyperparameters, **kwargs): # we're no longer a brick, but we still need to make sure we # initialize everything self.children = [] self.rnn = bricks.RecurrentStack( [ bricks.LSTM(activation=bricks.Tanh(), dim=hidden_dim), bricks.LSTM(activation=bricks.Tanh(), dim=hidden_dim) ], weights_init=initialization.NormalizedInitialization( initialization.IsotropicGaussian()), biases_init=initialization.Constant(0)) # name of the RNN state that determines the parameters of the next glimpse self.attention_state_name = attention_state_name self.cropper = cropper self.construct_locator(**hyperparameters) self.construct_merger(**hyperparameters) self.embedder = bricks.Linear(name="embedder", input_dim=self.response_mlp.output_dim, output_dim=self.rnn.get_dim("inputs"), use_bias=True, weights_init=initialization.Orthogonal(), biases_init=initialization.Constant(0)) self.children.extend([self.rnn, self.cropper, self.embedder])
def construct_cnn_layer(name, layer_spec, conv_module, ndim, batch_normalize): type_ = layer_spec.pop("type", "conv") if type_ == "pool": layer = conv_module.MaxPooling( name=name, pooling_size=layer_spec.pop("size", (1, ) * ndim), step=layer_spec.pop("step", (1, ) * ndim)) elif type_ == "conv": border_mode = layer_spec.pop("border_mode", (0, ) * ndim) if not isinstance(border_mode, basestring): # conv bricks barf on list-type shape arguments :/ border_mode = tuple(border_mode) activation = bricks.NormalizedActivation( name="activation", batch_normalize=batch_normalize) layer = conv_module.ConvolutionalActivation( name=name, activation=activation.apply, filter_size=tuple(layer_spec.pop("size", (1, ) * ndim)), step=tuple(layer_spec.pop("step", (1, ) * ndim)), num_filters=layer_spec.pop("num_filters", 1), border_mode=border_mode, # our activation function will handle the bias use_bias=False) # sigh. really REALLY do not use biases layer.convolution.use_bias = False layer.convolution.weights_init = initialization.ConvolutionalInitialization( initialization.Orthogonal()) layer.convolution.biases_init = initialization.Constant(0) if layer_spec: logger.warn("ignoring unknown layer specification keys [%s]" % " ".join(layer_spec.keys())) return layer
def __init__(self, input_dim, n_classes, batch_normalize): self.input_dim = input_dim self.n_classes = n_classes self.mlp = masonry.construct_mlp( name="mlp", activations=[None, bricks.Identity()], input_dim=input_dim, hidden_dims=[input_dim / 2, self.n_classes], batch_normalize=batch_normalize, weights_init=initialization.Orthogonal(), biases_init=initialization.Constant(0)) self.softmax = bricks.Softmax() self.children = [self.mlp, self.softmax]
def __init__(self, input_dim, n_classes, batch_normalize): self.input_dim = input_dim self.n_classes = n_classes # TODO: use TensorLinear or some such self.emitters = [ masonry.construct_mlp( activations=[None, bricks.Identity()], input_dim=input_dim, hidden_dims=[input_dim/2, n], name="mlp_%i" % i, batch_normalize=batch_normalize, weights_init=initialization.Orthogonal(), biases_init=initialization.Constant(0)) for i, n in enumerate(self.n_classes)] self.softmax = bricks.Softmax() self.children = self.emitters + [self.softmax]
def construct_mlp(name, hidden_dims, input_dim, batch_normalize, activations=None, weights_init=None, biases_init=None): if not hidden_dims: return bricks.FeedforwardIdentity(dim=input_dim) if not activations: activations = [bricks.Rectifier() for dim in hidden_dims] elif not isinstance(activations, collections.Iterable): activations = [activations] * len(hidden_dims) assert len(activations) == len(hidden_dims) if not weights_init: weights_init = initialization.Orthogonal() if not biases_init: biases_init = initialization.Constant(0) dims = [input_dim] + hidden_dims wrapped_activations = [ bricks.NormalizedActivation(shape=[hidden_dim], name="activation_%i" % i, batch_normalize=batch_normalize, activation=activation) for i, (hidden_dim, activation) in enumerate(zip(hidden_dims, activations)) ] mlp = bricks.MLP( name=name, activations=wrapped_activations, # biases are handled by our activation function use_bias=False, dims=dims, weights_init=weights_init, biases_init=biases_init) return mlp
def construct_merger(self, n_spatial_dims, n_channels, patch_shape, response_dim, patch_cnn_spec, patch_mlp_spec, merge_mlp_spec, response_mlp_spec, batch_normalize, batch_normalize_patch, **kwargs): # construct patch interpretation network patch_transforms = [] if patch_cnn_spec: patch_transforms.append(masonry.construct_cnn( name="patch_cnn", layer_specs=patch_cnn_spec, input_shape=patch_shape, n_channels=n_channels, batch_normalize=batch_normalize_patch)) shape = patch_transforms[-1].get_dim("output") else: shape = (n_channels,) + tuple(patch_shape) patch_transforms.append(bricks.FeedforwardFlattener(input_shape=shape)) if patch_mlp_spec: patch_transforms.append(masonry.construct_mlp( name="patch_mlp", hidden_dims=patch_mlp_spec, input_dim=patch_transforms[-1].output_dim, weights_init=initialization.Orthogonal(), biases_init=initialization.Constant(0), batch_normalize=batch_normalize_patch)) self.patch_transform = bricks.FeedforwardSequence( [brick.apply for brick in patch_transforms], name="ffs") # construct theta interpretation network self.merge_mlp = masonry.construct_mlp( name="merge_mlp", input_dim=2*n_spatial_dims, hidden_dims=merge_mlp_spec, weights_init=initialization.Orthogonal(), biases_init=initialization.Constant(0), batch_normalize=batch_normalize) # construct what-where merger network self.response_merge = bricks.Merge( input_names="area patch".split(), input_dims=[self.merge_mlp.output_dim, self.patch_transform.output_dim], output_dim=response_dim, prototype=bricks.Linear( use_bias=False, weights_init=initialization.Orthogonal(), biases_init=initialization.Constant(0)), child_prefix="response_merge") self.response_merge_activation = bricks.NormalizedActivation( shape=[response_dim], name="response_merge_activation", batch_normalize=batch_normalize) self.response_mlp = masonry.construct_mlp( name="response_mlp", hidden_dims=response_mlp_spec, input_dim=response_dim, weights_init=initialization.Orthogonal(), biases_init=initialization.Constant(0), batch_normalize=batch_normalize) self.children.extend([ self.response_merge_activation, self.response_merge, self.patch_transform, self.merge_mlp, self.response_mlp])