def get_initial_state(self, x): input_shape = self.input_spec[0].shape init_nb_row = input_shape[self.row_axis] init_nb_col = input_shape[self.column_axis] base_initial_state = K.zeros_like( x) # (samples, timesteps) + image_shape non_channel_axis = -1 if self.data_format == 'channels_first' else -2 for _ in range(2): base_initial_state = K.sum(base_initial_state, axis=non_channel_axis) base_initial_state = K.sum(base_initial_state, axis=1) # (samples, nb_channels) initial_states = [] states_to_pass = ['r', 'c', 'e'] nlayers_to_pass = {u: self.nb_layers for u in states_to_pass} if self.extrap_start_time is not None: states_to_pass.append( 'ahat' ) # pass prediction in states so can use as actual for t+1 when extrapolating nlayers_to_pass['ahat'] = 1 for u in states_to_pass: for l in range(nlayers_to_pass[u]): ds_factor = 2**l nb_row = init_nb_row // ds_factor nb_col = init_nb_col // ds_factor if u in ['r', 'c']: stack_size = self.R_stack_sizes[l] elif u == 'e': stack_size = 2 * self.stack_sizes[l] elif u == 'ahat': stack_size = self.stack_sizes[l] output_size = stack_size * nb_row * nb_col # flattened size reducer = K.zeros((input_shape[self.channel_axis], output_size)) # (nb_channels, output_size) initial_state = K.dot(base_initial_state, reducer) # (samples, output_size) if self.data_format == 'channels_first': output_shp = (-1, stack_size, nb_row, nb_col) else: output_shp = (-1, nb_row, nb_col, stack_size) initial_state = K.reshape(initial_state, output_shp) initial_states += [initial_state] if K._BACKEND == 'theano': from theano import tensor as T # There is a known issue in the Theano scan op when dealing with inputs whose shape is 1 along a dimension. # In our case, this is a problem when training on grayscale images, and the below line fixes it. initial_states = [ T.unbroadcast(init_state, 0, 1) for init_state in initial_states ] if self.extrap_start_time is not None: initial_states += [ K.variable(0, int if K.backend() != 'tensorflow' else 'int32') ] # the last state will correspond to the current timestep return initial_states
def step(self, inputs, states): states = list(states) if self.teacher_force: readout = states.pop() ground_truth = states.pop() assert K.ndim(ground_truth) == 3, K.ndim(ground_truth) counter = states.pop() if K.backend() == 'tensorflow': with tf.control_dependencies(None): zero = K.cast(K.zeros((1, ))[0], 'int32') one = K.cast(K.zeros((1, ))[0], 'int32') else: zero = K.cast(K.zeros((1, ))[0], 'int32') one = K.cast(K.zeros((1, ))[0], 'int32') slices = [ slice(None), counter[0] - K.switch(counter[0], one, zero) ] + [slice(None)] * (K.ndim(ground_truth) - 2) ground_truth_slice = ground_truth[slices] readout = K.in_train_phase( K.switch(counter[0], ground_truth_slice, readout), readout) states.append(readout) if self.decode: model_input = states else: model_input = [inputs] + states shapes = [] for x in model_input: if hasattr(x, '_keras_shape'): shapes.append(x._keras_shape) del x._keras_shape # Else keras internals will get messed up. model_output = _to_list(self.model.call(model_input)) for x, s in zip(model_input, shapes): setattr(x, '_keras_shape', s) if self.decode: model_output.insert(1, model_input[0]) for tensor in model_output: tensor._uses_learning_phase = self.uses_learning_phase states = model_output[1:] output = model_output[0] if self.readout: states += [output] if self.teacher_force: states.insert(-1, counter + 1) states.insert(-1, ground_truth) return output, states
def get_initial_state(self, inputs): if type(self.model.input) is not list: return [] try: batch_size = K.int_shape(inputs)[0] except: batch_size = None state_shapes = list(map(K.int_shape, self.model.input[1:])) states = [] if self.readout: state_shapes.pop() # default value for initial_readout is handled in call() for shape in state_shapes: if None in shape[1:]: raise Exception( 'Only the batch dimension of a state can be left unspecified. Got state with shape ' + str(shape)) if shape[0] is None: ndim = K.ndim(inputs) z = K.zeros_like(inputs) slices = [slice(None)] + [0] * (ndim - 1) z = z[slices] # (batch_size,) state_ndim = len(shape) z = K.reshape(z, (-1, ) + (1, ) * (state_ndim - 1)) z = K.tile(z, (1, ) + tuple(shape[1:])) states.append(z) else: states.append(K.zeros(shape)) state_initializer = self.state_initializer if state_initializer: # some initializers don't accept symbolic shapes for i in range(len(state_shapes)): if state_shapes[i][0] is None: if hasattr(self, 'batch_size'): state_shapes[i] = ( self.batch_size, ) + state_shapes[i][1:] if None in state_shapes[i]: state_shapes[i] = K.shape(states[i]) num_state_init = len(state_initializer) num_state = self.num_states assert num_state_init == num_state, 'RNN has ' + str( num_state) + ' states, but was provided ' + str( num_state_init) + ' state initializers.' for i in range(len(states)): init = state_initializer[i] shape = state_shapes[i] try: if not isinstance(init, initializers.Zeros): states[i] = init(shape) except: raise Exception( 'Seems the initializer ' + init.__class__.__name__ + ' does not support symbolic shapes(' + str(shape) + '). Try providing the full input shape (include batch dimension) for you RecurrentModel.' ) return states
def __init__(self, name=None, **kwargs): if not name: prefix = 'optional_input_placeholder' name = prefix + '_' + str(K.get_uid(prefix)) kwargs['batch_input_shape'] = (2, ) super(_OptionalInputPlaceHolder, self).__init__(**kwargs) self.tensor = K.zeros(shape=(2, )) self.tensor._keras_shape = (2, ) self.tensor._uses_learning_phase = False self.tensor._keras_history = (self, 0, 0) Node(self, inbound_layers=[], node_indices=[], tensor_indices=[], input_tensors=[], output_tensors=[self.tensor], input_masks=[None], output_masks=[None], input_shapes=[], output_shapes=[(2, )]) self.build((2, ))
def call(self, inputs, initial_state=None, initial_readout=None, ground_truth=None, mask=None, training=None): # input shape: `(samples, time (padded with zeros), input_dim)` # note that the .build() method of subclasses MUST define # self.input_spec and self.state_spec with complete input shapes. if type(mask) is list: mask = mask[0] if self.model is None: raise Exception('Empty RecurrentModel.') num_req_states = self.num_states if self.readout: num_actual_states = num_req_states - 1 else: num_actual_states = num_req_states if type(inputs) is list: inputs_list = inputs[:] inputs = inputs_list.pop(0) initial_states = inputs_list[:num_actual_states] if len(initial_states) > 0: if self._is_optional_input_placeholder(initial_states[0]): initial_states = self.get_initial_state(inputs) inputs_list = inputs_list[num_actual_states:] if self.readout: initial_readout = inputs_list.pop(0) if self.teacher_force: ground_truth = inputs_list.pop() else: if initial_state is not None: if not isinstance(initial_state, (list, tuple)): initial_states = [initial_state] else: initial_states = list(initial_state) if self._is_optional_input_placeholder(initial_states[0]): initial_states = self.get_initial_state(inputs) elif self.stateful: initial_states = self.states else: initial_states = self.get_initial_state(inputs) if self.readout: if initial_readout is None or self._is_optional_input_placeholder( initial_readout): output_shape = K.int_shape(_to_list((self.model.output))[0]) output_ndim = len(output_shape) input_ndim = K.ndim(inputs) initial_readout = K.zeros_like(inputs) slices = [slice(None)] + [0] * (input_ndim - 1) initial_readout = initial_readout[slices] # (batch_size,) initial_readout = K.reshape(initial_readout, (-1, ) + (1, ) * (output_ndim - 1)) initial_readout = K.tile(initial_readout, (1, ) + tuple(output_shape[1:])) initial_states.append(initial_readout) if self.teacher_force: if ground_truth is None or self._is_optional_input_placeholder( ground_truth): raise Exception( 'ground_truth must be provided for RecurrentModel with teacher_force=True.' ) if K.backend() == 'tensorflow': with tf.control_dependencies(None): counter = K.zeros((1, )) else: counter = K.zeros((1, )) counter = K.cast(counter, 'int32') initial_states.insert(-1, counter) initial_states[-2] initial_states.insert(-1, ground_truth) num_req_states += 2 if len(initial_states) != num_req_states: raise ValueError('Layer requires ' + str(num_req_states) + ' states but was passed ' + str(len(initial_states)) + ' initial states.') input_shape = K.int_shape(inputs) if self.unroll and input_shape[1] is None: raise ValueError('Cannot unroll a RNN if the ' 'time dimension is undefined. \n' '- If using a Sequential model, ' 'specify the time dimension by passing ' 'an `input_shape` or `batch_input_shape` ' 'argument to your first layer. If your ' 'first layer is an Embedding, you can ' 'also use the `input_length` argument.\n' '- If using the functional API, specify ' 'the time dimension by passing a `shape` ' 'or `batch_shape` argument to your Input layer.') preprocessed_input = self.preprocess_input(inputs, training=None) constants = self.get_constants(inputs, training=None) if self.decode: initial_states.insert(0, inputs) preprocessed_input = K.zeros((1, self.output_length, 1)) input_length = self.output_length else: input_length = input_shape[1] if self.uses_learning_phase: with learning_phase_scope(0): last_output_test, outputs_test, states_test, updates = rnn( self.step, preprocessed_input, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=input_length) with learning_phase_scope(1): last_output_train, outputs_train, states_train, updates = rnn( self.step, preprocessed_input, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=input_length) last_output = K.in_train_phase(last_output_train, last_output_test, training=training) outputs = K.in_train_phase(outputs_train, outputs_test, training=training) states = [] for state_train, state_test in zip(states_train, states_test): states.append( K.in_train_phase(state_train, state_test, training=training)) else: last_output, outputs, states, updates = rnn( self.step, preprocessed_input, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=input_length) states = list(states) if self.decode: states.pop(0) if self.readout: states.pop() if self.teacher_force: states.pop() states.pop() if len(updates) > 0: self.add_update(updates) if self.stateful: updates = [] for i in range(len(states)): updates.append((self.states[i], states[i])) self.add_update(updates, inputs) # Properly set learning phase if 0 < self.dropout + self.recurrent_dropout: last_output._uses_learning_phase = True outputs._uses_learning_phase = True if self.return_sequences: y = outputs else: y = last_output if self.return_states: return [y] + states else: return y
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / ( 1. - K.pow(self.beta_1, t)) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): # if a weight tensor (len > 1) use weight normalized parameterization # this is the only part changed w.r.t. keras.optimizers.Adam ps = K.get_variable_shape(p) if len(ps) > 1: # get weight normalization parameters V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads( p, g) # Adam containers for the 'g' parameter V_scaler_shape = K.get_variable_shape(V_scaler) m_g = K.zeros(V_scaler_shape) v_g = K.zeros(V_scaler_shape) # update g parameters m_g_t = (self.beta_1 * m_g) + (1. - self.beta_1) * grad_g v_g_t = (self.beta_2 * v_g) + (1. - self.beta_2) * K.square(grad_g) new_g_param = g_param - lr_t * m_g_t / (K.sqrt(v_g_t) + self.epsilon) self.updates.append(K.update(m_g, m_g_t)) self.updates.append(K.update(v_g, v_g_t)) # update V parameters m_t = (self.beta_1 * m) + (1. - self.beta_1) * grad_V v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(grad_V) new_V_param = V - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) # if there are constraints we apply them to V, not W # if p in constraints: # c = constraints[p] # new_V_param = c(new_V_param) # wn param updates --> W updates add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler) else: # do optimization normally m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # apply constraints # if p in constraints: # c = constraints[p] # new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates